-http://dlc.sun.com/osol/on/downloads/b117/on-src.tar.bz2
+http://dlc.sun.com/osol/on/downloads/b121/on-src.tar.bz2
(u_longlong_t)ds->ds_snapnames_zapobj);
(void) printf("\t\tnum_children = %llu\n",
(u_longlong_t)ds->ds_num_children);
+ (void) printf("\t\tuserrefs_obj = %llu\n",
+ (u_longlong_t)ds->ds_userrefs_obj);
(void) printf("\t\tcreation_time = %s", ctime(&crtime));
(void) printf("\t\tcreation_txg = %llu\n",
(u_longlong_t)ds->ds_creation_txg);
dump_zap, /* DSL scrub queue */
dump_zap, /* ZFS user/group used */
dump_zap, /* ZFS user/group quota */
+ dump_zap, /* snapshot refcount tags */
};
static void
cb.cb_types = types;
cb.cb_depth_limit = limit;
/*
- * If cb_proplist is provided then in the zfs_handles created we
+ * If cb_proplist is provided then in the zfs_handles created we
* retain only those properties listed in cb_proplist and sortcol.
* The rest are pruned. So, the caller should make sure that no other
* properties other than those listed in cb_proplist/sortcol are
static int zfs_do_promote(int argc, char **argv);
static int zfs_do_userspace(int argc, char **argv);
static int zfs_do_python(int argc, char **argv);
+static int zfs_do_hold(int argc, char **argv);
+static int zfs_do_release(int argc, char **argv);
/*
* Enable a reasonable set of defaults for libumem debugging on DEBUG builds.
HELP_ALLOW,
HELP_UNALLOW,
HELP_USERSPACE,
- HELP_GROUPSPACE
+ HELP_GROUPSPACE,
+ HELP_HOLD,
+ HELP_HOLDS,
+ HELP_RELEASE
} zfs_help_t;
typedef struct zfs_command {
{ "allow", zfs_do_python, HELP_ALLOW },
{ NULL },
{ "unallow", zfs_do_python, HELP_UNALLOW },
+ { NULL },
+ { "hold", zfs_do_hold, HELP_HOLD },
+ { "holds", zfs_do_python, HELP_HOLDS },
+ { "release", zfs_do_release, HELP_RELEASE },
};
#define NCOMMAND (sizeof (command_table) / sizeof (command_table[0]))
"-V <size> <volume>\n"));
case HELP_DESTROY:
return (gettext("\tdestroy [-rRf] "
- "<filesystem|volume|snapshot>\n"));
+ "<filesystem|volume|snapshot>\n"
+ "\tdestroy -d [-r] <filesystem|volume|snapshot>\n"));
case HELP_GET:
return (gettext("\tget [-rHp] [-d max] "
"[-o field[,...]] [-s source[,...]]\n"
return (gettext("\tunmount [-f] "
"<-a | filesystem|mountpoint>\n"));
case HELP_UNSHARE:
- return (gettext("\tunshare [-f] "
+ return (gettext("\tunshare "
"<-a | filesystem|mountpoint>\n"));
case HELP_ALLOW:
return (gettext("\tallow <filesystem|volume>\n"
return (gettext("\tgroupspace [-hniHpU] [-o field[,...]] "
"[-sS field] ... [-t type[,...]]\n"
"\t <filesystem|snapshot>\n"));
+ case HELP_HOLD:
+ return (gettext("\thold [-r] <tag> <snapshot> ...\n"));
+ case HELP_HOLDS:
+ return (gettext("\tholds [-r] <snapshot> ...\n"));
+ case HELP_RELEASE:
+ return (gettext("\trelease [-r] <tag> <snapshot> ...\n"));
}
abort();
}
/*
- * zfs destroy [-rf] <fs, snap, vol>
+ * zfs destroy [-rRf] <fs, snap, vol>
+ * zfs destroy -d [-r] <fs, snap, vol>
*
* -r Recursively destroy all children
* -R Recursively destroy all dependents, including clones
* -f Force unmounting of any dependents
+ * -d If we can't destroy now, mark for deferred destruction
*
* Destroys the given dataset. By default, it will unmount any filesystems,
* and refuse to destroy a dataset that has any dependents. A dependent can
boolean_t cb_closezhp;
zfs_handle_t *cb_target;
char *cb_snapname;
+ boolean_t cb_defer_destroy;
} destroy_cbdata_t;
/*
* Bail out on the first error.
*/
if (zfs_unmount(zhp, NULL, cbp->cb_force ? MS_FORCE : 0) != 0 ||
- zfs_destroy(zhp) != 0) {
+ zfs_destroy(zhp, cbp->cb_defer_destroy) != 0) {
zfs_close(zhp);
return (-1);
}
char *cp;
/* check options */
- while ((c = getopt(argc, argv, "frR")) != -1) {
+ while ((c = getopt(argc, argv, "dfrR")) != -1) {
switch (c) {
+ case 'd':
+ cb.cb_defer_destroy = B_TRUE;
+ break;
case 'f':
cb.cb_force = 1;
break;
usage(B_FALSE);
}
+ if (cb.cb_defer_destroy && cb.cb_doclones)
+ usage(B_FALSE);
+
/*
* If we are doing recursive destroy of a snapshot, then the
* named snapshot may not exist. Go straight to libzfs.
}
}
- ret = zfs_destroy_snaps(zhp, cp);
+ ret = zfs_destroy_snaps(zhp, cp, cb.cb_defer_destroy);
zfs_close(zhp);
if (ret) {
(void) fprintf(stderr,
return (ret != 0);
}
-
/* Open the given dataset */
if ((zhp = zfs_open(g_zfs, argv[0], ZFS_TYPE_DATASET)) == NULL)
return (1);
* Check for any dependents and/or clones.
*/
cb.cb_first = B_TRUE;
- if (!cb.cb_doclones &&
+ if (!cb.cb_doclones && !cb.cb_defer_destroy &&
zfs_iter_dependents(zhp, B_TRUE, destroy_check_dependent,
&cb) != 0) {
zfs_close(zhp);
return (1);
}
- if (cb.cb_error ||
- zfs_iter_dependents(zhp, B_FALSE, destroy_callback, &cb) != 0) {
+ if (cb.cb_error || (!cb.cb_defer_destroy &&
+ (zfs_iter_dependents(zhp, B_FALSE, destroy_callback, &cb) != 0))) {
zfs_close(zhp);
return (1);
}
if (destroy_callback(zhp, &cb) != 0)
return (1);
-
return (0);
}
(void) printf(gettext(" 1 Initial ZFS filesystem version\n"));
(void) printf(gettext(" 2 Enhanced directory entries\n"));
(void) printf(gettext(" 3 Case insensitive and File system "
- "unique identifer (FUID)\n"));
+ "unique identifier (FUID)\n"));
(void) printf(gettext(" 4 userquota, groupquota "
"properties\n"));
(void) printf(gettext("\nFor more information on a particular "
return (err != 0);
}
+static int
+zfs_do_hold_rele_impl(int argc, char **argv, boolean_t holding)
+{
+ int errors = 0;
+ int i;
+ const char *tag;
+ boolean_t recursive = B_FALSE;
+ int c;
+ int (*func)(zfs_handle_t *, const char *, const char *, boolean_t);
+
+ /* check options */
+ while ((c = getopt(argc, argv, "r")) != -1) {
+ switch (c) {
+ case 'r':
+ recursive = B_TRUE;
+ break;
+ case '?':
+ (void) fprintf(stderr, gettext("invalid option '%c'\n"),
+ optopt);
+ usage(B_FALSE);
+ }
+ }
+
+ argc -= optind;
+ argv += optind;
+
+ /* check number of arguments */
+ if (argc < 2)
+ usage(B_FALSE);
+
+ tag = argv[0];
+ --argc;
+ ++argv;
+
+ if (holding) {
+ if (tag[0] == '.') {
+ /* tags starting with '.' are reserved for libzfs */
+ (void) fprintf(stderr,
+ gettext("tag may not start with '.'\n"));
+ usage(B_FALSE);
+ }
+ func = zfs_hold;
+ } else {
+ func = zfs_release;
+ }
+
+ for (i = 0; i < argc; ++i) {
+ zfs_handle_t *zhp;
+ char parent[ZFS_MAXNAMELEN];
+ const char *delim;
+ char *path = argv[i];
+
+ delim = strchr(path, '@');
+ if (delim == NULL) {
+ (void) fprintf(stderr,
+ gettext("'%s' is not a snapshot\n"), path);
+ ++errors;
+ continue;
+ }
+ (void) strncpy(parent, path, delim - path);
+ parent[delim - path] = '\0';
+
+ zhp = zfs_open(g_zfs, parent,
+ ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME);
+ if (zhp == NULL) {
+ ++errors;
+ continue;
+ }
+ if (func(zhp, delim+1, tag, recursive) != 0)
+ ++errors;
+ zfs_close(zhp);
+ }
+
+ return (errors != 0);
+}
+
+/*
+ * zfs hold [-r] <tag> <snap> ...
+ *
+ * -r Recursively hold
+ *
+ * Apply a user-hold with the given tag to the list of snapshots.
+ */
+static int
+zfs_do_hold(int argc, char **argv)
+{
+ return (zfs_do_hold_rele_impl(argc, argv, B_TRUE));
+}
+
+/*
+ * zfs release [-r] <tag> <snap> ...
+ *
+ * -r Recursively release
+ *
+ * Release a user-hold with the given tag from the list of snapshots.
+ */
+static int
+zfs_do_release(int argc, char **argv)
+{
+ return (zfs_do_hold_rele_impl(argc, argv, B_FALSE));
+}
+
typedef struct get_all_cbdata {
zfs_handle_t **cb_handles;
size_t cb_alloc;
*/
static int
do_import(nvlist_t *config, const char *newname, const char *mntopts,
- int force, nvlist_t *props, boolean_t allowfaulted)
+ int force, nvlist_t *props, boolean_t do_verbatim)
{
zpool_handle_t *zhp;
char *name;
}
}
- if (zpool_import_props(g_zfs, config, newname, props,
- allowfaulted) != 0)
+ if (zpool_import_props(g_zfs, config, newname, props, do_verbatim) != 0)
return (1);
if (newname != NULL)
name = (char *)newname;
- verify((zhp = zpool_open_canfail(g_zfs, name)) != NULL);
+ if ((zhp = zpool_open_canfail(g_zfs, name)) == NULL)
+ return (1);
if (zpool_get_state(zhp) != POOL_STATE_UNAVAIL &&
zpool_enable_datasets(zhp, mntopts, 0) != 0) {
* -F Import even in the presence of faulted vdevs. This is an
* intentionally undocumented option for testing purposes, and
* treats the pool configuration as complete, leaving any bad
- * vdevs in the FAULTED state.
+ * vdevs in the FAULTED state. In other words, it does verbatim
+ * import.
*
* -a Import all pools found.
*
nvlist_t *found_config;
nvlist_t *props = NULL;
boolean_t first;
- boolean_t allow_faulted = B_FALSE;
+ boolean_t do_verbatim = B_FALSE;
uint64_t pool_state;
char *cachefile = NULL;
do_force = B_TRUE;
break;
case 'F':
- allow_faulted = B_TRUE;
+ do_verbatim = B_TRUE;
break;
case 'o':
if ((propval = strchr(optarg, '=')) != NULL) {
if (do_all)
err |= do_import(config, NULL, mntopts,
- do_force, props, allow_faulted);
+ do_force, props, do_verbatim);
else
show_import(config);
} else if (searchname != NULL) {
err = B_TRUE;
} else {
err |= do_import(found_config, argc == 1 ? NULL :
- argv[1], mntopts, do_force, props, allow_faulted);
+ argv[1], mntopts, do_force, props, do_verbatim);
}
}
"replace'.\n"));
break;
+ case ZPOOL_STATUS_REMOVED_DEV:
+ (void) printf(gettext("status: One or more devices has "
+ "been removed by the administrator.\n\tSufficient "
+ "replicas exist for the pool to continue functioning in "
+ "a\n\tdegraded state.\n"));
+ (void) printf(gettext("action: Online the device using "
+ "'zpool online' or replace the device with\n\t'zpool "
+ "replace'.\n"));
+ break;
+
+
case ZPOOL_STATUS_RESILVERING:
(void) printf(gettext("status: One or more devices is "
"currently being resilvered. The pool will\n\tcontinue "
(void) printf(gettext(" 14 passthrough-x aclinherit\n"));
(void) printf(gettext(" 15 user/group space accounting\n"));
(void) printf(gettext(" 16 stmf property support\n"));
+ (void) printf(gettext(" 17 Triple-parity RAID-Z\n"));
+ (void) printf(gettext(" 18 snapshot user holds\n"));
(void) printf(gettext("For more information on a particular "
"version, including supported releases, see:\n\n"));
(void) printf("http://www.opensolaris.org/os/community/zfs/"
"refquota set",
"refreservation set",
"pool scrub done",
+ "user hold",
+ "user release",
};
/*
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#include <libdiskmgt.h>
#include <libintl.h>
#include <libnvpair.h>
+#include <limits.h>
#include <stdio.h>
#include <string.h>
#include <unistd.h>
}
static const char *
-is_grouping(const char *type, int *mindev)
+is_grouping(const char *type, int *mindev, int *maxdev)
{
- if (strcmp(type, "raidz") == 0 || strcmp(type, "raidz1") == 0) {
- if (mindev != NULL)
- *mindev = 2;
- return (VDEV_TYPE_RAIDZ);
- }
+ if (strncmp(type, "raidz", 5) == 0) {
+ const char *p = type + 5;
+ char *end;
+ long nparity;
+
+ if (*p == '\0') {
+ nparity = 1;
+ } else if (*p == '0') {
+ return (NULL); /* no zero prefixes allowed */
+ } else {
+ errno = 0;
+ nparity = strtol(p, &end, 10);
+ if (errno != 0 || nparity < 1 || nparity >= 255 ||
+ *end != '\0')
+ return (NULL);
+ }
- if (strcmp(type, "raidz2") == 0) {
if (mindev != NULL)
- *mindev = 3;
+ *mindev = nparity + 1;
+ if (maxdev != NULL)
+ *maxdev = 255;
return (VDEV_TYPE_RAIDZ);
}
+ if (maxdev != NULL)
+ *maxdev = INT_MAX;
+
if (strcmp(type, "mirror") == 0) {
if (mindev != NULL)
*mindev = 2;
construct_spec(int argc, char **argv)
{
nvlist_t *nvroot, *nv, **top, **spares, **l2cache;
- int t, toplevels, mindev, nspares, nlogs, nl2cache;
+ int t, toplevels, mindev, maxdev, nspares, nlogs, nl2cache;
const char *type;
uint64_t is_log;
boolean_t seen_logs;
* If it's a mirror or raidz, the subsequent arguments are
* its leaves -- until we encounter the next mirror or raidz.
*/
- if ((type = is_grouping(argv[0], &mindev)) != NULL) {
+ if ((type = is_grouping(argv[0], &mindev, &maxdev)) != NULL) {
nvlist_t **child = NULL;
int c, children = 0;
}
for (c = 1; c < argc; c++) {
- if (is_grouping(argv[c], NULL) != NULL)
+ if (is_grouping(argv[c], NULL, NULL) != NULL)
break;
children++;
child = realloc(child,
return (NULL);
}
+ if (children > maxdev) {
+ (void) fprintf(stderr, gettext("invalid vdev "
+ "specification: %s supports no more than "
+ "%d devices\n"), argv[0], maxdev);
+ return (NULL);
+ }
+
argc -= c;
argv += c;
zopt_raidz = MAX(1, value);
break;
case 'R':
- zopt_raidz_parity = MIN(MAX(value, 1), 2);
+ zopt_raidz_parity = MIN(MAX(value, 1), 3);
break;
case 'd':
zopt_datasets = MAX(1, value);
/*
* Destroy the dataset.
*/
- error = dmu_objset_destroy(name);
+ error = dmu_objset_destroy(name, B_FALSE);
if (error) {
(void) dmu_objset_open(name, DMU_OST_OTHER,
DS_MODE_USER | DS_MODE_READONLY, &os);
zil_close(zilog);
dmu_objset_close(os);
- error = dmu_objset_destroy(name);
+ error = dmu_objset_destroy(name, B_FALSE);
if (error)
fatal(0, "dmu_objset_destroy(%s) = %d", name, error);
(void) snprintf(snapname, 100, "%s@%llu", osname,
(u_longlong_t)za->za_instance);
- error = dmu_objset_destroy(snapname);
+ error = dmu_objset_destroy(snapname, B_FALSE);
if (error != 0 && error != ENOENT)
fatal(0, "dmu_objset_destroy() = %d", error);
error = dmu_objset_snapshot(osname, strchr(snapname, '@')+1,
(void) snprintf(clone2name, 100, "%s/c2_%llu", osname, curval);
(void) snprintf(snap3name, 100, "%s@s3_%llu", clone1name, curval);
- error = dmu_objset_destroy(clone2name);
+ error = dmu_objset_destroy(clone2name, B_FALSE);
if (error && error != ENOENT)
fatal(0, "dmu_objset_destroy(%s) = %d", clone2name, error);
- error = dmu_objset_destroy(snap3name);
+ error = dmu_objset_destroy(snap3name, B_FALSE);
if (error && error != ENOENT)
fatal(0, "dmu_objset_destroy(%s) = %d", snap3name, error);
- error = dmu_objset_destroy(snap2name);
+ error = dmu_objset_destroy(snap2name, B_FALSE);
if (error && error != ENOENT)
fatal(0, "dmu_objset_destroy(%s) = %d", snap2name, error);
- error = dmu_objset_destroy(clone1name);
+ error = dmu_objset_destroy(clone1name, B_FALSE);
if (error && error != ENOENT)
fatal(0, "dmu_objset_destroy(%s) = %d", clone1name, error);
- error = dmu_objset_destroy(snap1name);
+ error = dmu_objset_destroy(snap1name, B_FALSE);
if (error && error != ENOENT)
fatal(0, "dmu_objset_destroy(%s) = %d", snap1name, error);
}
EZFS_NOTSUP, /* ops not supported on this dataset */
EZFS_ACTIVE_SPARE, /* pool has active shared spare devices */
EZFS_UNPLAYED_LOGS, /* log device has unplayed logs */
+ EZFS_REFTAG_RELE, /* snapshot release: tag not found */
+ EZFS_REFTAG_HOLD, /* snapshot hold: tag already exists */
EZFS_UNKNOWN
};
ZPOOL_STATUS_VERSION_OLDER, /* older on-disk version */
ZPOOL_STATUS_RESILVERING, /* device being resilvered */
ZPOOL_STATUS_OFFLINE_DEV, /* device online */
+ ZPOOL_STATUS_REMOVED_DEV, /* removed device */
/*
* Finally, the following indicates a healthy pool.
extern int zfs_create(libzfs_handle_t *, const char *, zfs_type_t,
nvlist_t *);
extern int zfs_create_ancestors(libzfs_handle_t *, const char *);
-extern int zfs_destroy(zfs_handle_t *);
-extern int zfs_destroy_snaps(zfs_handle_t *, char *);
+extern int zfs_destroy(zfs_handle_t *, boolean_t);
+extern int zfs_destroy_snaps(zfs_handle_t *, char *, boolean_t);
extern int zfs_clone(zfs_handle_t *, const char *, nvlist_t *);
extern int zfs_snapshot(libzfs_handle_t *, const char *, boolean_t, nvlist_t *);
extern int zfs_rollback(zfs_handle_t *, zfs_handle_t *, boolean_t);
extern int zfs_send(zfs_handle_t *, const char *, const char *,
boolean_t, boolean_t, boolean_t, boolean_t, int);
extern int zfs_promote(zfs_handle_t *);
+extern int zfs_hold(zfs_handle_t *, const char *, const char *, boolean_t);
+extern int zfs_release(zfs_handle_t *, const char *, const char *, boolean_t);
typedef int (*zfs_userspace_cb_t)(void *arg, const char *domain,
uid_t rid, uint64_t space);
&idx);
uu_list_insert(clp->cl_list, cn, idx);
} else {
+ /*
+ * Add this child to beginning of the list. Children
+ * below this one in the hierarchy will get added above
+ * this one in the list. This produces a list in
+ * reverse dataset name order.
+ * This is necessary when the original mountpoint
+ * is legacy or none.
+ */
ASSERT(!clp->cl_alldependents);
verify(uu_list_insert_before(clp->cl_list,
uu_list_first(clp->cl_list), cn) == 0);
zfs_handle_t *temp;
char property[ZFS_MAXPROPLEN];
uu_compare_fn_t *compare = NULL;
+ boolean_t legacy = B_FALSE;
if ((clp = zfs_alloc(zhp->zfs_hdl, sizeof (prop_changelist_t))) == NULL)
return (NULL);
if (prop == ZFS_PROP_NAME || prop == ZFS_PROP_ZONED ||
prop == ZFS_PROP_MOUNTPOINT || prop == ZFS_PROP_SHARENFS ||
prop == ZFS_PROP_SHARESMB) {
- compare = compare_mountpoints;
- clp->cl_sorted = B_TRUE;
+
+ if (zfs_prop_get(zhp, ZFS_PROP_MOUNTPOINT,
+ property, sizeof (property),
+ NULL, NULL, 0, B_FALSE) == 0 &&
+ (strcmp(property, "legacy") == 0 ||
+ strcmp(property, "none") == 0)) {
+
+ legacy = B_TRUE;
+ }
+ if (!legacy) {
+ compare = compare_mountpoints;
+ clp->cl_sorted = B_TRUE;
+ }
}
clp->cl_pool = uu_list_pool_create("changelist_pool",
(void) uu_list_find(clp->cl_list, cn, NULL, &idx);
uu_list_insert(clp->cl_list, cn, idx);
} else {
+ /*
+ * Add the target dataset to the end of the list.
+ * The list is not really unsorted. The list will be
+ * in reverse dataset name order. This is necessary
+ * when the original mountpoint is legacy or none.
+ */
verify(uu_list_insert_after(clp->cl_list,
uu_list_last(clp->cl_list), cn) == 0);
}
* If the mountpoint property was previously 'legacy', or 'none',
* record it as the behavior of changelist_postfix() will be different.
*/
- if ((clp->cl_prop == ZFS_PROP_MOUNTPOINT) &&
- (zfs_prop_get(zhp, prop, property, sizeof (property),
- NULL, NULL, 0, B_FALSE) == 0 &&
- (strcmp(property, "legacy") == 0 ||
- strcmp(property, "none") == 0))) {
+ if ((clp->cl_prop == ZFS_PROP_MOUNTPOINT) && legacy) {
/*
* do not automatically mount ex-legacy datasets if
* we specifically set canmount to noauto
#include <ucred.h>
#include <idmap.h>
#include <aclutils.h>
+#include <directory.h>
#include <sys/spa.h>
#include <sys/zap.h>
(void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
if (zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_OBJSET_ZPLPROPS, &zc)) {
zcmd_free_nvlists(&zc);
- zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN,
- "unable to get %s property"),
- zfs_prop_to_name(prop));
- return (zfs_error(zhp->zfs_hdl, EZFS_BADVERSION,
- dgettext(TEXT_DOMAIN, "internal error")));
+ return (-1);
}
if (zcmd_read_dst_nvlist(zhp->zfs_hdl, &zc, &zplprops) != 0 ||
nvlist_lookup_uint64(zplprops, zfs_prop_to_name(prop),
val) != 0) {
zcmd_free_nvlists(&zc);
- zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN,
- "unable to get %s property"),
- zfs_prop_to_name(prop));
- return (zfs_error(zhp->zfs_hdl, EZFS_NOMEM,
- dgettext(TEXT_DOMAIN, "internal error")));
+ return (-1);
}
if (zplprops)
nvlist_free(zplprops);
{
zfs_userquota_prop_t type;
char *cp, *end;
+ char *numericsid = NULL;
boolean_t isuser;
domain[0] = '\0';
if (strchr(cp, '@')) {
/*
* It's a SID name (eg "user@domain") that needs to be
- * turned into S-1-domainID-RID. There should be a
- * better way to do this, but for now just translate it
- * to the (possibly ephemeral) uid and then back to the
- * SID. This is like getsidname(noresolve=TRUE).
+ * turned into S-1-domainID-RID.
*/
- uid_t id;
- idmap_rid_t rid;
- char *mapdomain;
-
+ directory_error_t e;
if (zoned && getzoneid() == GLOBAL_ZONEID)
return (ENOENT);
- if (sid_to_id(cp, isuser, &id) != 0)
+ if (isuser) {
+ e = directory_sid_from_user_name(NULL,
+ cp, &numericsid);
+ } else {
+ e = directory_sid_from_group_name(NULL,
+ cp, &numericsid);
+ }
+ if (e != NULL) {
+ directory_error_free(e);
return (ENOENT);
- if (idmap_id_to_numeric_domain_rid(id, isuser,
- &mapdomain, &rid) != 0)
+ }
+ if (numericsid == NULL)
return (ENOENT);
- (void) strlcpy(domain, mapdomain, domainlen);
- *ridp = rid;
- } else if (strncmp(cp, "S-1-", 4) == 0) {
+ cp = numericsid;
+ /* will be further decoded below */
+ }
+
+ if (strncmp(cp, "S-1-", 4) == 0) {
/* It's a numeric SID (eg "S-1-234-567-89") */
- (void) strcpy(domain, cp);
+ (void) strlcpy(domain, cp, domainlen);
cp = strrchr(domain, '-');
*cp = '\0';
cp++;
errno = 0;
*ridp = strtoull(cp, &end, 10);
+ if (numericsid) {
+ free(numericsid);
+ numericsid = NULL;
+ }
if (errno != 0 || *end != '\0')
return (EINVAL);
} else if (!isdigit(*cp)) {
if (idmap_id_to_numeric_domain_rid(id, isuser,
&mapdomain, &rid) != 0)
return (ENOENT);
- (void) strcpy(domain, mapdomain);
+ (void) strlcpy(domain, mapdomain, domainlen);
*ridp = rid;
} else {
*ridp = id;
}
}
+ ASSERT3P(numericsid, ==, NULL);
return (0);
}
* isn't mounted, and that there are no active dependents.
*/
int
-zfs_destroy(zfs_handle_t *zhp)
+zfs_destroy(zfs_handle_t *zhp, boolean_t defer)
{
zfs_cmd_t zc = { 0 };
zc.zc_objset_type = DMU_OST_ZFS;
}
+ zc.zc_defer_destroy = defer;
if (zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_DESTROY, &zc) != 0) {
return (zfs_standard_error_fmt(zhp->zfs_hdl, errno,
dgettext(TEXT_DOMAIN, "cannot destroy '%s'"),
* Destroys all snapshots with the given name in zhp & descendants.
*/
int
-zfs_destroy_snaps(zfs_handle_t *zhp, char *snapname)
+zfs_destroy_snaps(zfs_handle_t *zhp, char *snapname, boolean_t defer)
{
zfs_cmd_t zc = { 0 };
int ret;
(void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
(void) strlcpy(zc.zc_value, snapname, sizeof (zc.zc_value));
+ zc.zc_defer_destroy = defer;
ret = zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_DESTROY_SNAPS, &zc);
if (ret != 0) {
logstr = zhp->zfs_hdl->libzfs_log_str;
zhp->zfs_hdl->libzfs_log_str = NULL;
- cbp->cb_error |= zfs_destroy(zhp);
+ cbp->cb_error |= zfs_destroy(zhp, B_FALSE);
zhp->zfs_hdl->libzfs_log_str = logstr;
}
} else {
zfs_close(zhp);
return (0);
}
- if (zfs_destroy(zhp) != 0)
+ if (zfs_destroy(zhp, B_FALSE) != 0)
cbp->cb_error = B_TRUE;
else
changelist_remove(clp, zhp->zfs_name);
return (error);
}
+
+int
+zfs_hold(zfs_handle_t *zhp, const char *snapname, const char *tag,
+ boolean_t recursive)
+{
+ zfs_cmd_t zc = { 0 };
+ libzfs_handle_t *hdl = zhp->zfs_hdl;
+
+ (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
+ (void) strlcpy(zc.zc_value, snapname, sizeof (zc.zc_value));
+ (void) strlcpy(zc.zc_string, tag, sizeof (zc.zc_string));
+ zc.zc_cookie = recursive;
+
+ if (zfs_ioctl(hdl, ZFS_IOC_HOLD, &zc) != 0) {
+ char errbuf[ZFS_MAXNAMELEN+32];
+
+ /*
+ * if it was recursive, the one that actually failed will be in
+ * zc.zc_name.
+ */
+ (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
+ "cannot hold '%s@%s'"), zc.zc_name, snapname);
+ switch (errno) {
+ case ENOTSUP:
+ zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+ "pool must be upgraded"));
+ return (zfs_error(hdl, EZFS_BADVERSION, errbuf));
+ case EINVAL:
+ return (zfs_error(hdl, EZFS_BADTYPE, errbuf));
+ case EEXIST:
+ return (zfs_error(hdl, EZFS_REFTAG_HOLD, errbuf));
+ default:
+ return (zfs_standard_error_fmt(hdl, errno, errbuf));
+ }
+ }
+
+ return (0);
+}
+
+int
+zfs_release(zfs_handle_t *zhp, const char *snapname, const char *tag,
+ boolean_t recursive)
+{
+ zfs_cmd_t zc = { 0 };
+ libzfs_handle_t *hdl = zhp->zfs_hdl;
+
+ (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
+ (void) strlcpy(zc.zc_value, snapname, sizeof (zc.zc_value));
+ (void) strlcpy(zc.zc_string, tag, sizeof (zc.zc_string));
+ zc.zc_cookie = recursive;
+
+ if (zfs_ioctl(hdl, ZFS_IOC_RELEASE, &zc) != 0) {
+ char errbuf[ZFS_MAXNAMELEN+32];
+
+ /*
+ * if it was recursive, the one that actually failed will be in
+ * zc.zc_name.
+ */
+ (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
+ "cannot release '%s@%s'"), zc.zc_name, snapname);
+ switch (errno) {
+ case ESRCH:
+ return (zfs_error(hdl, EZFS_REFTAG_RELE, errbuf));
+ case ENOTSUP:
+ zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+ "pool must be upgraded"));
+ return (zfs_error(hdl, EZFS_BADVERSION, errbuf));
+ case EINVAL:
+ return (zfs_error(hdl, EZFS_BADTYPE, errbuf));
+ default:
+ return (zfs_standard_error_fmt(hdl, errno, errbuf));
+ }
+ }
+
+ return (0);
+}
free(avl);
}
+/*
+ * Given an nvlist, produce an avl tree of snapshots, ordered by guid
+ */
static avl_tree_t *
fsavl_create(nvlist_t *fss)
{
continue;
verify(nvpair_value_nvlist(elem, &propnv) == 0);
- if (prop == ZFS_PROP_QUOTA || prop == ZFS_PROP_RESERVATION) {
+ if (prop == ZFS_PROP_QUOTA || prop == ZFS_PROP_RESERVATION ||
+ prop == ZFS_PROP_REFQUOTA ||
+ prop == ZFS_PROP_REFRESERVATION) {
/* these guys are modifyable, but have no source */
uint64_t value;
verify(nvlist_lookup_uint64(propnv,
}
}
+/*
+ * recursively generate nvlists describing datasets. See comment
+ * for the data structure send_data_t above for description of contents
+ * of the nvlist.
+ */
static int
send_iterate_fs(zfs_handle_t *zhp, void *arg)
{
}
/*
- * Dumps a backup of tosnap, incremental from fromsnap if it isn't NULL.
- * If 'doall', dump all intermediate snaps.
- * If 'replicate', dump special header and do recursively.
+ * Generate a send stream for the dataset identified by the argument zhp.
+ *
+ * The content of the send stream is the snapshot identified by
+ * 'tosnap'. Incremental streams are requested in two ways:
+ * - from the snapshot identified by "fromsnap" (if non-null) or
+ * - from the origin of the dataset identified by zhp, which must
+ * be a clone. In this case, "fromsnap" is null and "fromorigin"
+ * is TRUE.
+ *
+ * The send stream is recursive (i.e. dumps a hierarchy of snapshots) and
+ * uses a special header (with a version field of DMU_BACKUP_HEADER_VERSION)
+ * if "replicate" is set. If "doall" is set, dump all the intermediate
+ * snapshots. The DMU_BACKUP_HEADER_VERSION header is used in the "doall"
+ * case too.
*/
int
zfs_send(zfs_handle_t *zhp, const char *fromsnap, const char *tosnap,
if (err)
return (err);
+ zc.zc_objset_type = DMU_OST_ZFS;
+ (void) strlcpy(zc.zc_name, name, sizeof (zc.zc_name));
+
if (tryname) {
(void) strcpy(newname, tryname);
- zc.zc_objset_type = DMU_OST_ZFS;
- (void) strlcpy(zc.zc_name, name, sizeof (zc.zc_name));
(void) strlcpy(zc.zc_value, tryname, sizeof (zc.zc_value));
if (flags.verbose) {
int err = 0;
prop_changelist_t *clp;
zfs_handle_t *zhp;
+ boolean_t defer = B_FALSE;
+ int spa_version;
zhp = zfs_open(hdl, name, ZFS_TYPE_DATASET);
if (zhp == NULL)
return (-1);
clp = changelist_gather(zhp, ZFS_PROP_NAME, 0,
flags.force ? MS_FORCE : 0);
+ if (zfs_get_type(zhp) == ZFS_TYPE_SNAPSHOT &&
+ zfs_spa_version(zhp, &spa_version) == 0 &&
+ spa_version >= SPA_VERSION_USERREFS)
+ defer = B_TRUE;
zfs_close(zhp);
if (clp == NULL)
return (-1);
return (err);
zc.zc_objset_type = DMU_OST_ZFS;
+ zc.zc_defer_destroy = defer;
(void) strlcpy(zc.zc_name, name, sizeof (zc.zc_name));
if (flags.verbose)
(void) printf("attempting destroy %s\n", zc.zc_name);
err = ioctl(hdl->libzfs_fd, ZFS_IOC_DESTROY, &zc);
-
if (err == 0) {
if (flags.verbose)
(void) printf("success\n");
(void) changelist_postfix(clp);
changelist_free(clp);
- if (err != 0)
+ /*
+ * Deferred destroy should always succeed. Since we can't tell
+ * if it destroyed the dataset or just marked it for deferred
+ * destroy, always do the rename just in case.
+ */
+ if (err != 0 || defer)
err = recv_rename(hdl, name, NULL, baselen, newname, flags);
return (err);
/* We can't do online recv in this case */
clp = changelist_gather(zhp, ZFS_PROP_NAME, 0, 0);
if (clp == NULL) {
+ zfs_close(zhp);
zcmd_free_nvlists(&zc);
return (-1);
}
if (changelist_prefix(clp) != 0) {
changelist_free(clp);
+ zfs_close(zhp);
zcmd_free_nvlists(&zc);
return (-1);
}
* (if created, or if we tore them down to do an incremental
* restore), and the /dev links for the new snapshot (if
* created). Also mount any children of the target filesystem
- * if we did an incremental receive.
+ * if we did a replication receive (indicated by stream_avl
+ * being non-NULL).
*/
cp = strchr(zc.zc_value, '@');
if (cp && (ioctl_err == 0 || !newfs)) {
if (err == 0 && ioctl_err == 0)
err = zvol_create_link(hdl,
zc.zc_value);
- } else if (newfs) {
+ } else if (newfs || stream_avl) {
/*
* Track the first/top of hierarchy fs,
* for mounting and sharing later.
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
return (state == VDEV_STATE_OFFLINE);
}
+/* ARGSUSED */
+static int
+vdev_removed(uint64_t state, uint64_t aux, uint64_t errs)
+{
+ return (state == VDEV_STATE_REMOVED);
+}
+
/*
* Detect if any leaf devices that have seen errors or could not be opened.
*/
if (find_vdev_problem(nvroot, vdev_offlined))
return (ZPOOL_STATUS_OFFLINE_DEV);
+ /*
+ * Removed device
+ */
+ if (find_vdev_problem(nvroot, vdev_removed))
+ return (ZPOOL_STATUS_REMOVED_DEV);
+
/*
* Currently resilvering
*/
case EZFS_UNPLAYED_LOGS:
return (dgettext(TEXT_DOMAIN, "log device has unplayed intent "
"logs"));
+ case EZFS_REFTAG_RELE:
+ return (dgettext(TEXT_DOMAIN, "no such tag on this dataset"));
+ case EZFS_REFTAG_HOLD:
+ return (dgettext(TEXT_DOMAIN, "tag already exists on this "
+ "dataset"));
case EZFS_UNKNOWN:
return (dgettext(TEXT_DOMAIN, "unknown error"));
default:
*
* CDDL HEADER END
*/
+
/*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
#ifndef _SYS_FS_ZFS_H
#define _SYS_FS_ZFS_H
+#include <sys/time.h>
+
#ifdef __cplusplus
extern "C" {
#endif
ZFS_PROP_USEDREFRESERV,
ZFS_PROP_USERACCOUNTING, /* not exposed to the user */
ZFS_PROP_STMF_SHAREINFO, /* not exposed to the user */
+ ZFS_PROP_DEFER_DESTROY,
+ ZFS_PROP_USERREFS,
ZFS_NUM_PROPS
} zfs_prop_t;
#define SPA_VERSION_14 14ULL
#define SPA_VERSION_15 15ULL
#define SPA_VERSION_16 16ULL
+#define SPA_VERSION_17 17ULL
+#define SPA_VERSION_18 18ULL
/*
* When bumping up SPA_VERSION, make sure GRUB ZFS understands the on-disk
* format change. Go to usr/src/grub/grub-0.97/stage2/{zfs-include/, fsys_zfs*},
* and do the appropriate changes. Also bump the version number in
* usr/src/grub/capability.
*/
-#define SPA_VERSION SPA_VERSION_16
-#define SPA_VERSION_STRING "16"
+#define SPA_VERSION SPA_VERSION_18
+#define SPA_VERSION_STRING "18"
/*
* Symbolic names for the changes that caused a SPA_VERSION switch.
#define SPA_VERSION_INITIAL SPA_VERSION_1
#define SPA_VERSION_DITTO_BLOCKS SPA_VERSION_2
#define SPA_VERSION_SPARES SPA_VERSION_3
-#define SPA_VERSION_RAID6 SPA_VERSION_3
+#define SPA_VERSION_RAIDZ2 SPA_VERSION_3
#define SPA_VERSION_BPLIST_ACCOUNT SPA_VERSION_3
#define SPA_VERSION_RAIDZ_DEFLATE SPA_VERSION_3
#define SPA_VERSION_DNODE_BYTES SPA_VERSION_3
#define SPA_VERSION_PASSTHROUGH_X SPA_VERSION_14
#define SPA_VERSION_USERSPACE SPA_VERSION_15
#define SPA_VERSION_STMF_PROP SPA_VERSION_16
+#define SPA_VERSION_RAIDZ3 SPA_VERSION_17
+#define SPA_VERSION_USERREFS SPA_VERSION_18
/*
* ZPL version - rev'd whenever an incompatible on-disk format change
ZFS_IOC_SMB_ACL,
ZFS_IOC_USERSPACE_ONE,
ZFS_IOC_USERSPACE_MANY,
- ZFS_IOC_USERSPACE_UPGRADE
+ ZFS_IOC_USERSPACE_UPGRADE,
+ ZFS_IOC_HOLD,
+ ZFS_IOC_RELEASE,
+ ZFS_IOC_GET_HOLDS
} zfs_ioc_t;
/*
LOG_DS_REFQUOTA,
LOG_DS_REFRESERV,
LOG_POOL_SCRUB_DONE,
+ LOG_DS_USER_HOLD,
+ LOG_DS_USER_RELEASE,
LOG_END
} history_internal_events_t;
ZFS_DELEG_NOTE_GROUPQUOTA,
ZFS_DELEG_NOTE_USERUSED,
ZFS_DELEG_NOTE_GROUPUSED,
+ ZFS_DELEG_NOTE_HOLD,
+ ZFS_DELEG_NOTE_RELEASE,
ZFS_DELEG_NOTE_NONE
} zfs_deleg_note_t;
{ZFS_DELEG_PERM_GROUPQUOTA, ZFS_DELEG_NOTE_GROUPQUOTA },
{ZFS_DELEG_PERM_USERUSED, ZFS_DELEG_NOTE_USERUSED },
{ZFS_DELEG_PERM_GROUPUSED, ZFS_DELEG_NOTE_GROUPUSED },
+ {ZFS_DELEG_PERM_HOLD, ZFS_DELEG_NOTE_HOLD },
+ {ZFS_DELEG_PERM_RELEASE, ZFS_DELEG_NOTE_RELEASE },
{NULL, ZFS_DELEG_NOTE_NONE }
};
* Snapshot names must be made up of alphanumeric characters plus the following
* characters:
*
- * [-_.:]
+ * [-_.: ]
*/
int
snapshot_namecheck(const char *path, namecheck_err_t *why, char *what)
/* readonly index (boolean) properties */
register_index(ZFS_PROP_MOUNTED, "mounted", 0, PROP_READONLY,
ZFS_TYPE_FILESYSTEM, "yes | no", "MOUNTED", boolean_table);
+ register_index(ZFS_PROP_DEFER_DESTROY, "defer_destroy", 0,
+ PROP_READONLY, ZFS_TYPE_SNAPSHOT, "yes | no", "DEFER_DESTROY",
+ boolean_table);
/* set once index properties */
register_index(ZFS_PROP_NORMALIZE, "normalization", 0,
register_number(ZFS_PROP_USEDREFRESERV, "usedbyrefreservation", 0,
PROP_READONLY,
ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "<size>", "USEDREFRESERV");
+ register_number(ZFS_PROP_USERREFS, "userrefs", 0, PROP_READONLY,
+ ZFS_TYPE_SNAPSHOT, "<count>", "USERREFS");
/* default number properties */
register_number(ZFS_PROP_QUOTA, "quota", 0, PROP_DEFAULT,
{ zap_byteswap, TRUE, "scrub work queue" },
{ zap_byteswap, TRUE, "ZFS user/group used" },
{ zap_byteswap, TRUE, "ZFS user/group quota" },
+ { zap_byteswap, TRUE, "snapshot refcount tags"},
};
int
ASSERT(length <= DMU_MAX_ACCESS);
- dbuf_flags = DB_RF_CANFAIL | DB_RF_NEVERWAIT;
+ dbuf_flags = DB_RF_CANFAIL | DB_RF_NEVERWAIT | DB_RF_HAVESTRUCT;
if (flags & DMU_READ_NO_PREFETCH || length > zfetch_array_rd_sz)
dbuf_flags |= DB_RF_NOPREFETCH;
os_dsl_dataset->ds_object,
(longlong_t)dn->dn_object, dn->dn_datablksz,
(longlong_t)offset, (longlong_t)length);
+ rw_exit(&dn->dn_struct_rwlock);
return (EIO);
}
nblks = 1;
}
/* initiate async i/o */
if (read) {
- rw_exit(&dn->dn_struct_rwlock);
(void) dbuf_read(db, zio, dbuf_flags);
- rw_enter(&dn->dn_struct_rwlock, RW_READER);
}
dbp[i] = &db->db;
}
dnode_rele(dn, FTAG);
}
+/*
+ * Get the next "chunk" of file data to free. We traverse the file from
+ * the end so that the file gets shorter over time (if we crashes in the
+ * middle, this will leave us in a better state). We find allocated file
+ * data by simply searching the allocated level 1 indirects.
+ */
static int
-get_next_chunk(dnode_t *dn, uint64_t *offset, uint64_t limit)
+get_next_chunk(dnode_t *dn, uint64_t *start, uint64_t limit)
{
- uint64_t len = *offset - limit;
- uint64_t chunk_len = dn->dn_datablksz * DMU_MAX_DELETEBLKCNT;
- uint64_t subchunk =
+ uint64_t len = *start - limit;
+ uint64_t blkcnt = 0;
+ uint64_t maxblks = DMU_MAX_ACCESS / (1ULL << (dn->dn_indblkshift + 1));
+ uint64_t iblkrange =
dn->dn_datablksz * EPB(dn->dn_indblkshift, SPA_BLKPTRSHIFT);
- ASSERT(limit <= *offset);
+ ASSERT(limit <= *start);
- if (len <= chunk_len) {
- *offset = limit;
+ if (len <= iblkrange * maxblks) {
+ *start = limit;
return (0);
}
+ ASSERT(ISP2(iblkrange));
- ASSERT(ISP2(subchunk));
-
- while (*offset > limit) {
- uint64_t initial_offset = P2ROUNDUP(*offset, subchunk);
- uint64_t delta;
+ while (*start > limit && blkcnt < maxblks) {
int err;
- /* skip over allocated data */
+ /* find next allocated L1 indirect */
err = dnode_next_offset(dn,
- DNODE_FIND_HOLE|DNODE_FIND_BACKWARDS, offset, 1, 1, 0);
- if (err == ESRCH)
- *offset = limit;
- else if (err)
- return (err);
+ DNODE_FIND_BACKWARDS, start, 2, 1, 0);
- ASSERT3U(*offset, <=, initial_offset);
- *offset = P2ALIGN(*offset, subchunk);
- delta = initial_offset - *offset;
- if (delta >= chunk_len) {
- *offset += delta - chunk_len;
+ /* if there are no more, then we are done */
+ if (err == ESRCH) {
+ *start = limit;
return (0);
- }
- chunk_len -= delta;
-
- /* skip over unallocated data */
- err = dnode_next_offset(dn,
- DNODE_FIND_BACKWARDS, offset, 1, 1, 0);
- if (err == ESRCH)
- *offset = limit;
- else if (err)
+ } else if (err) {
return (err);
+ }
+ blkcnt += 1;
- if (*offset < limit)
- *offset = limit;
- ASSERT3U(*offset, <, initial_offset);
+ /* reset offset to end of "next" block back */
+ *start = P2ALIGN(*start, iblkrange);
+ if (*start <= limit)
+ *start = limit;
+ else
+ *start -= 1;
}
return (0);
}
{
dnode_t *dn;
dmu_buf_t **dbp;
- int numbufs, i, err;
+ int numbufs, err;
err = dnode_hold(os->os, object, FTAG, &dn);
if (err)
* block. If we ever do the tail block optimization, we will need to
* handle that here as well.
*/
- if (dn->dn_datablkshift == 0) {
+ if (dn->dn_maxblkid == 0) {
int newsz = offset > dn->dn_datablksz ? 0 :
MIN(size, dn->dn_datablksz - offset);
bzero((char *)buf + newsz, size - newsz);
while (size > 0) {
uint64_t mylen = MIN(size, DMU_MAX_ACCESS / 2);
+ int i;
/*
* NB: we could do this block-at-a-time, but it's nice
if (tocpy == db->db_size)
dmu_buf_fill_done(db, tx);
- if (err)
- break;
-
offset += tocpy;
size -= tocpy;
}
}
int
-dmu_objset_destroy(const char *name)
+dmu_objset_destroy(const char *name, boolean_t defer)
{
objset_t *os;
int error;
dsl_dataset_t *ds = os->os->os_dsl_dataset;
zil_destroy(dmu_objset_zil(os), B_FALSE);
- error = dsl_dataset_destroy(ds, os);
+ error = dsl_dataset_destroy(ds, os, defer);
/*
* dsl_dataset_destroy() closes the ds.
*/
*/
for (obj = 0; err == 0; err = dmu_object_next(os, &obj, FALSE, 0)) {
- dmu_tx_t *tx = dmu_tx_create(os);
+ dmu_tx_t *tx;
dmu_buf_t *db;
int objerr;
objerr = dmu_bonus_hold(os, obj, FTAG, &db);
if (objerr)
continue;
+ tx = dmu_tx_create(os);
dmu_tx_hold_bonus(tx, obj);
objerr = dmu_tx_assign(tx, TXG_WAIT);
if (objerr) {
dsl_dataset_t *ds = arg1;
struct recvbeginsyncarg *rbsa = arg2;
int err;
+ struct dsl_ds_destroyarg dsda = {0};
/* must be a head ds */
if (ds->ds_phys->ds_next_snap_obj != 0)
if (dsl_dir_is_clone(ds->ds_dir))
return (EINVAL);
- err = dsl_dataset_destroy_check(ds, rbsa->tag, tx);
+ dsda.ds = ds;
+ err = dsl_dataset_destroy_check(&dsda, rbsa->tag, tx);
if (err)
return (err);
dsl_dir_t *dd = ds->ds_dir;
uint64_t flags = DS_FLAG_INCONSISTENT | rbsa->dsflags;
uint64_t dsobj;
+ struct dsl_ds_destroyarg dsda = {0};
/*
* NB: caller must provide an extra hold on the dsl_dir_t, so it
* won't go away when dsl_dataset_destroy_sync() closes the
* dataset.
*/
- dsl_dataset_destroy_sync(ds, rbsa->tag, cr, tx);
+ dsda.ds = ds;
+ dsl_dataset_destroy_sync(&dsda, rbsa->tag, cr, tx);
+ ASSERT3P(dsda.rm_origin, ==, NULL);
dsobj = dsl_dataset_create_sync_dd(dd, rbsa->origin, flags, tx);
/* ARGSUSED */
static void
-recv_online_incremental_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+recv_incremental_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
{
dsl_dataset_t *ohds = arg1;
struct recvbeginsyncarg *rbsa = arg2;
dp->dp_spa, tx, cr, "dataset = %lld", dsobj);
}
-/* ARGSUSED */
-static void
-recv_offline_incremental_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
-{
- dsl_dataset_t *ds = arg1;
-
- dmu_buf_will_dirty(ds->ds_dbuf, tx);
- ds->ds_phys->ds_flags |= DS_FLAG_INCONSISTENT;
-
- spa_history_internal_log(LOG_DS_REPLAY_INC_SYNC,
- ds->ds_dir->dd_pool->dp_spa, tx, cr, "dataset = %lld",
- ds->ds_object);
-}
-
/*
* NB: callers *MUST* call dmu_recv_stream() if dmu_recv_begin()
* succeeds; otherwise we will leak the holds on the datasets.
*/
int
dmu_recv_begin(char *tofs, char *tosnap, struct drr_begin *drrb,
- boolean_t force, objset_t *origin, boolean_t online, dmu_recv_cookie_t *drc)
+ boolean_t force, objset_t *origin, dmu_recv_cookie_t *drc)
{
int err = 0;
boolean_t byteswap;
/*
* Process the begin in syncing context.
*/
- if (rbsa.fromguid && !(flags & DRR_FLAG_CLONE) && !online) {
- /* offline incremental receive */
- err = dsl_dataset_own(tofs, 0, dmu_recv_tag, &ds);
- if (err)
- return (err);
-
- /*
- * Only do the rollback if the most recent snapshot
- * matches the incremental source
- */
- if (force) {
- if (ds->ds_prev == NULL ||
- ds->ds_prev->ds_phys->ds_guid !=
- rbsa.fromguid) {
- dsl_dataset_disown(ds, dmu_recv_tag);
- return (ENODEV);
- }
- (void) dsl_dataset_rollback(ds, DMU_OST_NONE);
- }
- rbsa.force = B_FALSE;
- err = dsl_sync_task_do(ds->ds_dir->dd_pool,
- recv_incremental_check,
- recv_offline_incremental_sync, ds, &rbsa, 1);
- if (err) {
- dsl_dataset_disown(ds, dmu_recv_tag);
- return (err);
- }
- drc->drc_logical_ds = drc->drc_real_ds = ds;
- } else if (rbsa.fromguid && !(flags & DRR_FLAG_CLONE)) {
- /* online incremental receive */
+ if (rbsa.fromguid && !(flags & DRR_FLAG_CLONE)) {
+ /* incremental receive */
/* tmp clone name is: tofs/%tosnap" */
(void) snprintf(rbsa.clonelastname, sizeof (rbsa.clonelastname),
if (err)
return (err);
+ /* must not have an incremental recv already in progress */
+ if (!mutex_tryenter(&ds->ds_recvlock)) {
+ dsl_dataset_rele(ds, dmu_recv_tag);
+ return (EBUSY);
+ }
+
rbsa.force = force;
err = dsl_sync_task_do(ds->ds_dir->dd_pool,
recv_incremental_check,
- recv_online_incremental_sync, ds, &rbsa, 5);
+ recv_incremental_sync, ds, &rbsa, 5);
if (err) {
+ mutex_exit(&ds->ds_recvlock);
dsl_dataset_rele(ds, dmu_recv_tag);
return (err);
}
return (err);
}
-void
-dmu_recv_abort_cleanup(dmu_recv_cookie_t *drc)
-{
- if (drc->drc_newfs || drc->drc_real_ds != drc->drc_logical_ds) {
- /*
- * online incremental or new fs: destroy the fs (which
- * may be a clone) that we created
- */
- (void) dsl_dataset_destroy(drc->drc_real_ds, dmu_recv_tag);
- if (drc->drc_real_ds != drc->drc_logical_ds)
- dsl_dataset_rele(drc->drc_logical_ds, dmu_recv_tag);
- } else {
- /*
- * offline incremental: rollback to most recent snapshot.
- */
- (void) dsl_dataset_rollback(drc->drc_real_ds, DMU_OST_NONE);
- dsl_dataset_disown(drc->drc_real_ds, dmu_recv_tag);
- }
-}
-
/*
* NB: callers *must* call dmu_recv_end() if this succeeds.
*/
if (ra.err != 0) {
/*
- * rollback or destroy what we created, so we don't
- * leave it in the restoring state.
+ * destroy what we created, so we don't leave it in the
+ * inconsistent restoring state.
*/
txg_wait_synced(drc->drc_real_ds->ds_dir->dd_pool, 0);
- dmu_recv_abort_cleanup(drc);
+
+ (void) dsl_dataset_destroy(drc->drc_real_ds, dmu_recv_tag,
+ B_FALSE);
+ if (drc->drc_real_ds != drc->drc_logical_ds) {
+ mutex_exit(&drc->drc_logical_ds->ds_recvlock);
+ dsl_dataset_rele(drc->drc_logical_ds, dmu_recv_tag);
+ }
}
kmem_free(ra.buf, ra.bufsize);
dsl_dataset_rele(ds, dmu_recv_tag);
}
/* dsl_dataset_destroy() will disown the ds */
- (void) dsl_dataset_destroy(drc->drc_real_ds, dmu_recv_tag);
+ (void) dsl_dataset_destroy(drc->drc_real_ds, dmu_recv_tag,
+ B_FALSE);
+ mutex_exit(&drc->drc_logical_ds->ds_recvlock);
if (err)
return (err);
}
if (err) {
if (drc->drc_newfs) {
ASSERT(ds == drc->drc_real_ds);
- (void) dsl_dataset_destroy(ds, dmu_recv_tag);
+ (void) dsl_dataset_destroy(ds, dmu_recv_tag,
+ B_FALSE);
return (err);
} else {
(void) dsl_dataset_rollback(ds, DMU_OST_NONE);
}
err = zap_count_write(&dn->dn_objset->os, dn->dn_object, name, add,
- &txh->txh_space_towrite, &txh->txh_space_tooverwrite,
- txh->txh_dnode->dn_datablkshift);
+ &txh->txh_space_towrite, &txh->txh_space_tooverwrite);
/*
* If the modified blocks are scattered to the four winds,
dmu_tx_willuse_space(tx, space);
}
+/*
+ * This function scans a block at the indicated "level" looking for
+ * a hole or data (depending on 'flags'). If level > 0, then we are
+ * scanning an indirect block looking at its pointers. If level == 0,
+ * then we are looking at a block of dnodes. If we don't find what we
+ * are looking for in the block, we return ESRCH. Otherwise, return
+ * with *offset pointing to the beginning (if searching forwards) or
+ * end (if searching backwards) of the range covered by the block
+ * pointer we matched on (or dnode).
+ *
+ * The basic search algorithm used below by dnode_next_offset() is to
+ * use this function to search up the block tree (widen the search) until
+ * we find something (i.e., we don't return ESRCH) and then search back
+ * down the tree (narrow the search) until we reach our original search
+ * level.
+ */
static int
dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
int lvl, uint64_t blkfill, uint64_t txg)
error = ESRCH;
} else {
blkptr_t *bp = data;
+ uint64_t start = *offset;
span = (lvl - 1) * epbs + dn->dn_datablkshift;
minfill = 0;
maxfill = blkfill << ((lvl - 1) * epbs);
else
minfill++;
- for (i = (*offset >> span) & ((1ULL << epbs) - 1);
+ *offset = *offset >> span;
+ for (i = BF64_GET(*offset, 0, epbs);
i >= 0 && i < epb; i += inc) {
if (bp[i].blk_fill >= minfill &&
bp[i].blk_fill <= maxfill &&
(hole || bp[i].blk_birth > txg))
break;
- if (inc < 0 && *offset < (1ULL << span))
- *offset = 0;
- else
- *offset += (1ULL << span) * inc;
+ if (inc > 0 || *offset > 0)
+ *offset += inc;
+ }
+ *offset = *offset << span;
+ if (inc < 0) {
+ /* traversing backwards; position offset at the end */
+ ASSERT3U(*offset, <=, start);
+ *offset = MIN(*offset + (1ULL << span) - 1, start);
+ } else if (*offset < start) {
+ *offset = start;
}
- if (i < 0 || i == epb)
+ if (i < 0 || i >= epb)
error = ESRCH;
}
#include <sys/spa.h>
#include <sys/zfs_znode.h>
#include <sys/sunddi.h>
+#include <sys/zvol.h>
static char *dsl_reaper = "the grim reaper";
ASSERT(!list_link_active(&ds->ds_synced_link));
mutex_destroy(&ds->ds_lock);
+ mutex_destroy(&ds->ds_recvlock);
mutex_destroy(&ds->ds_opening_lock);
mutex_destroy(&ds->ds_deadlist.bpl_lock);
rw_destroy(&ds->ds_rwlock);
ds->ds_phys = dbuf->db_data;
mutex_init(&ds->ds_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&ds->ds_recvlock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&ds->ds_opening_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&ds->ds_deadlist.bpl_lock, NULL, MUTEX_DEFAULT,
NULL);
* just opened it.
*/
mutex_destroy(&ds->ds_lock);
+ mutex_destroy(&ds->ds_recvlock);
mutex_destroy(&ds->ds_opening_lock);
mutex_destroy(&ds->ds_deadlist.bpl_lock);
rw_destroy(&ds->ds_rwlock);
dsl_dataset_rele(origin, FTAG);
}
}
- } else if (zfs_flags & ZFS_DEBUG_SNAPNAMES) {
- err = dsl_dataset_get_snapname(ds);
+ } else {
+ if (zfs_flags & ZFS_DEBUG_SNAPNAMES)
+ err = dsl_dataset_get_snapname(ds);
+ if (err == 0 && ds->ds_phys->ds_userrefs_obj != 0) {
+ err = zap_count(
+ ds->ds_dir->dd_pool->dp_meta_objset,
+ ds->ds_phys->ds_userrefs_obj,
+ &ds->ds_userrefs);
+ }
}
if (err == 0 && !dsl_dataset_is_snapshot(ds)) {
dsl_dataset_drop_ref(ds->ds_prev, ds);
dsl_dir_close(ds->ds_dir, ds);
mutex_destroy(&ds->ds_lock);
+ mutex_destroy(&ds->ds_recvlock);
mutex_destroy(&ds->ds_opening_lock);
mutex_destroy(&ds->ds_deadlist.bpl_lock);
rw_destroy(&ds->ds_rwlock);
dsl_sync_task_group_t *dstg;
char *snapname;
char *failed;
+ boolean_t defer;
};
static int
{
struct destroyarg *da = arg;
dsl_dataset_t *ds;
- char *cp;
int err;
-
- (void) strcat(name, "@");
- (void) strcat(name, da->snapname);
- err = dsl_dataset_own(name, DS_MODE_READONLY | DS_MODE_INCONSISTENT,
+ char *dsname;
+ size_t buflen;
+
+ /* alloc a buffer to hold name@snapname, plus the terminating NULL */
+ buflen = strlen(name) + strlen(da->snapname) + 2;
+ dsname = kmem_alloc(buflen, KM_SLEEP);
+ (void) snprintf(dsname, buflen, "%s@%s", name, da->snapname);
+ err = dsl_dataset_own(dsname, DS_MODE_READONLY | DS_MODE_INCONSISTENT,
da->dstg, &ds);
- cp = strchr(name, '@');
- *cp = '\0';
+ kmem_free(dsname, buflen);
if (err == 0) {
+ struct dsl_ds_destroyarg *dsda;
+
dsl_dataset_make_exclusive(ds, da->dstg);
if (ds->ds_user_ptr) {
ds->ds_user_evict_func(ds, ds->ds_user_ptr);
ds->ds_user_ptr = NULL;
}
+ dsda = kmem_zalloc(sizeof (struct dsl_ds_destroyarg), KM_SLEEP);
+ dsda->ds = ds;
+ dsda->defer = da->defer;
dsl_sync_task_create(da->dstg, dsl_dataset_destroy_check,
- dsl_dataset_destroy_sync, ds, da->dstg, 0);
+ dsl_dataset_destroy_sync, dsda, da->dstg, 0);
} else if (err == ENOENT) {
err = 0;
} else {
*/
#pragma weak dmu_snapshots_destroy = dsl_snapshots_destroy
int
-dsl_snapshots_destroy(char *fsname, char *snapname)
+dsl_snapshots_destroy(char *fsname, char *snapname, boolean_t defer)
{
int err;
struct destroyarg da;
da.dstg = dsl_sync_task_group_create(spa_get_dsl(spa));
da.snapname = snapname;
da.failed = fsname;
+ da.defer = defer;
err = dmu_objset_find(fsname,
dsl_snapshot_destroy_one, &da, DS_FIND_CHILDREN);
for (dst = list_head(&da.dstg->dstg_tasks); dst;
dst = list_next(&da.dstg->dstg_tasks, dst)) {
- dsl_dataset_t *ds = dst->dst_arg1;
+ struct dsl_ds_destroyarg *dsda = dst->dst_arg1;
+ dsl_dataset_t *ds = dsda->ds;
+
/*
* Return the file system name that triggered the error
*/
dsl_dataset_name(ds, fsname);
*strchr(fsname, '@') = '\0';
}
+ ASSERT3P(dsda->rm_origin, ==, NULL);
dsl_dataset_disown(ds, da.dstg);
+ kmem_free(dsda, sizeof (struct dsl_ds_destroyarg));
}
dsl_sync_task_group_destroy(da.dstg);
return (err);
}
+static boolean_t
+dsl_dataset_might_destroy_origin(dsl_dataset_t *ds)
+{
+ boolean_t might_destroy = B_FALSE;
+
+ mutex_enter(&ds->ds_lock);
+ if (ds->ds_phys->ds_num_children == 2 && ds->ds_userrefs == 0 &&
+ DS_IS_DEFER_DESTROY(ds))
+ might_destroy = B_TRUE;
+ mutex_exit(&ds->ds_lock);
+
+ return (might_destroy);
+}
+
+#ifdef _KERNEL
+static int
+dsl_dataset_zvol_cleanup(dsl_dataset_t *ds, const char *name)
+{
+ int error;
+ objset_t *os;
+
+ error = dmu_objset_open_ds(ds, DMU_OST_ANY, &os);
+ if (error)
+ return (error);
+
+ if (dmu_objset_type(os) == DMU_OST_ZVOL)
+ error = zvol_remove_minor(name);
+ dmu_objset_close(os);
+
+ return (error);
+}
+#endif
+
+/*
+ * If we're removing a clone, and these three conditions are true:
+ * 1) the clone's origin has no other children
+ * 2) the clone's origin has no user references
+ * 3) the clone's origin has been marked for deferred destruction
+ * Then, prepare to remove the origin as part of this sync task group.
+ */
+static int
+dsl_dataset_origin_rm_prep(struct dsl_ds_destroyarg *dsda, void *tag)
+{
+ dsl_dataset_t *ds = dsda->ds;
+ dsl_dataset_t *origin = ds->ds_prev;
+
+ if (dsl_dataset_might_destroy_origin(origin)) {
+ char *name;
+ int namelen;
+ int error;
+
+ namelen = dsl_dataset_namelen(origin) + 1;
+ name = kmem_alloc(namelen, KM_SLEEP);
+ dsl_dataset_name(origin, name);
+#ifdef _KERNEL
+ error = zfs_unmount_snap(name, NULL);
+ if (error) {
+ kmem_free(name, namelen);
+ return (error);
+ }
+ error = dsl_dataset_zvol_cleanup(origin, name);
+ if (error) {
+ kmem_free(name, namelen);
+ return (error);
+ }
+#endif
+ error = dsl_dataset_own(name,
+ DS_MODE_READONLY | DS_MODE_INCONSISTENT,
+ tag, &origin);
+ kmem_free(name, namelen);
+ if (error)
+ return (error);
+ dsda->rm_origin = origin;
+ dsl_dataset_make_exclusive(origin, tag);
+ }
+
+ return (0);
+}
+
/*
* ds must be opened as OWNER. On return (whether successful or not),
* ds will be closed and caller can no longer dereference it.
*/
int
-dsl_dataset_destroy(dsl_dataset_t *ds, void *tag)
+dsl_dataset_destroy(dsl_dataset_t *ds, void *tag, boolean_t defer)
{
int err;
dsl_sync_task_group_t *dstg;
objset_t *os;
dsl_dir_t *dd;
uint64_t obj;
+ struct dsl_ds_destroyarg dsda = {0};
+
+ dsda.ds = ds;
if (dsl_dataset_is_snapshot(ds)) {
/* Destroying a snapshot is simpler */
ds->ds_user_evict_func(ds, ds->ds_user_ptr);
ds->ds_user_ptr = NULL;
}
+ /* NOTE: defer is always B_FALSE for non-snapshots */
+ dsda.defer = defer;
err = dsl_sync_task_do(ds->ds_dir->dd_pool,
dsl_dataset_destroy_check, dsl_dataset_destroy_sync,
- ds, tag, 0);
+ &dsda, tag, 0);
+ ASSERT3P(dsda.rm_origin, ==, NULL);
goto out;
}
ds->ds_user_evict_func(ds, ds->ds_user_ptr);
ds->ds_user_ptr = NULL;
}
- dstg = dsl_sync_task_group_create(ds->ds_dir->dd_pool);
- dsl_sync_task_create(dstg, dsl_dataset_destroy_check,
- dsl_dataset_destroy_sync, ds, tag, 0);
- dsl_sync_task_create(dstg, dsl_dir_destroy_check,
- dsl_dir_destroy_sync, dd, FTAG, 0);
- err = dsl_sync_task_group_wait(dstg);
- dsl_sync_task_group_destroy(dstg);
+
+ /*
+ * If we're removing a clone, we might also need to remove its
+ * origin.
+ */
+ do {
+ dsda.need_prep = B_FALSE;
+ if (dsl_dir_is_clone(dd)) {
+ err = dsl_dataset_origin_rm_prep(&dsda, tag);
+ if (err) {
+ dsl_dir_close(dd, FTAG);
+ goto out;
+ }
+ }
+
+ dstg = dsl_sync_task_group_create(ds->ds_dir->dd_pool);
+ dsl_sync_task_create(dstg, dsl_dataset_destroy_check,
+ dsl_dataset_destroy_sync, &dsda, tag, 0);
+ dsl_sync_task_create(dstg, dsl_dir_destroy_check,
+ dsl_dir_destroy_sync, dd, FTAG, 0);
+ err = dsl_sync_task_group_wait(dstg);
+ dsl_sync_task_group_destroy(dstg);
+
+ /*
+ * We could be racing against 'zfs release' or 'zfs destroy -d'
+ * on the origin snap, in which case we can get EBUSY if we
+ * needed to destroy the origin snap but were not ready to
+ * do so.
+ */
+ if (dsda.need_prep) {
+ ASSERT(err == EBUSY);
+ ASSERT(dsl_dir_is_clone(dd));
+ ASSERT(dsda.rm_origin == NULL);
+ }
+ } while (dsda.need_prep);
+
+ if (dsda.rm_origin != NULL)
+ dsl_dataset_disown(dsda.rm_origin, tag);
+
/* if it is successful, dsl_dir_destroy_sync will close the dd */
if (err)
dsl_dir_close(dd, FTAG);
/*
* We can only roll back to emptyness if it is a ZPL objset.
*/
- if (*ost != DMU_OST_ZFS && ds->ds_phys->ds_prev_snap_txg == 0)
+ if (*ost != DMU_OST_ZFS &&
+ ds->ds_phys->ds_prev_snap_txg < TXG_INITIAL)
return (EINVAL);
/*
} else {
objset_impl_t *osi;
+ ASSERT(*ost != DMU_OST_ZVOL);
ASSERT3U(ds->ds_phys->ds_used_bytes, ==, 0);
ASSERT3U(ds->ds_phys->ds_compressed_bytes, ==, 0);
ASSERT3U(ds->ds_phys->ds_uncompressed_bytes, ==, 0);
cr, "dataset = %llu", ds->ds_object);
}
+static int
+dsl_dataset_origin_check(struct dsl_ds_destroyarg *dsda, void *tag,
+ dmu_tx_t *tx)
+{
+ dsl_dataset_t *ds = dsda->ds;
+ dsl_dataset_t *ds_prev = ds->ds_prev;
+
+ if (dsl_dataset_might_destroy_origin(ds_prev)) {
+ struct dsl_ds_destroyarg ndsda = {0};
+
+ /*
+ * If we're not prepared to remove the origin, don't remove
+ * the clone either.
+ */
+ if (dsda->rm_origin == NULL) {
+ dsda->need_prep = B_TRUE;
+ return (EBUSY);
+ }
+
+ ndsda.ds = ds_prev;
+ ndsda.is_origin_rm = B_TRUE;
+ return (dsl_dataset_destroy_check(&ndsda, tag, tx));
+ }
+
+ /*
+ * If we're not going to remove the origin after all,
+ * undo the open context setup.
+ */
+ if (dsda->rm_origin != NULL) {
+ dsl_dataset_disown(dsda->rm_origin, tag);
+ dsda->rm_origin = NULL;
+ }
+
+ return (0);
+}
+
/* ARGSUSED */
int
dsl_dataset_destroy_check(void *arg1, void *arg2, dmu_tx_t *tx)
{
- dsl_dataset_t *ds = arg1;
+ struct dsl_ds_destroyarg *dsda = arg1;
+ dsl_dataset_t *ds = dsda->ds;
/* we have an owner hold, so noone else can destroy us */
ASSERT(!DSL_DATASET_IS_DESTROYED(ds));
- /* Can't delete a branch point. */
- if (ds->ds_phys->ds_num_children > 1)
- return (EEXIST);
+ /*
+ * Only allow deferred destroy on pools that support it.
+ * NOTE: deferred destroy is only supported on snapshots.
+ */
+ if (dsda->defer) {
+ if (spa_version(ds->ds_dir->dd_pool->dp_spa) <
+ SPA_VERSION_USERREFS)
+ return (ENOTSUP);
+ ASSERT(dsl_dataset_is_snapshot(ds));
+ return (0);
+ }
/*
* Can't delete a head dataset if there are snapshots of it.
if (ds->ds_phys->ds_bp.blk_birth >= tx->tx_txg)
return (EAGAIN);
+ if (dsl_dataset_is_snapshot(ds)) {
+ /*
+ * If this snapshot has an elevated user reference count,
+ * we can't destroy it yet.
+ */
+ if (ds->ds_userrefs > 0 && !dsda->releasing)
+ return (EBUSY);
+
+ mutex_enter(&ds->ds_lock);
+ /*
+ * Can't delete a branch point. However, if we're destroying
+ * a clone and removing its origin due to it having a user
+ * hold count of 0 and having been marked for deferred destroy,
+ * it's OK for the origin to have a single clone.
+ */
+ if (ds->ds_phys->ds_num_children >
+ (dsda->is_origin_rm ? 2 : 1)) {
+ mutex_exit(&ds->ds_lock);
+ return (EEXIST);
+ }
+ mutex_exit(&ds->ds_lock);
+ } else if (dsl_dir_is_clone(ds->ds_dir)) {
+ return (dsl_dataset_origin_check(dsda, arg2, tx));
+ }
+
/* XXX we should do some i/o error checking... */
return (0);
}
void
dsl_dataset_destroy_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx)
{
- dsl_dataset_t *ds = arg1;
+ struct dsl_ds_destroyarg *dsda = arg1;
+ dsl_dataset_t *ds = dsda->ds;
zio_t *zio;
int err;
int after_branch_point = FALSE;
uint64_t obj;
ASSERT(ds->ds_owner);
- ASSERT3U(ds->ds_phys->ds_num_children, <=, 1);
+ ASSERT(dsda->defer || ds->ds_phys->ds_num_children <= 1);
ASSERT(ds->ds_prev == NULL ||
ds->ds_prev->ds_phys->ds_next_snap_obj != ds->ds_object);
ASSERT3U(ds->ds_phys->ds_bp.blk_birth, <=, tx->tx_txg);
+ if (dsda->defer) {
+ ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS);
+ if (ds->ds_userrefs > 0 || ds->ds_phys->ds_num_children > 1) {
+ dmu_buf_will_dirty(ds->ds_dbuf, tx);
+ ds->ds_phys->ds_flags |= DS_FLAG_DEFER_DESTROY;
+ return;
+ }
+ }
+
/* signal any waiters that this dataset is going away */
mutex_enter(&ds->ds_lock);
ds->ds_owner = dsl_reaper;
/* This clone is toast. */
ASSERT(ds_prev->ds_phys->ds_num_children > 1);
ds_prev->ds_phys->ds_num_children--;
+
+ /*
+ * If the clone's origin has no other clones, no
+ * user holds, and has been marked for deferred
+ * deletion, then we should have done the necessary
+ * destroy setup for it.
+ */
+ if (ds_prev->ds_phys->ds_num_children == 1 &&
+ ds_prev->ds_userrefs == 0 &&
+ DS_IS_DEFER_DESTROY(ds_prev)) {
+ ASSERT3P(dsda->rm_origin, !=, NULL);
+ } else {
+ ASSERT3P(dsda->rm_origin, ==, NULL);
+ }
} else if (!after_branch_point) {
ds_prev->ds_phys->ds_next_snap_obj =
ds->ds_phys->ds_next_snap_obj;
}
if (ds->ds_phys->ds_props_obj != 0)
VERIFY(0 == zap_destroy(mos, ds->ds_phys->ds_props_obj, tx));
+ if (ds->ds_phys->ds_userrefs_obj != 0)
+ VERIFY(0 == zap_destroy(mos, ds->ds_phys->ds_userrefs_obj, tx));
dsl_dir_close(ds->ds_dir, ds);
ds->ds_dir = NULL;
dsl_dataset_drain_refs(ds, tag);
VERIFY(0 == dmu_object_free(mos, obj, tx));
+
+ if (dsda->rm_origin) {
+ /*
+ * Remove the origin of the clone we just destroyed.
+ */
+ dsl_dataset_t *origin = ds->ds_prev;
+ struct dsl_ds_destroyarg ndsda = {0};
+
+ ASSERT3P(origin, ==, dsda->rm_origin);
+ if (origin->ds_user_ptr) {
+ origin->ds_user_evict_func(origin, origin->ds_user_ptr);
+ origin->ds_user_ptr = NULL;
+ }
+
+ dsl_dataset_rele(origin, tag);
+ ds->ds_prev = NULL;
+
+ ndsda.ds = origin;
+ dsl_dataset_destroy_sync(&ndsda, tag, cr, tx);
+ }
}
static int
ds->ds_reserved);
dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_GUID,
ds->ds_phys->ds_guid);
+ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USERREFS, ds->ds_userrefs);
+ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_DEFER_DESTROY,
+ DS_IS_DEFER_DESTROY(ds) ? 1 : 0);
if (ds->ds_phys->ds_next_snap_obj) {
/*
ds->ds_quota = new_quota;
- dsl_prop_set_uint64_sync(ds->ds_dir, "refquota", new_quota, cr, tx);
+ dsl_dir_prop_set_uint64_sync(ds->ds_dir, "refquota", new_quota, cr, tx);
spa_history_internal_log(LOG_DS_REFQUOTA, ds->ds_dir->dd_pool->dp_spa,
tx, cr, "%lld dataset = %llu ",
dsl_dir_diduse_space(ds->ds_dir, DD_USED_REFRSRV, delta, 0, 0, tx);
mutex_exit(&ds->ds_dir->dd_lock);
- dsl_prop_set_uint64_sync(ds->ds_dir, "refreservation",
+ dsl_dir_prop_set_uint64_sync(ds->ds_dir, "refreservation",
new_reservation, cr, tx);
spa_history_internal_log(LOG_DS_REFRESERV,
dsl_dataset_rele(ds, FTAG);
return (err);
}
+
+static int
+dsl_dataset_user_hold_check(void *arg1, void *arg2, dmu_tx_t *tx)
+{
+ dsl_dataset_t *ds = arg1;
+ char *htag = arg2;
+ objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
+ int error = 0;
+
+ if (spa_version(ds->ds_dir->dd_pool->dp_spa) < SPA_VERSION_USERREFS)
+ return (ENOTSUP);
+
+ if (!dsl_dataset_is_snapshot(ds))
+ return (EINVAL);
+
+ if (strlen(htag) >= ZAP_MAXNAMELEN)
+ return (ENAMETOOLONG);
+
+ /* tags must be unique */
+ mutex_enter(&ds->ds_lock);
+ if (ds->ds_phys->ds_userrefs_obj) {
+ error = zap_lookup(mos, ds->ds_phys->ds_userrefs_obj, htag,
+ 8, 1, tx);
+ if (error == 0)
+ error = EEXIST;
+ else if (error == ENOENT)
+ error = 0;
+ }
+ mutex_exit(&ds->ds_lock);
+
+ return (error);
+}
+
+static void
+dsl_dataset_user_hold_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+{
+ dsl_dataset_t *ds = arg1;
+ char *htag = arg2;
+ objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
+ time_t now = gethrestime_sec();
+ uint64_t zapobj;
+
+ mutex_enter(&ds->ds_lock);
+ if (ds->ds_phys->ds_userrefs_obj == 0) {
+ /*
+ * This is the first user hold for this dataset. Create
+ * the userrefs zap object.
+ */
+ dmu_buf_will_dirty(ds->ds_dbuf, tx);
+ zapobj = ds->ds_phys->ds_userrefs_obj =
+ zap_create(mos, DMU_OT_USERREFS, DMU_OT_NONE, 0, tx);
+ } else {
+ zapobj = ds->ds_phys->ds_userrefs_obj;
+ }
+ ds->ds_userrefs++;
+ mutex_exit(&ds->ds_lock);
+
+ VERIFY(0 == zap_add(mos, zapobj, htag, 8, 1, &now, tx));
+
+ spa_history_internal_log(LOG_DS_USER_HOLD,
+ ds->ds_dir->dd_pool->dp_spa, tx, cr, "<%s> dataset = %llu",
+ htag, ds->ds_object);
+}
+
+struct dsl_ds_holdarg {
+ dsl_sync_task_group_t *dstg;
+ char *htag;
+ char *snapname;
+ boolean_t recursive;
+ char failed[MAXPATHLEN];
+};
+
+static int
+dsl_dataset_user_hold_one(char *dsname, void *arg)
+{
+ struct dsl_ds_holdarg *ha = arg;
+ dsl_dataset_t *ds;
+ int error;
+ char *name;
+ size_t buflen;
+
+ /* alloc a buffer to hold dsname@snapname plus terminating NULL */
+ buflen = strlen(dsname) + strlen(ha->snapname) + 2;
+ name = kmem_alloc(buflen, KM_SLEEP);
+ (void) snprintf(name, buflen, "%s@%s", dsname, ha->snapname);
+ error = dsl_dataset_hold(name, ha->dstg, &ds);
+ kmem_free(name, buflen);
+ if (error == 0) {
+ dsl_sync_task_create(ha->dstg, dsl_dataset_user_hold_check,
+ dsl_dataset_user_hold_sync, ds, ha->htag, 0);
+ } else if (error == ENOENT && ha->recursive) {
+ error = 0;
+ } else {
+ (void) strcpy(ha->failed, dsname);
+ }
+ return (error);
+}
+
+int
+dsl_dataset_user_hold(char *dsname, char *snapname, char *htag,
+ boolean_t recursive)
+{
+ struct dsl_ds_holdarg *ha;
+ dsl_sync_task_t *dst;
+ spa_t *spa;
+ int error;
+
+ ha = kmem_zalloc(sizeof (struct dsl_ds_holdarg), KM_SLEEP);
+
+ (void) strlcpy(ha->failed, dsname, sizeof (ha->failed));
+
+ error = spa_open(dsname, &spa, FTAG);
+ if (error) {
+ kmem_free(ha, sizeof (struct dsl_ds_holdarg));
+ return (error);
+ }
+
+ ha->dstg = dsl_sync_task_group_create(spa_get_dsl(spa));
+ ha->htag = htag;
+ ha->snapname = snapname;
+ ha->recursive = recursive;
+ if (recursive) {
+ error = dmu_objset_find(dsname, dsl_dataset_user_hold_one,
+ ha, DS_FIND_CHILDREN);
+ } else {
+ error = dsl_dataset_user_hold_one(dsname, ha);
+ }
+ if (error == 0)
+ error = dsl_sync_task_group_wait(ha->dstg);
+
+ for (dst = list_head(&ha->dstg->dstg_tasks); dst;
+ dst = list_next(&ha->dstg->dstg_tasks, dst)) {
+ dsl_dataset_t *ds = dst->dst_arg1;
+
+ if (dst->dst_err) {
+ dsl_dataset_name(ds, ha->failed);
+ *strchr(ha->failed, '@') = '\0';
+ }
+ dsl_dataset_rele(ds, ha->dstg);
+ }
+
+ if (error)
+ (void) strcpy(dsname, ha->failed);
+
+ dsl_sync_task_group_destroy(ha->dstg);
+ kmem_free(ha, sizeof (struct dsl_ds_holdarg));
+ spa_close(spa, FTAG);
+ return (error);
+}
+
+struct dsl_ds_releasearg {
+ dsl_dataset_t *ds;
+ const char *htag;
+ boolean_t own; /* do we own or just hold ds? */
+};
+
+static int
+dsl_dataset_release_might_destroy(dsl_dataset_t *ds, const char *htag,
+ boolean_t *might_destroy)
+{
+ objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
+ uint64_t zapobj;
+ uint64_t tmp;
+ int error;
+
+ *might_destroy = B_FALSE;
+
+ mutex_enter(&ds->ds_lock);
+ zapobj = ds->ds_phys->ds_userrefs_obj;
+ if (zapobj == 0) {
+ /* The tag can't possibly exist */
+ mutex_exit(&ds->ds_lock);
+ return (ESRCH);
+ }
+
+ /* Make sure the tag exists */
+ error = zap_lookup(mos, zapobj, htag, 8, 1, &tmp);
+ if (error) {
+ mutex_exit(&ds->ds_lock);
+ if (error == ENOENT)
+ error = ESRCH;
+ return (error);
+ }
+
+ if (ds->ds_userrefs == 1 && ds->ds_phys->ds_num_children == 1 &&
+ DS_IS_DEFER_DESTROY(ds))
+ *might_destroy = B_TRUE;
+
+ mutex_exit(&ds->ds_lock);
+ return (0);
+}
+
+static int
+dsl_dataset_user_release_check(void *arg1, void *tag, dmu_tx_t *tx)
+{
+ struct dsl_ds_releasearg *ra = arg1;
+ dsl_dataset_t *ds = ra->ds;
+ boolean_t might_destroy;
+ int error;
+
+ if (spa_version(ds->ds_dir->dd_pool->dp_spa) < SPA_VERSION_USERREFS)
+ return (ENOTSUP);
+
+ error = dsl_dataset_release_might_destroy(ds, ra->htag, &might_destroy);
+ if (error)
+ return (error);
+
+ if (might_destroy) {
+ struct dsl_ds_destroyarg dsda = {0};
+
+ if (dmu_tx_is_syncing(tx)) {
+ /*
+ * If we're not prepared to remove the snapshot,
+ * we can't allow the release to happen right now.
+ */
+ if (!ra->own)
+ return (EBUSY);
+ if (ds->ds_user_ptr) {
+ ds->ds_user_evict_func(ds, ds->ds_user_ptr);
+ ds->ds_user_ptr = NULL;
+ }
+ }
+ dsda.ds = ds;
+ dsda.releasing = B_TRUE;
+ return (dsl_dataset_destroy_check(&dsda, tag, tx));
+ }
+
+ return (0);
+}
+
+static void
+dsl_dataset_user_release_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx)
+{
+ struct dsl_ds_releasearg *ra = arg1;
+ dsl_dataset_t *ds = ra->ds;
+ spa_t *spa = ds->ds_dir->dd_pool->dp_spa;
+ objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
+ uint64_t zapobj;
+ uint64_t dsobj = ds->ds_object;
+ uint64_t refs;
+
+ mutex_enter(&ds->ds_lock);
+ ds->ds_userrefs--;
+ refs = ds->ds_userrefs;
+ mutex_exit(&ds->ds_lock);
+ zapobj = ds->ds_phys->ds_userrefs_obj;
+ VERIFY(0 == zap_remove(mos, zapobj, ra->htag, tx));
+ if (ds->ds_userrefs == 0 && ds->ds_phys->ds_num_children == 1 &&
+ DS_IS_DEFER_DESTROY(ds)) {
+ struct dsl_ds_destroyarg dsda = {0};
+
+ ASSERT(ra->own);
+ dsda.ds = ds;
+ dsda.releasing = B_TRUE;
+ /* We already did the destroy_check */
+ dsl_dataset_destroy_sync(&dsda, tag, cr, tx);
+ }
+
+ spa_history_internal_log(LOG_DS_USER_RELEASE,
+ spa, tx, cr, "<%s> %lld dataset = %llu",
+ ra->htag, (longlong_t)refs, dsobj);
+}
+
+static int
+dsl_dataset_user_release_one(char *dsname, void *arg)
+{
+ struct dsl_ds_holdarg *ha = arg;
+ struct dsl_ds_releasearg *ra;
+ dsl_dataset_t *ds;
+ int error;
+ void *dtag = ha->dstg;
+ char *name;
+ size_t buflen;
+ boolean_t own = B_FALSE;
+ boolean_t might_destroy;
+
+ if (strlen(ha->htag) >= ZAP_MAXNAMELEN)
+ return (ENAMETOOLONG);
+
+ /* alloc a buffer to hold dsname@snapname, plus the terminating NULL */
+ buflen = strlen(dsname) + strlen(ha->snapname) + 2;
+ name = kmem_alloc(buflen, KM_SLEEP);
+ (void) snprintf(name, buflen, "%s@%s", dsname, ha->snapname);
+ error = dsl_dataset_hold(name, dtag, &ds);
+ kmem_free(name, buflen);
+ if (error == ENOENT && ha->recursive)
+ return (0);
+ (void) strcpy(ha->failed, dsname);
+ if (error)
+ return (error);
+
+ ASSERT(dsl_dataset_is_snapshot(ds));
+
+ error = dsl_dataset_release_might_destroy(ds, ha->htag, &might_destroy);
+ if (error) {
+ dsl_dataset_rele(ds, dtag);
+ return (error);
+ }
+
+ if (might_destroy) {
+#ifdef _KERNEL
+ error = zfs_unmount_snap(name, NULL);
+ if (error) {
+ dsl_dataset_rele(ds, dtag);
+ return (error);
+ }
+ error = dsl_dataset_zvol_cleanup(ds, name);
+ if (error) {
+ dsl_dataset_rele(ds, dtag);
+ return (error);
+ }
+#endif
+ if (!dsl_dataset_tryown(ds,
+ DS_MODE_READONLY | DS_MODE_INCONSISTENT, dtag)) {
+ dsl_dataset_rele(ds, dtag);
+ return (EBUSY);
+ } else {
+ own = B_TRUE;
+ dsl_dataset_make_exclusive(ds, dtag);
+ }
+ }
+
+ ra = kmem_alloc(sizeof (struct dsl_ds_releasearg), KM_SLEEP);
+ ra->ds = ds;
+ ra->htag = ha->htag;
+ ra->own = own;
+ dsl_sync_task_create(ha->dstg, dsl_dataset_user_release_check,
+ dsl_dataset_user_release_sync, ra, dtag, 0);
+
+ return (0);
+}
+
+int
+dsl_dataset_user_release(char *dsname, char *snapname, char *htag,
+ boolean_t recursive)
+{
+ struct dsl_ds_holdarg *ha;
+ dsl_sync_task_t *dst;
+ spa_t *spa;
+ int error;
+
+ ha = kmem_zalloc(sizeof (struct dsl_ds_holdarg), KM_SLEEP);
+
+ (void) strlcpy(ha->failed, dsname, sizeof (ha->failed));
+
+ error = spa_open(dsname, &spa, FTAG);
+ if (error) {
+ kmem_free(ha, sizeof (struct dsl_ds_holdarg));
+ return (error);
+ }
+
+ ha->dstg = dsl_sync_task_group_create(spa_get_dsl(spa));
+ ha->htag = htag;
+ ha->snapname = snapname;
+ ha->recursive = recursive;
+ if (recursive) {
+ error = dmu_objset_find(dsname, dsl_dataset_user_release_one,
+ ha, DS_FIND_CHILDREN);
+ } else {
+ error = dsl_dataset_user_release_one(dsname, ha);
+ }
+ if (error == 0)
+ error = dsl_sync_task_group_wait(ha->dstg);
+
+ for (dst = list_head(&ha->dstg->dstg_tasks); dst;
+ dst = list_next(&ha->dstg->dstg_tasks, dst)) {
+ struct dsl_ds_releasearg *ra = dst->dst_arg1;
+ dsl_dataset_t *ds = ra->ds;
+
+ if (dst->dst_err)
+ dsl_dataset_name(ds, ha->failed);
+
+ if (ra->own)
+ dsl_dataset_disown(ds, ha->dstg);
+ else
+ dsl_dataset_rele(ds, ha->dstg);
+
+ kmem_free(ra, sizeof (struct dsl_ds_releasearg));
+ }
+
+ if (error)
+ (void) strcpy(dsname, ha->failed);
+
+ dsl_sync_task_group_destroy(ha->dstg);
+ kmem_free(ha, sizeof (struct dsl_ds_holdarg));
+ spa_close(spa, FTAG);
+ return (error);
+}
+
+int
+dsl_dataset_get_holds(const char *dsname, nvlist_t **nvp)
+{
+ dsl_dataset_t *ds;
+ int err;
+
+ err = dsl_dataset_hold(dsname, FTAG, &ds);
+ if (err)
+ return (err);
+
+ VERIFY(0 == nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP));
+ if (ds->ds_phys->ds_userrefs_obj != 0) {
+ zap_attribute_t *za;
+ zap_cursor_t zc;
+
+ za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
+ for (zap_cursor_init(&zc, ds->ds_dir->dd_pool->dp_meta_objset,
+ ds->ds_phys->ds_userrefs_obj);
+ zap_cursor_retrieve(&zc, za) == 0;
+ zap_cursor_advance(&zc)) {
+ VERIFY(0 == nvlist_add_uint64(*nvp, za->za_name,
+ za->za_first_integer));
+ }
+ zap_cursor_fini(&zc);
+ kmem_free(za, sizeof (zap_attribute_t));
+ }
+ dsl_dataset_rele(ds, FTAG);
+ return (0);
+}
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
* The ZAP OBJ is referred to as the jump object.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/dmu.h>
#include <sys/dmu_objset.h>
#include <sys/dmu_tx.h>
dsl_pool_t *dp;
void *cookie;
int error;
- char checkflag = ZFS_DELEG_LOCAL;
+ char checkflag;
objset_t *mos;
avl_tree_t permsets;
perm_set_t *setnode;
return (EPERM);
}
+ if (dsl_dataset_is_snapshot(ds)) {
+ /*
+ * Snapshots are treated as descendents only,
+ * local permissions do not apply.
+ */
+ checkflag = ZFS_DELEG_DESCENDENT;
+ } else {
+ checkflag = ZFS_DELEG_LOCAL;
+ }
+
avl_create(&permsets, perm_set_compare, sizeof (perm_set_t),
offsetof(perm_set_t, p_node));
#endif
if (dd == NULL) {
dsl_dir_t *winner;
- int err;
dd = kmem_zalloc(sizeof (dsl_dir_t), KM_SLEEP);
dd->dd_object = ddobj;
}
void
-dsl_prop_set_uint64_sync(dsl_dir_t *dd, const char *name, uint64_t val,
+dsl_dir_prop_set_uint64_sync(dsl_dir_t *dd, const char *name, uint64_t val,
cred_t *cr, dmu_tx_t *tx)
{
objset_t *mos = dd->dd_pool->dp_meta_objset;
int
dsl_pool_scrub_clean(dsl_pool_t *dp)
{
+ spa_t *spa = dp->dp_spa;
+
/*
* Purge all vdev caches. We do this here rather than in sync
* context because this requires a writer lock on the spa_config
* spa_scrub_reopen flag indicates that vdev_open() should not
* attempt to start another scrub.
*/
- spa_config_enter(dp->dp_spa, SCL_ALL, FTAG, RW_WRITER);
- dp->dp_spa->spa_scrub_reopen = B_TRUE;
- vdev_reopen(dp->dp_spa->spa_root_vdev);
- dp->dp_spa->spa_scrub_reopen = B_FALSE;
- spa_config_exit(dp->dp_spa, SCL_ALL, FTAG);
+ spa_vdev_state_enter(spa);
+ spa->spa_scrub_reopen = B_TRUE;
+ vdev_reopen(spa->spa_root_vdev);
+ spa->spa_scrub_reopen = B_FALSE;
+ (void) spa_vdev_state_exit(spa, NULL, 0);
return (dsl_pool_scrub_setup(dp, SCRUB_FUNC_CLEAN));
}
DMU_OT_SCRUB_QUEUE, /* ZAP */
DMU_OT_USERGROUP_USED, /* ZAP */
DMU_OT_USERGROUP_QUOTA, /* ZAP */
+ DMU_OT_USERREFS, /* ZAP */
DMU_OT_NUMTYPES
} dmu_object_type_t;
int dmu_objset_create(const char *name, dmu_objset_type_t type,
objset_t *clone_parent, uint64_t flags,
void (*func)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx), void *arg);
-int dmu_objset_destroy(const char *name);
-int dmu_snapshots_destroy(char *fsname, char *snapname);
+int dmu_objset_destroy(const char *name, boolean_t defer);
+int dmu_snapshots_destroy(char *fsname, char *snapname, boolean_t defer);
int dmu_objset_rollback(objset_t *os);
int dmu_objset_snapshot(char *fsname, char *snapname, struct nvlist *props,
boolean_t recursive);
} dmu_recv_cookie_t;
int dmu_recv_begin(char *tofs, char *tosnap, struct drr_begin *,
- boolean_t force, objset_t *origin, boolean_t online, dmu_recv_cookie_t *);
+ boolean_t force, objset_t *origin, dmu_recv_cookie_t *);
int dmu_recv_stream(dmu_recv_cookie_t *drc, struct vnode *vp, offset_t *voffp);
int dmu_recv_end(dmu_recv_cookie_t *drc);
-void dmu_recv_abort_cleanup(dmu_recv_cookie_t *drc);
/* CRC64 table */
#define ZFS_CRC64_POLY 0xC96C5795D7870F42ULL /* ECMA-182, reflected form */
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
* ds_lock
* protects:
* ds_user_ptr
- * ds_user_evice_func
+ * ds_user_evict_func
* ds_open_refcount
* ds_snapname
* ds_phys accounting
+ * ds_phys userrefs zapobj
* ds_reserved
* held from:
* dsl_dataset_*
int dmu_objset_create(const char *name, dmu_objset_type_t type,
objset_t *clone_parent, uint64_t flags,
void (*func)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx), void *arg);
-int dmu_objset_destroy(const char *name);
+int dmu_objset_destroy(const char *name, boolean_t defer);
int dmu_objset_rollback(objset_t *os);
int dmu_objset_snapshot(char *fsname, char *snapname, nvlist_t *props,
boolean_t recursive);
*/
#define DS_FLAG_UNIQUE_ACCURATE (1ULL<<2)
+/*
+ * DS_FLAG_DEFER_DESTROY is set after 'zfs destroy -d' has been called
+ * on a dataset. This allows the dataset to be destroyed using 'zfs release'.
+ */
+#define DS_FLAG_DEFER_DESTROY (1ULL<<3)
+#define DS_IS_DEFER_DESTROY(ds) \
+ ((ds)->ds_phys->ds_flags & DS_FLAG_DEFER_DESTROY)
+
/*
* DS_FLAG_CI_DATASET is set if the dataset contains a file system whose
* name lookups should be performed case-insensitively.
blkptr_t ds_bp;
uint64_t ds_next_clones_obj; /* DMU_OT_DSL_CLONES */
uint64_t ds_props_obj; /* DMU_OT_DSL_PROPS for snaps */
- uint64_t ds_pad[6]; /* pad out to 320 bytes for good measure */
+ uint64_t ds_userrefs_obj; /* DMU_OT_USERREFS */
+ uint64_t ds_pad[5]; /* pad out to 320 bytes for good measure */
} dsl_dataset_phys_t;
typedef struct dsl_dataset {
/* has internal locking: */
bplist_t ds_deadlist;
+ /* to protect against multiple concurrent incremental recv */
+ kmutex_t ds_recvlock;
+
/* protected by lock on pool's dp_dirty_datasets list */
txg_node_t ds_dirty_link;
list_node_t ds_synced_link;
kmutex_t ds_lock;
void *ds_user_ptr;
dsl_dataset_evict_func_t *ds_user_evict_func;
+ uint64_t ds_userrefs;
/*
* ds_owner is protected by the ds_rwlock and the ds_lock
char ds_snapname[MAXNAMELEN];
} dsl_dataset_t;
+struct dsl_ds_destroyarg {
+ dsl_dataset_t *ds; /* ds to destroy */
+ dsl_dataset_t *rm_origin; /* also remove our origin? */
+ boolean_t is_origin_rm; /* set if removing origin snap */
+ boolean_t defer; /* destroy -d requested? */
+ boolean_t releasing; /* destroying due to release? */
+ boolean_t need_prep; /* do we need to retry due to EBUSY? */
+};
+
#define dsl_dataset_is_snapshot(ds) \
((ds)->ds_phys->ds_num_children != 0)
dsl_dataset_t *origin, uint64_t flags, cred_t *, dmu_tx_t *);
uint64_t dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin,
uint64_t flags, dmu_tx_t *tx);
-int dsl_dataset_destroy(dsl_dataset_t *ds, void *tag);
-int dsl_snapshots_destroy(char *fsname, char *snapname);
+int dsl_dataset_destroy(dsl_dataset_t *ds, void *tag, boolean_t defer);
+int dsl_snapshots_destroy(char *fsname, char *snapname, boolean_t defer);
dsl_checkfunc_t dsl_dataset_destroy_check;
dsl_syncfunc_t dsl_dataset_destroy_sync;
dsl_checkfunc_t dsl_dataset_snapshot_check;
int dsl_dataset_promote(const char *name);
int dsl_dataset_clone_swap(dsl_dataset_t *clone, dsl_dataset_t *origin_head,
boolean_t force);
+int dsl_dataset_user_hold(char *dsname, char *snapname, char *htag,
+ boolean_t recursive);
+int dsl_dataset_user_release(char *dsname, char *snapname, char *htag,
+ boolean_t recursive);
+int dsl_dataset_get_holds(const char *dsname, nvlist_t **nvp);
void *dsl_dataset_set_user_ptr(dsl_dataset_t *ds,
void *p, dsl_dataset_evict_func_t func);
#define ZFS_DELEG_PERM_GROUPQUOTA "groupquota"
#define ZFS_DELEG_PERM_USERUSED "userused"
#define ZFS_DELEG_PERM_GROUPUSED "groupused"
+#define ZFS_DELEG_PERM_HOLD "hold"
+#define ZFS_DELEG_PERM_RELEASE "release"
/*
* Note: the names of properties that are marked delegatable are also
int dsl_prop_set(const char *ddname, const char *propname,
int intsz, int numints, const void *buf);
int dsl_props_set(const char *dsname, nvlist_t *nvl);
-void dsl_prop_set_uint64_sync(dsl_dir_t *dd, const char *name, uint64_t val,
+void dsl_dir_prop_set_uint64_sync(dsl_dir_t *dd, const char *name, uint64_t val,
cred_t *cr, dmu_tx_t *tx);
void dsl_prop_nvlist_add_uint64(nvlist_t *nv, zfs_prop_t prop, uint64_t value);
char *his_buf);
extern int spa_history_log(spa_t *spa, const char *his_buf,
history_log_type_t what);
-void spa_history_internal_log(history_internal_events_t event, spa_t *spa,
- dmu_tx_t *tx, cred_t *cr, const char *fmt, ...);
+extern void spa_history_internal_log(history_internal_events_t event,
+ spa_t *spa, dmu_tx_t *tx, cred_t *cr, const char *fmt, ...);
+extern void spa_history_log_version(spa_t *spa, history_internal_events_t evt);
/* error handling */
struct zbookmark;
int spa_inject_ref; /* injection references */
uint8_t spa_sync_on; /* sync threads are running */
spa_load_state_t spa_load_state; /* current load operation */
+ boolean_t spa_load_verbatim; /* load the given config? */
taskq_t *spa_zio_taskq[ZIO_TYPES][ZIO_TASKQ_TYPES];
dsl_pool_t *spa_dsl_pool;
metaslab_class_t *spa_normal_class; /* normal data class */
extern boolean_t zfs_nocacheflush;
extern int vdev_open(vdev_t *);
+extern void vdev_open_children(vdev_t *vd);
extern int vdev_validate(vdev_t *);
extern void vdev_close(vdev_t *);
extern int vdev_create(vdev_t *, uint64_t txg, boolean_t isreplace);
space_map_t vdev_dtl[DTL_TYPES]; /* in-core dirty time logs */
vdev_stat_t vdev_stat; /* virtual device statistics */
boolean_t vdev_expanding; /* expand the vdev? */
+ int vdev_open_error; /* error on last open */
+ kthread_t *vdev_open_thread; /* thread opening children */
/*
* Top-level vdev state.
boolean_t *normalization_conflictp);
int zap_count_write(objset_t *os, uint64_t zapobj, const char *name,
- int add, uint64_t *towrite, uint64_t *tooverwrite,
- uint64_t dn_datablkshift);
+ int add, uint64_t *towrite, uint64_t *tooverwrite);
/*
* Create an attribute with the given name and value.
void zfs_ace_byteswap(void *, size_t, boolean_t);
extern boolean_t zfs_has_access(struct znode *zp, cred_t *cr);
extern int zfs_zaccess(struct znode *, int, int, boolean_t, cred_t *);
+int zfs_fastaccesschk_execute(struct znode *, cred_t *);
extern int zfs_zaccess_rwx(struct znode *, mode_t, int, cred_t *);
extern int zfs_zaccess_unix(struct znode *, mode_t, cred_t *);
extern int zfs_acl_access(struct znode *, int, cred_t *);
dmu_objset_stats_t zc_objset_stats;
struct drr_begin zc_begin_record;
zinject_record_t zc_inject_record;
+ boolean_t zc_defer_destroy;
} zfs_cmd_t;
typedef struct zfs_useracct {
boolean_t z_vscan; /* virus scan on/off */
boolean_t z_use_fuids; /* version allows fuids */
boolean_t z_replay; /* set during ZIL replay */
- kmutex_t z_online_recv_lock; /* held while recv in progress */
uint64_t z_version; /* ZPL version */
uint64_t z_shares_dir; /* hidden shares dir */
kmutex_t z_lock;
#define ZFS_ACL_DEFAULTED 0x20 /* ACL should be defaulted */
#define ZFS_ACL_AUTO_INHERIT 0x40 /* ACL should be inherited */
#define ZFS_BONUS_SCANSTAMP 0x80 /* Scanstamp in bonus area */
+#define ZFS_NO_EXECS_DENIED 0x100 /* exec was given to everyone */
/*
* Is ID ephemeral?
uint64_t z_gen; /* generation (same as zp_gen) */
uint32_t z_sync_cnt; /* synchronous open count */
kmutex_t z_acl_lock; /* acl data lock */
+ zfs_acl_t *z_acl_cached; /* cached acl */
list_node_t z_link_node; /* all znodes in fs link */
/*
* These are dmu managed fields.
#define ZIO_FLAG_GODFATHER 0x080000
#define ZIO_FLAG_TRYHARD 0x100000
+#define ZIO_FLAG_NODATA 0x200000
+#define ZIO_FLAG_OPTIONAL 0x400000
#define ZIO_FLAG_GANG_INHERIT \
(ZIO_FLAG_CANFAIL | \
ZIO_FLAG_IO_REPAIR | \
ZIO_FLAG_IO_RETRY | \
ZIO_FLAG_PROBE | \
- ZIO_FLAG_TRYHARD)
+ ZIO_FLAG_TRYHARD | \
+ ZIO_FLAG_NODATA | \
+ ZIO_FLAG_OPTIONAL)
#define ZIO_FLAG_AGG_INHERIT \
(ZIO_FLAG_DONT_AGGREGATE | \
* CDDL HEADER END
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/refcount.h>
#include <sys/rrwlock.h>
rrw_node_t *prev = NULL;
if (refcount_count(&rrl->rr_linked_rcount) == 0)
- return (NULL);
+ return (B_FALSE);
for (rn = tsd_get(rrw_tsd_key); rn != NULL; rn = rn->rn_next) {
if (rn->rn_rrl == rrl) {
rrw_enter_read(rrwlock_t *rrl, void *tag)
{
mutex_enter(&rrl->rr_lock);
+#if !defined(DEBUG) && defined(_KERNEL)
+ if (!rrl->rr_writer && !rrl->rr_writer_wanted) {
+ rrl->rr_anon_rcount.rc_count++;
+ mutex_exit(&rrl->rr_lock);
+ return;
+ }
+ DTRACE_PROBE(zfs__rrwfastpath__rdmiss);
+#endif
ASSERT(rrl->rr_writer != curthread);
ASSERT(refcount_count(&rrl->rr_anon_rcount) >= 0);
rrw_exit(rrwlock_t *rrl, void *tag)
{
mutex_enter(&rrl->rr_lock);
+#if !defined(DEBUG) && defined(_KERNEL)
+ if (!rrl->rr_writer && rrl->rr_linked_rcount.rc_count == 0) {
+ rrl->rr_anon_rcount.rc_count--;
+ if (rrl->rr_anon_rcount.rc_count == 0)
+ cv_broadcast(&rrl->rr_cv);
+ mutex_exit(&rrl->rr_lock);
+ return;
+ }
+ DTRACE_PROBE(zfs__rrwfastpath__exitmiss);
+#endif
ASSERT(!refcount_is_zero(&rrl->rr_anon_rcount) ||
!refcount_is_zero(&rrl->rr_linked_rcount) ||
rrl->rr_writer != NULL);
if (rrl->rr_writer == NULL) {
- if (rrn_find_and_remove(rrl)) {
- if (refcount_remove(&rrl->rr_linked_rcount, tag) == 0)
- cv_broadcast(&rrl->rr_cv);
-
- } else {
- if (refcount_remove(&rrl->rr_anon_rcount, tag) == 0)
- cv_broadcast(&rrl->rr_cv);
- }
+ int64_t count;
+ if (rrn_find_and_remove(rrl))
+ count = refcount_remove(&rrl->rr_linked_rcount, tag);
+ else
+ count = refcount_remove(&rrl->rr_anon_rcount, tag);
+ if (count == 0)
+ cv_broadcast(&rrl->rr_cv);
} else {
ASSERT(rrl->rr_writer == curthread);
ASSERT(refcount_is_zero(&rrl->rr_anon_rcount) &&
/*
* If the config cache is stale, or we have uninitialized
* metaslabs (see spa_vdev_add()), then update the config.
+ *
+ * If spa_load_verbatim is true, trust the current
+ * in-core spa_config and update the disk labels.
*/
if (config_cache_txg != spa->spa_config_txg ||
- state == SPA_LOAD_IMPORT)
+ state == SPA_LOAD_IMPORT || spa->spa_load_verbatim)
need_update = B_TRUE;
for (int c = 0; c < rvd->vdev_children; c++)
if (version >= SPA_VERSION_ZPOOL_HISTORY && history_str != NULL)
(void) spa_history_log(spa, history_str, LOG_CMD_POOL_CREATE);
+ spa_history_log_version(spa, LOG_POOL_CREATE);
spa->spa_minref = refcount_count(&spa->spa_refcount);
spa = spa_add(pname, NULL);
spa->spa_is_root = B_TRUE;
+ spa->spa_load_verbatim = B_TRUE;
/*
* Build up a vdev tree based on the boot device's label config.
VERIFY(nvlist_dup(config, &spa->spa_config, 0) == 0);
error = 0;
+ spa_history_log_version(spa, LOG_POOL_IMPORT);
out:
spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
vdev_free(rvd);
zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
spa = spa_add(pool, altroot);
+ spa->spa_load_verbatim = B_TRUE;
+
VERIFY(nvlist_dup(config, &spa->spa_config, 0) == 0);
if (props != NULL)
spa_config_sync(spa, B_FALSE, B_TRUE);
mutex_exit(&spa_namespace_lock);
+ spa_history_log_version(spa, LOG_POOL_IMPORT);
return (0);
}
/*
* Update the config cache to include the newly-imported pool.
*/
- spa_config_update_common(spa, SPA_CONFIG_UPDATE_POOL, B_FALSE);
+ spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
}
/*
spa_async_request(spa, SPA_ASYNC_AUTOEXPAND);
mutex_exit(&spa_namespace_lock);
+ spa_history_log_version(spa, LOG_POOL_IMPORT);
return (0);
}
vdev_t *rvd = spa->spa_root_vdev;
vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd;
vdev_ops_t *pvops;
- dmu_tx_t *tx;
char *oldvdpath, *newvdpath;
int newvd_isspare;
int error;
(void) spa_vdev_exit(spa, newrootvd, open_txg, 0);
- tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
- if (dmu_tx_assign(tx, TXG_WAIT) == 0) {
- spa_history_internal_log(LOG_POOL_VDEV_ATTACH, spa, tx,
- CRED(), "%s vdev=%s %s vdev=%s",
- replacing && newvd_isspare ? "spare in" :
- replacing ? "replace" : "attach", newvdpath,
- replacing ? "for" : "to", oldvdpath);
- dmu_tx_commit(tx);
- } else {
- dmu_tx_abort(tx);
- }
+ spa_history_internal_log(LOG_POOL_VDEV_ATTACH, spa, NULL,
+ CRED(), "%s vdev=%s %s vdev=%s",
+ replacing && newvd_isspare ? "spare in" :
+ replacing ? "replace" : "attach", newvdpath,
+ replacing ? "for" : "to", oldvdpath);
spa_strfree(oldvdpath);
spa_strfree(newvdpath);
* then log an internal history event.
*/
if (space_update) {
- dmu_tx_t *tx;
-
- tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
- if (dmu_tx_assign(tx, TXG_WAIT) == 0) {
- spa_history_internal_log(LOG_POOL_VDEV_ONLINE,
- spa, tx, CRED(),
- "pool '%s' size: %llu(+%llu)",
- spa_name(spa), spa_get_space(spa),
- space_update);
- dmu_tx_commit(tx);
- } else {
- dmu_tx_abort(tx);
- }
+ spa_history_internal_log(LOG_POOL_VDEV_ONLINE,
+ spa, NULL, CRED(),
+ "pool '%s' size: %llu(+%llu)",
+ spa_name(spa), spa_get_space(spa),
+ space_update);
}
}
ASSERT(MUTEX_HELD(&spa_namespace_lock));
- if (rootdir == NULL)
+ if (rootdir == NULL || !(spa_mode_global & FWRITE))
return;
/*
return (config);
}
-/*
- * For a pool that's not currently a booting rootpool, update all disk labels,
- * generate a fresh config based on the current in-core state, and sync the
- * global config cache.
- */
-void
-spa_config_update(spa_t *spa, int what)
-{
- spa_config_update_common(spa, what, FALSE);
-}
-
/*
* Update all disk labels, generate a fresh config based on the current
* in-core state, and sync the global config cache (do not sync the config
* cache if this is a booting rootpool).
*/
void
-spa_config_update_common(spa_t *spa, int what, boolean_t isroot)
+spa_config_update(spa_t *spa, int what)
{
vdev_t *rvd = spa->spa_root_vdev;
uint64_t txg;
/*
* Update the global config cache to reflect the new mosconfig.
*/
- if (!isroot)
+ if (!spa->spa_is_root)
spa_config_sync(spa, B_FALSE, what != SPA_CONFIG_UPDATE_POOL);
if (what == SPA_CONFIG_UPDATE_POOL)
- spa_config_update_common(spa, SPA_CONFIG_UPDATE_VDEVS, isroot);
+ spa_config_update(spa, SPA_CONFIG_UPDATE_VDEVS);
}
return (err);
}
-void
-spa_history_internal_log(history_internal_events_t event, spa_t *spa,
- dmu_tx_t *tx, cred_t *cr, const char *fmt, ...)
+static void
+log_internal(history_internal_events_t event, spa_t *spa,
+ dmu_tx_t *tx, cred_t *cr, const char *fmt, va_list adx)
{
history_arg_t *hap;
char *str;
- va_list adx;
/*
* If this is part of creating a pool, not everything is
hap = kmem_alloc(sizeof (history_arg_t), KM_SLEEP);
str = kmem_alloc(HIS_MAX_RECORD_LEN, KM_SLEEP);
- va_start(adx, fmt);
(void) vsnprintf(str, HIS_MAX_RECORD_LEN, fmt, adx);
- va_end(adx);
hap->ha_log_type = LOG_INTERNAL;
hap->ha_history_str = str;
}
/* spa_history_log_sync() will free hap and str */
}
+
+void
+spa_history_internal_log(history_internal_events_t event, spa_t *spa,
+ dmu_tx_t *tx, cred_t *cr, const char *fmt, ...)
+{
+ dmu_tx_t *htx = tx;
+ va_list adx;
+
+ /* create a tx if we didn't get one */
+ if (tx == NULL) {
+ htx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
+ if (dmu_tx_assign(htx, TXG_WAIT) != 0) {
+ dmu_tx_abort(htx);
+ return;
+ }
+ }
+
+ va_start(adx, fmt);
+ log_internal(event, spa, htx, cr, fmt, adx);
+ va_end(adx);
+
+ /* if we didn't get a tx from the caller, commit the one we made */
+ if (tx == NULL)
+ dmu_tx_commit(htx);
+}
+
+void
+spa_history_log_version(spa_t *spa, history_internal_events_t event)
+{
+#ifdef _KERNEL
+ uint64_t current_vers = spa_version(spa);
+
+ if (current_vers >= SPA_VERSION_ZPOOL_HISTORY) {
+ spa_history_internal_log(event, spa, NULL, CRED(),
+ "pool spa %llu; zfs spa %llu; zpl %d; uts %s %s %s %s",
+ (u_longlong_t)current_vers, SPA_VERSION, ZPL_VERSION,
+ utsname.nodename, utsname.release, utsname.version,
+ utsname.machine);
+ }
+ cmn_err(CE_CONT, "!%s version %llu pool %s using %llu",
+ event == LOG_POOL_IMPORT ? "imported" :
+ event == LOG_POOL_CREATE ? "created" : "accessed",
+ (u_longlong_t)current_vers, spa_name(spa), SPA_VERSION);
+#endif
+}
void
spa_config_enter(spa_t *spa, int locks, void *tag, krw_t rw)
{
+ int wlocks_held = 0;
+
for (int i = 0; i < SCL_LOCKS; i++) {
spa_config_lock_t *scl = &spa->spa_config_lock[i];
+ if (scl->scl_writer == curthread)
+ wlocks_held |= (1 << i);
if (!(locks & (1 << i)))
continue;
mutex_enter(&scl->scl_lock);
(void) refcount_add(&scl->scl_count, tag);
mutex_exit(&scl->scl_lock);
}
+ ASSERT(wlocks_held <= locks);
}
void
if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY,
&nparity) == 0) {
/*
- * Currently, we can only support 2 parity devices.
+ * Currently, we can only support 3 parity devices.
*/
- if (nparity == 0 || nparity > 2)
+ if (nparity == 0 || nparity > 3)
return (EINVAL);
/*
- * Older versions can only support 1 parity device.
+ * Previous versions could only support 1 or 2 parity
+ * device.
*/
- if (nparity == 2 &&
- spa_version(spa) < SPA_VERSION_RAID6)
+ if (nparity > 1 &&
+ spa_version(spa) < SPA_VERSION_RAIDZ2)
+ return (ENOTSUP);
+ if (nparity > 2 &&
+ spa_version(spa) < SPA_VERSION_RAIDZ3)
return (ENOTSUP);
} else {
/*
* We require the parity to be specified for SPAs that
* support multiple parity levels.
*/
- if (spa_version(spa) >= SPA_VERSION_RAID6)
+ if (spa_version(spa) >= SPA_VERSION_RAIDZ2)
return (EINVAL);
/*
* Otherwise, we default to 1 parity device for RAID-Z.
return (NULL);
}
+static void
+vdev_open_child(void *arg)
+{
+ vdev_t *vd = arg;
+
+ vd->vdev_open_thread = curthread;
+ vd->vdev_open_error = vdev_open(vd);
+ vd->vdev_open_thread = NULL;
+}
+
+void
+vdev_open_children(vdev_t *vd)
+{
+ taskq_t *tq;
+ int children = vd->vdev_children;
+
+ tq = taskq_create("vdev_open", children, minclsyspri,
+ children, children, TASKQ_PREPOPULATE);
+
+ for (int c = 0; c < children; c++)
+ VERIFY(taskq_dispatch(tq, vdev_open_child, vd->vdev_child[c],
+ TQ_SLEEP) != NULL);
+
+ taskq_destroy(tq);
+}
+
/*
* Prepare a virtual device for access.
*/
uint64_t asize, psize;
uint64_t ashift = 0;
- ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
-
+ ASSERT(vd->vdev_open_thread == curthread ||
+ spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
ASSERT(vd->vdev_state == VDEV_STATE_CLOSED ||
vd->vdev_state == VDEV_STATE_CANT_OPEN ||
vd->vdev_state == VDEV_STATE_OFFLINE);
nvlist_free(label);
- if (spa->spa_load_state == SPA_LOAD_OPEN &&
+ /*
+ * If spa->spa_load_verbatim is true, no need to check the
+ * state of the pool.
+ */
+ if (!spa->spa_load_verbatim &&
+ spa->spa_load_state == SPA_LOAD_OPEN &&
state != POOL_STATE_ACTIVE)
return (EBADF);
* into a crufty old storage pool.
*/
ASSERT(vd->vdev_nparity == 1 ||
- (vd->vdev_nparity == 2 &&
- spa_version(spa) >= SPA_VERSION_RAID6));
+ (vd->vdev_nparity <= 2 &&
+ spa_version(spa) >= SPA_VERSION_RAIDZ2) ||
+ (vd->vdev_nparity <= 3 &&
+ spa_version(spa) >= SPA_VERSION_RAIDZ3));
/*
* Note that we'll add the nparity tag even on storage pools
/*
* Initialize uberblock template.
*/
- ub = zio_buf_alloc(VDEV_UBERBLOCK_SIZE(vd));
- bzero(ub, VDEV_UBERBLOCK_SIZE(vd));
+ ub = zio_buf_alloc(VDEV_UBERBLOCK_RING);
+ bzero(ub, VDEV_UBERBLOCK_RING);
*ub = spa->spa_uberblock;
ub->ub_txg = 0;
offsetof(vdev_label_t, vl_pad2),
VDEV_PAD_SIZE, NULL, NULL, flags);
- for (int n = 0; n < VDEV_UBERBLOCK_COUNT(vd); n++) {
- vdev_label_write(zio, vd, l, ub,
- VDEV_UBERBLOCK_OFFSET(vd, n),
- VDEV_UBERBLOCK_SIZE(vd), NULL, NULL, flags);
- }
+ vdev_label_write(zio, vd, l, ub,
+ offsetof(vdev_label_t, vl_uberblock),
+ VDEV_UBERBLOCK_RING, NULL, NULL, flags);
}
error = zio_wait(zio);
nvlist_free(label);
zio_buf_free(pad2, VDEV_PAD_SIZE);
- zio_buf_free(ub, VDEV_UBERBLOCK_SIZE(vd));
+ zio_buf_free(ub, VDEV_UBERBLOCK_RING);
zio_buf_free(vp, sizeof (vdev_phys_t));
/*
static int
vdev_mirror_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift)
{
- vdev_t *cvd;
- uint64_t c;
int numerrors = 0;
- int ret, lasterror = 0;
+ int lasterror = 0;
if (vd->vdev_children == 0) {
vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
return (EINVAL);
}
- for (c = 0; c < vd->vdev_children; c++) {
- cvd = vd->vdev_child[c];
+ vdev_open_children(vd);
- if ((ret = vdev_open(cvd)) != 0) {
- lasterror = ret;
+ for (int c = 0; c < vd->vdev_children; c++) {
+ vdev_t *cvd = vd->vdev_child[c];
+
+ if (cvd->vdev_open_error) {
+ lasterror = cvd->vdev_open_error;
numerrors++;
continue;
}
static void
vdev_mirror_close(vdev_t *vd)
{
- uint64_t c;
-
- for (c = 0; c < vd->vdev_children; c++)
+ for (int c = 0; c < vd->vdev_children; c++)
vdev_close(vd->vdev_child[c]);
}
*/
#include <sys/zfs_context.h>
-#include <sys/spa.h>
+#include <sys/spa_impl.h>
#include <sys/vdev_impl.h>
#include <sys/zio.h>
#include <sys/avl.h>
int zfs_vdev_ramp_rate = 2;
/*
- * To reduce IOPs, we aggregate small adjacent i/os into one large i/o.
- * For read i/os, we also aggregate across small adjacency gaps.
+ * To reduce IOPs, we aggregate small adjacent I/Os into one large I/O.
+ * For read I/Os, we also aggregate across small adjacency gaps; for writes
+ * we include spans of optional I/Os to aid aggregation at the disk even when
+ * they aren't able to help us aggregate at this level.
*/
int zfs_vdev_aggregation_limit = SPA_MAXBLOCKSIZE;
int zfs_vdev_read_gap_limit = 32 << 10;
+int zfs_vdev_write_gap_limit = 4 << 10;
/*
* Virtual device vector for disk I/O scheduling.
static zio_t *
vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit)
{
- zio_t *fio, *lio, *aio, *dio, *nio;
+ zio_t *fio, *lio, *aio, *dio, *nio, *mio;
avl_tree_t *t;
int flags;
uint64_t maxspan = zfs_vdev_aggregation_limit;
uint64_t maxgap;
+ int stretch;
+again:
ASSERT(MUTEX_HELD(&vq->vq_lock));
if (avl_numnodes(&vq->vq_pending_tree) >= pending_limit ||
if (!(flags & ZIO_FLAG_DONT_AGGREGATE)) {
/*
- * We can aggregate I/Os that are adjacent and of the
- * same flavor, as expressed by the AGG_INHERIT flags.
- * The latter is necessary so that certain attributes
- * of the I/O, such as whether it's a normal I/O or a
- * scrub/resilver, can be preserved in the aggregate.
+ * We can aggregate I/Os that are sufficiently adjacent and of
+ * the same flavor, as expressed by the AGG_INHERIT flags.
+ * The latter requirement is necessary so that certain
+ * attributes of the I/O, such as whether it's a normal I/O
+ * or a scrub/resilver, can be preserved in the aggregate.
+ * We can include optional I/Os, but don't allow them
+ * to begin a range as they add no benefit in that situation.
+ */
+
+ /*
+ * We keep track of the last non-optional I/O.
+ */
+ mio = (fio->io_flags & ZIO_FLAG_OPTIONAL) ? NULL : fio;
+
+ /*
+ * Walk backwards through sufficiently contiguous I/Os
+ * recording the last non-option I/O.
*/
while ((dio = AVL_PREV(t, fio)) != NULL &&
(dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags &&
- IO_SPAN(dio, lio) <= maxspan && IO_GAP(dio, fio) <= maxgap)
+ IO_SPAN(dio, lio) <= maxspan &&
+ IO_GAP(dio, fio) <= maxgap) {
fio = dio;
+ if (mio == NULL && !(fio->io_flags & ZIO_FLAG_OPTIONAL))
+ mio = fio;
+ }
+ /*
+ * Skip any initial optional I/Os.
+ */
+ while ((fio->io_flags & ZIO_FLAG_OPTIONAL) && fio != lio) {
+ fio = AVL_NEXT(t, fio);
+ ASSERT(fio != NULL);
+ }
+
+ /*
+ * Walk forward through sufficiently contiguous I/Os.
+ */
while ((dio = AVL_NEXT(t, lio)) != NULL &&
(dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags &&
- IO_SPAN(fio, dio) <= maxspan && IO_GAP(lio, dio) <= maxgap)
+ IO_SPAN(fio, dio) <= maxspan &&
+ IO_GAP(lio, dio) <= maxgap) {
lio = dio;
+ if (!(lio->io_flags & ZIO_FLAG_OPTIONAL))
+ mio = lio;
+ }
+
+ /*
+ * Now that we've established the range of the I/O aggregation
+ * we must decide what to do with trailing optional I/Os.
+ * For reads, there's nothing to do. While we are unable to
+ * aggregate further, it's possible that a trailing optional
+ * I/O would allow the underlying device to aggregate with
+ * subsequent I/Os. We must therefore determine if the next
+ * non-optional I/O is close enough to make aggregation
+ * worthwhile.
+ */
+ stretch = B_FALSE;
+ if (t != &vq->vq_read_tree && mio != NULL) {
+ nio = lio;
+ while ((dio = AVL_NEXT(t, nio)) != NULL &&
+ IO_GAP(nio, dio) == 0 &&
+ IO_GAP(mio, dio) <= zfs_vdev_write_gap_limit) {
+ nio = dio;
+ if (!(nio->io_flags & ZIO_FLAG_OPTIONAL)) {
+ stretch = B_TRUE;
+ break;
+ }
+ }
+ }
+
+ if (stretch) {
+ /* This may be a no-op. */
+ VERIFY((dio = AVL_NEXT(t, lio)) != NULL);
+ dio->io_flags &= ~ZIO_FLAG_OPTIONAL;
+ } else {
+ while (lio != mio && lio != fio) {
+ ASSERT(lio->io_flags & ZIO_FLAG_OPTIONAL);
+ lio = AVL_PREV(t, lio);
+ ASSERT(lio != NULL);
+ }
+ }
}
if (fio != lio) {
ASSERT(dio->io_type == aio->io_type);
ASSERT(dio->io_vdev_tree == t);
- if (dio->io_type == ZIO_TYPE_WRITE)
+ if (dio->io_flags & ZIO_FLAG_NODATA) {
+ ASSERT(dio->io_type == ZIO_TYPE_WRITE);
+ bzero((char *)aio->io_data + (dio->io_offset -
+ aio->io_offset), dio->io_size);
+ } else if (dio->io_type == ZIO_TYPE_WRITE) {
bcopy(dio->io_data, (char *)aio->io_data +
(dio->io_offset - aio->io_offset),
dio->io_size);
+ }
zio_add_child(dio, aio);
vdev_queue_io_remove(vq, dio);
ASSERT(fio->io_vdev_tree == t);
vdev_queue_io_remove(vq, fio);
+ /*
+ * If the I/O is or was optional and therefore has no data, we need to
+ * simply discard it. We need to drop the vdev queue's lock to avoid a
+ * deadlock that we could encounter since this I/O will complete
+ * immediately.
+ */
+ if (fio->io_flags & ZIO_FLAG_NODATA) {
+ mutex_exit(&vq->vq_lock);
+ zio_vdev_io_bypass(fio);
+ zio_execute(fio);
+ mutex_enter(&vq->vq_lock);
+ goto again;
+ }
+
avl_add(&vq->vq_pending_tree, fio);
return (fio);
/*
* Virtual device vector for RAID-Z.
*
- * This vdev supports both single and double parity. For single parity, we
- * use a simple XOR of all the data columns. For double parity, we use both
- * the simple XOR as well as a technique described in "The mathematics of
- * RAID-6" by H. Peter Anvin. This technique defines a Galois field, GF(2^8),
- * over the integers expressable in a single byte. Briefly, the operations on
- * the field are defined as follows:
+ * This vdev supports single, double, and triple parity. For single parity,
+ * we use a simple XOR of all the data columns. For double or triple parity,
+ * we use a special case of Reed-Solomon coding. This extends the
+ * technique described in "The mathematics of RAID-6" by H. Peter Anvin by
+ * drawing on the system described in "A Tutorial on Reed-Solomon Coding for
+ * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the
+ * former is also based. The latter is designed to provide higher performance
+ * for writes.
+ *
+ * Note that the Plank paper claimed to support arbitrary N+M, but was then
+ * amended six years later identifying a critical flaw that invalidates its
+ * claims. Nevertheless, the technique can be adapted to work for up to
+ * triple parity. For additional parity, the amendment "Note: Correction to
+ * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding
+ * is viable, but the additional complexity means that write performance will
+ * suffer.
+ *
+ * All of the methods above operate on a Galois field, defined over the
+ * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements
+ * can be expressed with a single byte. Briefly, the operations on the
+ * field are defined as follows:
*
* o addition (+) is represented by a bitwise XOR
* o subtraction (-) is therefore identical to addition: A + B = A - B
* (A * 2)_0 = A_7
*
* In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)).
+ * As an aside, this multiplication is derived from the error correcting
+ * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1.
*
* Observe that any number in the field (except for 0) can be expressed as a
* power of 2 -- a generator for the field. We store a table of the powers of
* 2 and logs base 2 for quick look ups, and exploit the fact that A * B can
* be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather
- * than field addition). The inverse of a field element A (A^-1) is A^254.
+ * than field addition). The inverse of a field element A (A^-1) is therefore
+ * A ^ (255 - 1) = A^254.
*
- * The two parity columns, P and Q, over several data columns, D_0, ... D_n-1,
- * can be expressed by field operations:
+ * The up-to-three parity columns, P, Q, R over several data columns,
+ * D_0, ... D_n-1, can be expressed by field operations:
*
* P = D_0 + D_1 + ... + D_n-2 + D_n-1
* Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1
* = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1
+ * R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1
+ * = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1
*
- * See the reconstruction code below for how P and Q can used individually or
- * in concert to recover missing data columns.
+ * We chose 1, 2, and 4 as our generators because 1 corresponds to the trival
+ * XOR operation, and 2 and 4 can be computed quickly and generate linearly-
+ * independent coefficients. (There are no additional coefficients that have
+ * this property which is why the uncorrected Plank method breaks down.)
+ *
+ * See the reconstruction code below for how P, Q and R can used individually
+ * or in concert to recover missing data columns.
*/
typedef struct raidz_col {
} raidz_col_t;
typedef struct raidz_map {
- uint64_t rm_cols; /* Column count */
+ uint64_t rm_cols; /* Regular column count */
+ uint64_t rm_scols; /* Count including skipped columns */
uint64_t rm_bigcols; /* Number of oversized columns */
uint64_t rm_asize; /* Actual total I/O size */
uint64_t rm_missingdata; /* Count of missing data devices */
uint64_t rm_missingparity; /* Count of missing parity devices */
uint64_t rm_firstdatacol; /* First data column/parity count */
+ uint64_t rm_skipped; /* Skipped sectors for padding */
raidz_col_t rm_col[1]; /* Flexible array of I/O columns */
} raidz_map_t;
#define VDEV_RAIDZ_P 0
#define VDEV_RAIDZ_Q 1
+#define VDEV_RAIDZ_R 2
+#define VDEV_RAIDZ_MAXPARITY 3
+
+#define VDEV_RAIDZ_MUL_2(x) (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
+#define VDEV_RAIDZ_MUL_4(x) (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
+
+/*
+ * We provide a mechanism to perform the field multiplication operation on a
+ * 64-bit value all at once rather than a byte at a time. This works by
+ * creating a mask from the top bit in each byte and using that to
+ * conditionally apply the XOR of 0x1d.
+ */
+#define VDEV_RAIDZ_64MUL_2(x, mask) \
+{ \
+ (mask) = (x) & 0x8080808080808080ULL; \
+ (mask) = ((mask) << 1) - ((mask) >> 7); \
+ (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
+ ((mask) & 0x1d1d1d1d1d1d1d1d); \
+}
-#define VDEV_RAIDZ_MAXPARITY 2
+#define VDEV_RAIDZ_64MUL_4(x, mask) \
+{ \
+ VDEV_RAIDZ_64MUL_2((x), mask); \
+ VDEV_RAIDZ_64MUL_2((x), mask); \
+}
-#define VDEV_RAIDZ_MUL_2(a) (((a) << 1) ^ (((a) & 0x80) ? 0x1d : 0))
+/*
+ * Force reconstruction to use the general purpose method.
+ */
+int vdev_raidz_default_to_general;
/*
* These two tables represent powers and logs of 2 in the Galois field defined
for (c = 0; c < rm->rm_firstdatacol; c++)
zio_buf_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size);
- kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_cols]));
+ kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols]));
}
static raidz_map_t *
uint64_t s = zio->io_size >> unit_shift;
uint64_t f = b % dcols;
uint64_t o = (b / dcols) << unit_shift;
- uint64_t q, r, c, bc, col, acols, coff, devidx;
+ uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
q = s / (dcols - nparity);
r = s - q * (dcols - nparity);
bc = (r == 0 ? 0 : r + nparity);
+ tot = s + nparity * (q + (r == 0 ? 0 : 1));
+
+ if (q == 0) {
+ acols = bc;
+ scols = MIN(dcols, roundup(bc, nparity + 1));
+ } else {
+ acols = dcols;
+ scols = dcols;
+ }
- acols = (q == 0 ? bc : dcols);
+ ASSERT3U(acols, <=, scols);
- rm = kmem_alloc(offsetof(raidz_map_t, rm_col[acols]), KM_SLEEP);
+ rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP);
rm->rm_cols = acols;
+ rm->rm_scols = scols;
rm->rm_bigcols = bc;
- rm->rm_asize = 0;
rm->rm_missingdata = 0;
rm->rm_missingparity = 0;
rm->rm_firstdatacol = nparity;
- for (c = 0; c < acols; c++) {
+ asize = 0;
+
+ for (c = 0; c < scols; c++) {
col = f + c;
coff = o;
if (col >= dcols) {
}
rm->rm_col[c].rc_devidx = col;
rm->rm_col[c].rc_offset = coff;
- rm->rm_col[c].rc_size = (q + (c < bc)) << unit_shift;
rm->rm_col[c].rc_data = NULL;
rm->rm_col[c].rc_error = 0;
rm->rm_col[c].rc_tried = 0;
rm->rm_col[c].rc_skipped = 0;
- rm->rm_asize += rm->rm_col[c].rc_size;
+
+ if (c >= acols)
+ rm->rm_col[c].rc_size = 0;
+ else if (c < bc)
+ rm->rm_col[c].rc_size = (q + 1) << unit_shift;
+ else
+ rm->rm_col[c].rc_size = q << unit_shift;
+
+ asize += rm->rm_col[c].rc_size;
}
- rm->rm_asize = roundup(rm->rm_asize, (nparity + 1) << unit_shift);
+ ASSERT3U(asize, ==, tot << unit_shift);
+ rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
+ rm->rm_skipped = roundup(tot, nparity + 1) - tot;
+ ASSERT3U(rm->rm_asize - asize, ==, rm->rm_skipped << unit_shift);
+ ASSERT3U(rm->rm_skipped, <=, nparity);
for (c = 0; c < rm->rm_firstdatacol; c++)
rm->rm_col[c].rc_data = zio_buf_alloc(rm->rm_col[c].rc_size);
if (c == rm->rm_firstdatacol) {
ASSERT(ccount == pcount);
- for (i = 0; i < ccount; i++, p++, src++) {
+ for (i = 0; i < ccount; i++, src++, p++) {
*p = *src;
}
} else {
ASSERT(ccount <= pcount);
- for (i = 0; i < ccount; i++, p++, src++) {
+ for (i = 0; i < ccount; i++, src++, p++) {
*p ^= *src;
}
}
static void
vdev_raidz_generate_parity_pq(raidz_map_t *rm)
{
- uint64_t *q, *p, *src, pcount, ccount, mask, i;
+ uint64_t *p, *q, *src, pcnt, ccnt, mask, i;
int c;
- pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
+ pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
rm->rm_col[VDEV_RAIDZ_Q].rc_size);
src = rm->rm_col[c].rc_data;
p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
- ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
+
+ ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
if (c == rm->rm_firstdatacol) {
- ASSERT(ccount == pcount || ccount == 0);
- for (i = 0; i < ccount; i++, p++, q++, src++) {
- *q = *src;
+ ASSERT(ccnt == pcnt || ccnt == 0);
+ for (i = 0; i < ccnt; i++, src++, p++, q++) {
*p = *src;
+ *q = *src;
}
- for (; i < pcount; i++, p++, q++, src++) {
- *q = 0;
+ for (; i < pcnt; i++, src++, p++, q++) {
*p = 0;
+ *q = 0;
}
} else {
- ASSERT(ccount <= pcount);
+ ASSERT(ccnt <= pcnt);
/*
- * Rather than multiplying each byte individually (as
- * described above), we are able to handle 8 at once
- * by generating a mask based on the high bit in each
- * byte and using that to conditionally XOR in 0x1d.
+ * Apply the algorithm described above by multiplying
+ * the previous result and adding in the new value.
*/
- for (i = 0; i < ccount; i++, p++, q++, src++) {
- mask = *q & 0x8080808080808080ULL;
- mask = (mask << 1) - (mask >> 7);
- *q = ((*q << 1) & 0xfefefefefefefefeULL) ^
- (mask & 0x1d1d1d1d1d1d1d1dULL);
+ for (i = 0; i < ccnt; i++, src++, p++, q++) {
+ *p ^= *src;
+
+ VDEV_RAIDZ_64MUL_2(*q, mask);
*q ^= *src;
+ }
+
+ /*
+ * Treat short columns as though they are full of 0s.
+ * Note that there's therefore nothing needed for P.
+ */
+ for (; i < pcnt; i++, q++) {
+ VDEV_RAIDZ_64MUL_2(*q, mask);
+ }
+ }
+ }
+}
+
+static void
+vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
+{
+ uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i;
+ int c;
+
+ pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
+ ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
+ rm->rm_col[VDEV_RAIDZ_Q].rc_size);
+ ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
+ rm->rm_col[VDEV_RAIDZ_R].rc_size);
+
+ for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
+ src = rm->rm_col[c].rc_data;
+ p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
+ q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
+ r = rm->rm_col[VDEV_RAIDZ_R].rc_data;
+
+ ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
+
+ if (c == rm->rm_firstdatacol) {
+ ASSERT(ccnt == pcnt || ccnt == 0);
+ for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
+ *p = *src;
+ *q = *src;
+ *r = *src;
+ }
+ for (; i < pcnt; i++, src++, p++, q++, r++) {
+ *p = 0;
+ *q = 0;
+ *r = 0;
+ }
+ } else {
+ ASSERT(ccnt <= pcnt);
+
+ /*
+ * Apply the algorithm described above by multiplying
+ * the previous result and adding in the new value.
+ */
+ for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
*p ^= *src;
+
+ VDEV_RAIDZ_64MUL_2(*q, mask);
+ *q ^= *src;
+
+ VDEV_RAIDZ_64MUL_4(*r, mask);
+ *r ^= *src;
}
/*
* Treat short columns as though they are full of 0s.
+ * Note that there's therefore nothing needed for P.
*/
- for (; i < pcount; i++, q++) {
- mask = *q & 0x8080808080808080ULL;
- mask = (mask << 1) - (mask >> 7);
- *q = ((*q << 1) & 0xfefefefefefefefeULL) ^
- (mask & 0x1d1d1d1d1d1d1d1dULL);
+ for (; i < pcnt; i++, q++, r++) {
+ VDEV_RAIDZ_64MUL_2(*q, mask);
+ VDEV_RAIDZ_64MUL_4(*r, mask);
}
}
}
}
+/*
+ * Generate RAID parity in the first virtual columns according to the number of
+ * parity columns available.
+ */
static void
-vdev_raidz_reconstruct_p(raidz_map_t *rm, int x)
+vdev_raidz_generate_parity(raidz_map_t *rm)
+{
+ switch (rm->rm_firstdatacol) {
+ case 1:
+ vdev_raidz_generate_parity_p(rm);
+ break;
+ case 2:
+ vdev_raidz_generate_parity_pq(rm);
+ break;
+ case 3:
+ vdev_raidz_generate_parity_pqr(rm);
+ break;
+ default:
+ cmn_err(CE_PANIC, "invalid RAID-Z configuration");
+ }
+}
+
+static int
+vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts)
{
uint64_t *dst, *src, xcount, ccount, count, i;
+ int x = tgts[0];
int c;
+ ASSERT(ntgts == 1);
+ ASSERT(x >= rm->rm_firstdatacol);
+ ASSERT(x < rm->rm_cols);
+
xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]));
ASSERT(xcount > 0);
*dst ^= *src;
}
}
+
+ return (1 << VDEV_RAIDZ_P);
}
-static void
-vdev_raidz_reconstruct_q(raidz_map_t *rm, int x)
+static int
+vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts)
{
uint64_t *dst, *src, xcount, ccount, count, mask, i;
uint8_t *b;
+ int x = tgts[0];
int c, j, exp;
+ ASSERT(ntgts == 1);
+
xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_Q].rc_size / sizeof (src[0]));
}
} else {
- /*
- * For an explanation of this, see the comment in
- * vdev_raidz_generate_parity_pq() above.
- */
for (i = 0; i < count; i++, dst++, src++) {
- mask = *dst & 0x8080808080808080ULL;
- mask = (mask << 1) - (mask >> 7);
- *dst = ((*dst << 1) & 0xfefefefefefefefeULL) ^
- (mask & 0x1d1d1d1d1d1d1d1dULL);
+ VDEV_RAIDZ_64MUL_2(*dst, mask);
*dst ^= *src;
}
for (; i < xcount; i++, dst++) {
- mask = *dst & 0x8080808080808080ULL;
- mask = (mask << 1) - (mask >> 7);
- *dst = ((*dst << 1) & 0xfefefefefefefefeULL) ^
- (mask & 0x1d1d1d1d1d1d1d1dULL);
+ VDEV_RAIDZ_64MUL_2(*dst, mask);
}
}
}
*b = vdev_raidz_exp2(*b, exp);
}
}
+
+ return (1 << VDEV_RAIDZ_Q);
}
-static void
-vdev_raidz_reconstruct_pq(raidz_map_t *rm, int x, int y)
+static int
+vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts)
{
uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp;
void *pdata, *qdata;
uint64_t xsize, ysize, i;
+ int x = tgts[0];
+ int y = tgts[1];
+ ASSERT(ntgts == 2);
ASSERT(x < y);
ASSERT(x >= rm->rm_firstdatacol);
ASSERT(y < rm->rm_cols);
*/
rm->rm_col[VDEV_RAIDZ_P].rc_data = pdata;
rm->rm_col[VDEV_RAIDZ_Q].rc_data = qdata;
+
+ return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q));
+}
+
+/* BEGIN CSTYLED */
+/*
+ * In the general case of reconstruction, we must solve the system of linear
+ * equations defined by the coeffecients used to generate parity as well as
+ * the contents of the data and parity disks. This can be expressed with
+ * vectors for the original data (D) and the actual data (d) and parity (p)
+ * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
+ *
+ * __ __ __ __
+ * | | __ __ | p_0 |
+ * | V | | D_0 | | p_m-1 |
+ * | | x | : | = | d_0 |
+ * | I | | D_n-1 | | : |
+ * | | ~~ ~~ | d_n-1 |
+ * ~~ ~~ ~~ ~~
+ *
+ * I is simply a square identity matrix of size n, and V is a vandermonde
+ * matrix defined by the coeffecients we chose for the various parity columns
+ * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
+ * computation as well as linear separability.
+ *
+ * __ __ __ __
+ * | 1 .. 1 1 1 | | p_0 |
+ * | 2^n-1 .. 4 2 1 | __ __ | : |
+ * | 4^n-1 .. 16 4 1 | | D_0 | | p_m-1 |
+ * | 1 .. 0 0 0 | | D_1 | | d_0 |
+ * | 0 .. 0 0 0 | x | D_2 | = | d_1 |
+ * | : : : : | | : | | d_2 |
+ * | 0 .. 1 0 0 | | D_n-1 | | : |
+ * | 0 .. 0 1 0 | ~~ ~~ | : |
+ * | 0 .. 0 0 1 | | d_n-1 |
+ * ~~ ~~ ~~ ~~
+ *
+ * Note that I, V, d, and p are known. To compute D, we must invert the
+ * matrix and use the known data and parity values to reconstruct the unknown
+ * data values. We begin by removing the rows in V|I and d|p that correspond
+ * to failed or missing columns; we then make V|I square (n x n) and d|p
+ * sized n by removing rows corresponding to unused parity from the bottom up
+ * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
+ * using Gauss-Jordan elimination. In the example below we use m=3 parity
+ * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
+ * __ __
+ * | 1 1 1 1 1 1 1 1 |
+ * | 128 64 32 16 8 4 2 1 | <-----+-+-- missing disks
+ * | 19 205 116 29 64 16 4 1 | / /
+ * | 1 0 0 0 0 0 0 0 | / /
+ * | 0 1 0 0 0 0 0 0 | <--' /
+ * (V|I) = | 0 0 1 0 0 0 0 0 | <---'
+ * | 0 0 0 1 0 0 0 0 |
+ * | 0 0 0 0 1 0 0 0 |
+ * | 0 0 0 0 0 1 0 0 |
+ * | 0 0 0 0 0 0 1 0 |
+ * | 0 0 0 0 0 0 0 1 |
+ * ~~ ~~
+ * __ __
+ * | 1 1 1 1 1 1 1 1 |
+ * | 128 64 32 16 8 4 2 1 |
+ * | 19 205 116 29 64 16 4 1 |
+ * | 1 0 0 0 0 0 0 0 |
+ * | 0 1 0 0 0 0 0 0 |
+ * (V|I)' = | 0 0 1 0 0 0 0 0 |
+ * | 0 0 0 1 0 0 0 0 |
+ * | 0 0 0 0 1 0 0 0 |
+ * | 0 0 0 0 0 1 0 0 |
+ * | 0 0 0 0 0 0 1 0 |
+ * | 0 0 0 0 0 0 0 1 |
+ * ~~ ~~
+ *
+ * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
+ * have carefully chosen the seed values 1, 2, and 4 to ensure that this
+ * matrix is not singular.
+ * __ __
+ * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 |
+ * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 |
+ * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
+ * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
+ * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
+ * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
+ * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
+ * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
+ * ~~ ~~
+ * __ __
+ * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
+ * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 |
+ * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 |
+ * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
+ * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
+ * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
+ * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
+ * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
+ * ~~ ~~
+ * __ __
+ * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
+ * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
+ * | 0 205 116 0 0 0 0 0 0 1 19 29 64 16 4 1 |
+ * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
+ * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
+ * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
+ * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
+ * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
+ * ~~ ~~
+ * __ __
+ * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
+ * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
+ * | 0 0 185 0 0 0 0 0 205 1 222 208 141 221 201 204 |
+ * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
+ * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
+ * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
+ * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
+ * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
+ * ~~ ~~
+ * __ __
+ * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
+ * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
+ * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 |
+ * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
+ * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
+ * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
+ * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
+ * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
+ * ~~ ~~
+ * __ __
+ * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
+ * | 0 1 0 0 0 0 0 0 167 100 5 41 159 169 217 208 |
+ * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 |
+ * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
+ * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
+ * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
+ * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
+ * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
+ * ~~ ~~
+ * __ __
+ * | 0 0 1 0 0 0 0 0 |
+ * | 167 100 5 41 159 169 217 208 |
+ * | 166 100 4 40 158 168 216 209 |
+ * (V|I)'^-1 = | 0 0 0 1 0 0 0 0 |
+ * | 0 0 0 0 1 0 0 0 |
+ * | 0 0 0 0 0 1 0 0 |
+ * | 0 0 0 0 0 0 1 0 |
+ * | 0 0 0 0 0 0 0 1 |
+ * ~~ ~~
+ *
+ * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
+ * of the missing data.
+ *
+ * As is apparent from the example above, the only non-trivial rows in the
+ * inverse matrix correspond to the data disks that we're trying to
+ * reconstruct. Indeed, those are the only rows we need as the others would
+ * only be useful for reconstructing data known or assumed to be valid. For
+ * that reason, we only build the coefficients in the rows that correspond to
+ * targeted columns.
+ */
+/* END CSTYLED */
+
+static void
+vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map,
+ uint8_t **rows)
+{
+ int i, j;
+ int pow;
+
+ ASSERT(n == rm->rm_cols - rm->rm_firstdatacol);
+
+ /*
+ * Fill in the missing rows of interest.
+ */
+ for (i = 0; i < nmap; i++) {
+ ASSERT3S(0, <=, map[i]);
+ ASSERT3S(map[i], <=, 2);
+
+ pow = map[i] * n;
+ if (pow > 255)
+ pow -= 255;
+ ASSERT(pow <= 255);
+
+ for (j = 0; j < n; j++) {
+ pow -= map[i];
+ if (pow < 0)
+ pow += 255;
+ rows[i][j] = vdev_raidz_pow2[pow];
+ }
+ }
+}
+
+static void
+vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing,
+ uint8_t **rows, uint8_t **invrows, const uint8_t *used)
+{
+ int i, j, ii, jj;
+ uint8_t log;
+
+ /*
+ * Assert that the first nmissing entries from the array of used
+ * columns correspond to parity columns and that subsequent entries
+ * correspond to data columns.
+ */
+ for (i = 0; i < nmissing; i++) {
+ ASSERT3S(used[i], <, rm->rm_firstdatacol);
+ }
+ for (; i < n; i++) {
+ ASSERT3S(used[i], >=, rm->rm_firstdatacol);
+ }
+
+ /*
+ * First initialize the storage where we'll compute the inverse rows.
+ */
+ for (i = 0; i < nmissing; i++) {
+ for (j = 0; j < n; j++) {
+ invrows[i][j] = (i == j) ? 1 : 0;
+ }
+ }
+
+ /*
+ * Subtract all trivial rows from the rows of consequence.
+ */
+ for (i = 0; i < nmissing; i++) {
+ for (j = nmissing; j < n; j++) {
+ ASSERT3U(used[j], >=, rm->rm_firstdatacol);
+ jj = used[j] - rm->rm_firstdatacol;
+ ASSERT3S(jj, <, n);
+ invrows[i][j] = rows[i][jj];
+ rows[i][jj] = 0;
+ }
+ }
+
+ /*
+ * For each of the rows of interest, we must normalize it and subtract
+ * a multiple of it from the other rows.
+ */
+ for (i = 0; i < nmissing; i++) {
+ for (j = 0; j < missing[i]; j++) {
+ ASSERT3U(rows[i][j], ==, 0);
+ }
+ ASSERT3U(rows[i][missing[i]], !=, 0);
+
+ /*
+ * Compute the inverse of the first element and multiply each
+ * element in the row by that value.
+ */
+ log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
+
+ for (j = 0; j < n; j++) {
+ rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
+ invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
+ }
+
+ for (ii = 0; ii < nmissing; ii++) {
+ if (i == ii)
+ continue;
+
+ ASSERT3U(rows[ii][missing[i]], !=, 0);
+
+ log = vdev_raidz_log2[rows[ii][missing[i]]];
+
+ for (j = 0; j < n; j++) {
+ rows[ii][j] ^=
+ vdev_raidz_exp2(rows[i][j], log);
+ invrows[ii][j] ^=
+ vdev_raidz_exp2(invrows[i][j], log);
+ }
+ }
+ }
+
+ /*
+ * Verify that the data that is left in the rows are properly part of
+ * an identity matrix.
+ */
+ for (i = 0; i < nmissing; i++) {
+ for (j = 0; j < n; j++) {
+ if (j == missing[i]) {
+ ASSERT3U(rows[i][j], ==, 1);
+ } else {
+ ASSERT3U(rows[i][j], ==, 0);
+ }
+ }
+ }
}
+static void
+vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
+ int *missing, uint8_t **invrows, const uint8_t *used)
+{
+ int i, j, x, cc, c;
+ uint8_t *src;
+ uint64_t ccount;
+ uint8_t *dst[VDEV_RAIDZ_MAXPARITY];
+ uint64_t dcount[VDEV_RAIDZ_MAXPARITY];
+ uint8_t log, val;
+ int ll;
+ uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
+ uint8_t *p, *pp;
+ size_t psize;
+
+ psize = sizeof (invlog[0][0]) * n * nmissing;
+ p = kmem_alloc(psize, KM_SLEEP);
+
+ for (pp = p, i = 0; i < nmissing; i++) {
+ invlog[i] = pp;
+ pp += n;
+ }
+
+ for (i = 0; i < nmissing; i++) {
+ for (j = 0; j < n; j++) {
+ ASSERT3U(invrows[i][j], !=, 0);
+ invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
+ }
+ }
+
+ for (i = 0; i < n; i++) {
+ c = used[i];
+ ASSERT3U(c, <, rm->rm_cols);
+
+ src = rm->rm_col[c].rc_data;
+ ccount = rm->rm_col[c].rc_size;
+ for (j = 0; j < nmissing; j++) {
+ cc = missing[j] + rm->rm_firstdatacol;
+ ASSERT3U(cc, >=, rm->rm_firstdatacol);
+ ASSERT3U(cc, <, rm->rm_cols);
+ ASSERT3U(cc, !=, c);
+
+ dst[j] = rm->rm_col[cc].rc_data;
+ dcount[j] = rm->rm_col[cc].rc_size;
+ }
+
+ ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0);
+
+ for (x = 0; x < ccount; x++, src++) {
+ if (*src != 0)
+ log = vdev_raidz_log2[*src];
+
+ for (cc = 0; cc < nmissing; cc++) {
+ if (x >= dcount[cc])
+ continue;
+
+ if (*src == 0) {
+ val = 0;
+ } else {
+ if ((ll = log + invlog[cc][i]) >= 255)
+ ll -= 255;
+ val = vdev_raidz_pow2[ll];
+ }
+
+ if (i == 0)
+ dst[cc][x] = val;
+ else
+ dst[cc][x] ^= val;
+ }
+ }
+ }
+
+ kmem_free(p, psize);
+}
+
+static int
+vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
+{
+ int n, i, c, t, tt;
+ int nmissing_rows;
+ int missing_rows[VDEV_RAIDZ_MAXPARITY];
+ int parity_map[VDEV_RAIDZ_MAXPARITY];
+
+ uint8_t *p, *pp;
+ size_t psize;
+
+ uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
+ uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
+ uint8_t *used;
+
+ int code = 0;
+
+
+ n = rm->rm_cols - rm->rm_firstdatacol;
+
+ /*
+ * Figure out which data columns are missing.
+ */
+ nmissing_rows = 0;
+ for (t = 0; t < ntgts; t++) {
+ if (tgts[t] >= rm->rm_firstdatacol) {
+ missing_rows[nmissing_rows++] =
+ tgts[t] - rm->rm_firstdatacol;
+ }
+ }
+
+ /*
+ * Figure out which parity columns to use to help generate the missing
+ * data columns.
+ */
+ for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
+ ASSERT(tt < ntgts);
+ ASSERT(c < rm->rm_firstdatacol);
+
+ /*
+ * Skip any targeted parity columns.
+ */
+ if (c == tgts[tt]) {
+ tt++;
+ continue;
+ }
+
+ code |= 1 << c;
+
+ parity_map[i] = c;
+ i++;
+ }
+
+ ASSERT(code != 0);
+ ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY);
+
+ psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
+ nmissing_rows * n + sizeof (used[0]) * n;
+ p = kmem_alloc(psize, KM_SLEEP);
+
+ for (pp = p, i = 0; i < nmissing_rows; i++) {
+ rows[i] = pp;
+ pp += n;
+ invrows[i] = pp;
+ pp += n;
+ }
+ used = pp;
+
+ for (i = 0; i < nmissing_rows; i++) {
+ used[i] = parity_map[i];
+ }
+
+ for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
+ if (tt < nmissing_rows &&
+ c == missing_rows[tt] + rm->rm_firstdatacol) {
+ tt++;
+ continue;
+ }
+
+ ASSERT3S(i, <, n);
+ used[i] = c;
+ i++;
+ }
+
+ /*
+ * Initialize the interesting rows of the matrix.
+ */
+ vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows);
+
+ /*
+ * Invert the matrix.
+ */
+ vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows,
+ invrows, used);
+
+ /*
+ * Reconstruct the missing data using the generated matrix.
+ */
+ vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows,
+ invrows, used);
+
+ kmem_free(p, psize);
+
+ return (code);
+}
+
+static int
+vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt)
+{
+ int tgts[VDEV_RAIDZ_MAXPARITY], *dt;
+ int ntgts;
+ int i, c;
+ int code;
+ int nbadparity, nbaddata;
+ int parity_valid[VDEV_RAIDZ_MAXPARITY];
+
+ /*
+ * The tgts list must already be sorted.
+ */
+ for (i = 1; i < nt; i++) {
+ ASSERT(t[i] > t[i - 1]);
+ }
+
+ nbadparity = rm->rm_firstdatacol;
+ nbaddata = rm->rm_cols - nbadparity;
+ ntgts = 0;
+ for (i = 0, c = 0; c < rm->rm_cols; c++) {
+ if (c < rm->rm_firstdatacol)
+ parity_valid[c] = B_FALSE;
+
+ if (i < nt && c == t[i]) {
+ tgts[ntgts++] = c;
+ i++;
+ } else if (rm->rm_col[c].rc_error != 0) {
+ tgts[ntgts++] = c;
+ } else if (c >= rm->rm_firstdatacol) {
+ nbaddata--;
+ } else {
+ parity_valid[c] = B_TRUE;
+ nbadparity--;
+ }
+ }
+
+ ASSERT(ntgts >= nt);
+ ASSERT(nbaddata >= 0);
+ ASSERT(nbaddata + nbadparity == ntgts);
+
+ dt = &tgts[nbadparity];
+
+ /*
+ * See if we can use any of our optimized reconstruction routines.
+ */
+ if (!vdev_raidz_default_to_general) {
+ switch (nbaddata) {
+ case 1:
+ if (parity_valid[VDEV_RAIDZ_P])
+ return (vdev_raidz_reconstruct_p(rm, dt, 1));
+
+ ASSERT(rm->rm_firstdatacol > 1);
+
+ if (parity_valid[VDEV_RAIDZ_Q])
+ return (vdev_raidz_reconstruct_q(rm, dt, 1));
+
+ ASSERT(rm->rm_firstdatacol > 2);
+ break;
+
+ case 2:
+ ASSERT(rm->rm_firstdatacol > 1);
+
+ if (parity_valid[VDEV_RAIDZ_P] &&
+ parity_valid[VDEV_RAIDZ_Q])
+ return (vdev_raidz_reconstruct_pq(rm, dt, 2));
+
+ ASSERT(rm->rm_firstdatacol > 2);
+
+ break;
+ }
+ }
+
+ code = vdev_raidz_reconstruct_general(rm, tgts, ntgts);
+ ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
+ ASSERT(code > 0);
+ return (code);
+}
static int
vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift)
{
vdev_t *cvd;
uint64_t nparity = vd->vdev_nparity;
- int c, error;
+ int c;
int lasterror = 0;
int numerrors = 0;
return (EINVAL);
}
+ vdev_open_children(vd);
+
for (c = 0; c < vd->vdev_children; c++) {
cvd = vd->vdev_child[c];
- if ((error = vdev_open(cvd)) != 0) {
- lasterror = error;
+ if (cvd->vdev_open_error != 0) {
+ lasterror = cvd->vdev_open_error;
numerrors++;
continue;
}
blkptr_t *bp = zio->io_bp;
raidz_map_t *rm;
raidz_col_t *rc;
- int c;
+ int c, i;
rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vd->vdev_children,
vd->vdev_nparity);
ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));
if (zio->io_type == ZIO_TYPE_WRITE) {
- /*
- * Generate RAID parity in the first virtual columns.
- */
- if (rm->rm_firstdatacol == 1)
- vdev_raidz_generate_parity_p(rm);
- else
- vdev_raidz_generate_parity_pq(rm);
+ vdev_raidz_generate_parity(rm);
for (c = 0; c < rm->rm_cols; c++) {
rc = &rm->rm_col[c];
vdev_raidz_child_done, rc));
}
+ /*
+ * Generate optional I/Os for any skipped sectors to improve
+ * aggregation contiguity.
+ */
+ for (c = rm->rm_bigcols, i = 0; i < rm->rm_skipped; c++, i++) {
+ ASSERT(c <= rm->rm_scols);
+ if (c == rm->rm_scols)
+ c = 0;
+ rc = &rm->rm_col[c];
+ cvd = vd->vdev_child[rc->rc_devidx];
+ zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
+ rc->rc_offset + rc->rc_size, NULL,
+ 1 << tvd->vdev_ashift,
+ zio->io_type, zio->io_priority,
+ ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL));
+ }
+
return (ZIO_PIPELINE_CONTINUE);
}
/*
* Iterate over the columns in reverse order so that we hit the parity
- * last -- any errors along the way will force us to read the parity
- * data.
+ * last -- any errors along the way will force us to read the parity.
*/
for (c = rm->rm_cols - 1; c >= 0; c--) {
rc = &rm->rm_col[c];
bcopy(rc->rc_data, orig[c], rc->rc_size);
}
- if (rm->rm_firstdatacol == 1)
- vdev_raidz_generate_parity_p(rm);
- else
- vdev_raidz_generate_parity_pq(rm);
+ vdev_raidz_generate_parity(rm);
for (c = 0; c < rm->rm_firstdatacol; c++) {
rc = &rm->rm_col[c];
return (ret);
}
-static uint64_t raidz_corrected_p;
-static uint64_t raidz_corrected_q;
-static uint64_t raidz_corrected_pq;
+/*
+ * Keep statistics on all the ways that we used parity to correct data.
+ */
+static uint64_t raidz_corrected[1 << VDEV_RAIDZ_MAXPARITY];
static int
vdev_raidz_worst_error(raidz_map_t *rm)
return (error);
}
+/*
+ * Iterate over all combinations of bad data and attempt a reconstruction.
+ * Note that the algorithm below is non-optimal because it doesn't take into
+ * account how reconstruction is actually performed. For example, with
+ * triple-parity RAID-Z the reconstruction procedure is the same if column 4
+ * is targeted as invalid as if columns 1 and 4 are targeted since in both
+ * cases we'd only use parity information in column 0.
+ */
+static int
+vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors)
+{
+ raidz_map_t *rm = zio->io_vsd;
+ raidz_col_t *rc;
+ void *orig[VDEV_RAIDZ_MAXPARITY];
+ int tstore[VDEV_RAIDZ_MAXPARITY + 2];
+ int *tgts = &tstore[1];
+ int current, next, i, c, n;
+ int code, ret = 0;
+
+ ASSERT(total_errors < rm->rm_firstdatacol);
+
+ /*
+ * This simplifies one edge condition.
+ */
+ tgts[-1] = -1;
+
+ for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) {
+ /*
+ * Initialize the targets array by finding the first n columns
+ * that contain no error.
+ *
+ * If there were no data errors, we need to ensure that we're
+ * always explicitly attempting to reconstruct at least one
+ * data column. To do this, we simply push the highest target
+ * up into the data columns.
+ */
+ for (c = 0, i = 0; i < n; i++) {
+ if (i == n - 1 && data_errors == 0 &&
+ c < rm->rm_firstdatacol) {
+ c = rm->rm_firstdatacol;
+ }
+
+ while (rm->rm_col[c].rc_error != 0) {
+ c++;
+ ASSERT3S(c, <, rm->rm_cols);
+ }
+
+ tgts[i] = c++;
+ }
+
+ /*
+ * Setting tgts[n] simplifies the other edge condition.
+ */
+ tgts[n] = rm->rm_cols;
+
+ /*
+ * These buffers were allocated in previous iterations.
+ */
+ for (i = 0; i < n - 1; i++) {
+ ASSERT(orig[i] != NULL);
+ }
+
+ orig[n - 1] = zio_buf_alloc(rm->rm_col[0].rc_size);
+
+ current = 0;
+ next = tgts[current];
+
+ while (current != n) {
+ tgts[current] = next;
+ current = 0;
+
+ /*
+ * Save off the original data that we're going to
+ * attempt to reconstruct.
+ */
+ for (i = 0; i < n; i++) {
+ ASSERT(orig[i] != NULL);
+ c = tgts[i];
+ ASSERT3S(c, >=, 0);
+ ASSERT3S(c, <, rm->rm_cols);
+ rc = &rm->rm_col[c];
+ bcopy(rc->rc_data, orig[i], rc->rc_size);
+ }
+
+ /*
+ * Attempt a reconstruction and exit the outer loop on
+ * success.
+ */
+ code = vdev_raidz_reconstruct(rm, tgts, n);
+ if (zio_checksum_error(zio) == 0) {
+ atomic_inc_64(&raidz_corrected[code]);
+
+ for (i = 0; i < n; i++) {
+ c = tgts[i];
+ rc = &rm->rm_col[c];
+ ASSERT(rc->rc_error == 0);
+ if (rc->rc_tried)
+ raidz_checksum_error(zio, rc);
+ rc->rc_error = ECKSUM;
+ }
+
+ ret = code;
+ goto done;
+ }
+
+ /*
+ * Restore the original data.
+ */
+ for (i = 0; i < n; i++) {
+ c = tgts[i];
+ rc = &rm->rm_col[c];
+ bcopy(orig[i], rc->rc_data, rc->rc_size);
+ }
+
+ do {
+ /*
+ * Find the next valid column after the current
+ * position..
+ */
+ for (next = tgts[current] + 1;
+ next < rm->rm_cols &&
+ rm->rm_col[next].rc_error != 0; next++)
+ continue;
+
+ ASSERT(next <= tgts[current + 1]);
+
+ /*
+ * If that spot is available, we're done here.
+ */
+ if (next != tgts[current + 1])
+ break;
+
+ /*
+ * Otherwise, find the next valid column after
+ * the previous position.
+ */
+ for (c = tgts[current - 1] + 1;
+ rm->rm_col[c].rc_error != 0; c++)
+ continue;
+
+ tgts[current] = c;
+ current++;
+
+ } while (current != n);
+ }
+ }
+ n--;
+done:
+ for (i = 0; i < n; i++) {
+ zio_buf_free(orig[i], rm->rm_col[0].rc_size);
+ }
+
+ return (ret);
+}
+
static void
vdev_raidz_io_done(zio_t *zio)
{
vdev_t *vd = zio->io_vd;
vdev_t *cvd;
raidz_map_t *rm = zio->io_vsd;
- raidz_col_t *rc, *rc1;
+ raidz_col_t *rc;
int unexpected_errors = 0;
int parity_errors = 0;
int parity_untried = 0;
int data_errors = 0;
int total_errors = 0;
- int n, c, c1;
+ int n, c;
+ int tgts[VDEV_RAIDZ_MAXPARITY];
+ int code;
ASSERT(zio->io_bp != NULL); /* XXX need to add code to enforce this */
* any errors.
*/
if (total_errors <= rm->rm_firstdatacol - parity_untried) {
- switch (data_errors) {
- case 0:
+ if (data_errors == 0) {
if (zio_checksum_error(zio) == 0) {
/*
* If we read parity information (unnecessarily
}
goto done;
}
- break;
-
- case 1:
+ } else {
/*
* We either attempt to read all the parity columns or
* none of them. If we didn't try to read parity, we
ASSERT(parity_errors < rm->rm_firstdatacol);
/*
- * Find the column that reported the error.
+ * Identify the data columns that reported an error.
*/
+ n = 0;
for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
rc = &rm->rm_col[c];
- if (rc->rc_error != 0)
- break;
+ if (rc->rc_error != 0) {
+ ASSERT(n < VDEV_RAIDZ_MAXPARITY);
+ tgts[n++] = c;
+ }
}
- ASSERT(c != rm->rm_cols);
- ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO ||
- rc->rc_error == ESTALE);
- if (rm->rm_col[VDEV_RAIDZ_P].rc_error == 0) {
- vdev_raidz_reconstruct_p(rm, c);
- } else {
- ASSERT(rm->rm_firstdatacol > 1);
- vdev_raidz_reconstruct_q(rm, c);
- }
+ ASSERT(rm->rm_firstdatacol >= n);
+
+ code = vdev_raidz_reconstruct(rm, tgts, n);
if (zio_checksum_error(zio) == 0) {
- if (rm->rm_col[VDEV_RAIDZ_P].rc_error == 0)
- atomic_inc_64(&raidz_corrected_p);
- else
- atomic_inc_64(&raidz_corrected_q);
+ atomic_inc_64(&raidz_corrected[code]);
/*
- * If there's more than one parity disk that
- * was successfully read, confirm that the
- * other parity disk produced the correct data.
- * This routine is suboptimal in that it
- * regenerates both the parity we wish to test
- * as well as the parity we just used to
- * perform the reconstruction, but this should
- * be a relatively uncommon case, and can be
- * optimized if it becomes a problem.
- * We also regenerate parity when resilvering
- * so we can write it out to the failed device
- * later.
+ * If we read more parity disks than were used
+ * for reconstruction, confirm that the other
+ * parity disks produced correct data. This
+ * routine is suboptimal in that it regenerates
+ * the parity that we already used in addition
+ * to the parity that we're attempting to
+ * verify, but this should be a relatively
+ * uncommon case, and can be optimized if it
+ * becomes a problem. Note that we regenerate
+ * parity when resilvering so we can write it
+ * out to failed devices later.
*/
- if (parity_errors < rm->rm_firstdatacol - 1 ||
+ if (parity_errors < rm->rm_firstdatacol - n ||
(zio->io_flags & ZIO_FLAG_RESILVER)) {
n = raidz_parity_verify(zio, rm);
unexpected_errors += n;
goto done;
}
- break;
-
- case 2:
- /*
- * Two data column errors require double parity.
- */
- ASSERT(rm->rm_firstdatacol == 2);
-
- /*
- * Find the two columns that reported errors.
- */
- for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
- rc = &rm->rm_col[c];
- if (rc->rc_error != 0)
- break;
- }
- ASSERT(c != rm->rm_cols);
- ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO ||
- rc->rc_error == ESTALE);
-
- for (c1 = c++; c < rm->rm_cols; c++) {
- rc = &rm->rm_col[c];
- if (rc->rc_error != 0)
- break;
- }
- ASSERT(c != rm->rm_cols);
- ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO ||
- rc->rc_error == ESTALE);
-
- vdev_raidz_reconstruct_pq(rm, c1, c);
-
- if (zio_checksum_error(zio) == 0) {
- atomic_inc_64(&raidz_corrected_pq);
- goto done;
- }
- break;
-
- default:
- ASSERT(rm->rm_firstdatacol <= 2);
- ASSERT(0);
}
}
* errors we detected, and we've attempted to read all columns. There
* must, therefore, be one or more additional problems -- silent errors
* resulting in invalid data rather than explicit I/O errors resulting
- * in absent data. Before we attempt combinatorial reconstruction make
- * sure we have a chance of coming up with the right answer.
+ * in absent data. We check if there is enough additional data to
+ * possibly reconstruct the data and then perform combinatorial
+ * reconstruction over all possible combinations. If that fails,
+ * we're cooked.
*/
if (total_errors >= rm->rm_firstdatacol) {
zio->io_error = vdev_raidz_worst_error(rm);
*/
if (total_errors == rm->rm_firstdatacol)
zio->io_error = zio_worst_error(zio->io_error, ECKSUM);
- goto done;
- }
- if (rm->rm_col[VDEV_RAIDZ_P].rc_error == 0) {
+ } else if ((code = vdev_raidz_combrec(zio, total_errors,
+ data_errors)) != 0) {
/*
- * Attempt to reconstruct the data from parity P.
+ * If we didn't use all the available parity for the
+ * combinatorial reconstruction, verify that the remaining
+ * parity is correct.
*/
- for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
- void *orig;
- rc = &rm->rm_col[c];
-
- orig = zio_buf_alloc(rc->rc_size);
- bcopy(rc->rc_data, orig, rc->rc_size);
- vdev_raidz_reconstruct_p(rm, c);
-
- if (zio_checksum_error(zio) == 0) {
- zio_buf_free(orig, rc->rc_size);
- atomic_inc_64(&raidz_corrected_p);
-
- /*
- * If this child didn't know that it returned
- * bad data, inform it.
- */
- if (rc->rc_tried && rc->rc_error == 0)
- raidz_checksum_error(zio, rc);
- rc->rc_error = ECKSUM;
- goto done;
- }
-
- bcopy(orig, rc->rc_data, rc->rc_size);
- zio_buf_free(orig, rc->rc_size);
- }
- }
-
- if (rm->rm_firstdatacol > 1 && rm->rm_col[VDEV_RAIDZ_Q].rc_error == 0) {
+ if (code != (1 << rm->rm_firstdatacol) - 1)
+ (void) raidz_parity_verify(zio, rm);
+ } else {
/*
- * Attempt to reconstruct the data from parity Q.
+ * All combinations failed to checksum. Generate checksum
+ * ereports for all children.
*/
- for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
- void *orig;
- rc = &rm->rm_col[c];
-
- orig = zio_buf_alloc(rc->rc_size);
- bcopy(rc->rc_data, orig, rc->rc_size);
- vdev_raidz_reconstruct_q(rm, c);
-
- if (zio_checksum_error(zio) == 0) {
- zio_buf_free(orig, rc->rc_size);
- atomic_inc_64(&raidz_corrected_q);
-
- /*
- * If this child didn't know that it returned
- * bad data, inform it.
- */
- if (rc->rc_tried && rc->rc_error == 0)
- raidz_checksum_error(zio, rc);
- rc->rc_error = ECKSUM;
- goto done;
- }
-
- bcopy(orig, rc->rc_data, rc->rc_size);
- zio_buf_free(orig, rc->rc_size);
- }
- }
+ zio->io_error = ECKSUM;
- if (rm->rm_firstdatacol > 1 &&
- rm->rm_col[VDEV_RAIDZ_P].rc_error == 0 &&
- rm->rm_col[VDEV_RAIDZ_Q].rc_error == 0) {
- /*
- * Attempt to reconstruct the data from both P and Q.
- */
- for (c = rm->rm_firstdatacol; c < rm->rm_cols - 1; c++) {
- void *orig, *orig1;
- rc = &rm->rm_col[c];
-
- orig = zio_buf_alloc(rc->rc_size);
- bcopy(rc->rc_data, orig, rc->rc_size);
-
- for (c1 = c + 1; c1 < rm->rm_cols; c1++) {
- rc1 = &rm->rm_col[c1];
-
- orig1 = zio_buf_alloc(rc1->rc_size);
- bcopy(rc1->rc_data, orig1, rc1->rc_size);
-
- vdev_raidz_reconstruct_pq(rm, c, c1);
-
- if (zio_checksum_error(zio) == 0) {
- zio_buf_free(orig, rc->rc_size);
- zio_buf_free(orig1, rc1->rc_size);
- atomic_inc_64(&raidz_corrected_pq);
-
- /*
- * If these children didn't know they
- * returned bad data, inform them.
- */
- if (rc->rc_tried && rc->rc_error == 0)
- raidz_checksum_error(zio, rc);
- if (rc1->rc_tried && rc1->rc_error == 0)
- raidz_checksum_error(zio, rc1);
-
- rc->rc_error = ECKSUM;
- rc1->rc_error = ECKSUM;
-
- goto done;
- }
-
- bcopy(orig1, rc1->rc_data, rc1->rc_size);
- zio_buf_free(orig1, rc1->rc_size);
+ if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
+ for (c = 0; c < rm->rm_cols; c++) {
+ rc = &rm->rm_col[c];
+ zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM,
+ zio->io_spa, vd->vdev_child[rc->rc_devidx],
+ zio, rc->rc_offset, rc->rc_size);
}
-
- bcopy(orig, rc->rc_data, rc->rc_size);
- zio_buf_free(orig, rc->rc_size);
- }
- }
-
- /*
- * All combinations failed to checksum. Generate checksum ereports for
- * all children.
- */
- zio->io_error = ECKSUM;
-
- if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
- for (c = 0; c < rm->rm_cols; c++) {
- rc = &rm->rm_col[c];
- zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM,
- zio->io_spa, vd->vdev_child[rc->rc_devidx], zio,
- rc->rc_offset, rc->rc_size);
}
}
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
static int
vdev_root_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift)
{
- int c;
int lasterror = 0;
int numerrors = 0;
return (EINVAL);
}
- for (c = 0; c < vd->vdev_children; c++) {
+ vdev_open_children(vd);
+
+ for (int c = 0; c < vd->vdev_children; c++) {
vdev_t *cvd = vd->vdev_child[c];
- int error;
- if ((error = vdev_open(cvd)) != 0 &&
- !cvd->vdev_islog) {
- lasterror = error;
+ if (cvd->vdev_open_error && !cvd->vdev_islog) {
+ lasterror = cvd->vdev_open_error;
numerrors++;
- continue;
}
}
static void
vdev_root_close(vdev_t *vd)
{
- int c;
-
- for (c = 0; c < vd->vdev_children; c++)
+ for (int c = 0; c < vd->vdev_children; c++)
vdev_close(vd->vdev_child[c]);
}
int
zap_count_write(objset_t *os, uint64_t zapobj, const char *name, int add,
- uint64_t *towrite, uint64_t *tooverwrite, uint64_t dn_datablkshift)
+ uint64_t *towrite, uint64_t *tooverwrite)
{
zap_t *zap;
int err = 0;
*towrite += (3 + (add ? 4 : 0)) * SPA_MAXBLOCKSIZE;
}
} else {
- if (!add) {
- if (dmu_buf_freeable(zap->zap_dbuf))
- *tooverwrite += SPA_MAXBLOCKSIZE;
- else
- *towrite += SPA_MAXBLOCKSIZE;
- } else {
- /*
- * We are here if we are adding and (name != NULL).
- * It is hard to find out if this add will promote this
- * microzap to fatzap. Hence, we assume the worst case
- * and account for the blocks assuming this microzap
- * would be promoted to a fatzap.
- *
- * 1 block overwritten : header block
- * 4 new blocks written : 2 new split leaf, 2 grown
- * ptrtbl blocks
- */
- if (dmu_buf_freeable(zap->zap_dbuf))
- *tooverwrite += 1 << dn_datablkshift;
- else
- *towrite += 1 << dn_datablkshift;
- *towrite += 4 << dn_datablkshift;
+ /*
+ * We are here if (name != NULL) and this is a micro-zap.
+ * We account for the header block depending on whether it
+ * is freeable.
+ *
+ * Incase of an add-operation it is hard to find out
+ * if this add will promote this microzap to fatzap.
+ * Hence, we consider the worst case and account for the
+ * blocks assuming this microzap would be promoted to a
+ * fatzap.
+ *
+ * 1 block overwritten : header block
+ * 4 new blocks written : 2 new split leaf, 2 grown
+ * ptrtbl blocks
+ */
+ if (dmu_buf_freeable(zap->zap_dbuf))
+ *tooverwrite += SPA_MAXBLOCKSIZE;
+ else
+ *towrite += SPA_MAXBLOCKSIZE;
+
+ if (add) {
+ *towrite += 4 * SPA_MAXBLOCKSIZE;
}
}
#define ZFS_ACL_WIDE_FLAGS (V4_ACL_WIDE_FLAGS|ZFS_ACL_TRIVIAL|ZFS_INHERIT_ACE|\
ZFS_ACL_OBJ_ACE)
+#define ALL_MODE_EXECS (S_IXUSR | S_IXGRP | S_IXOTH)
+
static uint16_t
zfs_ace_v0_get_type(void *acep)
{
uint64_t who;
uint16_t iflags, type;
uint32_t access_mask;
+ boolean_t an_exec_denied = B_FALSE;
mode = (zp->z_phys->zp_mode & (S_IFMT | S_ISUID | S_ISGID | S_ISVTX));
}
}
}
+ } else {
+ /*
+ * Only care if this IDENTIFIER_GROUP or
+ * USER ACE denies execute access to someone,
+ * mode is not affected
+ */
+ if ((access_mask & ACE_EXECUTE) && type == DENY)
+ an_exec_denied = B_TRUE;
}
}
+
+ /*
+ * Failure to allow is effectively a deny, so execute permission
+ * is denied if it was never mentioned or if we explicitly
+ * weren't allowed it.
+ */
+ if (!an_exec_denied &&
+ ((seen & ALL_MODE_EXECS) != ALL_MODE_EXECS ||
+ (mode & ALL_MODE_EXECS) != ALL_MODE_EXECS))
+ an_exec_denied = B_TRUE;
+
+ if (an_exec_denied)
+ zp->z_phys->zp_flags &= ~ZFS_NO_EXECS_DENIED;
+ else
+ zp->z_phys->zp_flags |= ZFS_NO_EXECS_DENIED;
+
return (mode);
}
}
/*
- * Read an external acl object.
+ * Read an external acl object. If the intent is to modify, always
+ * create a new acl and leave any cached acl in place.
*/
static int
zfs_acl_node_read(znode_t *zp, zfs_acl_t **aclpp, boolean_t will_modify)
ASSERT(MUTEX_HELD(&zp->z_acl_lock));
+ if (zp->z_acl_cached && !will_modify) {
+ *aclpp = zp->z_acl_cached;
+ return (0);
+ }
+
if (zp->z_phys->zp_acl.z_acl_extern_obj == 0) {
*aclpp = zfs_acl_node_read_internal(zp, will_modify);
+ if (!will_modify)
+ zp->z_acl_cached = *aclpp;
return (0);
}
}
*aclpp = aclp;
+ if (!will_modify)
+ zp->z_acl_cached = aclp;
return (0);
}
dmu_buf_will_dirty(zp->z_dbuf, tx);
+ if (zp->z_acl_cached) {
+ zfs_acl_free(zp->z_acl_cached);
+ zp->z_acl_cached = NULL;
+ }
+
zphys->zp_mode = zfs_mode_compute(zp, aclp);
/*
- * Decide which opbject type to use. If we are forced to
- * use old ACL format than transform ACL into zfs_oldace_t
+ * Decide which object type to use. If we are forced to
+ * use old ACL format then transform ACL into zfs_oldace_t
* layout.
*/
if (!zfsvfs->z_use_fuids) {
mutex_exit(&dzp->z_acl_lock);
acl_ids->z_aclp = zfs_acl_inherit(zfsvfs,
vap->va_type, paclp, acl_ids->z_mode, &need_chmod);
- zfs_acl_free(paclp);
} else {
acl_ids->z_aclp =
zfs_acl_alloc(zfs_acl_version_zp(dzp));
mutex_exit(&zp->z_acl_lock);
- zfs_acl_free(aclp);
-
return (0);
}
aclp->z_hints |= (zp->z_phys->zp_flags & V4_ACL_WIDE_FLAGS);
}
top:
- if (error = zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr)) {
- zfs_acl_free(aclp);
- return (error);
- }
-
mutex_enter(&zp->z_lock);
mutex_enter(&zp->z_acl_lock);
error = zfs_aclset_common(zp, aclp, cr, tx);
ASSERT(error == 0);
+ zp->z_acl_cached = aclp;
if (fuid_dirtied)
zfs_fuid_sync(zfsvfs, tx);
if (fuidp)
zfs_fuid_info_free(fuidp);
- zfs_acl_free(aclp);
dmu_tx_commit(tx);
done:
mutex_exit(&zp->z_acl_lock);
checkit = B_TRUE;
break;
} else {
- zfs_acl_free(aclp);
mutex_exit(&zp->z_acl_lock);
return (EIO);
}
uint32_t, mask_matched);
if (anyaccess) {
mutex_exit(&zp->z_acl_lock);
- zfs_acl_free(aclp);
return (0);
}
}
}
mutex_exit(&zp->z_acl_lock);
- zfs_acl_free(aclp);
/* Put the found 'denies' back on the working mode */
if (deny_mask) {
secpolicy_vnode_access(cr, ZTOV(zp), owner, VREAD) == 0 ||
secpolicy_vnode_access(cr, ZTOV(zp), owner, VWRITE) == 0 ||
secpolicy_vnode_access(cr, ZTOV(zp), owner, VEXEC) == 0 ||
- secpolicy_vnode_chown(cr, B_TRUE) == 0 ||
- secpolicy_vnode_chown(cr, B_FALSE) == 0 ||
+ secpolicy_vnode_chown(cr, owner) == 0 ||
secpolicy_vnode_setdac(cr, owner) == 0 ||
secpolicy_vnode_remove(cr) == 0);
}
check_privs, B_FALSE, cr));
}
+int
+zfs_fastaccesschk_execute(znode_t *zdp, cred_t *cr)
+{
+ boolean_t owner = B_FALSE;
+ boolean_t groupmbr = B_FALSE;
+ boolean_t is_attr;
+ uid_t fowner;
+ uid_t gowner;
+ uid_t uid = crgetuid(cr);
+ int error;
+
+ if (zdp->z_phys->zp_flags & ZFS_AV_QUARANTINED)
+ return (EACCES);
+
+ is_attr = ((zdp->z_phys->zp_flags & ZFS_XATTR) &&
+ (ZTOV(zdp)->v_type == VDIR));
+ if (is_attr)
+ goto slow;
+
+ mutex_enter(&zdp->z_acl_lock);
+
+ if (zdp->z_phys->zp_flags & ZFS_NO_EXECS_DENIED) {
+ mutex_exit(&zdp->z_acl_lock);
+ return (0);
+ }
+
+ if (FUID_INDEX(zdp->z_phys->zp_uid) != 0 ||
+ FUID_INDEX(zdp->z_phys->zp_gid) != 0) {
+ mutex_exit(&zdp->z_acl_lock);
+ goto slow;
+ }
+
+ fowner = (uid_t)zdp->z_phys->zp_uid;
+ gowner = (uid_t)zdp->z_phys->zp_gid;
+
+ if (uid == fowner) {
+ owner = B_TRUE;
+ if (zdp->z_phys->zp_mode & S_IXUSR) {
+ mutex_exit(&zdp->z_acl_lock);
+ return (0);
+ } else {
+ mutex_exit(&zdp->z_acl_lock);
+ goto slow;
+ }
+ }
+ if (groupmember(gowner, cr)) {
+ groupmbr = B_TRUE;
+ if (zdp->z_phys->zp_mode & S_IXGRP) {
+ mutex_exit(&zdp->z_acl_lock);
+ return (0);
+ } else {
+ mutex_exit(&zdp->z_acl_lock);
+ goto slow;
+ }
+ }
+ if (!owner && !groupmbr) {
+ if (zdp->z_phys->zp_mode & S_IXOTH) {
+ mutex_exit(&zdp->z_acl_lock);
+ return (0);
+ }
+ }
+
+ mutex_exit(&zdp->z_acl_lock);
+
+slow:
+ DTRACE_PROBE(zfs__fastpath__execute__access__miss);
+ ZFS_ENTER(zdp->z_zfsvfs);
+ error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr);
+ ZFS_EXIT(zdp->z_zfsvfs);
+ return (error);
+}
+
/*
* Determine whether Access should be granted/denied, invoking least
* priv subsytem when a deny is determined.
owner, checkmode);
if (error == 0 && (working_mode & ACE_WRITE_OWNER))
- error = secpolicy_vnode_chown(cr, B_TRUE);
+ error = secpolicy_vnode_chown(cr, owner);
if (error == 0 && (working_mode & ACE_WRITE_ACL))
error = secpolicy_vnode_setdac(cr, owner);
error = secpolicy_vnode_remove(cr);
if (error == 0 && (working_mode & ACE_SYNCHRONIZE)) {
- error = secpolicy_vnode_chown(cr, B_FALSE);
+ error = secpolicy_vnode_chown(cr, owner);
}
if (error == 0) {
/*
if (err)
avl_add(&sdp->sd_snaps, sep);
else
- err = dmu_objset_destroy(snapname);
+ err = dmu_objset_destroy(snapname, B_FALSE);
} else {
err = ENOENT;
}
rw_exit(&zfsvfs->z_fuid_lock);
return (retidx);
} else {
+ rw_exit(&zfsvfs->z_fuid_lock);
return (-1);
}
}
return (zfs_secpolicy_setprop(zc->zc_name, ZFS_PROP_VERSION, cr));
}
+static int
+zfs_secpolicy_hold(zfs_cmd_t *zc, cred_t *cr)
+{
+ return (zfs_secpolicy_write_perms(zc->zc_name,
+ ZFS_DELEG_PERM_HOLD, cr));
+}
+
+static int
+zfs_secpolicy_release(zfs_cmd_t *zc, cred_t *cr)
+{
+ return (zfs_secpolicy_write_perms(zc->zc_name,
+ ZFS_DELEG_PERM_RELEASE, cr));
+}
+
/*
* Returns the nvlist as specified by the user in the zfs_cmd_t.
*/
*/
if (error == 0) {
if ((error = zfs_set_prop_nvlist(zc->zc_name, nvprops)) != 0)
- (void) dmu_objset_destroy(zc->zc_name);
+ (void) dmu_objset_destroy(zc->zc_name, B_FALSE);
}
nvlist_free(nvprops);
return (error);
/*
* inputs:
- * zc_name name of filesystem
- * zc_value short name of snapshot
+ * zc_name name of filesystem
+ * zc_value short name of snapshot
+ * zc_defer_destroy mark for deferred destroy
*
* outputs: none
*/
zfs_unmount_snap, zc->zc_value, DS_FIND_CHILDREN);
if (err)
return (err);
- return (dmu_snapshots_destroy(zc->zc_name, zc->zc_value));
+ return (dmu_snapshots_destroy(zc->zc_name, zc->zc_value,
+ zc->zc_defer_destroy));
}
/*
* inputs:
* zc_name name of dataset to destroy
* zc_objset_type type of objset
+ * zc_defer_destroy mark for deferred destroy
*
* outputs: none
*/
return (err);
}
- return (dmu_objset_destroy(zc->zc_name));
+ return (dmu_objset_destroy(zc->zc_name, zc->zc_defer_destroy));
}
/*
file_t *fp;
objset_t *os;
dmu_recv_cookie_t drc;
- zfsvfs_t *zfsvfs = NULL;
boolean_t force = (boolean_t)zc->zc_guid;
int error, fd;
offset_t off;
return (EBADF);
}
- if (getzfsvfs(tofs, &zfsvfs) == 0) {
- if (!mutex_tryenter(&zfsvfs->z_online_recv_lock)) {
- VFS_RELE(zfsvfs->z_vfs);
- zfsvfs = NULL;
- error = EBUSY;
- goto out;
- }
+ if (props && dmu_objset_open(tofs, DMU_OST_ANY,
+ DS_MODE_USER | DS_MODE_READONLY, &os) == 0) {
/*
* If new properties are supplied, they are to completely
* replace the existing ones, so stash away the existing ones.
*/
- if (props)
- (void) dsl_prop_get_all(zfsvfs->z_os, &origprops, TRUE);
- } else if (props && dmu_objset_open(tofs, DMU_OST_ANY,
- DS_MODE_USER | DS_MODE_READONLY, &os) == 0) {
- /*
- * Get the props even if there was no zfsvfs (zvol or
- * unmounted zpl).
- */
(void) dsl_prop_get_all(os, &origprops, TRUE);
dmu_objset_close(os);
}
error = dmu_recv_begin(tofs, tosnap, &zc->zc_begin_record,
- force, origin, zfsvfs != NULL, &drc);
+ force, origin, &drc);
if (origin)
dmu_objset_close(origin);
if (error)
off = fp->f_offset;
error = dmu_recv_stream(&drc, fp->f_vnode, &off);
- if (error == 0 && zfsvfs) {
- char *osname;
- int mode;
+ if (error == 0) {
+ zfsvfs_t *zfsvfs = NULL;
- /* online recv */
- osname = kmem_alloc(MAXNAMELEN, KM_SLEEP);
- error = zfs_suspend_fs(zfsvfs, osname, &mode);
- if (error == 0) {
- int resume_err;
+ if (getzfsvfs(tofs, &zfsvfs) == 0) {
+ /* online recv */
+ int end_err;
+ char *osname;
+ int mode;
- error = dmu_recv_end(&drc);
- resume_err = zfs_resume_fs(zfsvfs, osname, mode);
- error = error ? error : resume_err;
+ osname = kmem_alloc(MAXNAMELEN, KM_SLEEP);
+ error = zfs_suspend_fs(zfsvfs, osname, &mode);
+ /*
+ * If the suspend fails, then the recv_end will
+ * likely also fail, and clean up after itself.
+ */
+ end_err = dmu_recv_end(&drc);
+ if (error == 0) {
+ int resume_err =
+ zfs_resume_fs(zfsvfs, osname, mode);
+ error = error ? error : resume_err;
+ }
+ error = error ? error : end_err;
+ VFS_RELE(zfsvfs->z_vfs);
+ kmem_free(osname, MAXNAMELEN);
} else {
- dmu_recv_abort_cleanup(&drc);
+ error = dmu_recv_end(&drc);
}
- kmem_free(osname, MAXNAMELEN);
- } else if (error == 0) {
- error = dmu_recv_end(&drc);
}
zc->zc_cookie = off - fp->f_offset;
(void) zfs_set_prop_nvlist(tofs, origprops);
}
out:
- if (zfsvfs) {
- mutex_exit(&zfsvfs->z_online_recv_lock);
- VFS_RELE(zfsvfs->z_vfs);
- }
nvlist_free(props);
nvlist_free(origprops);
releasef(fd);
return (error);
}
+/*
+ * inputs:
+ * zc_name name of filesystem
+ * zc_value short name of snap
+ * zc_string user-supplied tag for this reference
+ * zc_cookie recursive flag
+ *
+ * outputs: none
+ */
+static int
+zfs_ioc_hold(zfs_cmd_t *zc)
+{
+ boolean_t recursive = zc->zc_cookie;
+
+ if (snapshot_namecheck(zc->zc_value, NULL, NULL) != 0)
+ return (EINVAL);
+
+ return (dsl_dataset_user_hold(zc->zc_name, zc->zc_value,
+ zc->zc_string, recursive));
+}
+
+/*
+ * inputs:
+ * zc_name name of dataset from which we're releasing a user reference
+ * zc_value short name of snap
+ * zc_string user-supplied tag for this reference
+ * zc_cookie recursive flag
+ *
+ * outputs: none
+ */
+static int
+zfs_ioc_release(zfs_cmd_t *zc)
+{
+ boolean_t recursive = zc->zc_cookie;
+
+ if (snapshot_namecheck(zc->zc_value, NULL, NULL) != 0)
+ return (EINVAL);
+
+ return (dsl_dataset_user_release(zc->zc_name, zc->zc_value,
+ zc->zc_string, recursive));
+}
+
+/*
+ * inputs:
+ * zc_name name of filesystem
+ *
+ * outputs:
+ * zc_nvlist_src{_size} nvlist of snapshot holds
+ */
+static int
+zfs_ioc_get_holds(zfs_cmd_t *zc)
+{
+ nvlist_t *nvp;
+ int error;
+
+ if ((error = dsl_dataset_get_holds(zc->zc_name, &nvp)) == 0) {
+ error = put_nvlist(zc, nvp);
+ nvlist_free(nvp);
+ }
+
+ return (error);
+}
+
/*
* pool create, destroy, and export don't log the history as part of
* zfsdev_ioctl, but rather zfs_ioc_pool_create, and zfs_ioc_pool_export
B_TRUE },
{ zfs_ioc_dsobj_to_dsname, zfs_secpolicy_config, POOL_NAME, B_FALSE,
B_FALSE },
- { zfs_ioc_obj_to_path, zfs_secpolicy_config, NO_NAME, B_FALSE,
- B_FALSE },
+ { zfs_ioc_obj_to_path, zfs_secpolicy_config, DATASET_NAME, B_FALSE,
+ B_TRUE },
{ zfs_ioc_pool_set_props, zfs_secpolicy_config, POOL_NAME, B_TRUE,
B_TRUE },
{ zfs_ioc_pool_get_props, zfs_secpolicy_read, POOL_NAME, B_FALSE,
DATASET_NAME, B_FALSE, B_FALSE },
{ zfs_ioc_userspace_upgrade, zfs_secpolicy_userspace_upgrade,
DATASET_NAME, B_FALSE, B_TRUE },
+ { zfs_ioc_hold, zfs_secpolicy_hold, DATASET_NAME, B_TRUE, B_TRUE },
+ { zfs_ioc_release, zfs_secpolicy_release, DATASET_NAME, B_TRUE,
+ B_TRUE },
+ { zfs_ioc_get_holds, zfs_secpolicy_read, DATASET_NAME, B_FALSE,
+ B_TRUE }
};
int
goto out;
mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
- mutex_init(&zfsvfs->z_online_recv_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&zfsvfs->z_lock, NULL, MUTEX_DEFAULT, NULL);
list_create(&zfsvfs->z_all_znodes, sizeof (znode_t),
offsetof(znode_t, z_link_node));
zfs_fuid_destroy(zfsvfs);
mutex_destroy(&zfsvfs->z_znodes_lock);
- mutex_destroy(&zfsvfs->z_online_recv_lock);
mutex_destroy(&zfsvfs->z_lock);
list_destroy(&zfsvfs->z_all_znodes);
rrw_destroy(&zfsvfs->z_teardown_lock);
znode_t *zp = VTOZ(vp);
zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ /*
+ * Clean up any locks held by this process on the vp.
+ */
+ cleanlocks(vp, ddi_get_pid(), 0);
+ cleanshares(vp, ddi_get_pid());
+
ZFS_ENTER(zfsvfs);
ZFS_VERIFY_ZP(zp);
if ((flag & (FSYNC | FDSYNC)) && (count == 1))
atomic_dec_32(&zp->z_sync_cnt);
- /*
- * Clean up any locks held by this process on the vp.
- */
- cleanlocks(vp, ddi_get_pid(), 0);
- cleanshares(vp, ddi_get_pid());
-
if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan &&
ZTOV(zp)->v_type == VREG &&
!(zp->z_phys->zp_flags & ZFS_AV_QUARANTINED) &&
kmem_free(zgd, sizeof (zgd_t));
}
+#ifdef DEBUG
+static int zil_fault_io = 0;
+#endif
+
/*
* Get data to generate a TX_WRITE intent log record.
*/
zgd->zgd_rl = rl;
zgd->zgd_zilog = zfsvfs->z_log;
zgd->zgd_bp = &lr->lr_blkptr;
- VERIFY(0 == dmu_buf_hold(os, lr->lr_foid, boff, zgd, &db));
+#ifdef DEBUG
+ if (zil_fault_io) {
+ error = EIO;
+ zil_fault_io = 0;
+ } else {
+ error = dmu_buf_hold(os, lr->lr_foid, boff, zgd, &db);
+ }
+#else
+ error = dmu_buf_hold(os, lr->lr_foid, boff, zgd, &db);
+#endif
+ if (error != 0) {
+ kmem_free(zgd, sizeof (zgd_t));
+ goto out;
+ }
+
ASSERT(boff == db->db_offset);
lr->lr_blkoff = off - boff;
error = dmu_sync(zio, db, &lr->lr_blkptr,
return (error);
}
+/*
+ * If vnode is for a device return a specfs vnode instead.
+ */
+static int
+specvp_check(vnode_t **vpp, cred_t *cr)
+{
+ int error = 0;
+
+ if (IS_DEVVP(*vpp)) {
+ struct vnode *svp;
+
+ svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr);
+ VN_RELE(*vpp);
+ if (svp == NULL)
+ error = ENOSYS;
+ *vpp = svp;
+ }
+ return (error);
+}
+
+
/*
* Lookup an entry in a directory, or an extended attribute directory.
* If it exists, return a held vnode reference for it.
{
znode_t *zdp = VTOZ(dvp);
zfsvfs_t *zfsvfs = zdp->z_zfsvfs;
- int error;
+ int error = 0;
+
+ /* fast path */
+ if (!(flags & (LOOKUP_XATTR | FIGNORECASE))) {
+
+ if (dvp->v_type != VDIR) {
+ return (ENOTDIR);
+ } else if (zdp->z_dbuf == NULL) {
+ return (EIO);
+ }
+
+ if (nm[0] == 0 || (nm[0] == '.' && nm[1] == '\0')) {
+ error = zfs_fastaccesschk_execute(zdp, cr);
+ if (!error) {
+ *vpp = dvp;
+ VN_HOLD(*vpp);
+ return (0);
+ }
+ return (error);
+ } else {
+ vnode_t *tvp = dnlc_lookup(dvp, nm);
+
+ if (tvp) {
+ error = zfs_fastaccesschk_execute(zdp, cr);
+ if (error) {
+ VN_RELE(tvp);
+ return (error);
+ }
+ if (tvp == DNLC_NO_VNODE) {
+ VN_RELE(tvp);
+ return (ENOENT);
+ } else {
+ *vpp = tvp;
+ return (specvp_check(vpp, cr));
+ }
+ }
+ }
+ }
+
+ DTRACE_PROBE2(zfs__fastpath__lookup__miss, vnode_t *, dvp, char *, nm);
ZFS_ENTER(zfsvfs);
ZFS_VERIFY_ZP(zdp);
}
error = zfs_dirlook(zdp, nm, vpp, flags, direntflags, realpnp);
- if (error == 0) {
- /*
- * Convert device special files
- */
- if (IS_DEVVP(*vpp)) {
- vnode_t *svp;
-
- svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr);
- VN_RELE(*vpp);
- if (svp == NULL)
- error = ENOSYS;
- else
- *vpp = svp;
- }
- }
+ if (error == 0)
+ error = specvp_check(vpp, cr);
ZFS_EXIT(zfsvfs);
return (error);
&acl_ids)) != 0)
goto out;
if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
+ zfs_acl_ids_free(&acl_ids);
error = EDQUOT;
goto out;
}
VN_RELE(ZTOV(zp));
} else {
*vpp = ZTOV(zp);
- /*
- * If vnode is for a device return a specfs vnode instead.
- */
- if (IS_DEVVP(*vpp)) {
- struct vnode *svp;
-
- svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr);
- VN_RELE(*vpp);
- if (svp == NULL) {
- error = ENOSYS;
- }
- *vpp = svp;
- }
+ error = specvp_check(vpp, cr);
}
ZFS_EXIT(zfsvfs);
return (error);
}
if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
+ zfs_acl_ids_free(&acl_ids);
zfs_dirent_unlock(dl);
ZFS_EXIT(zfsvfs);
return (EDQUOT);
top:
attrzp = NULL;
+ /* Can this be moved to before the top label? */
if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
ZFS_EXIT(zfsvfs);
return (EROFS);
zp->z_phys->zp_mode = new_mode;
err = zfs_aclset_common(zp, aclp, cr, tx);
ASSERT3U(err, ==, 0);
+ zp->z_acl_cached = aclp;
+ aclp = NULL;
mutex_exit(&zp->z_acl_lock);
}
if (attrzp)
VN_RELE(ZTOV(attrzp));
- if (aclp) {
+ if (aclp)
zfs_acl_free(aclp);
- aclp = NULL;
- }
if (fuidp) {
zfs_fuid_info_free(fuidp);
if (err == 0) {
zfs_time_stamper(zp, CONTENT_MODIFIED, tx);
zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, len, 0);
- dmu_tx_commit(tx);
}
+ dmu_tx_commit(tx);
out:
pvn_write_done(pp, (err ? B_ERROR : 0) | flags);
zp->z_dbuf = NULL;
zp->z_dirlocks = NULL;
+ zp->z_acl_cached = NULL;
return (0);
}
ASSERT(zp->z_dbuf == NULL);
ASSERT(zp->z_dirlocks == NULL);
+ ASSERT(zp->z_acl_cached == NULL);
}
#ifdef ZNODE_STATS
nzp->z_phys = ozp->z_phys;
nzp->z_dbuf = ozp->z_dbuf;
+ /*
+ * Release any cached ACL, since it *may* have
+ * zfs_acl_node_t's that directly references an
+ * embedded ACL in the zp_acl of the old znode_phys_t
+ *
+ * It will be recached the next time the ACL is needed.
+ */
+ if (ozp->z_acl_cached) {
+ zfs_acl_free(ozp->z_acl_cached);
+ ozp->z_acl_cached = NULL;
+ }
+
/* Update back pointers. */
(void) dmu_buf_update_user(nzp->z_dbuf, ozp, nzp, &nzp->z_phys,
znode_evict_error);
list_remove(&zfsvfs->z_all_znodes, zp);
mutex_exit(&zfsvfs->z_znodes_lock);
+ if (zp->z_acl_cached) {
+ zfs_acl_free(zp->z_acl_cached);
+ zp->z_acl_cached = NULL;
+ }
+
kmem_cache_free(znode_cache, zp);
VFS_RELE(zfsvfs->z_vfs);
lwb->lwb_buf = NULL;
if (zio->io_error)
zilog->zl_log_error = B_TRUE;
- mutex_exit(&zilog->zl_lock);
/*
* Now that we've written this log block, we have a stable pointer
* to the next block in the chain, so it's OK to let the txg in
- * which we allocated the next block sync.
+ * which we allocated the next block sync. We still have the
+ * zl_lock to ensure zil_sync doesn't kmem free the lwb.
*/
txg_rele_to_sync(&lwb->lwb_txgh);
+ mutex_exit(&zilog->zl_lock);
}
/*
}
error = zilog->zl_get_data(
itx->itx_private, lr, dbuf, lwb->lwb_zio);
+ if (error == EIO) {
+ txg_wait_synced(zilog->zl_dmu_pool, txg);
+ return (lwb);
+ }
if (error) {
ASSERT(error == ENOENT || error == EEXIST ||
error == EALREADY);