]> granicus.if.org Git - zfs/blobdiff - lib/libzfs/libzfs_import.c
Add support for nvme based devids
[zfs] / lib / libzfs / libzfs_import.c
index fb3525848c9449326ebae2dad169084d18c4ac44..9dbf20795ebf47d03ac18fde1691ca2be3168164 100644 (file)
@@ -21,7 +21,7 @@
 /*
  * Copyright 2015 Nexenta Systems, Inc. All rights reserved.
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
  * Copyright 2015 RackTop Systems.
  * Copyright (c) 2016, Intel Corporation.
  */
@@ -64,6 +64,7 @@
 #include <blkid/blkid.h>
 #include "libzfs.h"
 #include "libzfs_impl.h"
+#include <libzfs.h>
 
 /*
  * Intermediate structures used to gather configuration information.
@@ -116,10 +117,10 @@ typedef struct vdev_dev_strs {
 /*
  * Obtain the persistent device id string (describes what)
  *
- * used by ZED auto-{online,expand,replace}
+ * used by ZED vdev matching for auto-{online,expand,replace}
  */
-static int
-udev_device_get_devid(struct udev_device *dev, char *bufptr, size_t buflen)
+int
+zfs_device_get_devid(struct udev_device *dev, char *bufptr, size_t buflen)
 {
        struct udev_list_entry *entry;
        const char *bus;
@@ -141,7 +142,18 @@ udev_device_get_devid(struct udev_device *dev, char *bufptr, size_t buflen)
                        (void) snprintf(bufptr, buflen, "dm-uuid-%s", dm_uuid);
                        return (0);
                }
-               return (ENODATA);
+
+               /*
+                * NVME 'by-id' symlinks are similar to bus case
+                */
+               struct udev_device *parent;
+
+               parent = udev_device_get_parent_with_subsystem_devtype(dev,
+                   "nvme", NULL);
+               if (parent != NULL)
+                       bus = "nvme";   /* continue with bus symlink search */
+               else
+                       return (ENODATA);
        }
 
        /*
@@ -167,27 +179,39 @@ udev_device_get_devid(struct udev_device *dev, char *bufptr, size_t buflen)
 /*
  * Obtain the persistent physical location string (describes where)
  *
- * used by ZED auto-{online,expand,replace}
+ * used by ZED vdev matching for auto-{online,expand,replace}
  */
-static int
-udev_device_get_physical(struct udev_device *dev, char *bufptr, size_t buflen)
+int
+zfs_device_get_physical(struct udev_device *dev, char *bufptr, size_t buflen)
 {
-       const char *physpath, *value;
+       const char *physpath = NULL;
 
        /*
-        * Skip indirect multipath device nodes
+        * Normal disks use ID_PATH for their physical path.  Device mapper
+        * devices are virtual and don't have a physical path.  For them we
+        * use ID_VDEV instead, which is setup via the /etc/vdev_id.conf file.
+        * ID_VDEV provides a persistent path to a virtual device.  If you
+        * don't have vdev_id.conf setup, you cannot use multipath autoreplace.
         */
-       value = udev_device_get_property_value(dev, "DM_MULTIPATH_DEVICE_PATH");
-       if (value != NULL && strcmp(value, "1") == 0)
-               return (ENODATA);  /* skip physical for multipath nodes */
-
-       physpath = udev_device_get_property_value(dev, "ID_PATH");
-       if (physpath != NULL && physpath[0] != '\0') {
-               (void) strlcpy(bufptr, physpath, buflen);
-               return (0);
+       if (!((physpath = udev_device_get_property_value(dev, "ID_PATH")) &&
+           physpath[0])) {
+               if (!((physpath =
+                   udev_device_get_property_value(dev, "ID_VDEV")) &&
+                   physpath[0])) {
+                       return (ENODATA);
+               }
        }
 
-       return (ENODATA);
+       (void) strlcpy(bufptr, physpath, buflen);
+
+       return (0);
+}
+
+boolean_t
+udev_is_mpath(struct udev_device *dev)
+{
+       return udev_device_get_property_value(dev, "DM_UUID") &&
+           udev_device_get_property_value(dev, "MPATH_SBIN_PATH");
 }
 
 /*
@@ -200,15 +224,13 @@ udev_device_get_physical(struct udev_device *dev, char *bufptr, size_t buflen)
 static boolean_t
 udev_mpath_whole_disk(struct udev_device *dev)
 {
-       const char *devname, *mapname, *type, *uuid;
+       const char *devname, *type, *uuid;
 
        devname = udev_device_get_property_value(dev, "DEVNAME");
-       mapname = udev_device_get_property_value(dev, "DM_NAME");
        type = udev_device_get_property_value(dev, "ID_PART_TABLE_TYPE");
        uuid = udev_device_get_property_value(dev, "DM_UUID");
 
        if ((devname != NULL && strncmp(devname, "/dev/dm-", 8) == 0) &&
-           (mapname != NULL && strncmp(mapname, "mpath", 5) == 0) &&
            ((type == NULL) || (strcmp(type, "gpt") != 0)) &&
            (uuid != NULL)) {
                return (B_TRUE);
@@ -298,7 +320,7 @@ zpool_label_disk_wait(char *path, int timeout_ms)
                dev = udev_device_new_from_subsystem_sysname(udev,
                    "block", sysname);
                if ((dev != NULL) && udev_device_is_ready(dev)) {
-                       struct udev_list_entry *links, *link;
+                       struct udev_list_entry *links, *link = NULL;
 
                        ret = 0;
                        links = udev_device_get_devlinks_list_entry(dev);
@@ -394,12 +416,12 @@ encode_device_strings(const char *path, vdev_dev_strs_t *ds,
        if (!wholedisk && !udev_mpath_whole_disk(dev))
                goto no_dev;
 
-       ret = udev_device_get_devid(dev, ds->vds_devid, sizeof (ds->vds_devid));
+       ret = zfs_device_get_devid(dev, ds->vds_devid, sizeof (ds->vds_devid));
        if (ret != 0)
                goto no_dev_ref;
 
        /* physical location string (optional) */
-       if (udev_device_get_physical(dev, ds->vds_devphys,
+       if (zfs_device_get_physical(dev, ds->vds_devphys,
            sizeof (ds->vds_devphys)) != 0) {
                ds->vds_devphys[0] = '\0'; /* empty string --> not available */
        }
@@ -427,6 +449,10 @@ no_dev:
  *
  * multipath device node example:
  *     devid:          'dm-uuid-mpath-35000c5006304de3f'
+ *
+ * We also store the enclosure sysfs path for turning on enclosure LEDs
+ * (if applicable):
+ *     vdev_enc_sysfs_path: '/sys/class/enclosure/11:0:1:0/SLOT 4'
  */
 void
 update_vdev_config_dev_strs(nvlist_t *nv)
@@ -434,6 +460,7 @@ update_vdev_config_dev_strs(nvlist_t *nv)
        vdev_dev_strs_t vds;
        char *env, *type, *path;
        uint64_t wholedisk = 0;
+       char *upath, *spath;
 
        /*
         * For the benefit of legacy ZFS implementations, allow
@@ -480,10 +507,23 @@ update_vdev_config_dev_strs(nvlist_t *nv)
                        (void) nvlist_add_string(nv, ZPOOL_CONFIG_PHYS_PATH,
                            vds.vds_devphys);
                }
+
+               /* Add enclosure sysfs path (if disk is in an enclosure) */
+               upath = zfs_get_underlying_path(path);
+               spath = zfs_get_enclosure_sysfs_path(upath);
+               if (spath)
+                       nvlist_add_string(nv, ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH,
+                           spath);
+               else
+                       nvlist_remove_all(nv, ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH);
+
+               free(upath);
+               free(spath);
        } else {
                /* clear out any stale entries */
                (void) nvlist_remove_all(nv, ZPOOL_CONFIG_DEVID);
                (void) nvlist_remove_all(nv, ZPOOL_CONFIG_PHYS_PATH);
+               (void) nvlist_remove_all(nv, ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH);
        }
 }
 #else
@@ -641,11 +681,14 @@ add_config(libzfs_handle_t *hdl, pool_list_t *pl, const char *path,
            &state) == 0 &&
            (state == POOL_STATE_SPARE || state == POOL_STATE_L2CACHE) &&
            nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, &vdev_guid) == 0) {
-               if ((ne = zfs_alloc(hdl, sizeof (name_entry_t))) == NULL)
+               if ((ne = zfs_alloc(hdl, sizeof (name_entry_t))) == NULL) {
+                       nvlist_free(config);
                        return (-1);
+               }
 
                if ((ne->ne_name = zfs_strdup(hdl, path)) == NULL) {
                        free(ne);
+                       nvlist_free(config);
                        return (-1);
                }
                ne->ne_guid = vdev_guid;
@@ -653,6 +696,7 @@ add_config(libzfs_handle_t *hdl, pool_list_t *pl, const char *path,
                ne->ne_num_labels = num_labels;
                ne->ne_next = pl->names;
                pl->names = ne;
+               nvlist_free(config);
                return (0);
        }
 
@@ -792,13 +836,14 @@ refresh_config(libzfs_handle_t *hdl, nvlist_t *config)
 {
        nvlist_t *nvl;
        zfs_cmd_t zc = {"\0"};
-       int err;
+       int err, dstbuf_size;
 
        if (zcmd_write_conf_nvlist(hdl, &zc, config) != 0)
                return (NULL);
 
-       if (zcmd_alloc_dst_nvlist(hdl, &zc,
-           zc.zc_nvlist_conf_size * 2) != 0) {
+       dstbuf_size = MAX(CONFIG_BUF_MINSIZE, zc.zc_nvlist_conf_size * 4);
+
+       if (zcmd_alloc_dst_nvlist(hdl, &zc, dstbuf_size) != 0) {
                zcmd_free_nvlists(&zc);
                return (NULL);
        }
@@ -1282,6 +1327,7 @@ zpool_read_label(int fd, nvlist_t **config, int *num_labels)
        vdev_label_t *label;
        nvlist_t *expected_config = NULL;
        uint64_t expected_guid = 0, size;
+       int error;
 
        *config = NULL;
 
@@ -1289,7 +1335,8 @@ zpool_read_label(int fd, nvlist_t **config, int *num_labels)
                return (0);
        size = P2ALIGN_TYPED(statbuf.st_size, sizeof (vdev_label_t), uint64_t);
 
-       if ((label = malloc(sizeof (vdev_label_t))) == NULL)
+       error = posix_memalign((void **)&label, PAGESIZE, sizeof (*label));
+       if (error)
                return (-1);
 
        for (l = 0; l < VDEV_LABELS; l++) {
@@ -1347,6 +1394,7 @@ typedef struct rdsk_node {
        char *rn_name;                  /* Full path to device */
        int rn_order;                   /* Preferred order (low to high) */
        int rn_num_labels;              /* Number of valid labels */
+       uint64_t rn_vdev_guid;          /* Expected vdev guid when set */
        libzfs_handle_t *rn_hdl;
        nvlist_t *rn_config;            /* Label config */
        avl_tree_t *rn_avl;
@@ -1355,39 +1403,29 @@ typedef struct rdsk_node {
        boolean_t rn_labelpaths;
 } rdsk_node_t;
 
+/*
+ * Sorted by vdev guid and full path to allow for multiple entries with
+ * the same full path name.  This is required because it's possible to
+ * have multiple block devices with labels that refer to the same
+ * ZPOOL_CONFIG_PATH yet have different vdev guids.  In this case both
+ * entries need to be added to the cache.  Scenarios where this can occur
+ * include overwritten pool labels, devices which are visible from multiple
+ * hosts and multipath devices.
+ */
 static int
 slice_cache_compare(const void *arg1, const void *arg2)
 {
        const char  *nm1 = ((rdsk_node_t *)arg1)->rn_name;
        const char  *nm2 = ((rdsk_node_t *)arg2)->rn_name;
-       char *nm1slice, *nm2slice;
+       uint64_t guid1 = ((rdsk_node_t *)arg1)->rn_vdev_guid;
+       uint64_t guid2 = ((rdsk_node_t *)arg2)->rn_vdev_guid;
        int rv;
 
-       /*
-        * partitions one and three (slices zero and two) are the most
-        * likely to provide results, so put those first
-        */
-       nm1slice = strstr(nm1, "part1");
-       nm2slice = strstr(nm2, "part1");
-       if (nm1slice && !nm2slice) {
-               return (-1);
-       }
-       if (!nm1slice && nm2slice) {
-               return (1);
-       }
-       nm1slice = strstr(nm1, "part3");
-       nm2slice = strstr(nm2, "part3");
-       if (nm1slice && !nm2slice) {
-               return (-1);
-       }
-       if (!nm1slice && nm2slice) {
-               return (1);
-       }
+       rv = AVL_CMP(guid1, guid2);
+       if (rv)
+               return (rv);
 
-       rv = strcmp(nm1, nm2);
-       if (rv == 0)
-               return (0);
-       return (rv > 0 ? 1 : -1);
+       return (AVL_ISIGN(strcmp(nm1, nm2)));
 }
 
 static boolean_t
@@ -1475,6 +1513,7 @@ zpool_open_func(void *arg)
        struct stat64 statbuf;
        nvlist_t *config;
        char *bname, *dupname;
+       uint64_t vdev_guid = 0;
        int error;
        int num_labels;
        int fd;
@@ -1500,19 +1539,28 @@ zpool_open_func(void *arg)
            (!S_ISREG(statbuf.st_mode) && !S_ISBLK(statbuf.st_mode)))
                return;
 
-       if ((fd = open(rn->rn_name, O_RDONLY)) < 0)
+       /*
+        * Preferentially open using O_DIRECT to bypass the block device
+        * cache which may be stale for multipath devices.  An EINVAL errno
+        * indicates O_DIRECT is unsupported so fallback to just O_RDONLY.
+        */
+       fd = open(rn->rn_name, O_RDONLY | O_DIRECT);
+       if ((fd < 0) && (errno == EINVAL))
+               fd = open(rn->rn_name, O_RDONLY);
+
+       if (fd < 0)
                return;
 
        /*
         * This file is too small to hold a zpool
         */
-       if (S_ISREG(statbuf.st_mode) &&
-           statbuf.st_size < SPA_MINDEVSIZE) {
+       if (S_ISREG(statbuf.st_mode) && statbuf.st_size < SPA_MINDEVSIZE) {
                (void) close(fd);
                return;
        }
 
-       if ((zpool_read_label(fd, &config, &num_labels)) != 0) {
+       error = zpool_read_label(fd, &config, &num_labels);
+       if (error != 0) {
                (void) close(fd);
                return;
        }
@@ -1523,6 +1571,18 @@ zpool_open_func(void *arg)
                return;
        }
 
+       /*
+        * Check that the vdev is for the expected guid.  Additional entries
+        * are speculatively added based on the paths stored in the labels.
+        * Entries with valid paths but incorrect guids must be removed.
+        */
+       error = nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, &vdev_guid);
+       if (error || (rn->rn_vdev_guid && rn->rn_vdev_guid != vdev_guid)) {
+               (void) close(fd);
+               nvlist_free(config);
+               return;
+       }
+
        (void) close(fd);
 
        rn->rn_config = config;
@@ -1549,9 +1609,10 @@ zpool_open_func(void *arg)
                if (path != NULL) {
                        slice = zfs_alloc(hdl, sizeof (rdsk_node_t));
                        slice->rn_name = zfs_strdup(hdl, path);
+                       slice->rn_vdev_guid = vdev_guid;
                        slice->rn_avl = rn->rn_avl;
                        slice->rn_hdl = hdl;
-                       slice->rn_order = 1;
+                       slice->rn_order = IMPORT_ORDER_PREFERRED_1;
                        slice->rn_labelpaths = B_FALSE;
                        mutex_enter(rn->rn_lock);
                        if (avl_find(rn->rn_avl, slice, &where)) {
@@ -1574,9 +1635,10 @@ zpool_open_func(void *arg)
                                return;
                        }
 
+                       slice->rn_vdev_guid = vdev_guid;
                        slice->rn_avl = rn->rn_avl;
                        slice->rn_hdl = hdl;
-                       slice->rn_order = 2;
+                       slice->rn_order = IMPORT_ORDER_PREFERRED_2;
                        slice->rn_labelpaths = B_FALSE;
                        mutex_enter(rn->rn_lock);
                        if (avl_find(rn->rn_avl, slice, &where)) {
@@ -1678,10 +1740,11 @@ zpool_find_import_scan(libzfs_handle_t *hdl, kmutex_t *lock,
                                free(slice);
                                continue;
                        }
+                       slice->rn_vdev_guid = 0;
                        slice->rn_lock = lock;
                        slice->rn_avl = cache;
                        slice->rn_hdl = hdl;
-                       slice->rn_order = i+1;
+                       slice->rn_order = i + IMPORT_ORDER_SCAN_OFFSET;
                        slice->rn_labelpaths = B_FALSE;
                        mutex_enter(lock);
                        avl_add(cache, slice);
@@ -1716,6 +1779,7 @@ zpool_find_import_blkid(libzfs_handle_t *hdl, kmutex_t *lock,
        blkid_cache cache;
        blkid_dev_iterate iter;
        blkid_dev dev;
+       avl_index_t where;
        int error;
 
        *slice_cache = NULL;
@@ -1750,13 +1814,25 @@ zpool_find_import_blkid(libzfs_handle_t *hdl, kmutex_t *lock,
        while (blkid_dev_next(iter, &dev) == 0) {
                slice = zfs_alloc(hdl, sizeof (rdsk_node_t));
                slice->rn_name = zfs_strdup(hdl, blkid_dev_devname(dev));
+               slice->rn_vdev_guid = 0;
                slice->rn_lock = lock;
                slice->rn_avl = *slice_cache;
                slice->rn_hdl = hdl;
-               slice->rn_order = 100;
                slice->rn_labelpaths = B_TRUE;
+
+               error = zfs_path_order(slice->rn_name, &slice->rn_order);
+               if (error == 0)
+                       slice->rn_order += IMPORT_ORDER_SCAN_OFFSET;
+               else
+                       slice->rn_order = IMPORT_ORDER_DEFAULT;
+
                mutex_enter(lock);
-               avl_add(*slice_cache, slice);
+               if (avl_find(*slice_cache, slice, &where)) {
+                       free(slice->rn_name);
+                       free(slice);
+               } else {
+                       avl_insert(*slice_cache, slice, where);
+               }
                mutex_exit(lock);
        }
 
@@ -1850,14 +1926,29 @@ zpool_find_import_impl(libzfs_handle_t *hdl, importargs_t *iarg)
                if (slice->rn_config != NULL) {
                        nvlist_t *config = slice->rn_config;
                        boolean_t matched = B_TRUE;
+                       boolean_t aux = B_FALSE;
+                       int fd;
+
+                       /*
+                        * Check if it's a spare or l2cache device. If it is,
+                        * we need to skip the name and guid check since they
+                        * don't exist on aux device label.
+                        */
+                       if (iarg->poolname != NULL || iarg->guid != 0) {
+                               uint64_t state;
+                               aux = nvlist_lookup_uint64(config,
+                                   ZPOOL_CONFIG_POOL_STATE, &state) == 0 &&
+                                   (state == POOL_STATE_SPARE ||
+                                   state == POOL_STATE_L2CACHE);
+                       }
 
-                       if (iarg->poolname != NULL) {
+                       if (iarg->poolname != NULL && !aux) {
                                char *pname;
 
                                matched = nvlist_lookup_string(config,
                                    ZPOOL_CONFIG_POOL_NAME, &pname) == 0 &&
                                    strcmp(iarg->poolname, pname) == 0;
-                       } else if (iarg->guid != 0) {
+                       } else if (iarg->guid != 0 && !aux) {
                                uint64_t this_guid;
 
                                matched = nvlist_lookup_uint64(config,
@@ -1867,9 +1958,26 @@ zpool_find_import_impl(libzfs_handle_t *hdl, importargs_t *iarg)
                        if (!matched) {
                                nvlist_free(config);
                        } else {
-                               add_config(hdl, &pools,
-                                   slice->rn_name, slice->rn_order,
-                                   slice->rn_num_labels, config);
+                               /*
+                                * Verify all remaining entries can be opened
+                                * exclusively. This will prune all underlying
+                                * multipath devices which otherwise could
+                                * result in the vdev appearing as UNAVAIL.
+                                *
+                                * Under zdb, this step isn't required and
+                                * would prevent a zdb -e of active pools with
+                                * no cachefile.
+                                */
+                               fd = open(slice->rn_name, O_RDONLY | O_EXCL);
+                               if (fd >= 0 || iarg->can_be_active) {
+                                       if (fd >= 0)
+                                               close(fd);
+                                       add_config(hdl, &pools,
+                                           slice->rn_name, slice->rn_order,
+                                           slice->rn_num_labels, config);
+                               } else {
+                                       nvlist_free(config);
+                               }
                        }
                }
                free(slice->rn_name);
@@ -2069,6 +2177,80 @@ zpool_search_import(libzfs_handle_t *hdl, importargs_t *import)
        return (zpool_find_import_impl(hdl, import));
 }
 
+static boolean_t
+pool_match(nvlist_t *cfg, char *tgt)
+{
+       uint64_t v, guid = strtoull(tgt, NULL, 0);
+       char *s;
+
+       if (guid != 0) {
+               if (nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_POOL_GUID, &v) == 0)
+                       return (v == guid);
+       } else {
+               if (nvlist_lookup_string(cfg, ZPOOL_CONFIG_POOL_NAME, &s) == 0)
+                       return (strcmp(s, tgt) == 0);
+       }
+       return (B_FALSE);
+}
+
+int
+zpool_tryimport(libzfs_handle_t *hdl, char *target, nvlist_t **configp,
+    importargs_t *args)
+{
+       nvlist_t *pools;
+       nvlist_t *match = NULL;
+       nvlist_t *config = NULL;
+       char *name = NULL, *sepp = NULL;
+       char sep = '\0';
+       int count = 0;
+       char *targetdup = strdup(target);
+
+       *configp = NULL;
+
+       if ((sepp = strpbrk(targetdup, "/@")) != NULL) {
+               sep = *sepp;
+               *sepp = '\0';
+       }
+
+       pools = zpool_search_import(hdl, args);
+
+       if (pools != NULL) {
+               nvpair_t *elem = NULL;
+               while ((elem = nvlist_next_nvpair(pools, elem)) != NULL) {
+                       VERIFY0(nvpair_value_nvlist(elem, &config));
+                       if (pool_match(config, targetdup)) {
+                               count++;
+                               if (match != NULL) {
+                                       /* multiple matches found */
+                                       continue;
+                               } else {
+                                       match = config;
+                                       name = nvpair_name(elem);
+                               }
+                       }
+               }
+       }
+
+       if (count == 0) {
+               (void) zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+                   "no pools found"));
+               free(targetdup);
+               return (ENOENT);
+       }
+
+       if (count > 1) {
+               (void) zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+                   "%d pools found, use pool GUID\n"), count);
+               free(targetdup);
+               return (EINVAL);
+       }
+
+       *configp = match;
+       free(targetdup);
+
+       return (0);
+}
+
 boolean_t
 find_guid(nvlist_t *nv, uint64_t guid)
 {
@@ -2126,7 +2308,7 @@ find_aux(zpool_handle_t *zhp, void *data)
 
 /*
  * Determines if the pool is in use.  If so, it returns true and the state of
- * the pool as well as the name of the pool.  Both strings are allocated and
+ * the pool as well as the name of the pool.  Name string is allocated and
  * must be freed by the caller.
  */
 int