]> granicus.if.org Git - zfs/commitdiff
Add support for autoexpand property
authorBrian Behlendorf <behlendorf1@llnl.gov>
Mon, 23 Jul 2018 22:40:15 +0000 (15:40 -0700)
committerGitHub <noreply@github.com>
Mon, 23 Jul 2018 22:40:15 +0000 (15:40 -0700)
While the autoexpand property may seem like a small feature it
depends on a significant amount of system infrastructure.  Enough
of that infrastructure is now in place that with a few modifications
for Linux it can be supported.

Auto-expand works as follows; when a block device is modified
(re-sized, closed after being open r/w, etc) a change uevent is
generated for udev.  The ZED, which is monitoring udev events,
passes the change event along to zfs_deliver_dle() if the disk
or partition contains a zfs_member as identified by blkid.

From here the device is matched against all imported pool vdevs
using the vdev_guid which was read from the label by blkid.  If
a match is found the ZED reopens the pool vdev.  This re-opening
is important because it allows the vdev to be briefly closed so
the disk partition table can be re-read.  Otherwise, it wouldn't
be possible to report the maximum possible expansion size.

Finally, if the property autoexpand=on a vdev expansion will be
attempted.  After performing some sanity checks on the disk to
verify that it is safe to expand,  the primary partition (-part1)
will be expanded and the partition table updated.  The partition
is then re-opened (again) to detect the updated size which allows
the new capacity to be used.

In order to make all of the above possible the following changes
were required:

* Updated the zpool_expand_001_pos and zpool_expand_003_pos tests.
  These tests now create a pool which is layered on a loopback,
  scsi_debug, and file vdev.  This allows for testing of non-
  partitioned block device (loopback), a partition block device
  (scsi_debug), and a file which does not receive udev change
  events.  This provided for better test coverage, and by removing
  the layering on ZFS volumes there issues surrounding layering
  one pool on another are avoided.

* zpool_find_vdev_by_physpath() updated to accept a vdev guid.
  This allows for matching by guid rather than path which is a
  more reliable way for the ZED to reference a vdev.

* Fixed zfs_zevent_wait() signal handling which could result
  in the ZED spinning when a signal was not handled.

* Removed vdev_disk_rrpart() functionality which can be abandoned
  in favor of kernel provided blkdev_reread_part() function.

* Added a rwlock which is held as a writer while a disk is being
  reopened.  This is important to prevent errors from occurring
  for any configuration related IOs which bypass the SCL_ZIO lock.
  The zpool_reopen_007_pos.ksh test case was added to verify IO
  error are never observed when reopening.  This is not expected
  to impact IO performance.

Additional fixes which aren't critical but were discovered and
resolved in the course of developing this functionality.

* Added PHYS_PATH="/dev/zvol/dataset" to the vdev configuration for
  ZFS volumes.  This is as good as a unique physical path, while the
  volumes are not used in the test cases anymore for other reasons
  this improvement was included.

Reviewed by: Richard Elling <Richard.Elling@RichardElling.com>
Signed-off-by: Sara Hartse <sara.hartse@delphix.com>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #120
Closes #2437
Closes #5771
Closes #7366
Closes #7582
Closes #7629

26 files changed:
cmd/zed/agents/zfs_mod.c
config/kernel-blkdev-get.m4 [deleted file]
config/kernel-blkdev-reread-part.m4 [new file with mode: 0644]
config/kernel-get-gendisk.m4 [deleted file]
config/kernel.m4
include/linux/blkdev_compat.h
include/sys/vdev_disk.h
lib/libzfs/libzfs_import.c
lib/libzfs/libzfs_pool.c
module/zfs/fm.c
module/zfs/vdev.c
module/zfs/vdev_disk.c
tests/runfiles/linux.run
tests/test-runner/bin/zts-report.py
tests/zfs-tests/include/blkdev.shlib
tests/zfs-tests/tests/functional/cli_root/zpool_expand/Makefile.am
tests/zfs-tests/tests/functional/cli_root/zpool_expand/setup.ksh
tests/zfs-tests/tests/functional/cli_root/zpool_expand/zpool_expand.cfg
tests/zfs-tests/tests/functional/cli_root/zpool_expand/zpool_expand_001_pos.ksh
tests/zfs-tests/tests/functional/cli_root/zpool_expand/zpool_expand_002_pos.ksh
tests/zfs-tests/tests/functional/cli_root/zpool_expand/zpool_expand_003_neg.ksh
tests/zfs-tests/tests/functional/cli_root/zpool_expand/zpool_expand_004_pos.ksh
tests/zfs-tests/tests/functional/cli_root/zpool_expand/zpool_expand_005_pos.ksh [new file with mode: 0755]
tests/zfs-tests/tests/functional/cli_root/zpool_reopen/Makefile.am
tests/zfs-tests/tests/functional/cli_root/zpool_reopen/cleanup.ksh
tests/zfs-tests/tests/functional/cli_root/zpool_reopen/zpool_reopen_007_pos.ksh [new file with mode: 0755]

index 600d6527c0db265afe390fab8757dd1eabad4fa5..f70e886a6406bca89057bb9f1f07509a57b8582a 100644 (file)
@@ -697,8 +697,8 @@ zfsdle_vdev_online(zpool_handle_t *zhp, void *data)
 {
        char *devname = data;
        boolean_t avail_spare, l2cache;
-       vdev_state_t newstate;
        nvlist_t *tgt;
+       int error;
 
        zed_log_msg(LOG_INFO, "zfsdle_vdev_online: searching for '%s' in '%s'",
            devname, zpool_get_name(zhp));
@@ -706,42 +706,58 @@ zfsdle_vdev_online(zpool_handle_t *zhp, void *data)
        if ((tgt = zpool_find_vdev_by_physpath(zhp, devname,
            &avail_spare, &l2cache, NULL)) != NULL) {
                char *path, fullpath[MAXPATHLEN];
-               uint64_t wholedisk = 0ULL;
+               uint64_t wholedisk;
 
-               verify(nvlist_lookup_string(tgt, ZPOOL_CONFIG_PATH,
-                   &path) == 0);
-               verify(nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_WHOLE_DISK,
-                   &wholedisk) == 0);
+               error = nvlist_lookup_string(tgt, ZPOOL_CONFIG_PATH, &path);
+               if (error) {
+                       zpool_close(zhp);
+                       return (0);
+               }
 
-               (void) strlcpy(fullpath, path, sizeof (fullpath));
-               if (wholedisk) {
-                       char *spath = zfs_strip_partition(fullpath);
-                       boolean_t scrub_restart = B_TRUE;
+               error = nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_WHOLE_DISK,
+                   &wholedisk);
+               if (error)
+                       wholedisk = 0;
 
-                       if (!spath) {
-                               zed_log_msg(LOG_INFO, "%s: Can't alloc",
-                                   __func__);
+               if (wholedisk) {
+                       path = strrchr(path, '/');
+                       if (path != NULL) {
+                               path = zfs_strip_partition(path + 1);
+                               if (path == NULL) {
+                                       zpool_close(zhp);
+                                       return (0);
+                               }
+                       } else {
+                               zpool_close(zhp);
                                return (0);
                        }
 
-                       (void) strlcpy(fullpath, spath, sizeof (fullpath));
-                       free(spath);
+                       (void) strlcpy(fullpath, path, sizeof (fullpath));
+                       free(path);
 
                        /*
                         * We need to reopen the pool associated with this
-                        * device so that the kernel can update the size
-                        * of the expanded device.
+                        * device so that the kernel can update the size of
+                        * the expanded device.  When expanding there is no
+                        * need to restart the scrub from the beginning.
                         */
+                       boolean_t scrub_restart = B_FALSE;
                        (void) zpool_reopen_one(zhp, &scrub_restart);
+               } else {
+                       (void) strlcpy(fullpath, path, sizeof (fullpath));
                }
 
                if (zpool_get_prop_int(zhp, ZPOOL_PROP_AUTOEXPAND, NULL)) {
-                       zed_log_msg(LOG_INFO, "zfsdle_vdev_online: setting "
-                           "device '%s' to ONLINE state in pool '%s'",
-                           fullpath, zpool_get_name(zhp));
-                       if (zpool_get_state(zhp) != POOL_STATE_UNAVAIL)
-                               (void) zpool_vdev_online(zhp, fullpath, 0,
+                       vdev_state_t newstate;
+
+                       if (zpool_get_state(zhp) != POOL_STATE_UNAVAIL) {
+                               error = zpool_vdev_online(zhp, fullpath, 0,
                                    &newstate);
+                               zed_log_msg(LOG_INFO, "zfsdle_vdev_online: "
+                                   "setting device '%s' to ONLINE state "
+                                   "in pool '%s': %d", fullpath,
+                                   zpool_get_name(zhp), error);
+                       }
                }
                zpool_close(zhp);
                return (1);
@@ -751,23 +767,32 @@ zfsdle_vdev_online(zpool_handle_t *zhp, void *data)
 }
 
 /*
- * This function handles the ESC_DEV_DLE event.
+ * This function handles the ESC_DEV_DLE device change event.  Use the
+ * provided vdev guid when looking up a disk or partition, when the guid
+ * is not present assume the entire disk is owned by ZFS and append the
+ * expected -part1 partition information then lookup by physical path.
  */
 static int
 zfs_deliver_dle(nvlist_t *nvl)
 {
-       char *devname;
-
-       if (nvlist_lookup_string(nvl, DEV_PHYS_PATH, &devname) != 0) {
-               zed_log_msg(LOG_INFO, "zfs_deliver_dle: no physpath");
-               return (-1);
+       char *devname, name[MAXPATHLEN];
+       uint64_t guid;
+
+       if (nvlist_lookup_uint64(nvl, ZFS_EV_VDEV_GUID, &guid) == 0) {
+               sprintf(name, "%llu", (u_longlong_t)guid);
+       } else if (nvlist_lookup_string(nvl, DEV_PHYS_PATH, &devname) == 0) {
+               strlcpy(name, devname, MAXPATHLEN);
+               zfs_append_partition(name, MAXPATHLEN);
+       } else {
+               zed_log_msg(LOG_INFO, "zfs_deliver_dle: no guid or physpath");
        }
 
-       if (zpool_iter(g_zfshdl, zfsdle_vdev_online, devname) != 1) {
+       if (zpool_iter(g_zfshdl, zfsdle_vdev_online, name) != 1) {
                zed_log_msg(LOG_INFO, "zfs_deliver_dle: device '%s' not "
-                   "found", devname);
+                   "found", name);
                return (1);
        }
+
        return (0);
 }
 
diff --git a/config/kernel-blkdev-get.m4 b/config/kernel-blkdev-get.m4
deleted file mode 100644 (file)
index e31d717..0000000
+++ /dev/null
@@ -1,19 +0,0 @@
-dnl #
-dnl # 2.6.37 API change
-dnl # Added 3rd argument for the active holder, previously this was
-dnl # hardcoded to NULL.
-dnl #
-AC_DEFUN([ZFS_AC_KERNEL_3ARG_BLKDEV_GET], [
-       AC_MSG_CHECKING([whether blkdev_get() wants 3 args])
-       ZFS_LINUX_TRY_COMPILE([
-               #include <linux/fs.h>
-       ],[
-               struct block_device *bdev = NULL;
-               (void) blkdev_get(bdev, 0, NULL);
-       ],[
-               AC_MSG_RESULT(yes)
-               AC_DEFINE(HAVE_3ARG_BLKDEV_GET, 1, [blkdev_get() wants 3 args])
-       ],[
-               AC_MSG_RESULT(no)
-       ])
-])
diff --git a/config/kernel-blkdev-reread-part.m4 b/config/kernel-blkdev-reread-part.m4
new file mode 100644 (file)
index 0000000..5664769
--- /dev/null
@@ -0,0 +1,21 @@
+dnl #
+dnl # 4.1 API, exported blkdev_reread_part() symbol, backported to the
+dnl # 3.10.0 CentOS 7.x enterprise kernels.
+dnl #
+AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_REREAD_PART], [
+       AC_MSG_CHECKING([whether blkdev_reread_part() is available])
+       ZFS_LINUX_TRY_COMPILE([
+               #include <linux/fs.h>
+       ], [
+               struct block_device *bdev = NULL;
+               int error;
+
+               error = blkdev_reread_part(bdev);
+       ], [
+               AC_MSG_RESULT(yes)
+               AC_DEFINE(HAVE_BLKDEV_REREAD_PART, 1,
+                   [blkdev_reread_part() is available])
+       ], [
+               AC_MSG_RESULT(no)
+       ])
+])
diff --git a/config/kernel-get-gendisk.m4 b/config/kernel-get-gendisk.m4
deleted file mode 100644 (file)
index b091377..0000000
+++ /dev/null
@@ -1,17 +0,0 @@
-dnl #
-dnl # 2.6.34 API change
-dnl # Verify the get_gendisk() symbol is available.
-dnl #
-AC_DEFUN([ZFS_AC_KERNEL_GET_GENDISK],
-       [AC_MSG_CHECKING([whether get_gendisk() is available])
-       ZFS_LINUX_TRY_COMPILE_SYMBOL([
-               #include <linux/genhd.h>
-       ], [
-               get_gendisk(0, NULL);
-       ], [get_gendisk], [block/genhd.c], [
-               AC_MSG_RESULT(yes)
-               AC_DEFINE(HAVE_GET_GENDISK, 1, [get_gendisk() is available])
-       ], [
-               AC_MSG_RESULT(no)
-       ])
-])
index 8c2998204cde867fb6256c4c847c63ad38791b4e..7ae10c1274608682bd466b9d00ffd672e6777ce0 100644 (file)
@@ -44,8 +44,8 @@ AC_DEFUN([ZFS_AC_CONFIG_KERNEL], [
        ZFS_AC_KERNEL_BLOCK_DEVICE_OPERATIONS_CHECK_EVENTS
        ZFS_AC_KERNEL_BLOCK_DEVICE_OPERATIONS_RELEASE_VOID
        ZFS_AC_KERNEL_TYPE_FMODE_T
-       ZFS_AC_KERNEL_3ARG_BLKDEV_GET
        ZFS_AC_KERNEL_BLKDEV_GET_BY_PATH
+       ZFS_AC_KERNEL_BLKDEV_REREAD_PART
        ZFS_AC_KERNEL_OPEN_BDEV_EXCLUSIVE
        ZFS_AC_KERNEL_LOOKUP_BDEV
        ZFS_AC_KERNEL_INVALIDATE_BDEV_ARGS
@@ -73,7 +73,6 @@ AC_DEFUN([ZFS_AC_CONFIG_KERNEL], [
        ZFS_AC_KERNEL_BLK_QUEUE_HAVE_BLK_PLUG
        ZFS_AC_KERNEL_GET_DISK_AND_MODULE
        ZFS_AC_KERNEL_GET_DISK_RO
-       ZFS_AC_KERNEL_GET_GENDISK
        ZFS_AC_KERNEL_HAVE_BIO_SET_OP_ATTRS
        ZFS_AC_KERNEL_GENERIC_READLINK_GLOBAL
        ZFS_AC_KERNEL_DISCARD_GRANULARITY
index 88b0e48cda0955e3d599fc514112e1e983df91a2..274552d5dc485c00aca50776c0c48f12d6ff9c25 100644 (file)
@@ -364,6 +364,20 @@ bio_set_bi_error(struct bio *bio, int error)
 #define        vdev_bdev_close(bdev, md)       close_bdev_excl(bdev)
 #endif /* HAVE_BLKDEV_GET_BY_PATH | HAVE_OPEN_BDEV_EXCLUSIVE */
 
+/*
+ * 4.1 - x.y.z API,
+ * 3.10.0 CentOS 7.x API,
+ *   blkdev_reread_part()
+ *
+ * For older kernels trigger a re-reading of the partition table by calling
+ * check_disk_change() which calls flush_disk() to invalidate the device.
+ */
+#ifdef HAVE_BLKDEV_REREAD_PART
+#define        vdev_bdev_reread_part(bdev)     blkdev_reread_part(bdev)
+#else
+#define        vdev_bdev_reread_part(bdev)     check_disk_change(bdev)
+#endif /* HAVE_BLKDEV_REREAD_PART */
+
 /*
  * 2.6.22 API change
  * The function invalidate_bdev() lost it's second argument because
index b8a32b3168824bdef5dc97f90eddec6a2129c2a6..908f5f32634fbc782f0e7036780ea244067618a0 100644 (file)
@@ -47,6 +47,7 @@ typedef struct vdev_disk {
        ddi_devid_t             vd_devid;
        char                    *vd_minor;
        struct block_device     *vd_bdev;
+       krwlock_t               vd_lock;
 } vdev_disk_t;
 
 #endif /* _KERNEL */
index 7d2f0e903cce7c74d9d7868dcff591106d7821c0..d2c7d98f966d29b4df17480796c0980f5442872e 100644 (file)
@@ -145,6 +145,21 @@ zfs_device_get_devid(struct udev_device *dev, char *bufptr, size_t buflen)
                        return (0);
                }
 
+               /*
+                * For volumes use the persistent /dev/zvol/dataset identifier
+                */
+               entry = udev_device_get_devlinks_list_entry(dev);
+               while (entry != NULL) {
+                       const char *name;
+
+                       name = udev_list_entry_get_name(entry);
+                       if (strncmp(name, ZVOL_ROOT, strlen(ZVOL_ROOT)) == 0) {
+                               (void) strlcpy(bufptr, name, buflen);
+                               return (0);
+                       }
+                       entry = udev_list_entry_get_next(entry);
+               }
+
                /*
                 * NVME 'by-id' symlinks are similar to bus case
                 */
@@ -187,26 +202,57 @@ int
 zfs_device_get_physical(struct udev_device *dev, char *bufptr, size_t buflen)
 {
        const char *physpath = NULL;
+       struct udev_list_entry *entry;
 
        /*
-        * Normal disks use ID_PATH for their physical path.  Device mapper
-        * devices are virtual and don't have a physical path.  For them we
-        * use ID_VDEV instead, which is setup via the /etc/vdev_id.conf file.
-        * ID_VDEV provides a persistent path to a virtual device.  If you
-        * don't have vdev_id.conf setup, you cannot use multipath autoreplace.
+        * Normal disks use ID_PATH for their physical path.
         */
-       if (!((physpath = udev_device_get_property_value(dev, "ID_PATH")) &&
-           physpath[0])) {
-               if (!((physpath =
-                   udev_device_get_property_value(dev, "ID_VDEV")) &&
-                   physpath[0])) {
-                       return (ENODATA);
+       physpath = udev_device_get_property_value(dev, "ID_PATH");
+       if (physpath != NULL && strlen(physpath) > 0) {
+               (void) strlcpy(bufptr, physpath, buflen);
+               return (0);
+       }
+
+       /*
+        * Device mapper devices are virtual and don't have a physical
+        * path. For them we use ID_VDEV instead, which is setup via the
+        * /etc/vdev_id.conf file.  ID_VDEV provides a persistent path
+        * to a virtual device.  If you don't have vdev_id.conf setup,
+        * you cannot use multipath autoreplace with device mapper.
+        */
+       physpath = udev_device_get_property_value(dev, "ID_VDEV");
+       if (physpath != NULL && strlen(physpath) > 0) {
+               (void) strlcpy(bufptr, physpath, buflen);
+               return (0);
+       }
+
+       /*
+        * For ZFS volumes use the persistent /dev/zvol/dataset identifier
+        */
+       entry = udev_device_get_devlinks_list_entry(dev);
+       while (entry != NULL) {
+               physpath = udev_list_entry_get_name(entry);
+               if (strncmp(physpath, ZVOL_ROOT, strlen(ZVOL_ROOT)) == 0) {
+                       (void) strlcpy(bufptr, physpath, buflen);
+                       return (0);
                }
+               entry = udev_list_entry_get_next(entry);
        }
 
-       (void) strlcpy(bufptr, physpath, buflen);
+       /*
+        * For all other devices fallback to using the by-uuid name.
+        */
+       entry = udev_device_get_devlinks_list_entry(dev);
+       while (entry != NULL) {
+               physpath = udev_list_entry_get_name(entry);
+               if (strncmp(physpath, "/dev/disk/by-uuid", 17) == 0) {
+                       (void) strlcpy(bufptr, physpath, buflen);
+                       return (0);
+               }
+               entry = udev_list_entry_get_next(entry);
+       }
 
-       return (0);
+       return (ENODATA);
 }
 
 boolean_t
index 8f2eedec8003dd790a8900cc7b264bf7a828c002..d19ca77140ecf55ad02a0acd59e377cc2b9205aa 100644 (file)
@@ -2283,17 +2283,25 @@ vdev_to_nvlist_iter(nvlist_t *nv, nvlist_t *search, boolean_t *avail_spare,
 }
 
 /*
- * Given a physical path (minus the "/devices" prefix), find the
- * associated vdev.
+ * Given a physical path or guid, find the associated vdev.
  */
 nvlist_t *
 zpool_find_vdev_by_physpath(zpool_handle_t *zhp, const char *ppath,
     boolean_t *avail_spare, boolean_t *l2cache, boolean_t *log)
 {
        nvlist_t *search, *nvroot, *ret;
+       uint64_t guid;
+       char *end;
 
        verify(nvlist_alloc(&search, NV_UNIQUE_NAME, KM_SLEEP) == 0);
-       verify(nvlist_add_string(search, ZPOOL_CONFIG_PHYS_PATH, ppath) == 0);
+
+       guid = strtoull(ppath, &end, 0);
+       if (guid != 0 && *end == '\0') {
+               verify(nvlist_add_uint64(search, ZPOOL_CONFIG_GUID, guid) == 0);
+       } else {
+               verify(nvlist_add_string(search, ZPOOL_CONFIG_PHYS_PATH,
+                   ppath) == 0);
+       }
 
        verify(nvlist_lookup_nvlist(zhp->zpool_config, ZPOOL_CONFIG_VDEV_TREE,
            &nvroot) == 0);
index 4986a3fa23501442f71677367f44252e629ff9e1..6d2166a09560f46e09a8fb8f244116ac330f285d 100644 (file)
@@ -665,25 +665,37 @@ out:
        return (error);
 }
 
+/*
+ * Wait in an interruptible state for any new events.
+ */
 int
 zfs_zevent_wait(zfs_zevent_t *ze)
 {
-       int error = 0;
+       int error = EAGAIN;
 
        mutex_enter(&zevent_lock);
+       zevent_waiters++;
 
-       if (zevent_flags & ZEVENT_SHUTDOWN) {
-               error = ESHUTDOWN;
-               goto out;
-       }
+       while (error == EAGAIN) {
+               if (zevent_flags & ZEVENT_SHUTDOWN) {
+                       error = SET_ERROR(ESHUTDOWN);
+                       break;
+               }
 
-       zevent_waiters++;
-       cv_wait_sig(&zevent_cv, &zevent_lock);
-       if (issig(JUSTLOOKING))
-               error = EINTR;
+               error = cv_timedwait_sig(&zevent_cv, &zevent_lock,
+                   ddi_get_lbolt() + MSEC_TO_TICK(10));
+               if (signal_pending(current)) {
+                       error = SET_ERROR(EINTR);
+                       break;
+               } else if (!list_is_empty(&zevent_list)) {
+                       error = 0;
+                       continue;
+               } else {
+                       error = EAGAIN;
+               }
+       }
 
        zevent_waiters--;
-out:
        mutex_exit(&zevent_lock);
 
        return (error);
index ef6e2d8be30108a5fb5c4d25624fd5a5caaf6196..c35f739236adf91065c41e9ea0ab198d13965dc7 100644 (file)
@@ -3241,7 +3241,8 @@ vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate)
        /* XXX - L2ARC 1.0 does not support expansion */
        if (!vd->vdev_aux) {
                for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent)
-                       pvd->vdev_expanding = !!(flags & ZFS_ONLINE_EXPAND);
+                       pvd->vdev_expanding = !!((flags & ZFS_ONLINE_EXPAND) ||
+                           spa->spa_autoexpand);
        }
 
        vdev_reopen(tvd);
index 996bab43c6ce7117fe20a6fc419078a2070c586b..78741af7f89897d319bbec585ecbce8c113a2b37 100644 (file)
@@ -85,50 +85,64 @@ vdev_bdev_mode(int smode)
 }
 #endif /* HAVE_OPEN_BDEV_EXCLUSIVE */
 
-/* The capacity (in bytes) of a bdev that is available to be used by a vdev */
+/*
+ * Returns the usable capacity (in bytes) for the partition or disk.
+ */
 static uint64_t
-bdev_capacity(struct block_device *bdev, boolean_t wholedisk)
+bdev_capacity(struct block_device *bdev)
 {
-       struct hd_struct *part = bdev->bd_part;
-       uint64_t sectors = get_capacity(bdev->bd_disk);
-       /* If there are no paritions, return the entire device capacity */
-       if (part == NULL)
-               return (sectors << SECTOR_BITS);
+       return (i_size_read(bdev->bd_inode));
+}
 
-       /*
-        * If there are partitions, decide if we are using a `wholedisk`
-        * layout (composed of part1 and part9) or just a single partition.
-        */
-       if (wholedisk) {
-               /* Verify the expected device layout */
-               ASSERT3P(bdev, !=, bdev->bd_contains);
+/*
+ * Returns the maximum expansion capacity of the block device (in bytes).
+ *
+ * It is possible to expand a vdev when it has been created as a wholedisk
+ * and the containing block device has increased in capacity.  Or when the
+ * partition containing the pool has been manually increased in size.
+ *
+ * This function is only responsible for calculating the potential expansion
+ * size so it can be reported by 'zpool list'.  The efi_use_whole_disk() is
+ * responsible for verifying the expected partition layout in the wholedisk
+ * case, and updating the partition table if appropriate.  Once the partition
+ * size has been increased the additional capacity will be visible using
+ * bdev_capacity().
+ */
+static uint64_t
+bdev_max_capacity(struct block_device *bdev, uint64_t wholedisk)
+{
+       uint64_t psize;
+       int64_t available;
+
+       if (wholedisk && bdev->bd_part != NULL && bdev != bdev->bd_contains) {
                /*
-                * Sectors used by the EFI partition (part9) as well as
-                * partion alignment.
+                * When reporting maximum expansion capacity for a wholedisk
+                * deduct any capacity which is expected to be lost due to
+                * alignment restrictions.  Over reporting this value isn't
+                * harmful and would only result in slightly less capacity
+                * than expected post expansion.
                 */
-               uint64_t used = EFI_MIN_RESV_SIZE + NEW_START_BLOCK +
-                   PARTITION_END_ALIGNMENT;
-
-               /* Space available to the vdev, i.e. the size of part1 */
-               if (sectors <= used)
-                       return (0);
-               uint64_t available = sectors - used;
-               return (available << SECTOR_BITS);
+               available = i_size_read(bdev->bd_contains->bd_inode) -
+                   ((EFI_MIN_RESV_SIZE + NEW_START_BLOCK +
+                   PARTITION_END_ALIGNMENT) << SECTOR_BITS);
+               if (available > 0)
+                       psize = available;
+               else
+                       psize = bdev_capacity(bdev);
        } else {
-               /* The partition capacity referenced by the block device */
-               return (part->nr_sects << SECTOR_BITS);
+               psize = bdev_capacity(bdev);
        }
+
+       return (psize);
 }
 
 static void
 vdev_disk_error(zio_t *zio)
 {
-#ifdef ZFS_DEBUG
-       printk(KERN_WARNING "ZFS: zio error=%d type=%d offset=%llu size=%llu "
+       zfs_dbgmsg(KERN_WARNING "zio error=%d type=%d offset=%llu size=%llu "
            "flags=%x\n", zio->io_error, zio->io_type,
            (u_longlong_t)zio->io_offset, (u_longlong_t)zio->io_size,
            zio->io_flags);
-#endif
 }
 
 /*
@@ -200,109 +214,73 @@ vdev_elevator_switch(vdev_t *v, char *elevator)
        }
 }
 
-/*
- * Expanding a whole disk vdev involves invoking BLKRRPART on the
- * whole disk device. This poses a problem, because BLKRRPART will
- * return EBUSY if one of the disk's partitions is open. That's why
- * we have to do it here, just before opening the data partition.
- * Unfortunately, BLKRRPART works by dropping all partitions and
- * recreating them, which means that for a short time window, all
- * /dev/sdxN device files disappear (until udev recreates them).
- * This means two things:
- *  - When we open the data partition just after a BLKRRPART, we
- *    can't do it using the normal device file path because of the
- *    obvious race condition with udev. Instead, we use reliable
- *    kernel APIs to get a handle to the new partition device from
- *    the whole disk device.
- *  - Because vdev_disk_open() initially needs to find the device
- *    using its path, multiple vdev_disk_open() invocations in
- *    short succession on the same disk with BLKRRPARTs in the
- *    middle have a high probability of failure (because of the
- *    race condition with udev). A typical situation where this
- *    might happen is when the zpool userspace tool does a
- *    TRYIMPORT immediately followed by an IMPORT. For this
- *    reason, we only invoke BLKRRPART in the module when strictly
- *    necessary (zpool online -e case), and rely on userspace to
- *    do it when possible.
- */
-static struct block_device *
-vdev_disk_rrpart(const char *path, int mode, vdev_disk_t *vd)
-{
-#if defined(HAVE_3ARG_BLKDEV_GET) && defined(HAVE_GET_GENDISK)
-       struct block_device *bdev, *result = ERR_PTR(-ENXIO);
-       struct gendisk *disk;
-       int error, partno;
-
-       bdev = vdev_bdev_open(path, vdev_bdev_mode(mode), zfs_vdev_holder);
-       if (IS_ERR(bdev))
-               return (bdev);
-
-       disk = get_gendisk(bdev->bd_dev, &partno);
-       vdev_bdev_close(bdev, vdev_bdev_mode(mode));
-
-       if (disk) {
-               bdev = bdget(disk_devt(disk));
-               if (bdev) {
-                       error = blkdev_get(bdev, vdev_bdev_mode(mode), vd);
-                       if (error == 0)
-                               error = ioctl_by_bdev(bdev, BLKRRPART, 0);
-                       vdev_bdev_close(bdev, vdev_bdev_mode(mode));
-               }
-
-               bdev = bdget_disk(disk, partno);
-               if (bdev) {
-                       error = blkdev_get(bdev,
-                           vdev_bdev_mode(mode) | FMODE_EXCL, vd);
-                       if (error == 0)
-                               result = bdev;
-               }
-               put_disk(disk);
-       }
-
-       return (result);
-#else
-       return (ERR_PTR(-EOPNOTSUPP));
-#endif /* defined(HAVE_3ARG_BLKDEV_GET) && defined(HAVE_GET_GENDISK) */
-}
-
 static int
 vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize,
     uint64_t *ashift)
 {
-       struct block_device *bdev = ERR_PTR(-ENXIO);
+       struct block_device *bdev;
+       fmode_t mode = vdev_bdev_mode(spa_mode(v->vdev_spa));
+       int count = 0, block_size;
+       int bdev_retry_count = 50;
        vdev_disk_t *vd;
-       int count = 0, mode, block_size;
 
        /* Must have a pathname and it must be absolute. */
        if (v->vdev_path == NULL || v->vdev_path[0] != '/') {
                v->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
-               vdev_dbgmsg(v, "vdev_disk_open: invalid "
-                   "vdev_path '%s'", v->vdev_path);
+               vdev_dbgmsg(v, "invalid vdev_path");
                return (SET_ERROR(EINVAL));
        }
 
        /*
-        * Reopen the device if it's not currently open. Otherwise,
-        * just update the physical size of the device.
+        * Reopen the device if it is currently open.  When expanding a
+        * partition force re-scanning the partition table while closed
+        * in order to get an accurate updated block device size.  Then
+        * since udev may need to recreate the device links increase the
+        * open retry count before reporting the device as unavailable.
         */
-       if (v->vdev_tsd != NULL) {
-               ASSERT(v->vdev_reopening);
-               vd = v->vdev_tsd;
-               goto skip_open;
-       }
+       vd = v->vdev_tsd;
+       if (vd) {
+               char disk_name[BDEVNAME_SIZE + 6] = "/dev/";
+               boolean_t reread_part = B_FALSE;
 
-       vd = kmem_zalloc(sizeof (vdev_disk_t), KM_SLEEP);
-       if (vd == NULL)
-               return (SET_ERROR(ENOMEM));
+               rw_enter(&vd->vd_lock, RW_WRITER);
+               bdev = vd->vd_bdev;
+               vd->vd_bdev = NULL;
+
+               if (bdev) {
+                       if (v->vdev_expanding && bdev != bdev->bd_contains) {
+                               bdevname(bdev->bd_contains, disk_name + 5);
+                               reread_part = B_TRUE;
+                       }
+
+                       vdev_bdev_close(bdev, mode);
+               }
+
+               if (reread_part) {
+                       bdev = vdev_bdev_open(disk_name, mode, zfs_vdev_holder);
+                       if (!IS_ERR(bdev)) {
+                               int error = vdev_bdev_reread_part(bdev);
+                               vdev_bdev_close(bdev, mode);
+                               if (error == 0)
+                                       bdev_retry_count = 100;
+                       }
+               }
+       } else {
+               vd = kmem_zalloc(sizeof (vdev_disk_t), KM_SLEEP);
+
+               rw_init(&vd->vd_lock, NULL, RW_DEFAULT, NULL);
+               rw_enter(&vd->vd_lock, RW_WRITER);
+       }
 
        /*
         * Devices are always opened by the path provided at configuration
         * time.  This means that if the provided path is a udev by-id path
-        * then drives may be recabled without an issue.  If the provided
+        * then drives may be re-cabled without an issue.  If the provided
         * path is a udev by-path path, then the physical location information
         * will be preserved.  This can be critical for more complicated
         * configurations where drives are located in specific physical
-        * locations to maximize the systems tolerence to component failure.
+        * locations to maximize the systems tolerance to component failure.
+        *
         * Alternatively, you can provide your own udev rule to flexibly map
         * the drives as you see fit.  It is not advised that you use the
         * /dev/[hd]d devices which may be reordered due to probing order.
@@ -317,15 +295,11 @@ vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize,
         * and it is reasonable to sleep and retry before giving up.  In
         * practice delays have been observed to be on the order of 100ms.
         */
-       mode = spa_mode(v->vdev_spa);
-       if (v->vdev_wholedisk && v->vdev_expanding)
-               bdev = vdev_disk_rrpart(v->vdev_path, mode, vd);
-
-       while (IS_ERR(bdev) && count < 50) {
-               bdev = vdev_bdev_open(v->vdev_path,
-                   vdev_bdev_mode(mode), zfs_vdev_holder);
+       bdev = ERR_PTR(-ENXIO);
+       while (IS_ERR(bdev) && count < bdev_retry_count) {
+               bdev = vdev_bdev_open(v->vdev_path, mode, zfs_vdev_holder);
                if (unlikely(PTR_ERR(bdev) == -ENOENT)) {
-                       msleep(10);
+                       schedule_timeout(MSEC_TO_TICK(10));
                        count++;
                } else if (IS_ERR(bdev)) {
                        break;
@@ -333,16 +307,18 @@ vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize,
        }
 
        if (IS_ERR(bdev)) {
-               dprintf("failed open v->vdev_path=%s, error=%d count=%d\n",
-                   v->vdev_path, -PTR_ERR(bdev), count);
-               kmem_free(vd, sizeof (vdev_disk_t));
-               return (SET_ERROR(-PTR_ERR(bdev)));
+               int error = -PTR_ERR(bdev);
+               vdev_dbgmsg(v, "open error=%d count=%d\n", error, count);
+               vd->vd_bdev = NULL;
+               v->vdev_tsd = vd;
+               rw_exit(&vd->vd_lock);
+               return (SET_ERROR(error));
+       } else {
+               vd->vd_bdev = bdev;
+               v->vdev_tsd = vd;
+               rw_exit(&vd->vd_lock);
        }
 
-       v->vdev_tsd = vd;
-       vd->vd_bdev = bdev;
-
-skip_open:
        /*  Determine the physical block size */
        block_size = vdev_bdev_block_size(vd->vd_bdev);
 
@@ -352,9 +328,11 @@ skip_open:
        /* Inform the ZIO pipeline that we are non-rotational */
        v->vdev_nonrot = blk_queue_nonrot(bdev_get_queue(vd->vd_bdev));
 
-       /* Physical volume size in bytes */
-       *psize = bdev_capacity(vd->vd_bdev, v->vdev_wholedisk);
-       *max_psize = *psize;
+       /* Physical volume size in bytes for the partition */
+       *psize = bdev_capacity(vd->vd_bdev);
+
+       /* Physical volume size in bytes including possible expansion space */
+       *max_psize = bdev_max_capacity(vd->vd_bdev, v->vdev_wholedisk);
 
        /* Based on the minimum sector size set the block size */
        *ashift = highbit64(MAX(block_size, SPA_MINBLOCKSIZE)) - 1;
@@ -373,10 +351,12 @@ vdev_disk_close(vdev_t *v)
        if (v->vdev_reopening || vd == NULL)
                return;
 
-       if (vd->vd_bdev != NULL)
+       if (vd->vd_bdev != NULL) {
                vdev_bdev_close(vd->vd_bdev,
                    vdev_bdev_mode(spa_mode(v->vdev_spa)));
+       }
 
+       rw_destroy(&vd->vd_lock);
        kmem_free(vd, sizeof (vdev_disk_t));
        v->vdev_tsd = NULL;
 }
@@ -562,9 +542,15 @@ __vdev_disk_physio(struct block_device *bdev, zio_t *zio,
 #if defined(HAVE_BLK_QUEUE_HAVE_BLK_PLUG)
        struct blk_plug plug;
 #endif
-
-       ASSERT(zio != NULL);
-       ASSERT3U(io_offset + io_size, <=, bdev->bd_inode->i_size);
+       /*
+        * Accessing outside the block device is never allowed.
+        */
+       if (io_offset + io_size > bdev->bd_inode->i_size) {
+               vdev_dbgmsg(zio->io_vd,
+                   "Illegal access %llu size %llu, device size %llu",
+                   io_offset, io_size, i_size_read(bdev->bd_inode));
+               return (SET_ERROR(EIO));
+       }
 
 retry:
        dr = vdev_disk_dio_alloc(bio_count);
@@ -705,10 +691,34 @@ vdev_disk_io_start(zio_t *zio)
        vdev_disk_t *vd = v->vdev_tsd;
        int rw, flags, error;
 
+       /*
+        * If the vdev is closed, it's likely in the REMOVED or FAULTED state.
+        * Nothing to be done here but return failure.
+        */
+       if (vd == NULL) {
+               zio->io_error = ENXIO;
+               zio_interrupt(zio);
+               return;
+       }
+
+       rw_enter(&vd->vd_lock, RW_READER);
+
+       /*
+        * If the vdev is closed, it's likely due to a failed reopen and is
+        * in the UNAVAIL state.  Nothing to be done here but return failure.
+        */
+       if (vd->vd_bdev == NULL) {
+               rw_exit(&vd->vd_lock);
+               zio->io_error = ENXIO;
+               zio_interrupt(zio);
+               return;
+       }
+
        switch (zio->io_type) {
        case ZIO_TYPE_IOCTL:
 
                if (!vdev_readable(v)) {
+                       rw_exit(&vd->vd_lock);
                        zio->io_error = SET_ERROR(ENXIO);
                        zio_interrupt(zio);
                        return;
@@ -726,8 +736,10 @@ vdev_disk_io_start(zio_t *zio)
                        }
 
                        error = vdev_disk_io_flush(vd->vd_bdev, zio);
-                       if (error == 0)
+                       if (error == 0) {
+                               rw_exit(&vd->vd_lock);
                                return;
+                       }
 
                        zio->io_error = error;
 
@@ -737,6 +749,7 @@ vdev_disk_io_start(zio_t *zio)
                        zio->io_error = SET_ERROR(ENOTSUP);
                }
 
+               rw_exit(&vd->vd_lock);
                zio_execute(zio);
                return;
        case ZIO_TYPE_WRITE:
@@ -762,6 +775,7 @@ vdev_disk_io_start(zio_t *zio)
                break;
 
        default:
+               rw_exit(&vd->vd_lock);
                zio->io_error = SET_ERROR(ENOTSUP);
                zio_interrupt(zio);
                return;
@@ -770,6 +784,8 @@ vdev_disk_io_start(zio_t *zio)
        zio->io_target_timestamp = zio_handle_io_delay(zio);
        error = __vdev_disk_physio(vd->vd_bdev, zio,
            zio->io_size, zio->io_offset, rw, flags);
+       rw_exit(&vd->vd_lock);
+
        if (error) {
                zio->io_error = error;
                zio_interrupt(zio);
index 056b1dddb99735a78bd86788157066f512f00767..89563189fd01695cfb59d3761abcb0625bc1fd1d 100644 (file)
@@ -333,7 +333,7 @@ tags = ['functional', 'cli_root', 'zpool_events']
 
 [tests/functional/cli_root/zpool_expand]
 tests = ['zpool_expand_001_pos', 'zpool_expand_002_pos',
-    'zpool_expand_003_neg', 'zpool_expand_004_pos']
+    'zpool_expand_003_neg', 'zpool_expand_004_pos', 'zpool_expand_005_pos']
 tags = ['functional', 'cli_root', 'zpool_expand']
 
 [tests/functional/cli_root/zpool_export]
@@ -398,7 +398,7 @@ tags = ['functional', 'cli_root', 'zpool_remove']
 [tests/functional/cli_root/zpool_reopen]
 tests = ['zpool_reopen_001_pos', 'zpool_reopen_002_pos',
     'zpool_reopen_003_pos', 'zpool_reopen_004_pos', 'zpool_reopen_005_pos',
-    'zpool_reopen_006_neg']
+    'zpool_reopen_006_neg', 'zpool_reopen_007_pos']
 tags = ['functional', 'cli_root', 'zpool_reopen']
 
 [tests/functional/cli_root/zpool_replace]
index 20afad5d77d54c0429eae5e8b924c69974762993..804d7d607aff49a243cd5be66b251348d5362c1d 100755 (executable)
@@ -81,6 +81,13 @@ python_deps_reason = 'Python modules missing: python-cffi'
 #
 tmpfile_reason = 'Kernel O_TMPFILE support required'
 
+#
+# Some tests may depend on udev change events being generated when block
+# devices change capacity.  This functionality wasn't available until the
+# 2.6.38 kernel.
+#
+udev_reason = 'Kernel block device udev change events required'
+
 #
 # Some tests require that the NFS client and server utilities be installed.
 #
@@ -159,8 +166,6 @@ known = {
     'cli_root/zfs_unshare/zfs_unshare_002_pos': ['SKIP', na_reason],
     'cli_root/zfs_unshare/zfs_unshare_006_pos': ['SKIP', na_reason],
     'cli_root/zpool_create/zpool_create_016_pos': ['SKIP', na_reason],
-    'cli_root/zpool_expand/zpool_expand_001_pos': ['SKIP', '5771'],
-    'cli_root/zpool_expand/zpool_expand_003_neg': ['SKIP', '5771'],
     'cli_user/misc/zfs_share_001_neg': ['SKIP', na_reason],
     'cli_user/misc/zfs_unshare_001_neg': ['SKIP', na_reason],
     'inuse/inuse_001_pos': ['SKIP', na_reason],
@@ -219,6 +224,7 @@ maybe = {
     'cli_root/zpool_create/setup': ['SKIP', disk_reason],
     'cli_root/zpool_create/zpool_create_008_pos': ['FAIL', known_reason],
     'cli_root/zpool_destroy/zpool_destroy_001_pos': ['SKIP', '6145'],
+    'cli_root/zpool_expand/setup': ['SKIP', udev_reason],
     'cli_root/zpool_export/setup': ['SKIP', disk_reason],
     'cli_root/zpool_import/setup': ['SKIP', disk_reason],
     'cli_root/zpool_import/import_rewind_device_replaced':
index 5163ea2ae294f18f19b7f2ad5c548c7500bd60c1..9cac7184f9fc44b2f2dcf26e8f3780ea93f1048c 100644 (file)
@@ -312,6 +312,7 @@ function on_off_disk # disk state{online,offline} host
                                        log_fail "Onlining $disk failed"
                                fi
                        elif is_real_device $disk; then
+                               block_device_wait
                                typeset -i retries=0
                                while ! lsscsi | egrep -q $disk; do
                                        if (( $retries > 2 )); then
@@ -410,9 +411,7 @@ function load_scsi_debug # dev_size_mb add_host num_tgts max_luns blksz
 #
 function unload_scsi_debug
 {
-       if lsmod | grep scsi_debug >/dev/null; then
-               log_must modprobe -r scsi_debug
-       fi
+       log_must_retry "in use" 5 modprobe -r scsi_debug
 }
 
 #
index 2fae015b53ba7d369f396b8b831c53795b7ecae3..beaa411e37cb282145d3e57a1530488e0fd140b9 100644 (file)
@@ -5,7 +5,8 @@ dist_pkgdata_SCRIPTS = \
        zpool_expand_001_pos.ksh \
        zpool_expand_002_pos.ksh \
        zpool_expand_003_neg.ksh \
-       zpool_expand_004_pos.ksh
+       zpool_expand_004_pos.ksh \
+       zpool_expand_005_pos.ksh
 
 dist_pkgdata_DATA = \
        zpool_expand.cfg
index 7d6a43ef5280dfdd69322b03d01eeb481f5525c0..9832a441c20bfeb68d326386e8b0d2b77a19c303 100755 (executable)
 
 verify_runnable "global"
 
+#
+# The pool expansion tests depend on udev change events being generated
+# when block devices change capacity.  Since this functionality wasn't
+# available until the 2.6.38 kernel skip this test group.
+#
+if [[ $(linux_version) -lt $(linux_version "2.6.38") ]]; then
+       log_unsupported "Requires block device udev change events"
+fi
+
 zed_setup
 zed_start
 
index e15471e227434b759ab0843b3042bfa2e5057361..bec5fb1638aaf6da055637a430f915e1d6bbbb96 100644 (file)
@@ -29,7 +29,9 @@
 #
 
 
-export org_size=$MINVDEVSIZE
-export exp_size=$((2*$org_size))
+export org_size=$((1024*1024*1024))
+export exp_size=$((2*1024*1024*1024))
+export org_size_mb=$((org_size/(1024*1024)))
 
-export VFS=$TESTPOOL/$TESTFS
+export FILE_LO=$TEST_BASE_DIR/vdev_lo
+export FILE_RAW=$TEST_BASE_DIR/vdev_raw
index 06ab1b84fd1cc48c6161b89b431ee48ff6faebc2..289e3e33fa4bf3c717d9eaf47d2b68c714f682b9 100755 (executable)
@@ -27,6 +27,7 @@
 
 #
 # Copyright (c) 2012, 2016 by Delphix. All rights reserved.
+# Copyright (c) 2018 by Lawrence Livermore National Security, LLC.
 #
 
 . $STF_SUITE/include/libtest.shlib
 #
 # DESCRIPTION:
 # Once zpool set autoexpand=on poolname, zpool can autoexpand by
-# Dynamic LUN Expansion
+# Dynamic VDEV Expansion
 #
 #
 # STRATEGY:
-# 1) Create a pool
-# 2) Create volume on top of the pool
-# 3) Create pool by using the zvols and set autoexpand=on
-# 4) Expand the vol size by 'zfs set volsize'
-# 5) Check that the pool size was expanded
+# 1) Create three vdevs (loopback, scsi_debug, and file)
+# 2) Create pool by using the different devices and set autoexpand=on
+# 3) Expand each device as appropriate
+# 4) Check that the pool size was expanded
+#
+# NOTE: Three different device types are used in this test to verify
+# expansion of non-partitioned block devices (loopback), partitioned
+# block devices (scsi_debug), and non-disk file vdevs.  ZFS volumes
+# are not used in order to avoid a possible lock inversion when
+# layering pools on zvols.
 #
 
 verify_runnable "global"
 
-# See issue: https://github.com/zfsonlinux/zfs/issues/5771
-if is_linux; then
-       log_unsupported "Requires autoexpand property support"
-fi
-
 function cleanup
 {
-       if poolexists $TESTPOOL1; then
-               log_must zpool destroy $TESTPOOL1
+       poolexists $TESTPOOL1 && destroy_pool $TESTPOOL1
+
+       if losetup -a | grep -q $DEV1; then
+               losetup -d $DEV1
        fi
 
-       for i in 1 2 3; do
-               if datasetexists $VFS/vol$i; then
-                       log_must zfs destroy $VFS/vol$i
-               fi
-       done
+       rm -f $FILE_LO $FILE_RAW
+
+       block_device_wait
+       unload_scsi_debug
 }
 
 log_onexit cleanup
 
-log_assert "zpool can be autoexpanded after set autoexpand=on on LUN expansion"
-
-for i in 1 2 3; do
-       log_must zfs create -V $org_size $VFS/vol$i
-done
-block_device_wait
+log_assert "zpool can be autoexpanded after set autoexpand=on on vdev expansion"
 
 for type in " " mirror raidz raidz2; do
+       log_note "Setting up loopback, scsi_debug, and file vdevs"
+       log_must truncate -s $org_size $FILE_LO
+       DEV1=$(losetup -f)
+       log_must losetup $DEV1 $FILE_LO
+
+       load_scsi_debug $org_size_mb 1 1 1 '512b'
+       block_device_wait
+       DEV2=$(get_debug_device)
+
+       log_must truncate -s $org_size $FILE_RAW
+       DEV3=$FILE_RAW
 
-       log_must zpool create -o autoexpand=on $TESTPOOL1 $type \
-           ${ZVOL_DEVDIR}/$VFS/vol1  ${ZVOL_DEVDIR}/$VFS/vol2 \
-           ${ZVOL_DEVDIR}/$VFS/vol3
+       # The -f is required since we're mixing disk and file vdevs.
+       log_must zpool create -f -o autoexpand=on $TESTPOOL1 $type \
+           $DEV1 $DEV2 $DEV3
 
        typeset autoexp=$(get_pool_prop autoexpand $TESTPOOL1)
        if [[ $autoexp != "on" ]]; then
-               log_fail "zpool $TESTPOOL1 autoexpand should on but is $autoexp"
+               log_fail "zpool $TESTPOOL1 autoexpand should be on but is " \
+                   "$autoexp"
        fi
 
        typeset prev_size=$(get_pool_prop size $TESTPOOL1)
        typeset zfs_prev_size=$(zfs get -p avail $TESTPOOL1 | tail -1 | \
            awk '{print $3}')
 
-       for i in 1 2 3; do
-               log_must zfs set volsize=$exp_size $VFS/vol$i
-       done
+       # Expand each device as appropriate being careful to add an artificial
+       # delay to ensure we get a single history entry for each.  This makes
+       # is easier to verify each expansion for the striped pool case, since
+       # they will not be merged in to a single larger expansion.
+       log_note "Expanding loopback, scsi_debug, and file vdevs"
+       log_must truncate -s $exp_size $FILE_LO
+       log_must losetup -c $DEV1
+       sleep 3
 
-       sync
-       sleep 10
-       sync
+       echo "2" > /sys/bus/pseudo/drivers/scsi_debug/virtual_gb
+       echo "1" > /sys/class/block/$DEV2/device/rescan
+       block_device_wait
+       sleep 3
+
+       log_must truncate -s $exp_size $FILE_RAW
+       log_must zpool online -e $TESTPOOL1 $FILE_RAW
 
        typeset expand_size=$(get_pool_prop size $TESTPOOL1)
        typeset zfs_expand_size=$(zfs get -p avail $TESTPOOL1 | tail -1 | \
@@ -105,8 +123,8 @@ for type in " " mirror raidz raidz2; do
        log_note "$TESTPOOL1 $type has previous size: $prev_size and " \
            "expanded size: $expand_size"
        # compare available pool size from zfs
-       if [[ $zfs_expand_size > $zfs_prev_size ]]; then
-       # check for zpool history for the pool size expansion
+       if [[ $zfs_expand_size -gt $zfs_prev_size ]]; then
+               # check for zpool history for the pool size expansion
                if [[ $type == " " ]]; then
                        typeset expansion_size=$(($exp_size-$org_size))
                        typeset size_addition=$(zpool history -il $TESTPOOL1 |\
@@ -114,9 +132,9 @@ for type in " " mirror raidz raidz2; do
                            grep "vdev online" | \
                            grep "(+${expansion_size}" | wc -l)
 
-                       if [[ $size_addition -ne $i ]]; then
-                               log_fail "pool $TESTPOOL1 is not autoexpand " \
-                                   "after LUN expansion"
+                       if [[ $size_addition -ne 3 ]]; then
+                               log_fail "pool $TESTPOOL1 has not expanded, " \
+                                   "$size_addition/3 vdevs expanded"
                        fi
                elif [[ $type == "mirror" ]]; then
                        typeset expansion_size=$(($exp_size-$org_size))
@@ -126,8 +144,7 @@ for type in " " mirror raidz raidz2; do
                            grep "(+${expansion_size})" >/dev/null 2>&1
 
                        if [[ $? -ne 0 ]] ; then
-                               log_fail "pool $TESTPOOL1 is not autoexpand " \
-                                   "after LUN expansion"
+                               log_fail "pool $TESTPOOL1 has not expanded"
                        fi
                else
                        typeset expansion_size=$((3*($exp_size-$org_size)))
@@ -137,19 +154,16 @@ for type in " " mirror raidz raidz2; do
                            grep "(+${expansion_size})" >/dev/null 2>&1
 
                        if [[ $? -ne 0 ]]; then
-                               log_fail "pool $TESTPOOL is not autoexpand " \
-                                   "after LUN expansion"
+                               log_fail "pool $TESTPOOL has not expanded"
                        fi
                fi
        else
-               log_fail "pool $TESTPOOL1 is not autoexpanded after LUN " \
-                   "expansion"
+               log_fail "pool $TESTPOOL1 is not autoexpanded after vdev " \
+                   "expansion.  Previous size: $zfs_prev_size and expanded " \
+                   "size: $zfs_expand_size"
        fi
 
-       log_must zpool destroy $TESTPOOL1
-       for i in 1 2 3; do
-               log_must zfs set volsize=$org_size $VFS/vol$i
-       done
-
+       cleanup
 done
-log_pass "zpool can be autoexpanded after set autoexpand=on on LUN expansion"
+
+log_pass "zpool can autoexpand if autoexpand=on after vdev expansion"
index 66b6969db3dcaf866f2216f72ad770a5e920450a..a49d4fc1706887a51d7c35e057b05ca59cb3fa8b 100755 (executable)
@@ -36,7 +36,7 @@
 #
 # DESCRIPTION:
 # After zpool online -e poolname zvol vdevs, zpool can autoexpand by
-# Dynamic LUN Expansion
+# Dynamic VDEV Expansion
 #
 #
 # STRATEGY:
@@ -52,9 +52,7 @@ verify_runnable "global"
 
 function cleanup
 {
-        if poolexists $TESTPOOL1; then
-                log_must zpool destroy $TESTPOOL1
-        fi
+       poolexists $TESTPOOL1 && destroy_pool $TESTPOOL1
 
        for i in 1 2 3; do
                [ -e ${TEMPFILE}.$i ] && log_must rm ${TEMPFILE}.$i
@@ -63,7 +61,7 @@ function cleanup
 
 log_onexit cleanup
 
-log_assert "zpool can expand after zpool online -e zvol vdevs on LUN expansion"
+log_assert "zpool can expand after zpool online -e zvol vdevs on vdev expansion"
 
 for type in " " mirror raidz raidz2; do
        # Initialize the file devices and the pool
@@ -77,7 +75,7 @@ for type in " " mirror raidz raidz2; do
        typeset autoexp=$(get_pool_prop autoexpand $TESTPOOL1)
 
        if [[ $autoexp != "off" ]]; then
-               log_fail "zpool $TESTPOOL1 autoexpand should off but is " \
+               log_fail "zpool $TESTPOOL1 autoexpand should be off but is " \
                    "$autoexp"
        fi
        typeset prev_size=$(get_pool_prop size $TESTPOOL1)
@@ -109,15 +107,15 @@ for type in " " mirror raidz raidz2; do
                    "expected $expected_zpool_expandsize"
        fi
 
-       # Online the devices to add the new space to the pool
+       # Online the devices to add the new space to the pool.  Add an
+       # artificial delay between online commands order to prevent them
+       # from being merged in to a single history entry.  This makes
+       # is easier to verify each expansion for the striped pool case.
        for i in 1 2 3; do
                log_must zpool online -e $TESTPOOL1 ${TEMPFILE}.$i
+               sleep 3
        done
 
-       sync
-       sleep 10
-       sync
-
        typeset expand_size=$(get_pool_prop size $TESTPOOL1)
        typeset zfs_expand_size=$(get_prop avail $TESTPOOL1)
        log_note "$TESTPOOL1 $type has previous size: $prev_size and " \
@@ -134,8 +132,9 @@ for type in " " mirror raidz raidz2; do
                            grep "(+${expansion_size}" | wc -l)
 
                        if [[ $size_addition -ne $i ]]; then
-                               log_fail "pool $TESTPOOL1 did not expand " \
-                                   "after LUN expansion and zpool online -e"
+                               log_fail "pool $TESTPOOL1 has not expanded " \
+                                   "after zpool online -e, " \
+                                   "$size_addition/3 vdevs expanded"
                        fi
                elif [[ $type == "mirror" ]]; then
                        typeset expansion_size=$(($exp_size-$org_size))
@@ -145,8 +144,8 @@ for type in " " mirror raidz raidz2; do
                            grep "(+${expansion_size})" >/dev/null 2>&1
 
                        if [[ $? -ne 0 ]]; then
-                               log_fail "pool $TESTPOOL1 did not expand " \
-                                   "after LUN expansion and zpool online -e"
+                               log_fail "pool $TESTPOOL1 has not expanded " \
+                                   "after zpool online -e"
                        fi
                else
                        typeset expansion_size=$((3*($exp_size-$org_size)))
@@ -156,14 +155,14 @@ for type in " " mirror raidz raidz2; do
                            grep "(+${expansion_size})" >/dev/null 2>&1
 
                        if [[ $? -ne 0 ]] ; then
-                               log_fail "pool $TESTPOOL1 did not expand " \
-                                   "after LUN expansion and zpool online -e"
+                               log_fail "pool $TESTPOOL1 has not expanded " \
+                                   "after zpool online -e"
                        fi
                fi
        else
-               log_fail "pool $TESTPOOL1 did not expand after LUN expansion " \
+               log_fail "pool $TESTPOOL1 did not expand after vdev expansion " \
                    "and zpool online -e"
        fi
        log_must zpool destroy $TESTPOOL1
 done
-log_pass "zpool can expand after zpool online -e zvol vdevs on LUN expansion"
+log_pass "zpool can expand after zpool online -e"
index 585dd050fd639abc79f8d796d019bbb0b0ee5b3e..323d0b907bd0ccacf1eee50376f34912f6803531 100755 (executable)
 
 #
 # Copyright (c) 2012, 2016 by Delphix. All rights reserved.
+# Copyright (c) 2018 by Lawrence Livermore National Security, LLC.
 #
 
+
 . $STF_SUITE/include/libtest.shlib
 . $STF_SUITE/tests/functional/cli_root/zpool_expand/zpool_expand.cfg
 
 #
 # Description:
 # Once set zpool autoexpand=off, zpool can *NOT* autoexpand by
-# Dynamic LUN Expansion
+# Dynamic VDEV Expansion
 #
 #
 # STRATEGY:
-# 1) Create a pool
-# 2) Create volumes on top of the pool
-# 3) Create pool by using the zvols and set autoexpand=off
-# 4) Expand the vol size by zfs set volsize
-# 5) Check that the pool size is not changed
+# 1) Create three vdevs (loopback, scsi_debug, and file)
+# 2) Create pool by using the different devices and set autoexpand=off
+# 3) Expand each device as appropriate
+# 4) Check that the pool size is not expanded
+#
+# NOTE: Three different device types are used in this test to verify
+# expansion of non-partitioned block devices (loopback), partitioned
+# block devices (scsi_debug), and non-disk file vdevs.  ZFS volumes
+# are not used in order to avoid a possible lock inversion when
+# layering pools on zvols.
 #
 
 verify_runnable "global"
 
-# See issue: https://github.com/zfsonlinux/zfs/issues/5771
-if is_linux; then
-       log_unsupported "Requires autoexpand property support"
-fi
-
 function cleanup
 {
-        if poolexists $TESTPOOL1; then
-                log_must zpool destroy $TESTPOOL1
-        fi
-
-       for i in 1 2 3; do
-               if datasetexists $VFS/vol$i; then
-                       log_must zfs destroy $VFS/vol$i
-               fi
-       done
+       poolexists $TESTPOOL1 && destroy_pool $TESTPOOL1
+
+       if losetup -a | grep -q $DEV1; then
+               losetup -d $DEV1
+       fi
+
+       rm -f $FILE_LO $FILE_RAW
+
+       block_device_wait
+       unload_scsi_debug
 }
 
 log_onexit cleanup
 
-log_assert "zpool can not expand if set autoexpand=off after LUN expansion"
-
-for i  in 1 2 3; do
-       log_must zfs create -V $org_size $VFS/vol$i
-done
-block_device_wait
+log_assert "zpool can not expand if set autoexpand=off after vdev expansion"
 
 for type in " " mirror raidz raidz2; do
-       log_must zpool create $TESTPOOL1 $type ${ZVOL_DEVDIR}/$VFS/vol1 \
-           ${ZVOL_DEVDIR}/$VFS/vol2 ${ZVOL_DEVDIR}/$VFS/vol3
+       log_note "Setting up loopback, scsi_debug, and file vdevs"
+       log_must truncate -s $org_size $FILE_LO
+       DEV1=$(losetup -f)
+       log_must losetup $DEV1 $FILE_LO
+
+       load_scsi_debug $org_size_mb 1 1 1 '512b'
+       block_device_wait
+       DEV2=$(get_debug_device)
+
+       log_must truncate -s $org_size $FILE_RAW
+       DEV3=$FILE_RAW
+
+       # The -f is required since we're mixing disk and file vdevs.
+       log_must zpool create -f $TESTPOOL1 $type $DEV1 $DEV2 $DEV3
 
        typeset autoexp=$(get_pool_prop autoexpand $TESTPOOL1)
        if [[ $autoexp != "off" ]]; then
-               log_fail "zpool $TESTPOOL1 autoexpand should off but is " \
+               log_fail "zpool $TESTPOOL1 autoexpand should be off but is " \
                    "$autoexp"
        fi
 
        typeset prev_size=$(get_pool_prop size $TESTPOOL1)
 
-       for i in 1 2 3; do
-               log_must zfs set volsize=$exp_size $VFS/vol$i
-       done
 
-       sync
-       sleep 10
-       sync
+       # Expand each device as appropriate being careful to add an artificial
+       # delay to ensure we get a single history entry for each.  This makes
+       # is easier to verify each expansion for the striped pool case, since
+       # they will not be merged in to a single larger expansion.
+       log_note "Expanding loopback, scsi_debug, and file vdevs"
+       log_must truncate -s $exp_size $FILE_LO
+       log_must losetup -c $DEV1
+       sleep 3
+
+       echo "2" > /sys/bus/pseudo/drivers/scsi_debug/virtual_gb
+       echo "1" > /sys/class/block/$DEV2/device/rescan
+       block_device_wait
+       sleep 3
+
+       log_must truncate -s $exp_size $FILE_RAW
+
+       # This is far longer than we should need to wait, but let's be sure.
+       sleep 5
 
        # check for zpool history for the pool size expansion
        zpool history -il $TESTPOOL1 | grep "pool '$TESTPOOL1' size:" | \
            grep "vdev online" >/dev/null 2>&1
 
        if [[ $? -eq 0 ]]; then
-               log_fail "pool $TESTPOOL1 is not autoexpand after LUN " \
+               log_fail "pool $TESTPOOL1 is not autoexpand after vdev " \
                    "expansion"
        fi
 
        typeset expand_size=$(get_pool_prop size $TESTPOOL1)
 
        if [[ "$prev_size" != "$expand_size" ]]; then
-               log_fail "pool $TESTPOOL1 size changed after LUN expansion"
+               log_fail "pool $TESTPOOL1 size changed after vdev expansion"
        fi
 
-       log_must zpool destroy $TESTPOOL1
-
-       for i in 1 2 3; do
-               log_must zfs set volsize=$org_size $VFS/vol$i
-       done
-
+       cleanup
 done
 
-log_pass "zpool can not expand if set autoexpand=off after LUN expansion"
+log_pass "zpool can not autoexpand if autoexpand=off after vdev expansion"
index 69481ba1ac8fce9d6674f47a037d258cb96e7b9a..8a4db824bc9c1defa62074c07d31f95bf2329820 100755 (executable)
@@ -50,9 +50,7 @@ verify_runnable "global"
 
 function cleanup
 {
-        if poolexists $TESTPOOL1; then
-                log_must zpool destroy $TESTPOOL1
-        fi
+       poolexists $TESTPOOL1 && destroy_pool $TESTPOOL1
 
        for i in 1 2 3; do
                [ -e ${TEMPFILE}.$i ] && log_must rm ${TEMPFILE}.$i
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_expand/zpool_expand_005_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_expand/zpool_expand_005_pos.ksh
new file mode 100755 (executable)
index 0000000..54ec73b
--- /dev/null
@@ -0,0 +1,99 @@
+#! /bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+
+#
+# Copyright (c) 2012, 2018 by Delphix. All rights reserved.
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/include/blkdev.shlib
+. $STF_SUITE/tests/functional/cli_root/zpool_expand/zpool_expand.cfg
+
+#
+# DESCRIPTION:
+#
+# STRATEGY:
+# 1) Create a scsi_debug device and a pool based on it
+# 2) Expand the device and rescan the scsi bus
+# 3) Reopen the pool and check that it detects new available space
+# 4) Online the device and check that the pool has been expanded
+#
+
+verify_runnable "global"
+
+function cleanup
+{
+       poolexists $TESTPOOL1 && destroy_pool $TESTPOOL1
+       unload_scsi_debug
+}
+
+log_onexit cleanup
+
+log_assert "zpool based on scsi device can be expanded with zpool online -e"
+
+# run scsi_debug to create a device
+MINVDEVSIZE_MB=$((MINVDEVSIZE / 1048576))
+load_scsi_debug $MINVDEVSIZE_MB 1 1 1 '512b'
+block_device_wait
+SDISK=$(get_debug_device)
+log_must zpool create $TESTPOOL1 $SDISK
+
+typeset autoexp=$(get_pool_prop autoexpand $TESTPOOL1)
+if [[ $autoexp != "off" ]]; then
+       log_fail "zpool $TESTPOOL1 autoexpand should be off but is $autoexp"
+fi
+
+typeset prev_size=$(get_pool_prop size $TESTPOOL1)
+log_note "original pool size: $prev_size"
+
+# resize the scsi_debug device
+echo "5" > /sys/bus/pseudo/drivers/scsi_debug/virtual_gb
+# rescan the device to detect the new size
+echo "1" > /sys/class/block/$SDISK/device/rescan
+block_device_wait
+
+# reopen the pool so ZFS can see the new space
+log_must zpool reopen $TESTPOOL1
+
+typeset expandsize=$(get_pool_prop expandsize $TESTPOOL1)
+log_note "pool expandsize: $expandsize"
+if [[ "$zpool_expandsize" = "-" ]]; then
+       log_fail "pool $TESTPOOL1 did not detect any " \
+           "expandsize after reopen"
+fi
+
+# online the device so the zpool will use the new space
+log_must zpool online -e $TESTPOOL1 $SDISK
+
+typeset new_size=$(get_pool_prop size $TESTPOOL1)
+log_note "new pool size: $new_size"
+if [[ $new_size -le $prev_size ]]; then
+       log_fail "pool $TESTPOOL1 did not expand " \
+           "after vdev expansion and zpool online -e"
+fi
+
+log_pass "zpool based on scsi_debug can be expanded with reopen and online -e"
index f4686c04e2e3fb1593fed8c4823aef403f7359b4..01ad68c817f261fbfd28f9e51dbbe42686226702 100644 (file)
@@ -7,7 +7,8 @@ dist_pkgdata_SCRIPTS = \
        zpool_reopen_003_pos.ksh \
        zpool_reopen_004_pos.ksh \
        zpool_reopen_005_pos.ksh \
-       zpool_reopen_006_neg.ksh
+       zpool_reopen_006_neg.ksh \
+       zpool_reopen_007_pos.ksh
 
 dist_pkgdata_DATA = \
        zpool_reopen.cfg \
index 99c51351c5c8c15a847b7443ac02f76f4e6d9172..a9fcef790586d2771dc5130ac487675f24ac0a31 100755 (executable)
@@ -25,7 +25,7 @@ cleanup_devices $DISKS
 # Unplug the disk and remove scsi_debug module
 if is_linux; then
        for SDDEVICE in $(get_debug_device); do
-               unplug $SDDEVICE
+               remove_disk $SDDEVICE
        done
        unload_scsi_debug
 fi
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_reopen/zpool_reopen_007_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_reopen/zpool_reopen_007_pos.ksh
new file mode 100755 (executable)
index 0000000..4ba56af
--- /dev/null
@@ -0,0 +1,67 @@
+#!/bin/ksh -p
+
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2018 by Lawrence Livermore National Security, LLC.
+#
+
+. $STF_SUITE/tests/functional/cli_root/zpool_reopen/zpool_reopen.shlib
+
+#
+# DESCRIPTION:
+# Test zpool reopen while performing IO to the pool.
+# Verify that no IO errors of any kind of reported.
+#
+# STRATEGY:
+# 1. Create a non-redundant pool.
+# 2. Repeat:
+#   a. Write files to the pool.
+#   b. Execute 'zpool reopen'.
+# 3. Verify that no errors are reported by 'zpool status'.
+
+verify_runnable "global"
+
+function cleanup
+{
+       poolexists $TESTPOOL && destroy_pool $TESTPOOL
+}
+
+log_assert "Testing zpool reopen with concurrent user IO"
+log_onexit cleanup
+
+set_removed_disk
+scsi_host=$(get_scsi_host $REMOVED_DISK)
+
+# 1. Create a non-redundant pool.
+log_must zpool create $TESTPOOL $DISK1 $DISK2 $DISK3
+
+for i in $(seq 10); do
+       # 3a. Write files in the background to the pool.
+       mkfile 64m /$TESTPOOL/data.$i &
+
+       # 3b. Execute 'zpool reopen'.
+       log_must zpool reopen $TESTPOOL
+
+       for disk in $DISK1 $DISK2 $DISK3; do
+               zpool status -P -v $TESTPOOL | grep $disk | \
+                   read -r name state rd wr cksum
+               log_must [ $state = "ONLINE" ]
+               log_must [ $rd -eq 0 ]
+               log_must [ $wr -eq 0 ]
+               log_must [ $cksum -eq 0 ]
+       done
+done
+
+wait
+
+log_pass "Zpool reopen with concurrent user IO successful"