]> granicus.if.org Git - zfs/commitdiff
Multipath autoreplace, control enclosure LEDs, event rate limiting
authorTony Hutter <hutter2@llnl.gov>
Wed, 19 Oct 2016 19:55:59 +0000 (12:55 -0700)
committerBrian Behlendorf <behlendorf1@llnl.gov>
Wed, 19 Oct 2016 19:55:59 +0000 (12:55 -0700)
1. Enable multipath autoreplace support for FMA.

This extends FMA autoreplace to work with multipath disks.  This
requires libdevmapper to be installed at build time.

2. Turn on/off fault LEDs when VDEVs become degraded/faulted/online

Set ZED_USE_ENCLOSURE_LEDS=1 in zed.rc to have ZED turn on/off the enclosure
LED for a drive when a drive becomes FAULTED/DEGRADED.  Your enclosure must
be supported by the Linux SES driver for this to work.  The enclosure LED
scripts work for multipath devices as well.  The scripts will clear the LED
when the fault is cleared.

3. Rate limit ZIO delay and checksum events so as not to flood ZED

ZIO delay and checksum events are rate limited to 5/sec in the zfs module.

Reviewed-by: Richard Laager <rlaager@wiktel.com>
Reviewed by: Don Brady <don.brady@intel.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Tony Hutter <hutter2@llnl.gov>
Closes #2449
Closes #3017
Closes #5159

24 files changed:
cmd/zed/Makefile.am
cmd/zed/agents/zfs_mod.c
cmd/zed/zed.d/statechange-led.sh [new file with mode: 0755]
cmd/zed/zed.d/vdev_clear-led.sh [new symlink]
cmd/zed/zed.d/zed.rc
cmd/zed/zed_disk_event.c
cmd/zed/zed_event.c
config/user-libdevmapper.m4 [new file with mode: 0644]
config/user.m4
configure.ac
include/libzfs.h
include/sys/Makefile.am
include/sys/fm/protocol.h
include/sys/vdev_impl.h
include/sys/zfs_ratelimit.h [new file with mode: 0644]
lib/libzfs/Makefile.am
lib/libzfs/libzfs_import.c
lib/libzfs/libzfs_pool.c
man/man8/zpool.8
module/zcommon/zfs_comutil.c
module/zfs/fm.c
module/zfs/vdev.c
module/zfs/zfs_fm.c
rpm/generic/zfs.spec.in

index 086d75d3665d658a426e8bdb26a16e4bcfd97f69..d35dfc4283f9825933c099c1094ec238c7e72811 100644 (file)
@@ -60,7 +60,9 @@ dist_zedexec_SCRIPTS = \
        zed.d/io-notify.sh \
        zed.d/io-spare.sh \
        zed.d/resilver_finish-notify.sh \
-       zed.d/scrub_finish-notify.sh
+       zed.d/scrub_finish-notify.sh \
+       zed.d/statechange-led.sh \
+       zed.d/vdev_clear-led.sh
 
 zedconfdefaults = \
        all-syslog.sh \
@@ -70,7 +72,9 @@ zedconfdefaults = \
        io-notify.sh \
        io-spare.sh \
        resilver_finish-notify.sh \
-       scrub_finish-notify.sh
+       scrub_finish-notify.sh \
+       statechange-blinkled.sh \
+       vdev_clear-blinkled.sh
 
 install-data-hook:
        $(MKDIR_P) "$(DESTDIR)$(zedconfdir)"
index c8326f21f224808be011b19c5e5c527ce43616f4..f7740ad2a01bc694a869b0553613ff3a97c5977c 100644 (file)
@@ -189,10 +189,22 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled)
        char rawpath[PATH_MAX], fullpath[PATH_MAX];
        char devpath[PATH_MAX];
        int ret;
+       int is_dm = 0;
+       uint_t c;
+       vdev_stat_t *vs;
 
        if (nvlist_lookup_string(vdev, ZPOOL_CONFIG_PATH, &path) != 0)
                return;
 
+       /* Skip healthy disks */
+       verify(nvlist_lookup_uint64_array(vdev, ZPOOL_CONFIG_VDEV_STATS,
+           (uint64_t **)&vs, &c) == 0);
+       if (vs->vs_state == VDEV_STATE_HEALTHY) {
+               zed_log_msg(LOG_INFO, "%s: %s is already healthy, skip it.",
+                   __func__, path);
+               return;
+       }
+
        (void) nvlist_lookup_string(vdev, ZPOOL_CONFIG_PHYS_PATH, &physpath);
        (void) nvlist_lookup_uint64(vdev, ZPOOL_CONFIG_WHOLE_DISK, &wholedisk);
        (void) nvlist_lookup_uint64(vdev, ZPOOL_CONFIG_OFFLINE, &offline);
@@ -201,8 +213,13 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled)
        if (offline)
                return;  /* don't intervene if it was taken offline */
 
-       zed_log_msg(LOG_INFO, "zfs_process_add: pool '%s' vdev '%s' (%llu)",
-           zpool_get_name(zhp), path, (long long unsigned int)guid);
+#ifdef HAVE_LIBDEVMAPPER
+       is_dm = dev_is_dm(path);
+#endif
+       zed_log_msg(LOG_INFO, "zfs_process_add: pool '%s' vdev '%s', phys '%s'"
+           " wholedisk %d, dm %d (%llu)", zpool_get_name(zhp), path,
+           physpath ? physpath : "NULL", wholedisk, is_dm,
+           (long long unsigned int)guid);
 
        /*
         * The VDEV guid is preferred for identification (gets passed in path)
@@ -216,7 +233,12 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled)
                 */
                (void) strlcpy(fullpath, path, sizeof (fullpath));
                if (wholedisk) {
-                       char *spath = zfs_strip_partition(g_zfshdl, fullpath);
+                       char *spath = zfs_strip_partition(fullpath);
+                       if (!spath) {
+                               zed_log_msg(LOG_INFO, "%s: Can't alloc",
+                                   __func__);
+                               return;
+                       }
 
                        (void) strlcpy(fullpath, spath, sizeof (fullpath));
                        free(spath);
@@ -241,8 +263,8 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled)
         * a true online (without the unspare flag), which will trigger a FMA
         * fault.
         */
-       if (!zpool_get_prop_int(zhp, ZPOOL_PROP_AUTOREPLACE, NULL) ||
-           !wholedisk || physpath == NULL) {
+       if (!is_dm && (!zpool_get_prop_int(zhp, ZPOOL_PROP_AUTOREPLACE, NULL) ||
+           !wholedisk || physpath == NULL)) {
                (void) zpool_vdev_online(zhp, fullpath, ZFS_ONLINE_FORCEFAULT,
                    &newstate);
                zed_log_msg(LOG_INFO, "  zpool_vdev_online: %s FORCEFAULT (%s)",
@@ -255,7 +277,7 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled)
         */
        (void) snprintf(rawpath, sizeof (rawpath), "%s%s", DEV_BYPATH_PATH,
            physpath);
-       if (realpath(rawpath, devpath) == NULL) {
+       if (realpath(rawpath, devpath) == NULL && !is_dm) {
                zed_log_msg(LOG_INFO, "  realpath: %s failed (%s)",
                    rawpath, strerror(errno));
 
@@ -267,10 +289,27 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled)
                return;
        }
 
-       /*
-        * we're auto-replacing a raw disk, so label it first
-        */
-       if (!labeled) {
+       if (!zpool_get_prop_int(zhp, ZPOOL_PROP_AUTOREPLACE, NULL)) {
+               zed_log_msg(LOG_INFO, "%s: Autoreplace is not enabled on this"
+                   " pool, ignore disk.", __func__);
+               return;
+       }
+
+       /* Only autoreplace bad disks */
+       if ((vs->vs_state != VDEV_STATE_DEGRADED) &&
+           (vs->vs_state != VDEV_STATE_FAULTED) &&
+           (vs->vs_state != VDEV_STATE_CANT_OPEN)) {
+               return;
+       }
+
+       nvlist_lookup_string(vdev, "new_devid", &new_devid);
+
+       if (is_dm) {
+               /* Don't label device mapper or multipath disks. */
+       } else if (!labeled) {
+               /*
+                * we're auto-replacing a raw disk, so label it first
+                */
                char *leafname;
 
                /*
@@ -311,7 +350,7 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled)
                list_insert_tail(&g_device_list, device);
 
                zed_log_msg(LOG_INFO, "  zpool_label_disk: async '%s' (%llu)",
-                   leafname, (long long unsigned int)guid);
+                   leafname, (u_longlong_t) guid);
 
                return; /* resumes at EC_DEV_ADD.ESC_DISK for partition */
 
@@ -337,16 +376,10 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled)
                }
 
                zed_log_msg(LOG_INFO, "  zpool_label_disk: resume '%s' (%llu)",
-                   physpath, (long long unsigned int)guid);
-
-               if (nvlist_lookup_string(vdev, "new_devid", &new_devid) != 0) {
-                       zed_log_msg(LOG_INFO, "  auto replace: missing devid!");
-                       return;
-               }
+                   physpath, (u_longlong_t) guid);
 
                (void) snprintf(devpath, sizeof (devpath), "%s%s",
                    DEV_BYID_PATH, new_devid);
-               path = devpath;
        }
 
        /*
@@ -411,7 +444,7 @@ static void
 zfs_iter_vdev(zpool_handle_t *zhp, nvlist_t *nvl, void *data)
 {
        dev_data_t *dp = data;
-       char *path;
+       char *path = NULL;
        uint_t c, children;
        nvlist_t **child;
 
@@ -450,15 +483,15 @@ zfs_iter_vdev(zpool_handle_t *zhp, nvlist_t *nvl, void *data)
                 * the dp->dd_compare value.
                 */
                if (nvlist_lookup_string(nvl, dp->dd_prop, &path) != 0 ||
-                   strcmp(dp->dd_compare, path) != 0) {
+                   strcmp(dp->dd_compare, path) != 0)
                        return;
-               }
+
                zed_log_msg(LOG_INFO, "  zfs_iter_vdev: matched %s on %s",
                    dp->dd_prop, path);
                dp->dd_found = B_TRUE;
 
                /* pass the new devid for use by replacing code */
-               if (dp->dd_islabeled && dp->dd_new_devid != NULL) {
+               if (dp->dd_new_devid != NULL) {
                        (void) nvlist_add_string(nvl, "new_devid",
                            dp->dd_new_devid);
                }
@@ -608,11 +641,11 @@ zfs_deliver_add(nvlist_t *nvl, boolean_t is_lofi)
 
        (void) nvlist_lookup_string(nvl, DEV_PHYS_PATH, &devpath);
 
-       zed_log_msg(LOG_INFO, "zfs_deliver_add: adding %s (%s)", devid,
-           devpath ? devpath : "NULL");
-
        is_slice = (nvlist_lookup_boolean(nvl, DEV_IS_PART) == 0);
 
+       zed_log_msg(LOG_INFO, "zfs_deliver_add: adding %s (%s) (is_slice %d)",
+           devid, devpath ? devpath : "NULL", is_slice);
+
        /*
         * Iterate over all vdevs looking for a match in the folllowing order:
         * 1. ZPOOL_CONFIG_DEVID (identifies the unique disk)
@@ -681,7 +714,12 @@ zfsdle_vdev_online(zpool_handle_t *zhp, void *data)
 
                (void) strlcpy(fullpath, path, sizeof (fullpath));
                if (wholedisk) {
-                       char *spath = zfs_strip_partition(g_zfshdl, fullpath);
+                       char *spath = zfs_strip_partition(fullpath);
+                       if (!spath) {
+                               zed_log_msg(LOG_INFO, "%s: Can't alloc",
+                                   __func__);
+                               return (0);
+                       }
 
                        (void) strlcpy(fullpath, spath, sizeof (fullpath));
                        free(spath);
diff --git a/cmd/zed/zed.d/statechange-led.sh b/cmd/zed/zed.d/statechange-led.sh
new file mode 100755 (executable)
index 0000000..ca911d2
--- /dev/null
@@ -0,0 +1,88 @@
+#!/bin/bash
+#
+# Turn off/on the VDEV's enclosure fault LEDs when the pool's state changes.
+#
+# Turn LED on if the VDEV becomes faulted/degraded, and turn it back off when
+# it's healthy again.  This requires that your enclosure be supported by the
+# Linux SCSI enclosure services (ses) driver.  The script will do nothing
+# if you have no enclosure, or if your enclosure isn't supported.
+#
+# This script also requires ZFS to be built with libdevmapper support.
+#
+# Exit codes:
+#   0: enclosure led successfully set
+#   1: enclosure leds not not available
+#   2: enclosure leds administratively disabled
+#   3: ZED built without libdevmapper
+
+[ -f "${ZED_ZEDLET_DIR}/zed.rc" ] && . "${ZED_ZEDLET_DIR}/zed.rc"
+. "${ZED_ZEDLET_DIR}/zed-functions.sh"
+
+# ZEVENT_VDEV_UPATH will not be present if ZFS is not built with libdevmapper
+[ -n "${ZEVENT_VDEV_UPATH}" ] || exit 3
+
+if [ "${ZED_USE_ENCLOSURE_LEDS}" != "1" ] ; then
+       exit 2
+fi
+
+if [ ! -d /sys/class/enclosure ] ; then
+       exit 1
+fi
+
+# Turn on/off enclosure LEDs
+function led
+{
+       name=$1
+       val=$2
+
+       # We want to check the current state first, since writing to the
+       # 'fault' entry always always causes a SES command, even if the
+       # current state is already what you want.
+       if [ -e /sys/block/$name/device/enclosure_device*/fault ] ; then
+               # We have to do some monkey business to deal with spaces in
+               # enclosure_device names.  I've seen horrible things like this: 
+               #
+               # '/sys/block/sdfw/device/enclosure_device:SLOT 43 41  /fault'
+               #
+               # ...so escape all spaces.
+               file=`ls /sys/block/$name/device/enclosure_device*/fault | sed 's/\s/\\ /g'`
+
+               current=`cat "$file"`
+
+               # On some enclosures if you write 1 to fault, and read it back,
+               # it will return 2.  Treat all non-zero values as 1 for
+               # simplicity.
+               if [ "$current" != "0" ] ; then
+                       current=1
+               fi
+
+               if [ "$current" != "$val" ] ; then
+                       # Set the value twice.  I've seen enclosures that were
+                       # flakey about setting it the first time.
+                       echo $val > "$file"
+                       echo $val > "$file"
+               fi
+       fi
+}
+
+# Decide whether to turn on/off an LED based on the state
+# Pass in path name and fault string ("ONLINE"/"FAULTED"/"DEGRADED"...etc)
+function process {
+       # path=/dev/sda, fault=
+
+       path=$1
+       fault=$2
+       name=`basename $path`
+
+       if [ -z "$name" ] ; then
+               return
+       fi
+
+       if [ "$fault" == "FAULTED" ] || [ "$fault" == "DEGRADED" ] ; then
+               led $name 1
+       else
+               led $name 0
+       fi
+}
+
+process "$ZEVENT_VDEV_UPATH" "$ZEVENT_VDEV_STATE_STR"
diff --git a/cmd/zed/zed.d/vdev_clear-led.sh b/cmd/zed/zed.d/vdev_clear-led.sh
new file mode 120000 (symlink)
index 0000000..7d74043
--- /dev/null
@@ -0,0 +1 @@
+statechange-led.sh
\ No newline at end of file
index f80fa33385adbc05db0c028ed58114d7ad5aab79..2dce048286657ed29c492f262a59104ac8750263 100644 (file)
 #
 #ZED_SPARE_ON_IO_ERRORS=1
 
+##
+# Turn on/off enclosure LEDs when drives get DEGRADED/FAULTED.  This works for
+# device mapper and multipath devices as well.  Your enclosure must be
+# supported by the Linux SES driver for this to work.
+#
+ZED_USE_ENCLOSURE_LEDS=1
+
+
 ##
 # The syslog priority (e.g., specified as a "facility.level" pair).
 #
index 0360bb584c8a0612591cd9d49db5cafd35a6f8c7..691024181795766e978e8f337d1c81df03d4bd0a 100644 (file)
@@ -159,6 +159,7 @@ static void *
 zed_udev_monitor(void *arg)
 {
        struct udev_monitor *mon = arg;
+       char *tmp, *tmp2;
 
        zed_log_msg(LOG_INFO, "Waiting for new uduev disk events...");
 
@@ -284,9 +285,26 @@ zed_udev_monitor(void *arg)
                if (strcmp(class, EC_DEV_STATUS) == 0 &&
                    udev_device_get_property_value(dev, "DM_UUID") &&
                    udev_device_get_property_value(dev, "MPATH_SBIN_PATH")) {
-                       /* Fake a MP "change" event to look like a "create" */
-                       class = EC_DEV_ADD;
-                       subclass = ESC_DISK;
+                       tmp = (char *) udev_device_get_devnode(dev);
+                       tmp2 = get_underlying_path(NULL, tmp);
+                       if (tmp && tmp2 && (strcmp(tmp, tmp2) != 0)) {
+                               /*
+                                * We have a real underlying device, which
+                                * means that this multipath "change" event is
+                                * an "add" event.
+                                *
+                                * If the multipath device and the underlying
+                                * dev are the same name (i.e. /dev/dm-5), then
+                                * there is no real underlying disk for this
+                                * multipath device, and so this "change" event
+                                * really a multipath removal.
+                                */
+                               class = EC_DEV_ADD;
+                               subclass = ESC_DISK;
+                       } else {
+                               /* multipath remove, ignore it. */
+                       }
+                       free(tmp2);
                }
 
                if ((nvl = dev_event_nvlist(dev)) != NULL) {
index 0e5c6793d98ef3307b49f6f66aa4d39faafb7748..51f4f99c510f2a285f422aa1b4e123eca28443d9 100644 (file)
@@ -843,6 +843,23 @@ _zed_internal_event(const char *class, nvlist_t *nvl)
        }
 }
 
+static void
+_zed_event_add_upath(uint64_t eid, zed_strings_t *zsp, nvlist_t *nvl)
+{
+       char *path = NULL;
+       char *upath = NULL;
+       if (nvlist_lookup_string(nvl, FM_EREPORT_PAYLOAD_ZFS_VDEV_PATH,
+           &path) == 0) {
+               upath = get_underlying_path(NULL, path);
+               if (upath) {
+                       _zed_event_add_var(eid, zsp, ZEVENT_VAR_PREFIX,
+                           "VDEV_UPATH",
+                           "%s", upath);
+                       free(upath);
+               }
+       }
+}
+
 /*
  * Service the next zevent, blocking until one is available.
  */
@@ -912,8 +929,19 @@ zed_event_service(struct zed_conf *zcp)
                subclass = _zed_event_get_subclass(class);
                _zed_event_add_var(eid, zsp, ZEVENT_VAR_PREFIX, "SUBCLASS",
                    "%s", (subclass ? subclass : class));
+
                _zed_event_add_time_strings(eid, zsp, etime);
 
+               /*
+                * If a VDEV is included, resolve it's path to the "underlying
+                * device".  This is useful for resolving device mapper and
+                * multipath devices to their underlying /dev/sd* devices.
+                * For example, if you have a DM or multipath VDEV
+                * (/dev/mapper/mpatha) that points to one or more /dev/sd*
+                * devices, this will return the first of its devices.
+                */
+               _zed_event_add_upath(eid, zsp, nvl);
+
                zed_exec_process(eid, class, subclass,
                    zcp->zedlet_dir, zcp->zedlets, zsp, zcp->zevent_fd);
 
diff --git a/config/user-libdevmapper.m4 b/config/user-libdevmapper.m4
new file mode 100644 (file)
index 0000000..af4dd41
--- /dev/null
@@ -0,0 +1,15 @@
+dnl #
+dnl # Check for libdevmapper.  libdevmapper is optional for building, but
+dnl # required for auto-online/auto-replace functionality for DM/multipath
+dnl # disks.
+dnl #
+AC_DEFUN([ZFS_AC_CONFIG_USER_LIBDEVMAPPER], [
+        AC_CHECK_HEADER([libdevmapper.h], [
+            AC_SUBST([LIBDEVMAPPER], ["-ldevmapper"])
+            AC_DEFINE([HAVE_LIBDEVMAPPER], 1, [Define if you have libdevmapper])
+
+           user_libdevmapper=yes
+        ], [
+           user_libdevmapper=no
+       ])
+])
index f70ab635fa5b464bc749e7b6d2b55f7ebbb26872..1d20642b81485153742c51dbfb31cc9e01fec143 100644 (file)
@@ -12,6 +12,7 @@ AC_DEFUN([ZFS_AC_CONFIG_USER], [
        ZFS_AC_CONFIG_USER_LIBTIRPC
        ZFS_AC_CONFIG_USER_LIBBLKID
        ZFS_AC_CONFIG_USER_LIBATTR
+       ZFS_AC_CONFIG_USER_LIBDEVMAPPER
        ZFS_AC_CONFIG_USER_LIBUDEV
        ZFS_AC_CONFIG_USER_FRAME_LARGER_THAN
        ZFS_AC_CONFIG_USER_RUNSTATEDIR
index f01a4d8728033e963b40eed18a97e73566db7742..e9f50f78369275580ad67fe97296ef3367465147 100644 (file)
@@ -301,4 +301,9 @@ AC_CONFIG_FILES([
        zfs.release
 ])
 
+
 AC_OUTPUT
+
+AS_IF([test "x$user_libdevmapper" != xyes ], [
+    AC_MSG_WARN([Building without libdevmapper.  Auto-replace, auto-online, and statechange-led.sh may not work correctly with device mapper vdevs.])
+])
index fe183a43ca38681c090ce3c99eadab9f0451fb3b..089cb8bc488b0f88b7fc9d6679416281d8e6830a 100644 (file)
@@ -280,6 +280,9 @@ extern nvlist_t *zpool_find_vdev_by_physpath(zpool_handle_t *, const char *,
 extern int zpool_label_disk_wait(char *, int);
 extern int zpool_label_disk(libzfs_handle_t *, zpool_handle_t *, char *);
 
+int dev_is_dm(char *devname);
+char *get_underlying_path(libzfs_handle_t *hdl, char *dev_name);
+
 /*
  * Functions to manage pool properties
  */
@@ -827,10 +830,12 @@ extern int zpool_fru_set(zpool_handle_t *, uint64_t, const char *);
  */
 extern boolean_t is_mpath_whole_disk(const char *);
 extern void update_vdev_config_dev_strs(nvlist_t *);
-extern char *zfs_strip_partition(libzfs_handle_t *, char *);
+extern char *zfs_strip_partition(char *);
 
 #ifdef HAVE_LIBUDEV
 struct udev_device;
+
+extern boolean_t udev_is_mpath(struct udev_device *dev);
 extern int zfs_device_get_devid(struct udev_device *, char *, size_t);
 extern int zfs_device_get_physical(struct udev_device *, char *, size_t);
 #endif
index 96d77c7b30779449fddaef077e3aec27fd708fc5..37df6e1d2ef09a7d4d7b9298738b8d7fa49c6c9f 100644 (file)
@@ -96,6 +96,7 @@ COMMON_H = \
        $(top_srcdir)/include/sys/zfs_delay.h \
        $(top_srcdir)/include/sys/zfs_dir.h \
        $(top_srcdir)/include/sys/zfs_fuid.h \
+       $(top_srcdir)/include/sys/zfs_ratelimit.h \
        $(top_srcdir)/include/sys/zfs_rlock.h \
        $(top_srcdir)/include/sys/zfs_sa.h \
        $(top_srcdir)/include/sys/zfs_stat.h \
index 33fccdf6782efc684d5abe3c4687c394a97ab837..74aef3a92270e06356aaefd730f8eefc7e13efaf 100644 (file)
@@ -361,6 +361,7 @@ extern uint64_t fm_ena_generation_get(uint64_t);
 extern uchar_t fm_ena_format_get(uint64_t);
 extern uint64_t fm_ena_id_get(uint64_t);
 extern uint64_t fm_ena_time_get(uint64_t);
+extern void fm_erpt_dropped_increment(void);
 
 #ifdef __cplusplus
 }
index 47e70090a568fa703d212e1817e95cb582ec1474..bdf8498fa14f2b84dc2491ac593af0e85df76b51 100644 (file)
@@ -34,6 +34,7 @@
 #include <sys/vdev.h>
 #include <sys/dkio.h>
 #include <sys/uberblock_impl.h>
+#include <sys/zfs_ratelimit.h>
 
 #ifdef __cplusplus
 extern "C" {
@@ -243,6 +244,15 @@ struct vdev {
        kmutex_t        vdev_dtl_lock;  /* vdev_dtl_{map,resilver}      */
        kmutex_t        vdev_stat_lock; /* vdev_stat                    */
        kmutex_t        vdev_probe_lock; /* protects vdev_probe_zio     */
+
+       /*
+        * We rate limit ZIO delay and ZIO checksum events, since they
+        * can flood ZED with tons of events when a drive is acting up.
+        */
+#define        DELAYS_PER_SECOND 5
+#define        CHECKSUMS_PER_SECOND 5
+       zfs_ratelimit_t vdev_delay_rl;
+       zfs_ratelimit_t vdev_checksum_rl;
 };
 
 #define        VDEV_RAIDZ_MAXPARITY    3
diff --git a/include/sys/zfs_ratelimit.h b/include/sys/zfs_ratelimit.h
new file mode 100644 (file)
index 0000000..b9f9f73
--- /dev/null
@@ -0,0 +1,37 @@
+/*
+ * CDDL HEADER START
+ *
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2016, Lawrence Livermore National Security, LLC.
+ */
+
+#ifndef _SYS_ZFS_RATELIMIT_H
+#define        _SYS_ZFS_RATELIMIT_H
+
+#include <sys/zfs_context.h>
+
+typedef struct {
+       hrtime_t start;
+       unsigned int count;
+       unsigned int burst;             /* Number to allow per interval */
+       unsigned int interval;          /* Interval length in seconds */
+       kmutex_t lock;
+} zfs_ratelimit_t;
+
+int zfs_ratelimit(zfs_ratelimit_t *rl);
+void zfs_ratelimit_init(zfs_ratelimit_t *rl, unsigned int burst,
+    unsigned int interval);
+
+#endif /* _SYS_ZFS_RATELIMIT_H */
index f1260ea7195dd60903ababd537a2c2ec0f0a5c55..8e596b0014a1ea556c8f88f22f57b6dd352f9e9e 100644 (file)
@@ -35,7 +35,7 @@ libzfs_la_LIBADD = \
        $(top_builddir)/lib/libnvpair/libnvpair.la \
        $(top_builddir)/lib/libzpool/libzpool.la
 
-libzfs_la_LIBADD += -lm $(LIBBLKID) $(LIBUDEV)
+libzfs_la_LIBADD += -lm $(LIBBLKID) $(LIBUDEV) $(LIBDEVMAPPER)
 libzfs_la_LDFLAGS = -version-info 2:0:0
 
 EXTRA_DIST = $(libzfs_pc_DATA) $(USER_C)
index edd4e5d585986b2ff88a43136924cc6ad5967590..e76f7432ae01b98e969f7e8a17fb8e36d62b51a2 100644 (file)
@@ -172,22 +172,34 @@ zfs_device_get_devid(struct udev_device *dev, char *bufptr, size_t buflen)
 int
 zfs_device_get_physical(struct udev_device *dev, char *bufptr, size_t buflen)
 {
-       const char *physpath, *value;
+       const char *physpath = NULL;
 
        /*
-        * Skip indirect multipath device nodes
+        * Normal disks use ID_PATH for their physical path.  Device mapper
+        * devices are virtual and don't have a physical path.  For them we
+        * use ID_VDEV instead, which is setup via the /etc/vdev_id.conf file.
+        * ID_VDEV provides a persistent path to a virtual device.  If you
+        * don't have vdev_id.conf setup, you cannot use multipath autoreplace.
         */
-       value = udev_device_get_property_value(dev, "DM_MULTIPATH_DEVICE_PATH");
-       if (value != NULL && strcmp(value, "1") == 0)
-               return (ENODATA);  /* skip physical for multipath nodes */
-
-       physpath = udev_device_get_property_value(dev, "ID_PATH");
-       if (physpath != NULL && physpath[0] != '\0') {
-               (void) strlcpy(bufptr, physpath, buflen);
-               return (0);
+       if (!((physpath = udev_device_get_property_value(dev, "ID_PATH")) &&
+           physpath[0])) {
+               if (!((physpath =
+                   udev_device_get_property_value(dev, "ID_VDEV")) &&
+                   physpath[0])) {
+                       return (ENODATA);
+               }
        }
 
-       return (ENODATA);
+       (void) strlcpy(bufptr, physpath, buflen);
+
+       return (0);
+}
+
+boolean_t
+udev_is_mpath(struct udev_device *dev)
+{
+       return udev_device_get_property_value(dev, "DM_UUID") &&
+       udev_device_get_property_value(dev, "MPATH_SBIN_PATH");
 }
 
 /*
@@ -200,15 +212,13 @@ zfs_device_get_physical(struct udev_device *dev, char *bufptr, size_t buflen)
 static boolean_t
 udev_mpath_whole_disk(struct udev_device *dev)
 {
-       const char *devname, *mapname, *type, *uuid;
+       const char *devname, *type, *uuid;
 
        devname = udev_device_get_property_value(dev, "DEVNAME");
-       mapname = udev_device_get_property_value(dev, "DM_NAME");
        type = udev_device_get_property_value(dev, "ID_PART_TABLE_TYPE");
        uuid = udev_device_get_property_value(dev, "DM_UUID");
 
        if ((devname != NULL && strncmp(devname, "/dev/dm-", 8) == 0) &&
-           (mapname != NULL && strncmp(mapname, "mpath", 5) == 0) &&
            ((type == NULL) || (strcmp(type, "gpt") != 0)) &&
            (uuid != NULL)) {
                return (B_TRUE);
index 68cc5f3ee0023ac422543f2b4d2710eacad1596d..ebca768342684e66baa386e35b453d9fc8833e9b 100644 (file)
@@ -41,6 +41,9 @@
 #include <sys/vtoc.h>
 #include <sys/zfs_ioctl.h>
 #include <dlfcn.h>
+#if HAVE_LIBDEVMAPPER
+#include <libdevmapper.h>
+#endif
 
 #include "zfs_namecheck.h"
 #include "zfs_prop.h"
@@ -3401,10 +3404,12 @@ set_path(zpool_handle_t *zhp, nvlist_t *nv, const char *path)
  * caller must free the returned string
  */
 char *
-zfs_strip_partition(libzfs_handle_t *hdl, char *path)
+zfs_strip_partition(char *path)
 {
-       char *tmp = zfs_strdup(hdl, path);
+       char *tmp = strdup(path);
        char *part = NULL, *d = NULL;
+       if (!tmp)
+               return (NULL);
 
        if ((part = strstr(tmp, "-part")) && part != tmp) {
                d = part + 5;
@@ -3422,6 +3427,7 @@ zfs_strip_partition(libzfs_handle_t *hdl, char *path)
                if (*d == '\0')
                        *part = '\0';
        }
+
        return (tmp);
 }
 
@@ -3544,7 +3550,7 @@ zpool_vdev_name(libzfs_handle_t *hdl, zpool_handle_t *zhp, nvlist_t *nv,
                 */
                if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, &value)
                    == 0 && value && !(name_flags & VDEV_NAME_PATH)) {
-                       return (zfs_strip_partition(hdl, path));
+                       return (zfs_strip_partition(path));
                }
        } else {
                verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &path) == 0);
@@ -4310,3 +4316,191 @@ zpool_label_disk(libzfs_handle_t *hdl, zpool_handle_t *zhp, char *name)
 
        return (0);
 }
+
+#if HAVE_LIBDEVMAPPER
+static void libdevmapper_dummy_log(int level, const char *file, int line,
+    int dm_errno_or_class, const char *f, ...) {}
+
+/* Disable libdevmapper error logging */
+static void disable_libdevmapper_errors(void) {
+       dm_log_with_errno_init(libdevmapper_dummy_log);
+}
+/* Enable libdevmapper error logging */
+static void enable_libdevmapper_errors(void) {
+       dm_log_with_errno_init(NULL);
+}
+#endif
+
+/*
+ * Allocate and return the underlying device name for a device mapper device.
+ * If a device mapper device maps to multiple devices, return the first device.
+ *
+ * For example, dm_name = "/dev/dm-0" could return "/dev/sda"
+ *
+ * dm_name should include the "/dev[/mapper]" prefix.
+ *
+ * Returns device name, or NULL on error or no match.  If dm_name is not a DM
+ * device then return NULL.
+ *
+ * NOTE: The returned name string must be *freed*.
+ */
+static char * dm_get_underlying_path(char *dm_name)
+{
+       char *name = NULL;
+#if HAVE_LIBDEVMAPPER
+       char *tmp;
+       struct dm_task *dmt = NULL;
+       struct dm_tree *dt = NULL;
+       struct dm_tree_node *root, *child;
+       void *handle = NULL;
+       struct dm_info info;
+       const struct dm_info *child_info;
+
+       /*
+        * Disable libdevmapper errors.  It's entirely possible user is not
+        * running devmapper, or that dm_name is not a devmapper device.
+        * That's totally ok, we will just harmlessly and silently return NULL.
+        */
+       disable_libdevmapper_errors();
+
+       /*
+        * libdevmapper tutorial
+        *
+        * libdevmapper is basically a fancy wrapper for its ioctls.  You
+        * create a "task", fill in the needed info to the task (fill in the
+        * ioctl fields), then run the task (call the ioctl).
+        *
+        * First we need the major/minor number for our DM device.
+        */
+       if (!(dmt = dm_task_create(DM_DEVICE_INFO)))
+               goto end;
+
+       /* Lookup the name in libdevmapper */
+       if (!dm_task_set_name(dmt, dm_name)) {
+               enable_libdevmapper_errors();
+               goto end;
+       }
+
+       if (!dm_task_run(dmt))
+               goto end;
+
+       /* Get DM device's major/minor */
+       if (!dm_task_get_info(dmt, &info))
+               goto end;
+
+       /* We have major/minor number.  Lookup the dm device's children */
+       if (!(dt = dm_tree_create()))
+               goto end;
+
+       /* We add the device into the tree and its children get populated */
+       if (!dm_tree_add_dev(dt, info.major, info.minor))
+               goto end;
+
+       if (!(root = dm_tree_find_node(dt, 0, 0)))
+               goto end;
+
+       if (!(child = dm_tree_next_child(&handle, root, 1)))
+               goto end;
+
+       /* Get child's major/minor numbers */
+       if (!(child_info = dm_tree_node_get_info(child)))
+               goto end;
+
+       if ((asprintf(&tmp, "/dev/block/%d:%d", child_info->major,
+           child_info->minor) == -1) || !tmp)
+               goto end;
+
+       /* Further translate /dev/block/ name into the normal name */
+       name = realpath(tmp, NULL);
+       free(tmp);
+
+end:
+       if (dmt)
+               dm_task_destroy(dmt);
+       if (dt)
+               dm_tree_free(dt);
+       enable_libdevmapper_errors();
+#endif /* HAVE_LIBDEVMAPPER */
+
+       return (name);
+}
+
+/*
+ * Return 1 if device is a device mapper or multipath device.
+ * Return 0 if not.
+ */
+int
+dev_is_dm(char *devname)
+{
+
+       char *tmp;
+       tmp = dm_get_underlying_path(devname);
+       if (!tmp)
+               return (0);
+
+       free(tmp);
+       return (1);
+}
+
+/*
+ * Lookup the underlying device for a device name
+ *
+ * Often you'll have a symlink to a device, a partition device,
+ * or a multipath device, and want to look up the underlying device.
+ * This function returns the underlying device name.  If the device
+ * name is already the underlying device, then just return the same
+ * name.  If the device is a DM device with multiple underlying devices
+ * then return the first one.
+ *
+ * For example:
+ *
+ * 1. /dev/disk/by-id/ata-QEMU_HARDDISK_QM00001 -> ../../sda
+ * dev_name:   /dev/disk/by-id/ata-QEMU_HARDDISK_QM00001
+ * returns:    /dev/sda
+ *
+ * 2. /dev/mapper/mpatha (made up of /dev/sda and /dev/sdb)
+ * dev_name:   /dev/mapper/mpatha
+ * returns:    /dev/sda (first device)
+ *
+ * 3. /dev/sda (already the underlying device)
+ * dev_name:   /dev/sda
+ * returns:    /dev/sda
+ *
+ * 4. /dev/dm-3 (mapped to /dev/sda)
+ * dev_name:   /dev/dm-3
+ * returns:    /dev/sda
+ *
+ * 5. /dev/disk/by-id/scsi-0QEMU_drive-scsi0-0-0-0-part9 -> ../../sdb9
+ * dev_name:   /dev/disk/by-id/scsi-0QEMU_drive-scsi0-0-0-0-part9
+ * returns:    /dev/sdb
+ *
+ * 6. /dev/disk/by-uuid/5df030cf-3cd9-46e4-8e99-3ccb462a4e9a -> ../dev/sda2
+ * dev_name:   /dev/disk/by-uuid/5df030cf-3cd9-46e4-8e99-3ccb462a4e9a
+ * returns:    /dev/sda
+ *
+ * Returns underlying device name, or NULL on error or no match.
+ *
+ * NOTE: The returned name string must be *freed*.
+ */
+char *
+get_underlying_path(libzfs_handle_t *hdl, char *dev_name)
+{
+       char *name = NULL;
+       char *tmp;
+
+       if (!dev_name)
+               return (NULL);
+
+       tmp = dm_get_underlying_path(dev_name);
+
+       /* dev_name not a DM device, so just un-symlinkize it */
+       if (!tmp)
+               tmp = realpath(dev_name, NULL);
+
+       if (tmp) {
+               name = zfs_strip_partition(tmp);
+               free(tmp);
+       }
+
+       return (name);
+}
index 80402c55ec6368ffb8687d32f848908a52df41ec..3518175699f87bc50a4206ea5a46200a5b3babe3 100644 (file)
@@ -626,7 +626,7 @@ Controls automatic pool expansion when the underlying LUN is grown. If set to \f
 .ad
 .sp .6
 .RS 4n
-Controls automatic device replacement. If set to "\fBoff\fR", device replacement must be initiated by the administrator by using the "\fBzpool replace\fR" command. If set to "\fBon\fR", any new device, found in the same physical location as a device that previously belonged to the pool, is automatically formatted and replaced. The default behavior is "\fBoff\fR". This property can also be referred to by its shortened column name, "replace".
+Controls automatic device replacement. If set to "\fBoff\fR", device replacement must be initiated by the administrator by using the "\fBzpool replace\fR" command. If set to "\fBon\fR", any new device, found in the same physical location as a device that previously belonged to the pool, is automatically formatted and replaced. The default behavior is "\fBoff\fR". This property can also be referred to by its shortened column name, "replace".  Autoreplace can also be used with virtual disks (like device mapper) provided that you use the /dev/disk/by-vdev paths setup by vdev_id.conf.  See the vdev_id.conf man page for more details.  Autoreplace and autoonline require libudev to be present at build time.  If you're using device mapper disks, you must have libdevmapper installed at build time as well.
 .RE
 
 .sp
index 6d0314fa78d08e14663c5e747a1075bcb9fdcd45..704ef84c77c8aa58e73e9acfde18ea7f5131fe4c 100644 (file)
@@ -40,6 +40,7 @@
 #include <sys/int_limits.h>
 #include <sys/nvpair.h>
 #include "zfs_comutil.h"
+#include <sys/zfs_ratelimit.h>
 
 /*
  * Are there allocatable vdevs?
@@ -206,10 +207,73 @@ const char *zfs_history_event_names[ZFS_NUM_LEGACY_HISTORY_EVENTS] = {
        "pool split",
 };
 
+/*
+ * Initialize rate limit struct
+ *
+ * rl:         zfs_ratelimit_t struct
+ * burst:      Number to allow in an interval before rate limiting
+ * interval:   Interval time in seconds
+ */
+void
+zfs_ratelimit_init(zfs_ratelimit_t *rl, unsigned int burst,
+    unsigned int interval)
+{
+       rl->count = 0;
+       rl->start = 0;
+       rl->interval = interval;
+       rl->burst = burst;
+       mutex_init(&rl->lock, NULL, MUTEX_DEFAULT, NULL);
+}
+
+/*
+ * Re-implementation of the kernel's __ratelimit() function
+ *
+ * We had to write our own rate limiter because the kernel's __ratelimit()
+ * function annoyingly prints out how many times it rate limited to the kernel
+ * logs (and there's no way to turn it off):
+ *
+ *     __ratelimit: 59 callbacks suppressed
+ *
+ * If the kernel ever allows us to disable these prints, we should go back to
+ * using __ratelimit() instead.
+ *
+ * Return values are the same as __ratelimit():
+ *
+ * 0: If we're rate limiting
+ * 1: If we're not rate limiting.
+ */
+int
+zfs_ratelimit(zfs_ratelimit_t *rl)
+{
+       hrtime_t now;
+       hrtime_t elapsed;
+       int rc = 1;
+
+       mutex_enter(&rl->lock);
+
+       now = gethrtime();
+       elapsed = now - rl->start;
+
+       rl->count++;
+       if (NSEC2SEC(elapsed) >= rl->interval) {
+               rl->start = now;
+               rl->count = 0;
+       } else {
+               if (rl->count >= rl->burst) {
+                       rc = 0; /* We're ratelimiting */
+               }
+       }
+       mutex_exit(&rl->lock);
+
+       return (rc);
+}
+
 #if defined(_KERNEL) && defined(HAVE_SPL)
 EXPORT_SYMBOL(zfs_allocatable_devs);
 EXPORT_SYMBOL(zpool_get_rewind_policy);
 EXPORT_SYMBOL(zfs_zpl_version_map);
 EXPORT_SYMBOL(zfs_spa_version_map);
 EXPORT_SYMBOL(zfs_history_event_names);
+EXPORT_SYMBOL(zfs_ratelimit_init);
+EXPORT_SYMBOL(zfs_ratelimit);
 #endif
index a1069d140c498db53ed9bf27b4fc4b66cf35ceb9..6c569ffc444aa1a3a8e54dd85713bbe9dec0eb2f 100644 (file)
@@ -84,6 +84,9 @@ static int zevent_len_cur = 0;
 static int zevent_waiters = 0;
 static int zevent_flags = 0;
 
+/* Num events rate limited since the last time zfs_zevent_next() was called */
+static uint64_t ratelimit_dropped = 0;
+
 /*
  * The EID (Event IDentifier) is used to uniquely tag a zevent when it is
  * posted.  The posted EIDs are monotonically increasing but not persistent.
@@ -654,6 +657,12 @@ zfs_zevent_next(zfs_zevent_t *ze, nvlist_t **event, uint64_t *event_size,
        list_insert_head(&ev->ev_ze_list, ze);
        (void) nvlist_dup(ev->ev_nvl, event, KM_SLEEP);
        *dropped = ze->ze_dropped;
+
+#ifdef _KERNEL
+       /* Include events dropped due to rate limiting */
+       *dropped += ratelimit_dropped;
+       ratelimit_dropped = 0;
+#endif
        ze->ze_dropped = 0;
 out:
        mutex_exit(&zevent_lock);
@@ -1586,6 +1595,19 @@ fm_ena_time_get(uint64_t ena)
        return (time);
 }
 
+#ifdef _KERNEL
+/*
+ * Helper function to increment ereport dropped count.  Used by the event
+ * rate limiting code to give feedback to the user about how many events were
+ * rate limited by including them in the 'dropped' count.
+ */
+void
+fm_erpt_dropped_increment(void)
+{
+       atomic_inc_64(&ratelimit_dropped);
+}
+#endif
+
 #ifdef _KERNEL
 void
 fm_init(void)
index 5ff5cf3b1271b0d25e7697891bcec9ad913670fb..f7e91430f29c12c9093c01c98aaed80cff740e65 100644 (file)
@@ -44,6 +44,7 @@
 #include <sys/zil.h>
 #include <sys/dsl_scan.h>
 #include <sys/zvol.h>
+#include <sys/zfs_ratelimit.h>
 
 /*
  * When a vdev is added, it will be divided into approximately (but no
@@ -346,12 +347,21 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
        vd->vdev_state = VDEV_STATE_CLOSED;
        vd->vdev_ishole = (ops == &vdev_hole_ops);
 
+       /*
+        * Initialize rate limit structs for events.  We rate limit ZIO delay
+        * and checksum events so that we don't overwhelm ZED with thousands
+        * of events when a disk is acting up.
+        */
+       zfs_ratelimit_init(&vd->vdev_delay_rl, DELAYS_PER_SECOND, 1);
+       zfs_ratelimit_init(&vd->vdev_checksum_rl, CHECKSUMS_PER_SECOND, 1);
+
        list_link_init(&vd->vdev_config_dirty_node);
        list_link_init(&vd->vdev_state_dirty_node);
        mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_NOLOCKDEP, NULL);
        mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL);
        mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL);
        mutex_init(&vd->vdev_queue_lock, NULL, MUTEX_DEFAULT, NULL);
+
        for (t = 0; t < DTL_TYPES; t++) {
                vd->vdev_dtl[t] = range_tree_create(NULL, NULL,
                    &vd->vdev_dtl_lock);
@@ -2221,7 +2231,6 @@ vdev_load(vdev_t *vd)
            vdev_metaslab_init(vd, 0) != 0))
                vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
                    VDEV_AUX_CORRUPT_DATA);
-
        /*
         * If this is a leaf vdev, load its DTL.
         */
@@ -3458,15 +3467,17 @@ vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux)
        /*
         * Notify ZED of any significant state-change on a leaf vdev.
         *
-        * We ignore transitions from a closed state to healthy unless
-        * the parent was degraded.
         */
-       if (vd->vdev_ops->vdev_op_leaf &&
-           ((save_state > VDEV_STATE_CLOSED) ||
-           (vd->vdev_state < VDEV_STATE_HEALTHY) ||
-           (vd->vdev_parent != NULL &&
-           vd->vdev_parent->vdev_prevstate == VDEV_STATE_DEGRADED))) {
-               zfs_post_state_change(spa, vd, save_state);
+       if (vd->vdev_ops->vdev_op_leaf) {
+               /* preserve original state from a vdev_reopen() */
+               if ((vd->vdev_prevstate != VDEV_STATE_UNKNOWN) &&
+                   (vd->vdev_prevstate != vd->vdev_state) &&
+                   (save_state <= VDEV_STATE_CLOSED))
+                       save_state = vd->vdev_prevstate;
+
+               /* filter out state change due to initial vdev_open */
+               if (save_state > VDEV_STATE_CLOSED)
+                       zfs_post_state_change(spa, vd, save_state);
        }
 
        if (!isopen && vd->vdev_parent)
index 0d508c0b840b6ba743a94d3648c55516b28089be..5b6bea7aefc2e1b504ee15813b897c6fb617ce65 100644 (file)
@@ -112,6 +112,33 @@ zfs_zevent_post_cb(nvlist_t *nvl, nvlist_t *detector)
                fm_nvlist_destroy(detector, FM_NVA_FREE);
 }
 
+/*
+ * We want to rate limit ZIO delay and checksum events so as to not
+ * flood ZED when a disk is acting up.
+ *
+ * Returns 1 if we're ratelimiting, 0 if not.
+ */
+static int
+zfs_is_ratelimiting_event(const char *subclass, vdev_t *vd)
+{
+       int rc = 0;
+       /*
+        * __ratelimit() returns 1 if we're *not* ratelimiting and 0 if we
+        * are.  Invert it to get our return value.
+        */
+       if (strcmp(subclass, FM_EREPORT_ZFS_DELAY) == 0) {
+               rc = !zfs_ratelimit(&vd->vdev_delay_rl);
+       } else if (strcmp(subclass, FM_EREPORT_ZFS_CHECKSUM) == 0) {
+               rc = !zfs_ratelimit(&vd->vdev_checksum_rl);
+       }
+
+       if (rc) {
+               /* We're rate limiting */
+               fm_erpt_dropped_increment();
+       }
+
+       return (rc);
+}
 
 static void
 zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out,
@@ -191,6 +218,12 @@ zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out,
                return;
        }
 
+       if ((strcmp(subclass, FM_EREPORT_ZFS_DELAY) == 0) &&
+           !zio->io_timestamp) {
+               /* Ignore bogus delay events */
+               return;
+       }
+
        /*
         * Serialize ereport generation
         */
@@ -738,6 +771,9 @@ zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio,
        if (ereport == NULL)
                return;
 
+       if (zfs_is_ratelimiting_event(subclass, vd))
+               return;
+
        /* Cleanup is handled by the callback function */
        zfs_zevent_post(ereport, detector, zfs_zevent_post_cb);
 #endif
@@ -748,7 +784,15 @@ zfs_ereport_start_checksum(spa_t *spa, vdev_t *vd,
     struct zio *zio, uint64_t offset, uint64_t length, void *arg,
     zio_bad_cksum_t *info)
 {
-       zio_cksum_report_t *report = kmem_zalloc(sizeof (*report), KM_SLEEP);
+       zio_cksum_report_t *report;
+
+
+#ifdef _KERNEL
+       if (zfs_is_ratelimiting_event(FM_EREPORT_ZFS_CHECKSUM, vd))
+               return;
+#endif
+
+       report = kmem_zalloc(sizeof (*report), KM_SLEEP);
 
        if (zio->io_vsd != NULL)
                zio->io_vsd_ops->vsd_cksum_report(zio, report, arg);
index 4e693ea5b70fc356e79f6fc8ab8c542d8c7b6c96..e80f51e312b334edb29d20ede65247494c712c99 100644 (file)
@@ -73,6 +73,7 @@ Requires:       libzpool2 = %{version}
 Requires:       libnvpair1 = %{version}
 Requires:       libuutil1 = %{version}
 Requires:       libzfs2 = %{version}
+Requires:       device-mapper
 Requires:       %{name}-kmod = %{version}
 Provides:       %{name}-kmod-common = %{version}
 
@@ -84,6 +85,7 @@ Conflicts:      zfs-fuse
 BuildRequires:  zlib-devel
 BuildRequires:  libuuid-devel
 BuildRequires:  libblkid-devel
+BuildRequires:  device-mapper-devel
 BuildRequires:  libudev-devel
 BuildRequires:  libattr-devel
 %endif