From aded9a6814c9f6260437bc186ad08debc5d0b6c8 Mon Sep 17 00:00:00 2001 From: Chris Dunlap Date: Tue, 17 Feb 2015 17:23:54 -0800 Subject: [PATCH] Cleanup ZEDLETs This commit factors out several common ZEDLET code blocks into zed-functions.sh. This shortens the length of the scripts, thereby (hopefully) making them easier to understand and maintain. In addition, this commit revamps the coding style used by the scripts to be more consistent and (again, hopefully) maintainable. It now mostly follows the Google Shell Style Guide. I've tried to assimilate the following resources: Google Shell Style Guide https://google-styleguide.googlecode.com/svn/trunk/shell.xml Dash as /bin/sh https://wiki.ubuntu.com/DashAsBinSh Filenames and Pathnames in Shell: How to do it Correctly http://www.dwheeler.com/essays/filenames-in-shell.html Common shell script mistakes http://www.pixelbeat.org/programming/shell_script_mistakes.html Finally, this commit updates the exit codes used by the ZEDLETs to be more consistent with one another. All scripts run cleanly through ShellCheck . All scripts have been tested on bash and dash. Signed-off-by: Chris Dunlap --- cmd/zed/Makefile.am | 3 + cmd/zed/zed.d/README | 30 ++++ cmd/zed/zed.d/all-debug.sh | 19 +- cmd/zed/zed.d/all-syslog.sh | 9 +- cmd/zed/zed.d/data-email.sh | 88 ++++----- cmd/zed/zed.d/generic-email.sh | 90 +++++----- cmd/zed/zed.d/io-email.sh | 95 ++++------ cmd/zed/zed.d/io-spare.sh | 267 +++++++++++++++++----------- cmd/zed/zed.d/scrub.finish-email.sh | 92 +++++----- cmd/zed/zed.d/zed-functions.sh | 230 ++++++++++++++++++++++++ cmd/zed/zed.d/zed.rc | 44 ++++- 11 files changed, 630 insertions(+), 337 deletions(-) create mode 100644 cmd/zed/zed.d/README create mode 100644 cmd/zed/zed.d/zed-functions.sh diff --git a/cmd/zed/Makefile.am b/cmd/zed/Makefile.am index b907f6af9..8bdc097c7 100644 --- a/cmd/zed/Makefile.am +++ b/cmd/zed/Makefile.am @@ -4,6 +4,8 @@ DEFAULT_INCLUDES += \ -I$(top_srcdir)/include \ -I$(top_srcdir)/lib/libspl/include +EXTRA_DIST = $(top_srcdir)/cmd/zed/zed.d/README + sbin_PROGRAMS = zed zed_SOURCES = \ @@ -33,6 +35,7 @@ zed_LDADD = \ zedconfdir = $(sysconfdir)/zfs/zed.d dist_zedconf_DATA = \ + $(top_srcdir)/cmd/zed/zed.d/zed-functions.sh \ $(top_srcdir)/cmd/zed/zed.d/zed.rc zedexecdir = $(libexecdir)/zfs/zed.d diff --git a/cmd/zed/zed.d/README b/cmd/zed/zed.d/README new file mode 100644 index 000000000..b4cb11514 --- /dev/null +++ b/cmd/zed/zed.d/README @@ -0,0 +1,30 @@ +Shell scripts are the recommended choice for ZEDLETs that mostly call +other utilities and do relatively little data manipulation. + +Shell scripts MUST work on both bash and dash. + +Shell scripts MUST run cleanly through ShellCheck: + http://www.shellcheck.net/ + +General functions reside in "zed-functions.sh". Use them where applicable. + +Additional references that may be of use: + + Google Shell Style Guide + https://google-styleguide.googlecode.com/svn/trunk/shell.xml + + Dash as /bin/sh + https://wiki.ubuntu.com/DashAsBinSh + + Common shell script mistakes + http://www.pixelbeat.org/programming/shell_script_mistakes.html + + Filenames and Pathnames in Shell: How to do it Correctly + http://www.dwheeler.com/essays/filenames-in-shell.html + + Autoconf: Portable Shell Programming + https://www.gnu.org/software/autoconf/manual/autoconf.html#Portable-Shell + +Please BE CONSISTENT with the existing style, check for errors, +minimize dependencies where possible, try to be portable, +and comment anything non-obvious. Festina lente. diff --git a/cmd/zed/zed.d/all-debug.sh b/cmd/zed/zed.d/all-debug.sh index aa20ef268..057e39b50 100755 --- a/cmd/zed/zed.d/all-debug.sh +++ b/cmd/zed/zed.d/all-debug.sh @@ -2,16 +2,23 @@ # # Log all environment variables to ZED_DEBUG_LOG. # -test -f "${ZED_ZEDLET_DIR}/zed.rc" && . "${ZED_ZEDLET_DIR}/zed.rc" +# This can be a useful aid when developing/debugging ZEDLETs since it shows the +# environment variables defined for each zevent. -# Override the default umask to restrict access to a newly-created logfile. -umask 077 +[ -f "${ZED_ZEDLET_DIR}/zed.rc" ] && . "${ZED_ZEDLET_DIR}/zed.rc" +. "${ZED_ZEDLET_DIR}/zed-functions.sh" + +: "${ZED_DEBUG_LOG:="${TMPDIR:="/tmp"}/zed.debug.log"}" -# Append stdout to the logfile after obtaining an advisory lock. -exec >> "${ZED_DEBUG_LOG:=/tmp/zed.debug.log}" -flock -x 1 +lockfile="$(basename -- "${ZED_DEBUG_LOG}").lock" + +umask 077 +zed_lock "${lockfile}" +exec >> "${ZED_DEBUG_LOG}" printenv | sort echo +exec >&- +zed_unlock "${lockfile}" exit 0 diff --git a/cmd/zed/zed.d/all-syslog.sh b/cmd/zed/zed.d/all-syslog.sh index acf9e83bd..b34d17cef 100755 --- a/cmd/zed/zed.d/all-syslog.sh +++ b/cmd/zed/zed.d/all-syslog.sh @@ -1,11 +1,10 @@ #!/bin/sh # # Log the zevent via syslog. -# -test -f "${ZED_ZEDLET_DIR}/zed.rc" && . "${ZED_ZEDLET_DIR}/zed.rc" -logger -t "${ZED_SYSLOG_TAG:=zed}" -p "${ZED_SYSLOG_PRIORITY:=daemon.notice}" \ - eid="${ZEVENT_EID}" class="${ZEVENT_SUBCLASS}" \ - "${ZEVENT_POOL:+pool=$ZEVENT_POOL}" +[ -f "${ZED_ZEDLET_DIR}/zed.rc" ] && . "${ZED_ZEDLET_DIR}/zed.rc" +. "${ZED_ZEDLET_DIR}/zed-functions.sh" +zed_log_msg "eid=${ZEVENT_EID}" "class=${ZEVENT_SUBCLASS}" \ + "${ZEVENT_POOL:+"pool=${ZEVENT_POOL}"}" exit 0 diff --git a/cmd/zed/zed.d/data-email.sh b/cmd/zed/zed.d/data-email.sh index 543b8fe55..2dae8ff6b 100755 --- a/cmd/zed/zed.d/data-email.sh +++ b/cmd/zed/zed.d/data-email.sh @@ -1,81 +1,53 @@ #!/bin/sh # -# Send email to ZED_EMAIL in response to a DATA zevent. -# Only one message per ZED_EMAIL_INTERVAL_SECS will be sent for a given -# class/pool combination. This protects against spamming the recipient -# should multiple events occur together in time for the same pool. +# Send email to ZED_EMAIL in response to a DATA error. +# +# Only one email per ZED_EMAIL_INTERVAL_SECS will be sent for a given +# class/pool combination. This protects against spamming the recipient +# should multiple events occur together in time for the same pool. +# # Exit codes: # 0: email sent # 1: email failed -# 2: email suppressed -# 3: missing executable -# 4: unsupported event class -# 5: internal error -# State File Format: -# POOL;TIME_OF_LAST_EMAIL -# -test -f "${ZED_ZEDLET_DIR}/zed.rc" && . "${ZED_ZEDLET_DIR}/zed.rc" +# 2: email not configured +# 3: email suppressed +# 9: internal error -test -n "${ZEVENT_POOL}" || exit 5 -test -n "${ZEVENT_SUBCLASS}" || exit 5 +[ -f "${ZED_ZEDLET_DIR}/zed.rc" ] && . "${ZED_ZEDLET_DIR}/zed.rc" +. "${ZED_ZEDLET_DIR}/zed-functions.sh" -if test "${ZEVENT_SUBCLASS}" != "data"; then \ - logger -t "${ZED_SYSLOG_TAG:=zed}" \ - -p "${ZED_SYSLOG_PRIORITY:=daemon.warning}" \ - `basename "$0"`: unsupported event class \"${ZEVENT_SUBCLASS}\" - exit 4 -fi +[ -n "${ZED_EMAIL}" ] || exit 2 -# Only send email if ZED_EMAIL has been configured. -test -n "${ZED_EMAIL}" || exit 2 +[ -n "${ZEVENT_POOL}" ] || exit 9 +[ -n "${ZEVENT_SUBCLASS}" ] || exit 9 -# Ensure requisite executables are installed. -if ! command -v "${MAIL:=mail}" >/dev/null 2>&1; then - logger -t "${ZED_SYSLOG_TAG:=zed}" \ - -p "${ZED_SYSLOG_PRIORITY:=daemon.warning}" \ - `basename "$0"`: "${MAIL}" not installed - exit 3 +if [ "${ZEVENT_SUBCLASS}" != "data" ]; then \ + zed_log_err "unsupported event class \"${ZEVENT_SUBCLASS}\"" + exit 9 fi -NAME="zed.${ZEVENT_SUBCLASS}.email" -LOCKFILE="${ZED_LOCKDIR:=/var/lock}/${NAME}.lock" -STATEFILE="${ZED_RUNDIR:=/var/run}/${NAME}.state" +zed_check_cmd "mail" || exit 9 -# Obtain lock to ensure mutual exclusion for accessing state. -exec 8> "${LOCKFILE}" -flock -x 8 +zed_rate_limit "${ZEVENT_POOL};${ZEVENT_SUBCLASS};email" || exit 3 -# Query state for last time email was sent for this pool. -TIME_NOW=`date +%s` -TIME_LAST=`egrep "^${ZEVENT_POOL};" "${STATEFILE}" 2>/dev/null | cut -d ";" -f2` -if test -n "${TIME_LAST}"; then - TIME_DELTA=`expr "${TIME_NOW}" - "${TIME_LAST}"` - if test "${TIME_DELTA}" -lt "${ZED_EMAIL_INTERVAL_SECS:=3600}"; then - exit 2 - fi -fi - -"${MAIL}" -s "ZFS ${ZEVENT_SUBCLASS} error for ${ZEVENT_POOL} on `hostname`" \ - "${ZED_EMAIL}" < "${email_pathname}" </dev/null > "${STATEFILE}.$$" -echo "${ZEVENT_POOL};${TIME_NOW}" >> "${STATEFILE}.$$" -mv -f "${STATEFILE}.$$" "${STATEFILE}" +mail -s "${email_subject}" "${ZED_EMAIL}" < "${email_pathname}" +mail_status=$? -if test "${MAIL_STATUS}" -ne 0; then - logger -t "${ZED_SYSLOG_TAG:=zed}" \ - -p "${ZED_SYSLOG_PRIORITY:=daemon.warning}" \ - `basename "$0"`: "${MAIL}" exit="${MAIL_STATUS}" - exit 1 +if [ "${mail_status}" -ne 0 ]; then + zed_log_msg "mail exit=${mail_status}" + exit 1 fi - +rm -f "${email_pathname}" exit 0 diff --git a/cmd/zed/zed.d/generic-email.sh b/cmd/zed/zed.d/generic-email.sh index 357aedee5..ad022e034 100755 --- a/cmd/zed/zed.d/generic-email.sh +++ b/cmd/zed/zed.d/generic-email.sh @@ -1,59 +1,59 @@ #!/bin/sh # # Send email to ZED_EMAIL in response to a given zevent. -# This is a generic script than can be symlinked to a file in the zed -# enabled-scripts directory in order to have email sent when a particular -# class of zevents occurs. The symlink filename must begin with the zevent -# (sub)class string (eg, "probe_failure-email.sh" for the "probe_failure" -# subclass). Refer to the zed(8) manpage for details. +# +# This is a generic script than can be symlinked to a file in the +# enabled-zedlets directory to have an email sent when a particular class of +# zevents occurs. The symlink filename must begin with the zevent (sub)class +# string (e.g., "probe_failure-email.sh" for the "probe_failure" subclass). +# Refer to the zed(8) manpage for details. +# # Exit codes: # 0: email sent # 1: email failed -# 2: email suppressed -# 3: missing executable -# -test -f "${ZED_ZEDLET_DIR}/zed.rc" && . "${ZED_ZEDLET_DIR}/zed.rc" +# 2: email not configured +# 3: email suppressed -# Only send email if ZED_EMAIL has been configured. -test -n "${ZED_EMAIL}" || exit 2 +[ -f "${ZED_ZEDLET_DIR}/zed.rc" ] && . "${ZED_ZEDLET_DIR}/zed.rc" +. "${ZED_ZEDLET_DIR}/zed-functions.sh" -# Ensure requisite executables are installed. -if ! command -v "${MAIL:=mail}" >/dev/null 2>&1; then - logger -t "${ZED_SYSLOG_TAG:=zed}" \ - -p "${ZED_SYSLOG_PRIORITY:=daemon.warning}" \ - `basename "$0"`: "${MAIL}" not installed - exit 3 -fi +[ -n "${ZED_EMAIL}" ] || exit 2 + +# Rate-limit the message based in part on the filename. +# +rate_limit_tag="${ZEVENT_POOL};${ZEVENT_SUBCLASS};$(basename -- "$0")" +rate_limit_interval="${ZED_EMAIL_INTERVAL_SECS}" +zed_rate_limit "${rate_limit_tag}" "${rate_limit_interval}" || exit 3 -# Override the default umask to restrict access to the msgbody tmpfile. umask 077 +pool_str="${ZEVENT_POOL:+" for ${ZEVENT_POOL}"}" +host_str=" on $(hostname)" +email_subject="ZFS ${ZEVENT_SUBCLASS} event${pool_str}${host_str}" +email_pathname="${TMPDIR:="/tmp"}/$(basename -- "$0").${ZEVENT_EID}.$$" +{ + echo "ZFS has posted the following event:" + echo + echo " eid: ${ZEVENT_EID}" + echo " class: ${ZEVENT_SUBCLASS}" + echo " host: $(hostname)" + echo " time: ${ZEVENT_TIME_STRING}" -SUBJECT="ZFS ${ZEVENT_SUBCLASS} event" -test -n "${ZEVENT_POOL}" && SUBJECT="${SUBJECT} for ${ZEVENT_POOL}" -SUBJECT="${SUBJECT} on `hostname`" + if [ -n "${ZEVENT_VDEV_PATH}" ]; then + echo " vpath: ${ZEVENT_VDEV_PATH}" + [ -n "${ZEVENT_VDEV_TYPE}" ] && echo " vtype: ${ZEVENT_VDEV_TYPE}" + fi -MSGBODY="${TMPDIR:=/tmp}/`basename \"$0\"`.$$" -{ - echo "A ZFS ${ZEVENT_SUBCLASS} event has been posted:" - echo - echo " eid: ${ZEVENT_EID}" - echo " host: `hostname`" - echo " time: ${ZEVENT_TIME_STRING}" - test -n "${ZEVENT_VDEV_TYPE}" -a -n "${ZEVENT_VDEV_PATH}" && \ - echo " vdev: ${ZEVENT_VDEV_TYPE}:${ZEVENT_VDEV_PATH}" - test -n "${ZEVENT_POOL}" -a -x "${ZPOOL}" && \ - "${ZPOOL}" status "${ZEVENT_POOL}" -} > "${MSGBODY}" - -test -f "${MSGBODY}" && "${MAIL}" -s "${SUBJECT}" "${ZED_EMAIL}" < "${MSGBODY}" -MAIL_STATUS=$? -rm -f "${MSGBODY}" - -if test "${MAIL_STATUS}" -ne 0; then - logger -t "${ZED_SYSLOG_TAG:=zed}" \ - -p "${ZED_SYSLOG_PRIORITY:=daemon.warning}" \ - `basename "$0"`: "${MAIL}" exit="${MAIL_STATUS}" - exit 1 -fi + [ -n "${ZEVENT_POOL}" ] && [ -x "${ZPOOL}" ] \ + && "${ZPOOL}" status "${ZEVENT_POOL}" +} > "${email_pathname}" + +mail -s "${email_subject}" "${ZED_EMAIL}" < "${email_pathname}" +mail_status=$? + +if [ "${mail_status}" -ne 0 ]; then + zed_log_msg "mail exit=${mail_status}" + exit 1 +fi +rm -f "${email_pathname}" exit 0 diff --git a/cmd/zed/zed.d/io-email.sh b/cmd/zed/zed.d/io-email.sh index 9edbe6670..1854b1593 100755 --- a/cmd/zed/zed.d/io-email.sh +++ b/cmd/zed/zed.d/io-email.sh @@ -1,86 +1,57 @@ #!/bin/sh # -# Send email to ZED_EMAIL in response to a CHECKSUM or IO zevent. -# Only one message per ZED_EMAIL_INTERVAL_SECS will be sent for a given -# class/pool/vdev combination. This protects against spamming the recipient -# should multiple events occur together in time for the same pool/device. +# Send email to ZED_EMAIL in response to a CHECKSUM or IO error. +# +# Only one email per ZED_EMAIL_INTERVAL_SECS will be sent for a given +# class/pool/vdev combination. This protects against spamming the recipient +# should multiple events occur together in time for the same pool/device. +# # Exit codes: # 0: email sent # 1: email failed -# 2: email suppressed -# 3: missing executable -# 4: unsupported event class -# 5: internal error -# State File Format: -# POOL;VDEV_PATH;TIME_OF_LAST_EMAIL -# -test -f "${ZED_ZEDLET_DIR}/zed.rc" && . "${ZED_ZEDLET_DIR}/zed.rc" +# 2: email not configured +# 3: email suppressed +# 9: internal error -test -n "${ZEVENT_POOL}" || exit 5 -test -n "${ZEVENT_SUBCLASS}" || exit 5 -test -n "${ZEVENT_VDEV_PATH}" || exit 5 +[ -f "${ZED_ZEDLET_DIR}/zed.rc" ] && . "${ZED_ZEDLET_DIR}/zed.rc" +. "${ZED_ZEDLET_DIR}/zed-functions.sh" -if test "${ZEVENT_SUBCLASS}" != "checksum" \ - -a "${ZEVENT_SUBCLASS}" != "io"; then - logger -t "${ZED_SYSLOG_TAG:=zed}" \ - -p "${ZED_SYSLOG_PRIORITY:=daemon.warning}" \ - `basename "$0"`: unsupported event class \"${ZEVENT_SUBCLASS}\" - exit 4 -fi +[ -n "${ZED_EMAIL}" ] || exit 2 -# Only send email if ZED_EMAIL has been configured. -test -n "${ZED_EMAIL}" || exit 2 +[ -n "${ZEVENT_POOL}" ] || exit 9 +[ -n "${ZEVENT_SUBCLASS}" ] || exit 9 +[ -n "${ZEVENT_VDEV_PATH}" ] || exit 9 -# Ensure requisite executables are installed. -if ! command -v "${MAIL:=mail}" >/dev/null 2>&1; then - logger -t "${ZED_SYSLOG_TAG:=zed}" \ - -p "${ZED_SYSLOG_PRIORITY:=daemon.warning}" \ - `basename "$0"`: "${MAIL}" not installed - exit 3 +if [ "${ZEVENT_SUBCLASS}" != "checksum" ] \ + && [ "${ZEVENT_SUBCLASS}" != "io" ]; then + zed_log_err "unsupported event class \"${ZEVENT_SUBCLASS}\"" + exit 9 fi -NAME="zed.${ZEVENT_SUBCLASS}.email" -LOCKFILE="${ZED_LOCKDIR:=/var/lock}/${NAME}.lock" -STATEFILE="${ZED_RUNDIR:=/var/run}/${NAME}.state" +zed_check_cmd "mail" || exit 9 -# Obtain lock to ensure mutual exclusion for accessing state. -exec 8> "${LOCKFILE}" -flock -x 8 +zed_rate_limit "${ZEVENT_POOL};${ZEVENT_VDEV_PATH};${ZEVENT_SUBCLASS};email" \ + || exit 3 -# Query state for last time email was sent for this pool/vdev. -TIME_NOW=`date +%s` -TIME_LAST=`egrep "^${ZEVENT_POOL};${ZEVENT_VDEV_PATH};" "${STATEFILE}" \ - 2>/dev/null | cut -d ";" -f3` -if test -n "${TIME_LAST}"; then - TIME_DELTA=`expr "${TIME_NOW}" - "${TIME_LAST}"` - if test "${TIME_DELTA}" -lt "${ZED_EMAIL_INTERVAL_SECS:=3600}"; then - exit 2 - fi -fi - -"${MAIL}" -s "ZFS ${ZEVENT_SUBCLASS} error for ${ZEVENT_POOL} on `hostname`" \ - "${ZED_EMAIL}" < "${email_pathname}" </dev/null > "${STATEFILE}.$$" -echo "${ZEVENT_POOL};${ZEVENT_VDEV_PATH};${TIME_NOW}" >> "${STATEFILE}.$$" -mv -f "${STATEFILE}.$$" "${STATEFILE}" +mail -s "${email_subject}" "${ZED_EMAIL}" < "${email_pathname}" +mail_status=$? -if test "${MAIL_STATUS}" -ne 0; then - logger -t "${ZED_SYSLOG_TAG:=zed}" \ - -p "${ZED_SYSLOG_PRIORITY:=daemon.warning}" \ - `basename "$0"`: "${MAIL}" exit="${MAIL_STATUS}" - exit 1 +if [ "${mail_status}" -ne 0 ]; then + zed_log_msg "mail exit=${mail_status}" + exit 1 fi - +rm -f "${email_pathname}" exit 0 diff --git a/cmd/zed/zed.d/io-spare.sh b/cmd/zed/zed.d/io-spare.sh index b64b2a9f1..9667dedcb 100755 --- a/cmd/zed/zed.d/io-spare.sh +++ b/cmd/zed/zed.d/io-spare.sh @@ -1,6 +1,6 @@ #!/bin/sh # -# Replace a device with a hot spare in response to IO or checksum errors. +# Replace a device with a hot spare in response to IO or CHECKSUM errors. # The following actions will be performed automatically when the number # of errors exceed the limit set by ZED_SPARE_ON_IO_ERRORS or # ZED_SPARE_ON_CHECKSUM_ERRORS. @@ -21,106 +21,171 @@ # the majority of the expected hot spare functionality. # # Exit codes: -# 0: replaced by hot spare -# 1: no hot spare device available -# 2: hot sparing disabled -# 3: already faulted or degraded -# 4: unsupported event class -# 5: internal error +# 0: hot spare replacement successful +# 1: hot spare device not available +# 2: hot sparing disabled or threshold not reached +# 3: device already faulted or degraded +# 9: internal error + +[ -f "${ZED_ZEDLET_DIR}/zed.rc" ] && . "${ZED_ZEDLET_DIR}/zed.rc" +. "${ZED_ZEDLET_DIR}/zed-functions.sh" + +# Disabled by default. Enable in the zed.rc file. +: "${ZED_SPARE_ON_CHECKSUM_ERRORS:=0}" +: "${ZED_SPARE_ON_IO_ERRORS:=0}" + + +# query_vdev_status (pool, vdev) +# +# Given a [pool] and [vdev], return the matching vdev path & status on stdout. +# +# Warning: This function does not handle the case of [pool] or [vdev] +# containing whitespace. Beware of ShellCheck SC2046. Caveat emptor. +# +# Arguments +# pool: pool name +# vdev: virtual device name +# +# StdOut +# arg1: vdev pathname +# arg2: vdev status # -test -f "${ZED_ZEDLET_DIR}/zed.rc" && . "${ZED_ZEDLET_DIR}/zed.rc" - -test -n "${ZEVENT_POOL}" || exit 5 -test -n "${ZEVENT_SUBCLASS}" || exit 5 -test -n "${ZEVENT_VDEV_PATH}" || exit 5 -test -n "${ZEVENT_VDEV_GUID}" || exit 5 - -# Defaults to disabled, enable in the zed.rc file. -ZED_SPARE_ON_IO_ERRORS=${ZED_SPARE_ON_IO_ERRORS:-0} -ZED_SPARE_ON_CHECKSUM_ERRORS=${ZED_SPARE_ON_CHECKSUM_ERRORS:-0} - -if [ ${ZED_SPARE_ON_IO_ERRORS} -eq 0 -a \ - ${ZED_SPARE_ON_CHECKSUM_ERRORS} -eq 0 ]; then - exit 2 -fi - -# A lock file is used to serialize execution. -ZED_LOCKDIR=${ZED_LOCKDIR:-/var/lock} -LOCKFILE="${ZED_LOCKDIR}/zed.spare.lock" - -exec 8> "${LOCKFILE}" -flock -x 8 - -# Given a and return the status, (ONLINE, FAULTED, etc...). -vdev_status() { - local POOL=$1 - local VDEV=`basename $2` - local T=' ' # tab character since '\t' isn't portable - - ${ZPOOL} status ${POOL} | sed -n -e \ - "s,^[ $T]*\(.*$VDEV\(-part[0-9]\+\)\?\)[ $T]*\([A-Z]\+\).*,\1 \3,p" - return 0 +query_vdev_status() +{ + local pool="$1" + local vdev="$2" + local t + + vdev="$(basename -- "${vdev}")" + ([ -n "${pool}" ] && [ -n "${vdev}" ]) || return + t="$(printf '\t')" + + "${ZPOOL}" status "${pool}" 2>/dev/null | sed -n -e \ + "s,^[ $t]*\(.*${vdev}\(-part[0-9]\+\)\?\)[ $t]*\([A-Z]\+\).*,\1 \3,p" \ + | tail -1 } -# Fault devices after N I/O errors. -if [ "${ZEVENT_CLASS}" = "ereport.fs.zfs.io" ]; then - ERRORS=`expr ${ZEVENT_VDEV_READ_ERRORS} + ${ZEVENT_VDEV_WRITE_ERRORS}` - - if [ ${ZED_SPARE_ON_IO_ERRORS} -gt 0 -a \ - ${ERRORS} -ge ${ZED_SPARE_ON_IO_ERRORS} ]; then - ACTION="fault" - fi -# Degrade devices after N checksum errors. -elif [ "${ZEVENT_CLASS}" = "ereport.fs.zfs.checksum" ]; then - ERRORS=${ZEVENT_VDEV_CKSUM_ERRORS} - - if [ ${ZED_SPARE_ON_CHECKSUM_ERRORS} -gt 0 -a \ - ${ERRORS} -ge ${ZED_SPARE_ON_CHECKSUM_ERRORS} ]; then - ACTION="degrade" - fi -else - ACTION= -fi - -if [ -n "${ACTION}" ]; then - - # Device is already FAULTED or DEGRADED - set -- `vdev_status ${ZEVENT_POOL} ${ZEVENT_VDEV_PATH}` - ZEVENT_VDEV_PATH_FOUND=$1 - STATUS=$2 - if [ "${STATUS}" = "FAULTED" -o "${STATUS}" = "DEGRADED" ]; then - exit 3 - fi - - # Step 1) FAULT or DEGRADE the device - # - ${ZINJECT} -d ${ZEVENT_VDEV_GUID} -A ${ACTION} ${ZEVENT_POOL} - - # Step 2) Set the SES fault beacon. - # - # XXX: Set the 'fault' or 'ident' beacon for the device. This can - # be done through the sg_ses utility, the only hard part is to map - # the sd device to its corresponding enclosure and slot. We may - # be able to leverage the existing vdev_id scripts for this. - # - # $ sg_ses --dev-slot-num=0 --set=ident /dev/sg3 - # $ sg_ses --dev-slot-num=0 --clear=ident /dev/sg3 - - # Step 3) Replace the device with a hot spare. - # - # Round robin through the spares selecting those which are available. - # - for SPARE in ${ZEVENT_VDEV_SPARE_PATHS}; do - set -- `vdev_status ${ZEVENT_POOL} ${SPARE}` - SPARE_VDEV_FOUND=$1 - STATUS=$2 - if [ "${STATUS}" = "AVAIL" ]; then - ${ZPOOL} replace ${ZEVENT_POOL} \ - ${ZEVENT_VDEV_GUID} ${SPARE_VDEV_FOUND} && exit 0 - fi - done - - exit 1 -fi - -exit 4 + +# main +# +# Arguments +# none +# +# Return +# see above +# +main() +{ + local num_errors + local action + local lockfile + local vdev_path + local vdev_status + local spare + local zpool_err + local zpool_rv + local rv + + # Avoid hot-sparing a hot-spare. + # + # Note: ZEVENT_VDEV_PATH is not defined for ZEVENT_VDEV_TYPE=spare. + # + [ "${ZEVENT_VDEV_TYPE}" = "spare" ] && exit 2 + + [ -n "${ZEVENT_POOL}" ] || exit 9 + [ -n "${ZEVENT_VDEV_GUID}" ] || exit 9 + [ -n "${ZEVENT_VDEV_PATH}" ] || exit 9 + + zed_check_cmd "${ZPOOL}" "${ZINJECT}" || exit 9 + + # Fault the device after a given number of I/O errors. + # + if [ "${ZEVENT_SUBCLASS}" = "io" ]; then + if [ "${ZED_SPARE_ON_IO_ERRORS}" -gt 0 ]; then + num_errors=$((ZEVENT_VDEV_READ_ERRORS + ZEVENT_VDEV_WRITE_ERRORS)) + [ "${num_errors}" -ge "${ZED_SPARE_ON_IO_ERRORS}" ] \ + && action="fault" + fi 2>/dev/null + + # Degrade the device after a given number of checksum errors. + # + elif [ "${ZEVENT_SUBCLASS}" = "checksum" ]; then + if [ "${ZED_SPARE_ON_CHECKSUM_ERRORS}" -gt 0 ]; then + num_errors="${ZEVENT_VDEV_CKSUM_ERRORS}" + [ "${num_errors}" -ge "${ZED_SPARE_ON_CHECKSUM_ERRORS}" ] \ + && action="degrade" + fi 2>/dev/null + + else + zed_log_err "unsupported event class \"${ZEVENT_SUBCLASS}\"" + exit 9 + fi + + # Error threshold not reached. + # + if [ -z "${action}" ]; then + exit 2 + fi + + lockfile="zed.spare.lock" + zed_lock "${lockfile}" + + # shellcheck disable=SC2046 + set -- $(query_vdev_status "${ZEVENT_POOL}" "${ZEVENT_VDEV_PATH}") + vdev_path="$1" + vdev_status="$2" + + # Device is already FAULTED or DEGRADED. + # + if [ "${vdev_status}" = "FAULTED" ] \ + || [ "${vdev_status}" = "DEGRADED" ]; then + rv=3 + + else + rv=1 + + # 1) FAULT or DEGRADE the device. + # + "${ZINJECT}" -d "${ZEVENT_VDEV_GUID}" -A "${action}" "${ZEVENT_POOL}" + + # 2) Set the SES fault beacon. + # + # TODO: Set the 'fault' or 'ident' beacon for the device. This can + # be done through the sg_ses utility. The only hard part is to map + # the sd device to its corresponding enclosure and slot. We may + # be able to leverage the existing vdev_id scripts for this. + # + # $ sg_ses --dev-slot-num=0 --set=ident /dev/sg3 + # $ sg_ses --dev-slot-num=0 --clear=ident /dev/sg3 + + # 3) Replace the device with a hot spare. + # + # Round-robin through the spares trying those that are available. + # + for spare in ${ZEVENT_VDEV_SPARE_PATHS}; do + + # shellcheck disable=SC2046 + set -- $(query_vdev_status "${ZEVENT_POOL}" "${spare}") + vdev_path="$1" + vdev_status="$2" + + [ "${vdev_status}" = "AVAIL" ] || continue + + zpool_err="$("${ZPOOL}" replace "${ZEVENT_POOL}" \ + "${ZEVENT_VDEV_GUID}" "${vdev_path}" 2>&1)"; zpool_rv=$? + + if [ "${zpool_rv}" -ne 0 ]; then + [ -n "${zpool_err}" ] && zed_log_err "zpool ${zpool_err}" + else + rv=0 + break + fi + done + fi + + zed_unlock "${lockfile}" + exit "${rv}" +} + + +main "$@" diff --git a/cmd/zed/zed.d/scrub.finish-email.sh b/cmd/zed/zed.d/scrub.finish-email.sh index d92ccfea1..4a8155caf 100755 --- a/cmd/zed/zed.d/scrub.finish-email.sh +++ b/cmd/zed/zed.d/scrub.finish-email.sh @@ -1,73 +1,63 @@ #!/bin/sh # # Send email to ZED_EMAIL in response to a RESILVER.FINISH or SCRUB.FINISH. -# By default, "zpool status" output will only be included in the email for -# a scrub.finish zevent if the pool is not healthy; to always include its -# output, set ZED_EMAIL_VERBOSE=1. +# +# By default, "zpool status" output will only be included for a scrub.finish +# zevent if the pool is not healthy; to always include its output, set +# ZED_EMAIL_VERBOSE=1. +# # Exit codes: # 0: email sent # 1: email failed -# 2: email suppressed -# 3: missing executable -# 4: unsupported event class -# 5: internal error -# -test -f "${ZED_ZEDLET_DIR}/zed.rc" && . "${ZED_ZEDLET_DIR}/zed.rc" +# 2: email not configured +# 3: email suppressed +# 9: internal error -test -n "${ZEVENT_POOL}" || exit 5 -test -n "${ZEVENT_SUBCLASS}" || exit 5 +[ -f "${ZED_ZEDLET_DIR}/zed.rc" ] && . "${ZED_ZEDLET_DIR}/zed.rc" +. "${ZED_ZEDLET_DIR}/zed-functions.sh" -if test "${ZEVENT_SUBCLASS}" = "resilver.finish"; then - ACTION="resilvering" -elif test "${ZEVENT_SUBCLASS}" = "scrub.finish"; then - ACTION="scrubbing" -else - logger -t "${ZED_SYSLOG_TAG:=zed}" \ - -p "${ZED_SYSLOG_PRIORITY:=daemon.warning}" \ - `basename "$0"`: unsupported event class \"${ZEVENT_SUBCLASS}\" - exit 4 -fi +[ -n "${ZED_EMAIL}" ] || exit 2 -# Only send email if ZED_EMAIL has been configured. -test -n "${ZED_EMAIL}" || exit 2 +[ -n "${ZEVENT_POOL}" ] || exit 9 +[ -n "${ZEVENT_SUBCLASS}" ] || exit 9 -# Ensure requisite executables are installed. -if ! command -v "${MAIL:=mail}" >/dev/null 2>&1; then - logger -t "${ZED_SYSLOG_TAG:=zed}" \ - -p "${ZED_SYSLOG_PRIORITY:=daemon.warning}" \ - `basename "$0"`: "${MAIL}" not installed - exit 3 -fi -if ! test -x "${ZPOOL}"; then - logger -t "${ZED_SYSLOG_TAG:=zed}" \ - -p "${ZED_SYSLOG_PRIORITY:=daemon.warning}" \ - `basename "$0"`: "${ZPOOL}" not installed - exit 3 +if [ "${ZEVENT_SUBCLASS}" = "resilver.finish" ]; then + action="resilver" +elif [ "${ZEVENT_SUBCLASS}" = "scrub.finish" ]; then + action="scrub" +else + zed_log_err "unsupported event class \"${ZEVENT_SUBCLASS}\"" + exit 9 fi +zed_check_cmd "mail" "${ZPOOL}" || exit 9 + # For scrub, suppress email if pool is healthy and verbosity is not enabled. -if test "${ZEVENT_SUBCLASS}" = "scrub.finish"; then - HEALTHY=`"${ZPOOL}" status -x "${ZEVENT_POOL}" | \ - grep "'${ZEVENT_POOL}' is healthy"` - test -n "${HEALTHY}" -a "${ZED_EMAIL_VERBOSE:=0}" = 0 && exit 2 +# +if [ "${ZEVENT_SUBCLASS}" = "scrub.finish" ]; then + healthy="$("${ZPOOL}" status -x "${ZEVENT_POOL}" \ + | grep "'${ZEVENT_POOL}' is healthy")" + [ -n "${healthy}" ] && [ "${ZED_EMAIL_VERBOSE}" -eq 0 ] && exit 3 fi -"${MAIL}" -s "ZFS ${ZEVENT_SUBCLASS} event for ${ZEVENT_POOL} on `hostname`" \ - "${ZED_EMAIL}" < "${email_pathname}" </dev/null 2>&1; then + zed_log_err "\"${cmd}\" not installed" + rv=$((rv + 1)) + fi + done + return "${rv}" +} + + +# zed_log_msg (msg, ...) +# +# Write all argument strings to the system log. +# +# Globals +# ZED_SYSLOG_PRIORITY +# ZED_SYSLOG_TAG +# +# Return +# nothing +# +zed_log_msg() +{ + logger -p "${ZED_SYSLOG_PRIORITY}" -t "${ZED_SYSLOG_TAG}" -- "$@" +} + + +# zed_log_err (msg, ...) +# +# Write an error message to the system log. This message will contain the +# script name, EID, and all argument strings. +# +# Globals +# ZED_SYSLOG_PRIORITY +# ZED_SYSLOG_TAG +# ZEVENT_EID +# +# Return +# nothing +# +zed_log_err() +{ + logger -p "${ZED_SYSLOG_PRIORITY}" -t "${ZED_SYSLOG_TAG}" -- "error:" \ + "$(basename -- "$0"):" "${ZEVENT_EID:+"eid=${ZEVENT_EID}:"}" "$@" +} + + +# zed_lock (lockfile, [fd]) +# +# Obtain an exclusive (write) lock on [lockfile]. If the lock cannot be +# immediately acquired, wait until it becomes available. +# +# Every zed_lock() must be paired with a corresponding zed_unlock(). +# +# By default, flock-style locks associate the lockfile with file descriptor 8. +# The bash manpage warns that file descriptors >9 should be used with care as +# they may conflict with file descriptors used internally by the shell. File +# descriptor 9 is reserved for zed_rate_limit(). If concurrent locks are held +# within the same process, they must use different file descriptors (preferably +# decrementing from 8); otherwise, obtaining a new lock with a given file +# descriptor will release the previous lock associated with that descriptor. +# +# Arguments +# lockfile: pathname of the lock file; the lock will be stored in +# ZED_LOCKDIR unless the pathname contains a "/". +# fd: integer for the file descriptor used by flock (OPTIONAL unless holding +# concurrent locks) +# +# Globals +# ZED_FLOCK_FD +# ZED_LOCKDIR +# +# Return +# nothing +# +zed_lock() +{ + local lockfile="$1" + local fd="${2:-${ZED_FLOCK_FD}}" + local umask_bak + local err + + [ -n "${lockfile}" ] || return + if ! expr "${lockfile}" : '.*/' >/dev/null 2>&1; then + lockfile="${ZED_LOCKDIR}/${lockfile}" + fi + + umask_bak="$(umask)" + umask 077 + + # Obtain a lock on the file bound to the given file descriptor. + # + eval "exec ${fd}> '${lockfile}'" + err="$(flock --exclusive "${fd}" 2>&1)" + if [ $? -ne 0 ]; then + zed_log_err "failed to lock \"${lockfile}\": ${err}" + fi + + umask "${umask_bak}" +} + + +# zed_unlock (lockfile, [fd]) +# +# Release the lock on [lockfile]. +# +# Arguments +# lockfile: pathname of the lock file +# fd: integer for the file descriptor used by flock (must match the file +# descriptor passed to the zed_lock function call) +# +# Globals +# ZED_FLOCK_FD +# ZED_LOCKDIR +# +# Return +# nothing +# +zed_unlock() +{ + local lockfile="$1" + local fd="${2:-${ZED_FLOCK_FD}}" + local err + + [ -n "${lockfile}" ] || return + if ! expr "${lockfile}" : '.*/' >/dev/null 2>&1; then + lockfile="${ZED_LOCKDIR}/${lockfile}" + fi + + # Release the lock and close the file descriptor. + # + err="$(flock --unlock "${fd}" 2>&1)" + if [ $? -ne 0 ]; then + zed_log_err "failed to unlock \"${lockfile}\": ${err}" + fi + eval "exec ${fd}>&-" +} + + +# zed_rate_limit (tag, [interval]) +# +# Check whether an event of a given type [tag] has already occurred within the +# last [interval] seconds. +# +# This function obtains a lock on the statefile using file descriptor 9. +# +# Arguments +# tag: arbitrary string for grouping related events to rate-limit +# interval: time interval in seconds (OPTIONAL) +# +# Globals +# ZED_EMAIL_INTERVAL_SECS +# ZED_RUNDIR +# +# Return +# 0 if the event should be processed +# 1 if the event should be dropped +# +# State File Format +# time;tag +# +zed_rate_limit() +{ + local tag="$1" + local interval="${2:-${ZED_EMAIL_INTERVAL_SECS}}" + local lockfile="zed.zedlet.state.lock" + local lockfile_fd=9 + local statefile="${ZED_RUNDIR}/zed.zedlet.state" + local time_now + local time_prev + local umask_bak + local rv=0 + + [ -n "${tag}" ] || return 0 + + zed_lock "${lockfile}" "${lockfile_fd}" + time_now="$(date +%s)" + time_prev="$(egrep "^[0-9]+;${tag}\$" "${statefile}" 2>/dev/null \ + | tail -1 | cut -d\; -f1)" + + if [ -n "${time_prev}" ] \ + && [ "$((time_now - time_prev))" -lt "${interval}" ]; then + rv=1 + else + umask_bak="$(umask)" + umask 077 + egrep -v "^[0-9]+;${tag}\$" "${statefile}" 2>/dev/null \ + > "${statefile}.$$" + echo "${time_now};${tag}" >> "${statefile}.$$" + mv -f "${statefile}.$$" "${statefile}" + umask "${umask_bak}" + fi + + zed_unlock "${lockfile}" "${lockfile_fd}" + return "${rv}" +} diff --git a/cmd/zed/zed.d/zed.rc b/cmd/zed/zed.d/zed.rc index 69989f953..4c53207d7 100644 --- a/cmd/zed/zed.d/zed.rc +++ b/cmd/zed/zed.d/zed.rc @@ -1,34 +1,60 @@ +## # zed.rc +## +## # Absolute path to the debug output file. +# #ZED_DEBUG_LOG="/tmp/zed.debug.log" +## # Email address of the zpool administrator. # Email will only be sent if ZED_EMAIL is defined. +# Disabled by default; uncomment to enable. +# #ZED_EMAIL="root" +## +# Minimum number of seconds between emails for a similar event. +# +#ZED_EMAIL_INTERVAL_SECS=3600 + +## # Email verbosity. # If set to 0, suppress email if the pool is healthy. # If set to 1, send email regardless of pool health. +# #ZED_EMAIL_VERBOSE=0 -# Minimum number of seconds between emails sent for a similar event. -#ZED_EMAIL_INTERVAL_SECS="3600" - +## # Default directory for zed lock files. +# #ZED_LOCKDIR="/var/lock" +## # Default directory for zed state files. +# #ZED_RUNDIR="/var/run" -# The syslog priority (eg, specified as a "facility.level" pair). +## +# Replace a device with a hot spare after N checksum errors are detected. +# Disabled by default; uncomment to enable. +# +#ZED_SPARE_ON_CHECKSUM_ERRORS=10 + +## +# Replace a device with a hot spare after N I/O errors are detected. +# Disabled by default; uncomment to enable. +# +#ZED_SPARE_ON_IO_ERRORS=1 + +## +# The syslog priority (e.g., specified as a "facility.level" pair). +# #ZED_SYSLOG_PRIORITY="daemon.notice" +## # The syslog tag for marking zed events. +# #ZED_SYSLOG_TAG="zed" -# Replace a device with a hot spare after N I/O errors are detected. -#ZED_SPARE_ON_IO_ERRORS=1 - -# Replace a device with a hot spare after N checksum errors are detected. -#ZED_SPARE_ON_CHECKSUM_ERRORS=10 -- 2.40.0