]> granicus.if.org Git - zfs/commitdiff
3246 ZFS I/O deadman thread
authorGeorge.Wilson <george.wilson@delphix.com>
Mon, 29 Apr 2013 22:49:23 +0000 (15:49 -0700)
committerBrian Behlendorf <behlendorf1@llnl.gov>
Thu, 2 May 2013 00:05:52 +0000 (17:05 -0700)
Reviewed by: Matt Ahrens <matthew.ahrens@delphix.com>
Reviewed by: Eric Schrock <eric.schrock@delphix.com>
Reviewed by: Christopher Siden <chris.siden@delphix.com>
Approved by: Garrett D'Amore <garrett@damore.org>

NOTES: This patch has been reworked from the original in the
following ways to accomidate Linux ZFS implementation

*) Usage of the cyclic interface was replaced by the delayed taskq
   interface.  This avoids the need to implement new compatibility
   code and allows us to rely on the existing taskq implementation.

*) An extern for zfs_txg_synctime_ms was added to sys/dsl_pool.h
   because declaring externs in source files as was done in the
   original patch is just plain wrong.

*) Instead of panicing the system when the deadman triggers a
   zevent describing the blocked vdev and the first pending I/O
   is posted.  If the panic behavior is desired Linux provides
   other generic methods to panic the system when threads are
   observed to hang.

*) For reference, to delay zios by 30 seconds for testing you can
   use zinject as follows: 'zinject -d <vdev> -D30 <pool>'

References:
  illumos/illumos-gate@283b84606b6fc326692c03273de1774e8c122f9a
  https://www.illumos.org/issues/3246

Ported-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #1396

20 files changed:
cmd/zinject/translate.c
cmd/zinject/zinject.c
include/sys/dsl_pool.h
include/sys/fm/fs/zfs.h
include/sys/spa.h
include/sys/spa_impl.h
include/sys/vdev.h
include/sys/vdev_impl.h
include/sys/zfs_context.h
include/sys/zfs_ioctl.h
include/sys/zio.h
lib/libzpool/taskq.c
module/zfs/spa.c
module/zfs/spa_misc.c
module/zfs/vdev.c
module/zfs/vdev_disk.c
module/zfs/vdev_queue.c
module/zfs/zfs_fm.c
module/zfs/zio.c
module/zfs/zio_inject.c

index 160786641e4e164a82070e14f189c5cb46fcbc38..fc1612738e111f8e8df2b357101df68963522291 100644 (file)
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012 by Delphix. All rights reserved.
  */
 
 #include <libzfs.h>
@@ -476,6 +477,20 @@ translate_device(const char *pool, const char *device, err_type_t label_type,
                    &record->zi_guid) == 0);
        }
 
+       /*
+        * Device faults can take on three different forms:
+        * 1). delayed or hanging I/O
+        * 2). zfs label faults
+        * 3). generic disk faults
+        */
+       if (record->zi_timer != 0) {
+               record->zi_cmd = ZINJECT_DELAY_IO;
+       } else if (label_type != TYPE_INVAL) {
+               record->zi_cmd = ZINJECT_LABEL_FAULT;
+       } else {
+               record->zi_cmd = ZINJECT_DEVICE_FAULT;
+       }
+
        switch (label_type) {
        default:
                break;
index d584eadd82650c63c853bcc07ee0c0a638b0dedd..9ae3f9ddc3adc9f0a4501c69ef3fbee876bf6321 100644 (file)
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012 by Delphix. All rights reserved.
  */
 
 /*
@@ -235,8 +236,8 @@ usage(void)
            "\t\t'pad1', or 'pad2'.\n"
            "\t\t'errno' can be 'nxio' (the default), 'io', or 'dtl'.\n"
            "\n"
-           "\tzinject -d device -A <degrade|fault> pool\n"
-           "\t\tPerform a specific action on a particular device\n"
+           "\tzinject -d device -A <degrade|fault> -D <delay secs> pool\n"
+           "\t\tPerform a specific action on a particular device.\n"
            "\n"
            "\tzinject -I [-s <seconds> | -g <txgs>] pool\n"
            "\t\tCause the pool to stop writing blocks yet not\n"
@@ -589,7 +590,7 @@ main(int argc, char **argv)
        }
 
        while ((c = getopt(argc, argv,
-           ":aA:b:d:f:Fg:qhIc:t:T:l:mr:s:e:uL:p:")) != -1) {
+           ":aA:b:d:D:f:Fg:qhIc:t:T:l:mr:s:e:uL:p:")) != -1) {
                switch (c) {
                case 'a':
                        flags |= ZINJECT_FLUSH_ARC;
@@ -615,6 +616,16 @@ main(int argc, char **argv)
                case 'd':
                        device = optarg;
                        break;
+               case 'D':
+                       errno = 0;
+                       record.zi_timer = strtoull(optarg, &end, 10);
+                       if (errno != 0 || *end != '\0') {
+                               (void) fprintf(stderr, "invalid i/o delay "
+                                   "value: '%s'\n", optarg);
+                               usage();
+                               return (1);
+                       }
+                       break;
                case 'e':
                        if (strcasecmp(optarg, "io") == 0) {
                                error = EIO;
@@ -679,6 +690,7 @@ main(int argc, char **argv)
                case 'p':
                        (void) strlcpy(record.zi_func, optarg,
                            sizeof (record.zi_func));
+                       record.zi_cmd = ZINJECT_PANIC;
                        break;
                case 'q':
                        quiet = 1;
@@ -762,13 +774,15 @@ main(int argc, char **argv)
                return (1);
        }
 
+       if (record.zi_duration != 0)
+               record.zi_cmd = ZINJECT_IGNORED_WRITES;
+
        if (cancel != NULL) {
                /*
                 * '-c' is invalid with any other options.
                 */
                if (raw != NULL || range != NULL || type != TYPE_INVAL ||
-                   level != 0 || record.zi_func[0] != '\0' ||
-                   record.zi_duration != 0) {
+                   level != 0 || record.zi_cmd != ZINJECT_UNINITIALIZED) {
                        (void) fprintf(stderr, "cancel (-c) incompatible with "
                            "any other options\n");
                        usage();
@@ -800,8 +814,7 @@ main(int argc, char **argv)
                 * for doing injection, so handle it separately here.
                 */
                if (raw != NULL || range != NULL || type != TYPE_INVAL ||
-                   level != 0 || record.zi_func[0] != '\0' ||
-                   record.zi_duration != 0) {
+                   level != 0 || record.zi_cmd != ZINJECT_UNINITIALIZED) {
                        (void) fprintf(stderr, "device (-d) incompatible with "
                            "data error injection\n");
                        usage();
@@ -835,7 +848,7 @@ main(int argc, char **argv)
 
        } else if (raw != NULL) {
                if (range != NULL || type != TYPE_INVAL || level != 0 ||
-                   record.zi_func[0] != '\0' || record.zi_duration != 0) {
+                   record.zi_cmd != ZINJECT_UNINITIALIZED) {
                        (void) fprintf(stderr, "raw (-b) format with "
                            "any other options\n");
                        usage();
@@ -858,13 +871,14 @@ main(int argc, char **argv)
                        return (1);
                }
 
+               record.zi_cmd = ZINJECT_DATA_FAULT;
                if (translate_raw(raw, &record) != 0)
                        return (1);
                if (!error)
                        error = EIO;
-       } else if (record.zi_func[0] != '\0') {
+       } else if (record.zi_cmd == ZINJECT_PANIC) {
                if (raw != NULL || range != NULL || type != TYPE_INVAL ||
-                   level != 0 || device != NULL || record.zi_duration != 0) {
+                   level != 0 || device != NULL) {
                        (void) fprintf(stderr, "panic (-p) incompatible with "
                            "other options\n");
                        usage();
@@ -882,7 +896,7 @@ main(int argc, char **argv)
                if (argv[1] != NULL)
                        record.zi_type = atoi(argv[1]);
                dataset[0] = '\0';
-       } else if (record.zi_duration != 0) {
+       } else if (record.zi_cmd == ZINJECT_IGNORED_WRITES) {
                if (nowrites == 0) {
                        (void) fprintf(stderr, "-s or -g meaningless "
                            "without -I (ignore writes)\n");
@@ -936,6 +950,7 @@ main(int argc, char **argv)
                        return (1);
                }
 
+               record.zi_cmd = ZINJECT_DATA_FAULT;
                if (translate_record(type, argv[0], range, level, &record, pool,
                    dataset) != 0)
                        return (1);
index ff5df1414a5eab11528f37e5bcfbf9f552ad62e2..96a2296bf8e9009542482fd1f5f4e76d316d08ca 100644 (file)
@@ -41,6 +41,8 @@
 extern "C" {
 #endif
 
+extern int zfs_txg_synctime_ms;
+
 struct objset;
 struct dsl_dir;
 struct dsl_dataset;
index d5c6004c261aa32ed5d5a4adfd33de3caba4b7e4..741b99e62fa6db409c3d0c56d5c5056c9049f8ac 100644 (file)
@@ -73,6 +73,8 @@ extern "C" {
 #define        FM_EREPORT_PAYLOAD_ZFS_VDEV_FRU         "vdev_fru"
 #define        FM_EREPORT_PAYLOAD_ZFS_VDEV_STATE       "vdev_state"
 #define        FM_EREPORT_PAYLOAD_ZFS_VDEV_ASHIFT      "vdev_ashift"
+#define        FM_EREPORT_PAYLOAD_ZFS_VDEV_COMP_TS     "vdev_complete_ts"
+#define        FM_EREPORT_PAYLOAD_ZFS_VDEV_DELTA_TS    "vdev_delta_ts"
 #define        FM_EREPORT_PAYLOAD_ZFS_PARENT_GUID      "parent_guid"
 #define        FM_EREPORT_PAYLOAD_ZFS_PARENT_TYPE      "parent_type"
 #define        FM_EREPORT_PAYLOAD_ZFS_PARENT_PATH      "parent_path"
@@ -88,6 +90,9 @@ extern "C" {
 #define        FM_EREPORT_PAYLOAD_ZFS_ZIO_STAGE        "zio_stage"
 #define        FM_EREPORT_PAYLOAD_ZFS_ZIO_PIPELINE     "zio_pipeline"
 #define        FM_EREPORT_PAYLOAD_ZFS_ZIO_DELAY        "zio_delay"
+#define        FM_EREPORT_PAYLOAD_ZFS_ZIO_TIMESTAMP    "zio_timestamp"
+#define        FM_EREPORT_PAYLOAD_ZFS_ZIO_DEADLINE     "zio_deadline"
+#define        FM_EREPORT_PAYLOAD_ZFS_ZIO_DELTA        "zio_delta"
 #define        FM_EREPORT_PAYLOAD_ZFS_PREV_STATE       "prev_state"
 #define        FM_EREPORT_PAYLOAD_ZFS_CKSUM_EXPECTED   "cksum_expected"
 #define        FM_EREPORT_PAYLOAD_ZFS_CKSUM_ACTUAL     "cksum_actual"
index 821172297a0dfb84d05b4157c337dca946ab7c4e..ca9fb24384d2373156f97756a05c844b7310eab6 100644 (file)
@@ -570,6 +570,7 @@ extern int spa_offline_log(spa_t *spa);
 
 /* Log claim callback */
 extern void spa_claim_notify(zio_t *zio);
+extern void spa_deadman(void *);
 
 /* Accessor functions */
 extern boolean_t spa_shutting_down(spa_t *spa);
@@ -604,6 +605,7 @@ extern boolean_t spa_suspended(spa_t *spa);
 extern uint64_t spa_bootfs(spa_t *spa);
 extern uint64_t spa_delegation(spa_t *spa);
 extern objset_t *spa_meta_objset(spa_t *spa);
+extern uint64_t spa_deadman_synctime(spa_t *spa);
 
 /* Miscellaneous support routines */
 extern void spa_activate_mos_feature(spa_t *spa, const char *feature);
index 65edc97f526395cea54c42cb8847fcd1a07acaba..5bd0e0a9c1fb10a4580053f4ac9cee6769a13997 100644 (file)
@@ -227,6 +227,10 @@ struct spa {
        uint64_t        spa_feat_for_write_obj; /* required to write to pool */
        uint64_t        spa_feat_for_read_obj;  /* required to read from pool */
        uint64_t        spa_feat_desc_obj;      /* Feature descriptions */
+       taskqid_t       spa_deadman_tqid;       /* Task id */
+       uint64_t        spa_deadman_calls;      /* number of deadman calls */
+       uint64_t        spa_sync_starttime;     /* starting time fo spa_sync */
+       uint64_t        spa_deadman_synctime;   /* deadman expiration timer */
        /*
         * spa_refcnt & spa_config_lock must be the last elements
         * because refcount_t changes size based on compilation options.
index 8f297a917adb6acdea7472d312618dc2829bbb80..f49086a4776c783691a958e58762404ec503a466 100644 (file)
@@ -78,6 +78,7 @@ extern void vdev_metaslab_fini(vdev_t *vd);
 extern void vdev_metaslab_set_size(vdev_t *);
 extern void vdev_expand(vdev_t *vd, uint64_t txg);
 extern void vdev_split(vdev_t *vd);
+extern void vdev_deadman(vdev_t *vd);
 
 
 extern void vdev_get_stats(vdev_t *vd, vdev_stat_t *vs);
index 964ee24074b016ceaf8727975e77928e79380b5d..711408e6a9a399cac3f395647c49f674e1533259 100644 (file)
@@ -105,6 +105,8 @@ struct vdev_queue {
        avl_tree_t      vq_read_tree;
        avl_tree_t      vq_write_tree;
        avl_tree_t      vq_pending_tree;
+       uint64_t        vq_io_complete_ts;
+       uint64_t        vq_io_delta_ts;
        list_t          vq_io_list;
        kmutex_t        vq_lock;
 };
index 53080f38234f4807252d9ca3a59e8eca291d1068..a23bfdcf82a29b385a0973240e930f802f6db3b4 100644 (file)
@@ -25,6 +25,7 @@
 /*
  * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
  * Copyright (c) 2012, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2012 by Delphix. All rights reserved.
  */
 
 #ifndef _SYS_ZFS_CONTEXT_H
@@ -400,6 +401,8 @@ extern taskq_t      *taskq_create(const char *, int, pri_t, int, int, uint_t);
 #define        taskq_create_sysdc(a, b, d, e, p, dc, f) \
            (taskq_create(a, b, maxclsyspri, d, e, f))
 extern taskqid_t taskq_dispatch(taskq_t *, task_func_t, void *, uint_t);
+extern taskqid_t taskq_dispatch_delay(taskq_t *, task_func_t, void *, uint_t,
+    clock_t);
 extern void    taskq_dispatch_ent(taskq_t *, task_func_t, void *, uint_t,
     taskq_ent_t *);
 extern int     taskq_empty_ent(taskq_ent_t *);
@@ -407,6 +410,7 @@ extern void taskq_init_ent(taskq_ent_t *);
 extern void    taskq_destroy(taskq_t *);
 extern void    taskq_wait(taskq_t *);
 extern int     taskq_member(taskq_t *, kthread_t *);
+extern int     taskq_cancel_id(taskq_t *, taskqid_t);
 extern void    system_taskq_init(void);
 extern void    system_taskq_fini(void);
 
@@ -523,6 +527,11 @@ extern vnode_t *rootdir;
 
 extern void delay(clock_t ticks);
 
+#define        SEC_TO_TICK(sec)        ((sec) * hz)
+#define        MSEC_TO_TICK(msec)      ((msec) / (MILLISEC / hz))
+#define        USEC_TO_TICK(usec)      ((usec) / (MICROSEC / hz))
+#define        NSEC_TO_TICK(usec)      ((usec) / (NANOSEC / hz))
+
 #define        gethrestime_sec() time(NULL)
 #define        gethrestime(t) \
        do {\
index 740d8eda4e670f19b73c53decacf81d6d1b133ff..c0cb4705d48776a7809165b45a006a91a51f845a 100644 (file)
@@ -236,6 +236,8 @@ typedef struct zinject_record {
        uint32_t        zi_iotype;
        int32_t         zi_duration;
        uint64_t        zi_timer;
+       uint32_t        zi_cmd;
+       uint32_t        zi_pad;
 } zinject_record_t;
 
 #define        ZINJECT_NULL            0x1
@@ -245,6 +247,16 @@ typedef struct zinject_record {
 #define        ZEVENT_NONBLOCK         0x1
 #define        ZEVENT_SIZE             1024
 
+typedef enum zinject_type {
+       ZINJECT_UNINITIALIZED,
+       ZINJECT_DATA_FAULT,
+       ZINJECT_DEVICE_FAULT,
+       ZINJECT_LABEL_FAULT,
+       ZINJECT_IGNORED_WRITES,
+       ZINJECT_PANIC,
+       ZINJECT_DELAY_IO,
+} zinject_type_t;
+
 typedef struct zfs_share {
        uint64_t        z_exportdata;
        uint64_t        z_sharedata;
index 64efde01691ae46a1d3435a0e9ff6d72226a683d..03530330c95f0a404e357726b7ca2aba1533b5ce 100644 (file)
@@ -21,8 +21,6 @@
 
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- */
-/*
  * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
  * Copyright (c) 2012 by Delphix. All rights reserved.
  * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
@@ -411,7 +409,10 @@ struct zio {
        const zio_vsd_ops_t *io_vsd_ops;
 
        uint64_t        io_offset;
-       uint64_t        io_deadline;
+       uint64_t        io_deadline;    /* expires at timestamp + deadline */
+       uint64_t        io_timestamp;   /* submitted at (ticks) */
+       uint64_t        io_delta;       /* vdev queue service delta (ticks) */
+       uint64_t        io_delay;       /* vdev disk service delta (ticks) */
        avl_node_t      io_offset_node;
        avl_node_t      io_deadline_node;
        avl_tree_t      *io_vdev_tree;
@@ -423,7 +424,6 @@ struct zio {
        enum zio_flag   io_orig_flags;
        enum zio_stage  io_orig_stage;
        enum zio_stage  io_orig_pipeline;
-       uint64_t        io_delay;
        int             io_error;
        int             io_child_error[ZIO_CHILD_TYPES];
        uint64_t        io_children[ZIO_CHILD_TYPES][ZIO_WAIT_TYPES];
@@ -560,6 +560,7 @@ extern int zio_handle_fault_injection(zio_t *zio, int error);
 extern int zio_handle_device_injection(vdev_t *vd, zio_t *zio, int error);
 extern int zio_handle_label_injection(zio_t *zio, int error);
 extern void zio_handle_ignored_writes(zio_t *zio);
+extern uint64_t zio_handle_io_delay(zio_t *zio);
 
 /*
  * Checksum ereport functions
index 803f7dc37444aefcfd55ceea20095104946bc606..64e214205e62c9f6a7ba9e52e8b40da2f400845e 100644 (file)
@@ -147,6 +147,13 @@ taskq_dispatch(taskq_t *tq, task_func_t func, void *arg, uint_t tqflags)
        return (1);
 }
 
+taskqid_t
+taskq_dispatch_delay(taskq_t *tq,  task_func_t func, void *arg, uint_t tqflags,
+    clock_t expire_time)
+{
+       return (0);
+}
+
 int
 taskq_empty_ent(taskq_ent_t *t)
 {
@@ -339,6 +346,12 @@ taskq_member(taskq_t *tq, kthread_t *t)
        return (0);
 }
 
+int
+taskq_cancel_id(taskq_t *tq, taskqid_t id)
+{
+       return (ENOENT);
+}
+
 void
 system_taskq_init(void)
 {
index 0babf47522b9540aab36d87c5130f5a224c384bb..0d3537eaa177a280d21648cebc782071b0a8da1c 100644 (file)
@@ -1013,6 +1013,8 @@ spa_deactivate(spa_t *spa)
        list_destroy(&spa->spa_config_dirty_list);
        list_destroy(&spa->spa_state_dirty_list);
 
+       taskq_cancel_id(system_taskq, spa->spa_deadman_tqid);
+
        for (t = 0; t < ZIO_TYPES; t++) {
                for (q = 0; q < ZIO_TASKQ_TYPES; q++) {
                        if (spa->spa_zio_taskq[t][q] != NULL)
@@ -6017,6 +6019,12 @@ spa_sync(spa_t *spa, uint64_t txg)
 
        tx = dmu_tx_create_assigned(dp, txg);
 
+       spa->spa_sync_starttime = gethrtime();
+       taskq_cancel_id(system_taskq, spa->spa_deadman_tqid);
+       spa->spa_deadman_tqid = taskq_dispatch_delay(system_taskq,
+           spa_deadman, spa, TQ_SLEEP, ddi_get_lbolt() +
+           NSEC_TO_TICK(spa->spa_deadman_synctime));
+
        /*
         * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg,
         * set spa_deflate if we have no raid-z vdevs.
@@ -6145,6 +6153,9 @@ spa_sync(spa_t *spa, uint64_t txg)
        }
        dmu_tx_commit(tx);
 
+       taskq_cancel_id(system_taskq, spa->spa_deadman_tqid);
+       spa->spa_deadman_tqid = 0;
+
        /*
         * Clear the dirty config list.
         */
index 5ec8e688e7898bca588bb9524f120cd573267766..476f9c02cdea2bac826ec8eaad55286a0dc6c651 100644 (file)
@@ -236,6 +236,24 @@ static avl_tree_t spa_l2cache_avl;
 kmem_cache_t *spa_buffer_pool;
 int spa_mode_global;
 
+/*
+ * Expiration time in units of zfs_txg_synctime_ms. This value has two
+ * meanings. First it is used to determine when the spa_deadman logic
+ * should fire. By default the spa_deadman will fire if spa_sync has
+ * not completed in 1000 * zfs_txg_synctime_ms (i.e. 1000 seconds).
+ * Secondly, the value determines if an I/O is considered "hung".
+ * Any I/O that has not completed in zfs_deadman_synctime is considered
+ * "hung" resulting in a zevent being posted.
+ * 1000 zfs_txg_synctime_ms (i.e. 1000 seconds).
+ */
+unsigned long zfs_deadman_synctime = 1000ULL;
+
+/*
+ * By default the deadman is enabled.
+ */
+int zfs_deadman_enabled = 1;
+
+
 /*
  * ==========================================================================
  * SPA config locking
@@ -412,6 +430,27 @@ spa_lookup(const char *name)
        return (spa);
 }
 
+/*
+ * Fires when spa_sync has not completed within zfs_deadman_synctime_ms.
+ * If the zfs_deadman_enabled flag is set then it inspects all vdev queues
+ * looking for potentially hung I/Os.
+ */
+void
+spa_deadman(void *arg)
+{
+       spa_t *spa = arg;
+
+       zfs_dbgmsg("slow spa_sync: started %llu seconds ago, calls %llu",
+           (gethrtime() - spa->spa_sync_starttime) / NANOSEC,
+           ++spa->spa_deadman_calls);
+       if (zfs_deadman_enabled)
+               vdev_deadman(spa->spa_root_vdev);
+
+       spa->spa_deadman_tqid = taskq_dispatch_delay(system_taskq,
+           spa_deadman, spa, TQ_SLEEP, ddi_get_lbolt() +
+           NSEC_TO_TICK(spa->spa_deadman_synctime));
+}
+
 /*
  * Create an uninitialized spa_t with the given name.  Requires
  * spa_namespace_lock.  The caller must ensure that the spa_t doesn't already
@@ -454,6 +493,9 @@ spa_add(const char *name, nvlist_t *config, const char *altroot)
        spa->spa_proc = &p0;
        spa->spa_proc_state = SPA_PROC_NONE;
 
+       spa->spa_deadman_synctime = zfs_deadman_synctime *
+           zfs_txg_synctime_ms * MICROSEC;
+
        refcount_create(&spa->spa_refcount);
        spa_config_lock_init(spa);
 
@@ -1492,6 +1534,12 @@ spa_prev_software_version(spa_t *spa)
        return (spa->spa_prev_software_version);
 }
 
+uint64_t
+spa_deadman_synctime(spa_t *spa)
+{
+       return (spa->spa_deadman_synctime);
+}
+
 uint64_t
 dva_get_dsize_sync(spa_t *spa, const dva_t *dva)
 {
@@ -1812,4 +1860,10 @@ EXPORT_SYMBOL(spa_writeable);
 EXPORT_SYMBOL(spa_mode);
 
 EXPORT_SYMBOL(spa_namespace_lock);
+
+module_param(zfs_deadman_synctime, ulong, 0644);
+MODULE_PARM_DESC(zfs_deadman_synctime,"Expire in units of zfs_txg_synctime_ms");
+
+module_param(zfs_deadman_enabled, int, 0644);
+MODULE_PARM_DESC(zfs_deadman_enabled, "Enable deadman timer");
 #endif
index afff4d423deac58ef1770f61c28010774ac7cee0..15ff30fb3b4e585ea51a8ee948953b454651f950 100644 (file)
@@ -3195,6 +3195,46 @@ vdev_split(vdev_t *vd)
        vdev_propagate_state(cvd);
 }
 
+void
+vdev_deadman(vdev_t *vd)
+{
+       int c;
+
+       for (c = 0; c < vd->vdev_children; c++) {
+               vdev_t *cvd = vd->vdev_child[c];
+
+               vdev_deadman(cvd);
+       }
+
+       if (vd->vdev_ops->vdev_op_leaf) {
+               vdev_queue_t *vq = &vd->vdev_queue;
+
+               mutex_enter(&vq->vq_lock);
+               if (avl_numnodes(&vq->vq_pending_tree) > 0) {
+                       spa_t *spa = vd->vdev_spa;
+                       zio_t *fio;
+                       uint64_t delta;
+
+                       /*
+                        * Look at the head of all the pending queues,
+                        * if any I/O has been outstanding for longer than
+                        * the spa_deadman_synctime we log a zevent.
+                        */
+                       fio = avl_first(&vq->vq_pending_tree);
+                       delta = ddi_get_lbolt64() - fio->io_timestamp;
+                       if (delta > NSEC_TO_TICK(spa_deadman_synctime(spa))) {
+                               zfs_dbgmsg("SLOW IO: zio timestamp %llu, "
+                                   "delta %llu, last io %llu",
+                                   fio->io_timestamp, delta,
+                                   vq->vq_io_complete_ts);
+                               zfs_ereport_post(FM_EREPORT_ZFS_DELAY,
+                                   spa, vd, fio, 0, 0);
+                       }
+               }
+               mutex_exit(&vq->vq_lock);
+       }
+}
+
 #if defined(_KERNEL) && defined(HAVE_SPL)
 EXPORT_SYMBOL(vdev_fault);
 EXPORT_SYMBOL(vdev_degrade);
index 08f64a91361ce5678c6fb4f7855d4bf2af4194ad..31d1a28bdc9bfff35392cbd35e9a69915bd2447c 100644 (file)
@@ -407,8 +407,7 @@ vdev_disk_dio_put(dio_request_t *dr)
                vdev_disk_dio_free(dr);
 
                if (zio) {
-                       zio->io_delay = jiffies_to_msecs(
-                           jiffies_64 - zio->io_delay);
+                       zio->io_delay = jiffies_64 - zio->io_delay;
                        zio->io_error = error;
                        ASSERT3S(zio->io_error, >=, 0);
                        if (zio->io_error)
@@ -609,7 +608,7 @@ BIO_END_IO_PROTO(vdev_disk_io_flush_completion, bio, size, rc)
 {
        zio_t *zio = bio->bi_private;
 
-       zio->io_delay = jiffies_to_msecs(jiffies_64 - zio->io_delay);
+       zio->io_delay = jiffies_64 - zio->io_delay;
        zio->io_error = -rc;
        if (rc && (rc == -EOPNOTSUPP))
                zio->io_vd->vdev_nowritecache = B_TRUE;
index e2096fac9d63d70397468fbd85bfa8f5617ea98e..3f2793ba4f1623f7e1c0374c0a5ffd73295b8360 100644 (file)
  * Use is subject to license terms.
  */
 
+/*
+ * Copyright (c) 2012 by Delphix. All rights reserved.
+ */
+
 #include <sys/zfs_context.h>
 #include <sys/vdev_impl.h>
 #include <sys/zio.h>
@@ -319,6 +323,7 @@ again:
                    vi, size, fio->io_type, ZIO_PRIORITY_AGG,
                    flags | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE,
                    vdev_queue_agg_io_done, NULL);
+               aio->io_timestamp = fio->io_timestamp;
 
                nio = fio;
                do {
@@ -391,7 +396,8 @@ vdev_queue_io(zio_t *zio)
 
        mutex_enter(&vq->vq_lock);
 
-       zio->io_deadline = (ddi_get_lbolt64() >> zfs_vdev_time_shift) +
+       zio->io_timestamp = ddi_get_lbolt64();
+       zio->io_deadline = (zio->io_timestamp >> zfs_vdev_time_shift) +
            zio->io_priority;
 
        vdev_queue_io_add(vq, zio);
@@ -417,10 +423,17 @@ vdev_queue_io_done(zio_t *zio)
        vdev_queue_t *vq = &zio->io_vd->vdev_queue;
        int i;
 
+       if (zio_injection_enabled)
+               delay(SEC_TO_TICK(zio_handle_io_delay(zio)));
+
        mutex_enter(&vq->vq_lock);
 
        avl_remove(&vq->vq_pending_tree, zio);
 
+       zio->io_delta = ddi_get_lbolt64() - zio->io_timestamp;
+       vq->vq_io_complete_ts = ddi_get_lbolt64();
+       vq->vq_io_delta_ts = vq->vq_io_complete_ts - zio->io_timestamp;
+
        for (i = 0; i < zfs_vdev_ramp_rate; i++) {
                zio_t *nio = vdev_queue_io_to_issue(vq, zfs_vdev_max_pending);
                if (nio == NULL)
index 820291bf4b04e400e580535c99144bf4196eb6e6..af2030ae7f3670a57c8289c924ed21a974880867 100644 (file)
@@ -250,6 +250,7 @@ zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out,
 
        if (vd != NULL) {
                vdev_t *pvd = vd->vdev_parent;
+               vdev_queue_t *vq = &vd->vdev_queue;
 
                fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID,
                    DATA_TYPE_UINT64, vd->vdev_guid,
@@ -272,6 +273,15 @@ zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out,
                            FM_EREPORT_PAYLOAD_ZFS_VDEV_ASHIFT,
                            DATA_TYPE_UINT64, vd->vdev_ashift, NULL);
 
+               if (vq != NULL) {
+                       fm_payload_set(ereport,
+                           FM_EREPORT_PAYLOAD_ZFS_VDEV_COMP_TS,
+                           DATA_TYPE_UINT64, vq->vq_io_complete_ts, NULL);
+                       fm_payload_set(ereport,
+                           FM_EREPORT_PAYLOAD_ZFS_VDEV_DELTA_TS,
+                           DATA_TYPE_UINT64, vq->vq_io_delta_ts, NULL);
+               }
+
                if (pvd != NULL) {
                        fm_payload_set(ereport,
                            FM_EREPORT_PAYLOAD_ZFS_PARENT_GUID,
@@ -304,6 +314,12 @@ zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out,
                    DATA_TYPE_UINT32, zio->io_pipeline, NULL);
                fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_DELAY,
                    DATA_TYPE_UINT64, zio->io_delay, NULL);
+               fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_TIMESTAMP,
+                   DATA_TYPE_UINT64, zio->io_timestamp, NULL);
+               fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_DEADLINE,
+                   DATA_TYPE_UINT64, zio->io_deadline, NULL);
+               fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_DELTA,
+                   DATA_TYPE_UINT64, zio->io_delta, NULL);
 
                /*
                 * If the 'size' parameter is non-zero, it indicates this is a
index 0622553f5f23a32ba069420508755d66094274ec..943f2d677a96376fc1bce6d29f80ef109332a9c5 100644 (file)
@@ -609,6 +609,9 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
        zio->io_vsd_ops = NULL;
        zio->io_offset = offset;
        zio->io_deadline = 0;
+       zio->io_timestamp = 0;
+       zio->io_delta = 0;
+       zio->io_delay = 0;
        zio->io_orig_data = zio->io_data = data;
        zio->io_orig_size = zio->io_size = size;
        zio->io_orig_flags = zio->io_flags = flags;
@@ -620,7 +623,6 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
        zio->io_bp_override = NULL;
        zio->io_walk_link = NULL;
        zio->io_transform_stack = NULL;
-       zio->io_delay = 0;
        zio->io_error = 0;
        zio->io_child_count = 0;
        zio->io_parent_count = 0;
@@ -2906,11 +2908,11 @@ zio_done(zio_t *zio)
        vdev_stat_update(zio, zio->io_size);
 
        /*
-        * If this I/O is attached to a particular vdev is slow, exeeding
+        * If this I/O is attached to a particular vdev is slow, exceeding
         * 30 seconds to complete, post an error described the I/O delay.
         * We ignore these errors if the device is currently unavailable.
         */
-       if (zio->io_delay >= zio_delay_max) {
+       if (zio->io_delay >= MSEC_TO_TICK(zio_delay_max)) {
                if (zio->io_vd != NULL && !vdev_is_dead(zio->io_vd))
                        zfs_ereport_post(FM_EREPORT_ZFS_DELAY, zio->io_spa,
                                          zio->io_vd, zio, 0, 0);
index 293f267fc9ed30716d8891b4638a2778ad715d21..eb589c4299cdad6e114180caabc8184e6b323183 100644 (file)
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012 by Delphix. All rights reserved.
  */
 
 /*
@@ -147,14 +148,8 @@ zio_handle_fault_injection(zio_t *zio, int error)
        for (handler = list_head(&inject_handlers); handler != NULL;
            handler = list_next(&inject_handlers, handler)) {
 
-               /* Ignore errors not destined for this pool */
-               if (zio->io_spa != handler->zi_spa)
-                       continue;
-
-               /* Ignore device errors and panic injection */
-               if (handler->zi_record.zi_guid != 0 ||
-                   handler->zi_record.zi_func[0] != '\0' ||
-                   handler->zi_record.zi_duration != 0)
+               if (zio->io_spa != handler->zi_spa ||
+                   handler->zi_record.zi_cmd != ZINJECT_DATA_FAULT)
                        continue;
 
                /* If this handler matches, return EIO */
@@ -197,10 +192,7 @@ zio_handle_label_injection(zio_t *zio, int error)
                uint64_t start = handler->zi_record.zi_start;
                uint64_t end = handler->zi_record.zi_end;
 
-               /* Ignore device only faults or panic injection */
-               if (handler->zi_record.zi_start == 0 ||
-                   handler->zi_record.zi_func[0] != '\0' ||
-                   handler->zi_record.zi_duration != 0)
+               if (handler->zi_record.zi_cmd != ZINJECT_LABEL_FAULT)
                        continue;
 
                /*
@@ -246,13 +238,7 @@ zio_handle_device_injection(vdev_t *vd, zio_t *zio, int error)
        for (handler = list_head(&inject_handlers); handler != NULL;
            handler = list_next(&inject_handlers, handler)) {
 
-               /*
-                * Ignore label specific faults, panic injection
-                * or fake writes
-                */
-               if (handler->zi_record.zi_start != 0 ||
-                   handler->zi_record.zi_func[0] != '\0' ||
-                   handler->zi_record.zi_duration != 0)
+               if (handler->zi_record.zi_cmd != ZINJECT_DEVICE_FAULT)
                        continue;
 
                if (vd->vdev_guid == handler->zi_record.zi_guid) {
@@ -316,10 +302,8 @@ zio_handle_ignored_writes(zio_t *zio)
            handler = list_next(&inject_handlers, handler)) {
 
                /* Ignore errors not destined for this pool */
-               if (zio->io_spa != handler->zi_spa)
-                       continue;
-
-               if (handler->zi_record.zi_duration == 0)
+               if (zio->io_spa != handler->zi_spa ||
+                   handler->zi_record.zi_cmd != ZINJECT_IGNORED_WRITES)
                        continue;
 
                /*
@@ -355,11 +339,8 @@ spa_handle_ignored_writes(spa_t *spa)
        for (handler = list_head(&inject_handlers); handler != NULL;
            handler = list_next(&inject_handlers, handler)) {
 
-               /* Ignore errors not destined for this pool */
-               if (spa != handler->zi_spa)
-                       continue;
-
-               if (handler->zi_record.zi_duration == 0)
+               if (spa != handler->zi_spa ||
+                   handler->zi_record.zi_cmd != ZINJECT_IGNORED_WRITES)
                        continue;
 
                if (handler->zi_record.zi_duration > 0) {
@@ -379,6 +360,34 @@ spa_handle_ignored_writes(spa_t *spa)
        rw_exit(&inject_lock);
 }
 
+uint64_t
+zio_handle_io_delay(zio_t *zio)
+{
+       vdev_t *vd = zio->io_vd;
+       inject_handler_t *handler;
+       uint64_t seconds = 0;
+
+       if (zio_injection_enabled == 0)
+               return (0);
+
+       rw_enter(&inject_lock, RW_READER);
+
+       for (handler = list_head(&inject_handlers); handler != NULL;
+           handler = list_next(&inject_handlers, handler)) {
+
+               if (handler->zi_record.zi_cmd != ZINJECT_DELAY_IO)
+                       continue;
+
+               if (vd->vdev_guid == handler->zi_record.zi_guid) {
+                       seconds = handler->zi_record.zi_timer;
+                       break;
+               }
+
+       }
+       rw_exit(&inject_lock);
+       return (seconds);
+}
+
 /*
  * Create a new handler for the given record.  We add it to the list, adding
  * a reference to the spa_t in the process.  We increment zio_injection_enabled,