]> granicus.if.org Git - zfs/commitdiff
Add visibility in to txg sync behavior
authorBrian Behlendorf <behlendorf1@llnl.gov>
Tue, 1 Oct 2013 16:50:50 +0000 (09:50 -0700)
committerBrian Behlendorf <behlendorf1@llnl.gov>
Fri, 25 Oct 2013 20:57:25 +0000 (13:57 -0700)
This change is an attempt to add visibility in to how txgs are being
formed on a system, in real time. To do this, a list was added to the
in memory SPA data structure for a pool, with each element on the list
corresponding to txg. These entries are then exported through the kstat
interface, which can then be interpreted in userspace.

For each txg, the following information is exported:

 * Unique txg number (uint64_t)
 * The time the txd was born (hrtime_t)
   (*not* wall clock time; relative to the other entries on the list)
 * The current txg state ((O)pen/(Q)uiescing/(S)yncing/(C)ommitted)
 * The number of reserved bytes for the txg (uint64_t)
 * The number of bytes read during the txg (uint64_t)
 * The number of bytes written during the txg (uint64_t)
 * The number of read operations during the txg (uint64_t)
 * The number of write operations during the txg (uint64_t)
 * The time the txg was closed (hrtime_t)
 * The time the txg was quiesced (hrtime_t)
 * The time the txg was synced (hrtime_t)

Note that while the raw kstat now stores relative hrtimes for the
open, quiesce, and sync times.  Those relative times are used to
calculate how long each state took and these deltas and printed by
output handlers.

Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
include/sys/spa.h
module/zfs/spa_stats.c
module/zfs/txg.c

index a4a717ac04509db96891c9bf1c3bd8e19c26e527..85a62588aedd38ef198ebc01e8f67211e2f55ff2 100644 (file)
@@ -548,12 +548,26 @@ typedef struct spa_stats_history {
 
 typedef struct spa_stats {
        spa_stats_history_t     read_history;
+       spa_stats_history_t     txg_history;
 } spa_stats_t;
 
+typedef enum txg_state {
+       TXG_STATE_BIRTH         = 0,
+       TXG_STATE_OPEN          = 1,
+       TXG_STATE_QUIESCED      = 2,
+       TXG_STATE_SYNCED        = 3,
+       TXG_STATE_COMMITTED     = 4,
+} txg_state_t;
+
 extern void spa_stats_init(spa_t *spa);
 extern void spa_stats_destroy(spa_t *spa);
 extern void spa_read_history_add(spa_t *spa, const zbookmark_t *zb,
     uint32_t aflags);
+extern void spa_txg_history_add(spa_t *spa, uint64_t txg);
+extern int spa_txg_history_set(spa_t *spa,  uint64_t txg,
+    txg_state_t completed_state, hrtime_t completed_time);
+extern int spa_txg_history_set_io(spa_t *spa,  uint64_t txg, uint64_t nread,
+    uint64_t nwritten, uint64_t reads, uint64_t writes, uint64_t nreserved);
 
 /* Pool configuration locks */
 extern int spa_config_tryenter(spa_t *spa, int locks, void *tag, krw_t rw);
index 271a3c312f7e7a9dec2fe3d94fd5f30091437468..16ccbb0ad53a40059e280d92e3e4168bfc6a275c 100644 (file)
@@ -32,6 +32,11 @@ int zfs_read_history = 0;
  */
 int zfs_read_history_hits = 0;
 
+/*
+ * Keeps stats on the last N txgs, disabled by default.
+ */
+int zfs_txg_history = 0;
+
 /*
  * ==========================================================================
  * SPA Read History Routines
@@ -227,15 +232,289 @@ spa_read_history_add(spa_t *spa, const zbookmark_t *zb, uint32_t aflags)
        mutex_exit(&ssh->lock);
 }
 
+/*
+ * ==========================================================================
+ * SPA TXG History Routines
+ * ==========================================================================
+ */
+
+/*
+ * Txg statistics - Information exported regarding each txg sync
+ */
+
+typedef struct spa_txg_history {
+       uint64_t        txg;            /* txg id */
+       txg_state_t     state;          /* active txg state */
+       uint64_t        nread;          /* number of bytes read */
+       uint64_t        nwritten;       /* number of bytes written */
+       uint64_t        reads;          /* number of read operations */
+       uint64_t        writes;         /* number of write operations */
+       uint64_t        nreserved;      /* number of bytes reserved */
+       hrtime_t        times[TXG_STATE_COMMITTED]; /* completion times */
+       list_node_t     sth_link;
+} spa_txg_history_t;
+
+static int
+spa_txg_history_headers(char *buf, size_t size)
+{
+       size = snprintf(buf, size - 1, "%-8s %-16s %-5s %-12s %-12s %-12s "
+           "%-8s %-8s %-12s %-12s %-12s\n", "txg", "birth", "state",
+           "nreserved", "nread", "nwritten", "reads", "writes",
+           "otime", "qtime", "stime");
+       buf[size] = '\0';
+
+       return (0);
+}
+
+static int
+spa_txg_history_data(char *buf, size_t size, void *data)
+{
+       spa_txg_history_t *sth = (spa_txg_history_t *)data;
+       uint64_t open = 0, quiesce = 0, sync = 0;
+       char state;
+
+       switch (sth->state) {
+               case TXG_STATE_BIRTH:           state = 'B';    break;
+               case TXG_STATE_OPEN:            state = 'O';    break;
+               case TXG_STATE_QUIESCED:        state = 'Q';    break;
+               case TXG_STATE_SYNCED:          state = 'S';    break;
+               case TXG_STATE_COMMITTED:       state = 'C';    break;
+               default:                        state = '?';    break;
+       }
+
+       if (sth->times[TXG_STATE_OPEN])
+               open = sth->times[TXG_STATE_OPEN] -
+                   sth->times[TXG_STATE_BIRTH];
+
+       if (sth->times[TXG_STATE_QUIESCED])
+               quiesce = sth->times[TXG_STATE_QUIESCED] -
+                   sth->times[TXG_STATE_OPEN];
+
+       if (sth->times[TXG_STATE_SYNCED])
+               sync = sth->times[TXG_STATE_SYNCED] -
+                   sth->times[TXG_STATE_QUIESCED];
+
+       size = snprintf(buf, size - 1, "%-8llu %-16llu %-5c %-12llu "
+           "%-12llu %-12llu %-8llu %-8llu %-12llu %-12llu %-12llu\n",
+           (longlong_t)sth->txg, sth->times[TXG_STATE_BIRTH], state,
+           (u_longlong_t)sth->nreserved,
+           (u_longlong_t)sth->nread, (u_longlong_t)sth->nwritten,
+           (u_longlong_t)sth->reads, (u_longlong_t)sth->writes,
+           (u_longlong_t)open, (u_longlong_t)quiesce, (u_longlong_t)sync);
+       buf[size] = '\0';
+
+       return (0);
+}
+
+/*
+ * Calculate the address for the next spa_stats_history_t entry.  The
+ * ssh->lock will be held until ksp->ks_ndata entries are processed.
+ */
+static void *
+spa_txg_history_addr(kstat_t *ksp, loff_t n)
+{
+       spa_t *spa = ksp->ks_private;
+       spa_stats_history_t *ssh = &spa->spa_stats.txg_history;
+
+       ASSERT(MUTEX_HELD(&ssh->lock));
+
+       if (n == 0)
+               ssh->private = list_tail(&ssh->list);
+       else if (ssh->private)
+               ssh->private = list_prev(&ssh->list, ssh->private);
+
+       return (ssh->private);
+}
+
+/*
+ * When the kstat is written discard all spa_txg_history_t entires.  The
+ * ssh->lock will be held until ksp->ks_ndata entries are processed.
+ */
+static int
+spa_txg_history_update(kstat_t *ksp, int rw)
+{
+       spa_t *spa = ksp->ks_private;
+       spa_stats_history_t *ssh = &spa->spa_stats.txg_history;
+
+       ASSERT(MUTEX_HELD(&ssh->lock));
+
+       if (rw == KSTAT_WRITE) {
+               spa_txg_history_t *sth;
+
+               while ((sth = list_remove_head(&ssh->list))) {
+                       ssh->size--;
+                       kmem_free(sth, sizeof(spa_txg_history_t));
+               }
+
+               ASSERT3U(ssh->size, ==, 0);
+       }
+
+       ksp->ks_ndata = ssh->size;
+       ksp->ks_data_size = ssh->size * sizeof(spa_txg_history_t);
+
+       return (0);
+}
+
+static void
+spa_txg_history_init(spa_t *spa)
+{
+       spa_stats_history_t *ssh = &spa->spa_stats.txg_history;
+       char name[KSTAT_STRLEN];
+       kstat_t *ksp;
+
+       mutex_init(&ssh->lock, NULL, MUTEX_DEFAULT, NULL);
+       list_create(&ssh->list, sizeof (spa_txg_history_t),
+           offsetof(spa_txg_history_t, sth_link));
+
+       ssh->count = 0;
+       ssh->size = 0;
+       ssh->private = NULL;
+
+       (void) snprintf(name, KSTAT_STRLEN, "zfs/%s", spa_name(spa));
+       name[KSTAT_STRLEN-1] = '\0';
+
+       ksp = kstat_create(name, 0, "txgs", "misc",
+           KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL);
+       ssh->kstat = ksp;
+
+       if (ksp) {
+               ksp->ks_lock = &ssh->lock;
+               ksp->ks_data = NULL;
+               ksp->ks_private = spa;
+               ksp->ks_update = spa_txg_history_update;
+               kstat_set_raw_ops(ksp, spa_txg_history_headers,
+                   spa_txg_history_data, spa_txg_history_addr);
+               kstat_install(ksp);
+       }
+}
+
+static void
+spa_txg_history_destroy(spa_t *spa)
+{
+       spa_stats_history_t *ssh = &spa->spa_stats.txg_history;
+       spa_txg_history_t *sth;
+       kstat_t *ksp;
+
+       ksp = ssh->kstat;
+       if (ksp)
+               kstat_delete(ksp);
+
+       mutex_enter(&ssh->lock);
+       while ((sth = list_remove_head(&ssh->list))) {
+               ssh->size--;
+               kmem_free(sth, sizeof(spa_txg_history_t));
+       }
+
+       ASSERT3U(ssh->size, ==, 0);
+       list_destroy(&ssh->list);
+       mutex_exit(&ssh->lock);
+
+       mutex_destroy(&ssh->lock);
+}
+
+/*
+ * Add a new txg to historical record.
+ */
+void
+spa_txg_history_add(spa_t *spa, uint64_t txg)
+{
+       spa_stats_history_t *ssh = &spa->spa_stats.txg_history;
+       spa_txg_history_t *sth, *rm;
+
+       if (zfs_txg_history == 0 && ssh->size == 0)
+               return;
+
+       sth = kmem_zalloc(sizeof(spa_txg_history_t), KM_PUSHPAGE);
+       sth->txg = txg;
+       sth->state = TXG_STATE_OPEN;
+       sth->times[TXG_STATE_BIRTH] = gethrtime();
+
+       mutex_enter(&ssh->lock);
+
+       list_insert_head(&ssh->list, sth);
+       ssh->size++;
+
+       while (ssh->size > zfs_txg_history) {
+               ssh->size--;
+               rm = list_remove_tail(&ssh->list);
+               kmem_free(rm, sizeof(spa_txg_history_t));
+       }
+
+       mutex_exit(&ssh->lock);
+}
+
+/*
+ * Set txg state completion time and increment current state.
+ */
+int
+spa_txg_history_set(spa_t *spa, uint64_t txg, txg_state_t completed_state,
+    hrtime_t completed_time)
+{
+       spa_stats_history_t *ssh = &spa->spa_stats.txg_history;
+       spa_txg_history_t *sth;
+       int error = ENOENT;
+
+       if (zfs_txg_history == 0)
+               return (0);
+
+       mutex_enter(&ssh->lock);
+       for (sth = list_head(&ssh->list); sth != NULL;
+            sth = list_next(&ssh->list, sth)) {
+               if (sth->txg == txg) {
+                       sth->times[completed_state] = completed_time;
+                       sth->state++;
+                       error = 0;
+                       break;
+               }
+       }
+       mutex_exit(&ssh->lock);
+
+       return (error);
+}
+
+/*
+ * Set txg IO stats.
+ */
+int
+spa_txg_history_set_io(spa_t *spa, uint64_t txg, uint64_t nread,
+    uint64_t nwritten, uint64_t reads, uint64_t writes, uint64_t nreserved)
+{
+       spa_stats_history_t *ssh = &spa->spa_stats.txg_history;
+       spa_txg_history_t *sth;
+       int error = ENOENT;
+
+       if (zfs_txg_history == 0)
+               return (0);
+
+       mutex_enter(&ssh->lock);
+       for (sth = list_head(&ssh->list); sth != NULL;
+            sth = list_next(&ssh->list, sth)) {
+               if (sth->txg == txg) {
+                       sth->nread = nread;
+                       sth->nwritten = nwritten;
+                       sth->reads = reads;
+                       sth->writes = writes;
+                       sth->nreserved = nreserved;
+                       error = 0;
+                       break;
+               }
+       }
+       mutex_exit(&ssh->lock);
+
+       return (error);
+}
+
 void
 spa_stats_init(spa_t *spa)
 {
        spa_read_history_init(spa);
+       spa_txg_history_init(spa);
 }
 
 void
 spa_stats_destroy(spa_t *spa)
 {
+       spa_txg_history_destroy(spa);
        spa_read_history_destroy(spa);
 }
 
@@ -245,4 +524,7 @@ MODULE_PARM_DESC(zfs_read_history, "Historic statistics for the last N reads");
 
 module_param(zfs_read_history_hits, int, 0644);
 MODULE_PARM_DESC(zfs_read_history_hits, "Include cache hits in read history");
+
+module_param(zfs_txg_history, int, 0644);
+MODULE_PARM_DESC(zfs_txg_history, "Historic statistics for the last N txgs");
 #endif
index 440353a50e2eb62d954d5c6d2709dd3f5ba9cae6..7a3da8647d17e352e5cfbbd273f4ae296d8b9b0d 100644 (file)
@@ -27,6 +27,7 @@
 #include <sys/zfs_context.h>
 #include <sys/txg_impl.h>
 #include <sys/dmu_impl.h>
+#include <sys/spa_impl.h>
 #include <sys/dmu_tx.h>
 #include <sys/dsl_pool.h>
 #include <sys/dsl_scan.h>
@@ -363,6 +364,9 @@ txg_quiesce(dsl_pool_t *dp, uint64_t txg)
        ASSERT(txg == tx->tx_open_txg);
        tx->tx_open_txg++;
 
+       spa_txg_history_set(dp->dp_spa, txg, TXG_STATE_OPEN, gethrtime());
+       spa_txg_history_add(dp->dp_spa, tx->tx_open_txg);
+
        /*
         * Now that we've incremented tx_open_txg, we can let threads
         * enter the next transaction group.
@@ -380,6 +384,8 @@ txg_quiesce(dsl_pool_t *dp, uint64_t txg)
                        cv_wait(&tc->tc_cv[g], &tc->tc_lock);
                mutex_exit(&tc->tc_lock);
        }
+
+       spa_txg_history_set(dp->dp_spa, txg, TXG_STATE_QUIESCED, gethrtime());
 }
 
 static void
@@ -451,6 +457,7 @@ txg_sync_thread(dsl_pool_t *dp)
        spa_t *spa = dp->dp_spa;
        tx_state_t *tx = &dp->dp_tx;
        callb_cpr_t cpr;
+       vdev_stat_t *vs1, *vs2;
        uint64_t start, delta;
 
 #ifdef _KERNEL
@@ -464,6 +471,9 @@ txg_sync_thread(dsl_pool_t *dp)
 
        txg_thread_enter(tx, &cpr);
 
+       vs1 = kmem_alloc(sizeof(vdev_stat_t), KM_PUSHPAGE);
+       vs2 = kmem_alloc(sizeof(vdev_stat_t), KM_PUSHPAGE);
+
        start = delta = 0;
        for (;;) {
                uint64_t timer, timeout;
@@ -499,8 +509,13 @@ txg_sync_thread(dsl_pool_t *dp)
                        txg_thread_wait(tx, &cpr, &tx->tx_quiesce_done_cv, 0);
                }
 
-               if (tx->tx_exiting)
+               if (tx->tx_exiting) {
+                       kmem_free(vs2, sizeof(vdev_stat_t));
+                       kmem_free(vs1, sizeof(vdev_stat_t));
                        txg_thread_exit(tx, &cpr, &tx->tx_sync_thread);
+               }
+
+               vdev_get_stats(spa->spa_root_vdev, vs1);
 
                /*
                 * Consume the quiesced txg which has been handed off to
@@ -529,6 +544,16 @@ txg_sync_thread(dsl_pool_t *dp)
                 * Dispatch commit callbacks to worker threads.
                 */
                txg_dispatch_callbacks(dp, txg);
+
+               vdev_get_stats(spa->spa_root_vdev, vs2);
+               spa_txg_history_set_io(spa, txg,
+                   vs2->vs_bytes[ZIO_TYPE_READ]-vs1->vs_bytes[ZIO_TYPE_READ],
+                   vs2->vs_bytes[ZIO_TYPE_WRITE]-vs1->vs_bytes[ZIO_TYPE_WRITE],
+                   vs2->vs_ops[ZIO_TYPE_READ]-vs1->vs_ops[ZIO_TYPE_READ],
+                   vs2->vs_ops[ZIO_TYPE_WRITE]-vs1->vs_ops[ZIO_TYPE_WRITE],
+                   dp->dp_space_towrite[txg & TXG_MASK] +
+                   dp->dp_tempreserved[txg & TXG_MASK] / 2);
+               spa_txg_history_set(spa, txg, TXG_STATE_SYNCED, gethrtime());
        }
 }