]> granicus.if.org Git - zfs/commitdiff
Fix random ztest_deadman_thread failures
authorTom Caputi <tcaputi@datto.com>
Wed, 10 Oct 2018 20:48:33 +0000 (16:48 -0400)
committerBrian Behlendorf <behlendorf1@llnl.gov>
Wed, 24 Oct 2018 21:36:21 +0000 (14:36 -0700)
The zloop test has been failing in buildbot for the last few weeks
with various failures in ztest_deadman_thread(). This is due to the
fact that this thread is not stopped when performing pool import /
export tests as it should be. This patch simply corrects this.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Serapheim Dimitropoulos <serapheim.dimitro@delphix.com>
Reviewed-by: Matthew Ahrens <mahrens@delphix.com>
Signed-off-by: Tom Caputi <tcaputi@datto.com>
Closes #8010

cmd/ztest/ztest.c
module/zfs/spa_misc.c

index f277312c663ba1982cb62c60d8bdea6c75054fe4..47f3cdf126c17a20870c3cb1c8cace6a71dbf5b9 100644 (file)
@@ -6495,13 +6495,20 @@ ztest_deadman_thread(void *arg)
 {
        ztest_shared_t *zs = arg;
        spa_t *spa = ztest_spa;
-       hrtime_t delta, overdue, total = 0;
+       hrtime_t delay, overdue, last_run = gethrtime();
 
-       for (;;) {
-               delta = zs->zs_thread_stop - zs->zs_thread_start +
-                   MSEC2NSEC(zfs_deadman_synctime_ms);
+       delay = (zs->zs_thread_stop - zs->zs_thread_start) +
+           MSEC2NSEC(zfs_deadman_synctime_ms);
 
-               (void) poll(NULL, 0, (int)NSEC2MSEC(delta));
+       while (!ztest_exiting) {
+               /*
+                * Wait for the delay timer while checking occasionally
+                * if we should stop.
+                */
+               if (gethrtime() < last_run + delay) {
+                       (void) poll(NULL, 0, 1000);
+                       continue;
+               }
 
                /*
                 * If the pool is suspended then fail immediately. Otherwise,
@@ -6522,15 +6529,20 @@ ztest_deadman_thread(void *arg)
                 * then it may be hung and is terminated.
                 */
                overdue = zs->zs_proc_stop + MSEC2NSEC(zfs_deadman_synctime_ms);
-               total += zfs_deadman_synctime_ms / 1000;
                if (gethrtime() > overdue) {
                        fatal(0, "aborting test after %llu seconds because "
-                           "the process is overdue for termination.", total);
+                           "the process is overdue for termination.",
+                           (gethrtime() - zs->zs_proc_start) / NANOSEC);
                }
 
                (void) printf("ztest has been running for %lld seconds\n",
-                   total);
+                   (gethrtime() - zs->zs_proc_start) / NANOSEC);
+
+               last_run = gethrtime();
+               delay = MSEC2NSEC(zfs_deadman_checktime_ms);
        }
+
+       thread_exit();
 }
 
 static void
@@ -6724,7 +6736,7 @@ ztest_run(ztest_shared_t *zs)
 {
        spa_t *spa;
        objset_t *os;
-       kthread_t *resume_thread;
+       kthread_t *resume_thread, *deadman_thread;
        kthread_t **run_threads;
        uint64_t object;
        int error;
@@ -6782,7 +6794,7 @@ ztest_run(ztest_shared_t *zs)
        /*
         * Create a deadman thread and set to panic if we hang.
         */
-       (void) thread_create(NULL, 0, ztest_deadman_thread,
+       deadman_thread = thread_create(NULL, 0, ztest_deadman_thread,
            zs, 0, NULL, TS_RUN | TS_JOINABLE, defclsyspri);
 
        spa->spa_deadman_failmode = ZIO_FAILURE_MODE_PANIC;
@@ -6849,9 +6861,10 @@ ztest_run(ztest_shared_t *zs)
 
        umem_free(run_threads, ztest_opts.zo_threads * sizeof (kthread_t *));
 
-       /* Kill the resume thread */
+       /* Kill the resume and deadman threads */
        ztest_exiting = B_TRUE;
        VERIFY0(thread_join(resume_thread));
+       VERIFY0(thread_join(deadman_thread));
        ztest_resume(spa);
 
        /*
@@ -7351,6 +7364,7 @@ main(int argc, char **argv)
 
        dprintf_setup(&argc, argv);
        zfs_deadman_synctime_ms = 300000;
+       zfs_deadman_checktime_ms = 30000;
        /*
         * As two-word space map entries may not come up often (especially
         * if pool and vdev sizes are small) we want to force at least some
index ae9eb4de7be7c02385a39962183bd7d766dc2301..a3ac70f07ae28e035cafaeae4ab95bbd388d15b1 100644 (file)
@@ -312,7 +312,7 @@ unsigned long zfs_deadman_ziotime_ms = 300000ULL;
  * Check time in milliseconds. This defines the frequency at which we check
  * for hung I/O.
  */
-unsigned long  zfs_deadman_checktime_ms = 60000ULL;
+unsigned long zfs_deadman_checktime_ms = 60000ULL;
 
 /*
  * By default the deadman is enabled.