From 99573cc05339ee04e80eb49c68f62d310da92ec7 Mon Sep 17 00:00:00 2001 From: Prakash Surya Date: Tue, 1 Oct 2019 12:33:12 -0700 Subject: [PATCH] Timeout waiting for ZVOL device to be created We've seen cases where after creating a ZVOL, the ZVOL device node in "/dev" isn't generated after 20 seconds of waiting, which is the point at which our applications gives up on waiting and reports an error. The workload when this occurs is to "refresh" 400+ ZVOLs roughly at the same time, based on a policy set by the user. This refresh operation will destroy the ZVOL, and re-create it based on a snapshot. When this occurs, we see many hundreds of entries on the "z_zvol" taskq (based on inspection of the /proc/spl/taskq-all file). Many of the entries on the taskq end up in the "zvol_remove_minors_impl" function, and I've measured the latency of that function: Function = zvol_remove_minors_impl msecs : count distribution 0 -> 1 : 0 | | 2 -> 3 : 0 | | 4 -> 7 : 1 | | 8 -> 15 : 0 | | 16 -> 31 : 0 | | 32 -> 63 : 0 | | 64 -> 127 : 1 | | 128 -> 255 : 45 |****************************************| 256 -> 511 : 5 |**** | That data is from a 10 second sample, using the BCC "funclatency" tool. As we can see, in this 10 second sample, most calls took 128ms at a minimum. Thus, some basic math tells us that in any 20 second interval, we could only process at most about 150 removals, which is much less than the 400+ that'll occur based on the workload. As a result of this, and since all ZVOL minor operations will go through the single threaded "z_zvol" taskq, the latency for creating a single ZVOL device can be unreasonably large due to other ZVOL activity on the system. In our case, it's large enough to cause the application to generate an error and fail the operation. When profiling the "zvol_remove_minors_impl" function, I saw that most of the time in the function was spent off-cpu, blocked in the function "taskq_wait_outstanding". How this works, is "zvol_remove_minors_impl" will dispatch calls to "zvol_free" using the "system_taskq", and then the "taskq_wait_outstanding" function is used to wait for all of those dispatched calls to occur before "zvol_remove_minors_impl" will return. As far as I can tell, "zvol_remove_minors_impl" doesn't necessarily have to wait for all calls to "zvol_free" to occur before it returns. Thus, this change removes the call to "taskq_wait_oustanding", so that calls to "zvol_free" don't affect the latency of "zvol_remove_minors_impl". Reviewed-by: Brian Behlendorf Reviewed-by: John Gallagher Signed-off-by: Prakash Surya Closes #9380 --- module/os/linux/zfs/zvol_os.c | 13 +++++++++++++ module/zfs/zvol.c | 7 +------ 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/module/os/linux/zfs/zvol_os.c b/module/os/linux/zfs/zvol_os.c index b89028dc3..2cb0cfca7 100644 --- a/module/os/linux/zfs/zvol_os.c +++ b/module/os/linux/zfs/zvol_os.c @@ -845,6 +845,11 @@ out_kmem: * At this time, the structure is not opened by anyone, is taken off * the zvol_state_list, and has its private data set to NULL. * The zvol_state_lock is dropped. + * + * This function may take many milliseconds to complete (e.g. we've seen + * it take over 256ms), due to the calls to "blk_cleanup_queue" and + * "del_gendisk". Thus, consumers need to be careful to account for this + * latency when calling this function. */ static void zvol_free(zvol_state_t *zv) @@ -1089,6 +1094,14 @@ zvol_fini(void) { zvol_remove_minors_impl(NULL); + /* + * The call to "zvol_remove_minors_impl" may dispatch entries to + * the system_taskq, but it doesn't wait for those entires to + * complete before it returns. Thus, we must wait for all of the + * removals to finish, before we can continue. + */ + taskq_wait_outstanding(system_taskq, 0); + zvol_fini_impl(); blk_unregister_region(MKDEV(zvol_major, 0), 1UL << MINORBITS); unregister_blkdev(zvol_major, ZVOL_DRIVER); diff --git a/module/zfs/zvol.c b/module/zfs/zvol.c index 9b26084bb..e76a1c61b 100644 --- a/module/zfs/zvol.c +++ b/module/zfs/zvol.c @@ -1170,7 +1170,7 @@ zvol_remove_minors_impl(const char *name) { zvol_state_t *zv, *zv_next; int namelen = ((name) ? strlen(name) : 0); - taskqid_t t, tid = TASKQID_INVALID; + taskqid_t t; list_t free_list; if (zvol_inhibit_dev) @@ -1217,8 +1217,6 @@ zvol_remove_minors_impl(const char *name) (task_func_t *)ops->zv_free, zv, TQ_SLEEP); if (t == TASKQID_INVALID) list_insert_head(&free_list, zv); - else - tid = t; } else { mutex_exit(&zv->zv_state_lock); } @@ -1230,9 +1228,6 @@ zvol_remove_minors_impl(const char *name) list_remove(&free_list, zv); ops->zv_free(zv); } - - if (tid != TASKQID_INVALID) - taskq_wait_outstanding(system_taskq, tid); } /* Remove minor for this specific volume only */ -- 2.40.0