Linux 5.0 compat: SIMD compatibility

author Brian Behlendorf <behlendorf1@llnl.gov>

Fri, 12 Jul 2019 16:31:20 +0000 (09:31 -0700)

committer GitHub <noreply@github.com>

Fri, 12 Jul 2019 16:31:20 +0000 (09:31 -0700)
author Brian Behlendorf <behlendorf1@llnl.gov>
Fri, 12 Jul 2019 16:31:20 +0000 (09:31 -0700)
committer GitHub <noreply@github.com>
Fri, 12 Jul 2019 16:31:20 +0000 (09:31 -0700)
diff --git a/cmd/ztest/ztest.c b/cmd/ztest/ztest.c

index a6820dede11380beda84c03023b9f8dd37281512..f67d94fa8302d5415fa87c80be74a9daa12056ac 100644 (file)
--- a/cmd/ztest/ztest.c
+++ b/cmd/ztest/ztest.c
@@ -107,6 +107,7 @@
  #include <sys/vdev_impl.h>
  #include <sys/vdev_file.h>
  #include <sys/vdev_initialize.h>
+#include <sys/vdev_raidz.h>
  #include <sys/vdev_trim.h>
  #include <sys/spa_impl.h>
  #include <sys/metaslab_impl.h>
@@ -6934,6 +6935,8 @@ ztest_run(ztest_shared_t *zs)
         metaslab_preload_limit = ztest_random(20) + 1;
         ztest_spa = spa;
  
+       VERIFY0(vdev_raidz_impl_set("cycle"));
+
         dmu_objset_stats_t dds;
         VERIFY0(ztest_dmu_objset_own(ztest_opts.zo_pool,
             DMU_OST_ANY, B_TRUE, B_TRUE, FTAG, &os));
diff --git a/config/kernel-fpu.m4 b/config/kernel-fpu.m4

index 5fff79a74c708c34ef0231597717b0c5dee6472e..31bf35f837ed74f577bc523447f42cff6a1c23b1 100644 (file)
--- a/config/kernel-fpu.m4
+++ b/config/kernel-fpu.m4
@@ -2,8 +2,15 @@ dnl #
  dnl # Handle differences in kernel FPU code.
  dnl #
  dnl # Kernel
-dnl # 5.0:     All kernel fpu functions are GPL only, so we can't use them.
-dnl #          (nothing defined)
+dnl # 5.2:     The fpu->initialized flag was replaced by TIF_NEED_FPU_LOAD.
+dnl #          HAVE_KERNEL_TIF_NEED_FPU_LOAD
+dnl #
+dnl # 5.0:     As an optimization SIMD operations performed by kernel
+dnl #          threads can skip saving and restoring their FPU context.
+dnl #          Wrappers have been introduced to determine the running
+dnl #          context and use either the SIMD or generic implementation.
+dnl #          This change was made to the 4.19.38 and 4.14.120 LTS kernels.
+dnl #          HAVE_KERNEL_FPU_INITIALIZED
  dnl #
  dnl # 4.2:     Use __kernel_fpu_{begin,end}()
  dnl #          HAVE_UNDERSCORE_KERNEL_FPU & KERNEL_EXPORTS_X86_FPU
@@ -56,10 +63,39 @@ AC_DEFUN([ZFS_AC_KERNEL_FPU], [
                         __kernel_fpu_end();
                 ], [__kernel_fpu_begin], [arch/x86/kernel/fpu/core.c arch/x86/kernel/i387.c], [
                         AC_MSG_RESULT(__kernel_fpu_*)
-                       AC_DEFINE(HAVE_UNDERSCORE_KERNEL_FPU, 1, [kernel has __kernel_fpu_* functions])
-                       AC_DEFINE(KERNEL_EXPORTS_X86_FPU, 1, [kernel exports FPU functions])
+                       AC_DEFINE(HAVE_UNDERSCORE_KERNEL_FPU, 1,
+                           [kernel has __kernel_fpu_* functions])
+                       AC_DEFINE(KERNEL_EXPORTS_X86_FPU, 1,
+                           [kernel exports FPU functions])
                 ],[
-                       AC_MSG_RESULT(not exported)
+                       ZFS_LINUX_TRY_COMPILE([
+                               #include <linux/module.h>
+                               #include <linux/sched.h>
+                       ],[
+                               struct fpu *fpu = &current->thread.fpu;
+                               if (fpu->initialized) { return (0); };
+                       ],[
+                               AC_MSG_RESULT(fpu.initialized)
+                               AC_DEFINE(HAVE_KERNEL_FPU_INITIALIZED, 1,
+                                   [kernel fpu.initialized exists])
+                       ],[
+                               ZFS_LINUX_TRY_COMPILE([
+                                       #include <linux/module.h>
+                                       #include <asm/thread_info.h>
+
+                                       #if !defined(TIF_NEED_FPU_LOAD)
+                                       #error "TIF_NEED_FPU_LOAD undefined"
+                                       #endif
+                               ],[
+                               ],[
+                                       AC_MSG_RESULT(TIF_NEED_FPU_LOAD)
+                                       AC_DEFINE(
+                                           HAVE_KERNEL_TIF_NEED_FPU_LOAD, 1,
+                                           [kernel TIF_NEED_FPU_LOAD exists])
+                               ],[
+                                       AC_MSG_RESULT(unavailable)
+                               ])
+                       ])
                 ])
         ])
  ])
diff --git a/include/linux/Makefile.am b/include/linux/Makefile.am

index efb49520e63cb99d8f7d0b73c72649753578dfeb..2455759e8138dda27029dfa8d963e76ddc0241d1 100644 (file)
--- a/include/linux/Makefile.am
+++ b/include/linux/Makefile.am
@@ -7,6 +7,7 @@ KERNEL_H = \
         $(top_srcdir)/include/linux/blkdev_compat.h \
         $(top_srcdir)/include/linux/utsname_compat.h \
         $(top_srcdir)/include/linux/kmap_compat.h \
+       $(top_srcdir)/include/linux/simd.h \
         $(top_srcdir)/include/linux/simd_x86.h \
         $(top_srcdir)/include/linux/simd_aarch64.h \
         $(top_srcdir)/include/linux/mod_compat.h \
diff --git a/include/linux/simd.h b/include/linux/simd.h

new file mode 100644 (file)

index 0000000..d2b6099
--- /dev/null
+++ b/include/linux/simd.h
@@ -0,0 +1,41 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (C) 2019 Lawrence Livermore National Security, LLC.
+ */
+
+#ifndef _SIMD_H
+#define        _SIMD_H
+
+#if defined(__x86)
+#include <linux/simd_x86.h>
+
+#elif defined(__aarch64__)
+#include <linux/simd_aarch64.h>
+#else
+
+#define        kfpu_allowed()          1
+#define        kfpu_initialize(tsk)    do {} while (0)
+#define        kfpu_begin()            do {} while (0)
+#define        kfpu_end()              do {} while (0)
+
+#endif
+#endif /* _SIMD_H */
diff --git a/include/linux/simd_aarch64.h b/include/linux/simd_aarch64.h

index 155ef62055993eba7ef67040683fa2bb7519acb2..1cfcd01e45eaf812a2b39bf95e6022faf635bc8d 100644 (file)
--- a/include/linux/simd_aarch64.h
+++ b/include/linux/simd_aarch64.h
@@ -41,20 +41,18 @@
  
  #if defined(_KERNEL)
  #include <asm/neon.h>
-#define        kfpu_begin()            \
-{                                      \
-       kernel_neon_begin();            \
-}
-#define        kfpu_end()                      \
-{                                      \
-       kernel_neon_end();              \
-}
+#define        kfpu_allowed()          1
+#define        kfpu_initialize(tsk)    do {} while (0)
+#define        kfpu_begin()            kernel_neon_begin()
+#define        kfpu_end()              kernel_neon_end()
  #else
  /*
   * fpu dummy methods for userspace
   */
-#define        kfpu_begin()    do {} while (0)
-#define        kfpu_end()              do {} while (0)
+#define        kfpu_allowed()          1
+#define        kfpu_initialize(tsk)    do {} while (0)
+#define        kfpu_begin()            do {} while (0)
+#define        kfpu_end()              do {} while (0)
  #endif /* defined(_KERNEL) */
  
  #endif /* __aarch64__ */
diff --git a/include/linux/simd_x86.h b/include/linux/simd_x86.h

index 12cd7467788eb174af5437edf25458c2f349077b..2d7a1c3a5209ed1864b5a619ff09c9531cea2991 100644 (file)
--- a/include/linux/simd_x86.h
+++ b/include/linux/simd_x86.h
@@ -90,33 +90,135 @@
  #include <asm/xcr.h>
  #endif
  
+/*
+ * The following cases are for kernels which export either the
+ * kernel_fpu_* or __kernel_fpu_* functions.
+ */
+#if defined(KERNEL_EXPORTS_X86_FPU)
+
+#define        kfpu_allowed()          1
+#define        kfpu_initialize(tsk)    do {} while (0)
+
  #if defined(HAVE_UNDERSCORE_KERNEL_FPU)
  #define        kfpu_begin()            \
-{                                                      \
-       preempt_disable();              \
+{                              \
+       preempt_disable();      \
         __kernel_fpu_begin();   \
  }
-#define        kfpu_end()                      \
-{                                                      \
-       __kernel_fpu_end();             \
-       preempt_enable();               \
+#define        kfpu_end()              \
+{                              \
+       __kernel_fpu_end();     \
+       preempt_enable();       \
  }
+
  #elif defined(HAVE_KERNEL_FPU)
-#define        kfpu_begin()    kernel_fpu_begin()
+#define        kfpu_begin()            kernel_fpu_begin()
  #define        kfpu_end()              kernel_fpu_end()
+
  #else
-/* Kernel doesn't export any kernel_fpu_* functions */
-#include <asm/fpu/internal.h>  /* For kernel xgetbv() */
-#define        kfpu_begin()    panic("This code should never run")
-#define        kfpu_end()      panic("This code should never run")
-#endif /* defined(HAVE_KERNEL_FPU) */
+/*
+ * This case is unreachable.  When KERNEL_EXPORTS_X86_FPU is defined then
+ * either HAVE_UNDERSCORE_KERNEL_FPU or HAVE_KERNEL_FPU must be defined.
+ */
+#error "Unreachable kernel configuration"
+#endif
+
+#else /* defined(KERNEL_EXPORTS_X86_FPU) */
+/*
+ * When the kernel_fpu_* symbols are unavailable then provide our own
+ * versions which allow the FPU to be safely used in kernel threads.
+ * In practice, this is not a significant restriction for ZFS since the
+ * vast majority of SIMD operations are performed by the IO pipeline.
+ */
  
+/*
+ * Returns non-zero if FPU operations are allowed in the current context.
+ */
+#if defined(HAVE_KERNEL_TIF_NEED_FPU_LOAD)
+#define        kfpu_allowed()          ((current->flags & PF_KTHREAD) && \
+                               test_thread_flag(TIF_NEED_FPU_LOAD))
+#elif defined(HAVE_KERNEL_FPU_INITIALIZED)
+#define        kfpu_allowed()          ((current->flags & PF_KTHREAD) && \
+                               current->thread.fpu.initialized)
  #else
+#define        kfpu_allowed()          0
+#endif
+
+static inline void
+kfpu_initialize(void)
+{
+       WARN_ON_ONCE(!(current->flags & PF_KTHREAD));
+
+#if defined(HAVE_KERNEL_TIF_NEED_FPU_LOAD)
+       __fpu_invalidate_fpregs_state(&current->thread.fpu);
+       set_thread_flag(TIF_NEED_FPU_LOAD);
+#elif defined(HAVE_KERNEL_FPU_INITIALIZED)
+       __fpu_invalidate_fpregs_state(&current->thread.fpu);
+       current->thread.fpu.initialized = 1;
+#endif
+}
+
+static inline void
+kfpu_begin(void)
+{
+       WARN_ON_ONCE(!kfpu_allowed());
+
+       /*
+        * Preemption and interrupts must be disabled for the critical
+        * region where the FPU state is being modified.
+        */
+       preempt_disable();
+       local_irq_disable();
+
+#if defined(HAVE_KERNEL_TIF_NEED_FPU_LOAD)
+       /*
+        * The current FPU registers need to be preserved by kfpu_begin()
+        * and restored by kfpu_end().  This is required because we can
+        * not call __cpu_invalidate_fpregs_state() to invalidate the
+        * per-cpu FPU state and force them to be restored during a
+        * context switch.
+        */
+       copy_fpregs_to_fpstate(&current->thread.fpu);
+#elif defined(HAVE_KERNEL_FPU_INITIALIZED)
+       /*
+        * There is no need to preserve and restore the FPU registers.
+        * They will always be restored from the task's stored FPU state
+        * when switching contexts.
+        */
+       WARN_ON_ONCE(current->thread.fpu.initialized == 0);
+#endif
+}
+
+static inline void
+kfpu_end(void)
+{
+#if defined(HAVE_KERNEL_TIF_NEED_FPU_LOAD)
+       union fpregs_state *state = &current->thread.fpu.state;
+       int error;
+
+       if (use_xsave()) {
+               error = copy_kernel_to_xregs_err(&state->xsave, -1);
+       } else if (use_fxsr()) {
+               error = copy_kernel_to_fxregs_err(&state->fxsave);
+       } else {
+               error = copy_kernel_to_fregs_err(&state->fsave);
+       }
+       WARN_ON_ONCE(error);
+#endif
+
+       local_irq_enable();
+       preempt_enable();
+}
+#endif /* defined(HAVE_KERNEL_FPU) */
+
+#else /* defined(_KERNEL) */
  /*
- * fpu dummy methods for userspace
+ * FPU dummy methods for user space.
   */
-#define        kfpu_begin()    do {} while (0)
-#define        kfpu_end()              do {} while (0)
+#define        kfpu_allowed()          1
+#define        kfpu_initialize(tsk)    do {} while (0)
+#define        kfpu_begin()            do {} while (0)
+#define        kfpu_end()              do {} while (0)
  #endif /* defined(_KERNEL) */
  
  /*
@@ -298,7 +400,7 @@ __simd_state_enabled(const uint64_t state)
         uint64_t xcr0;
  
  #if defined(_KERNEL)
-#if defined(X86_FEATURE_OSXSAVE) && defined(KERNEL_EXPORTS_X86_FPU)
+#if defined(X86_FEATURE_OSXSAVE)
         has_osxsave = !!boot_cpu_has(X86_FEATURE_OSXSAVE);
  #else
         has_osxsave = B_FALSE;
@@ -328,11 +430,7 @@ static inline boolean_t
  zfs_sse_available(void)
  {
  #if defined(_KERNEL)
-#if defined(KERNEL_EXPORTS_X86_FPU)
         return (!!boot_cpu_has(X86_FEATURE_XMM));
-#else
-       return (B_FALSE);
-#endif
  #elif !defined(_KERNEL)
         return (__cpuid_has_sse());
  #endif
@@ -345,11 +443,7 @@ static inline boolean_t
  zfs_sse2_available(void)
  {
  #if defined(_KERNEL)
-#if defined(KERNEL_EXPORTS_X86_FPU)
         return (!!boot_cpu_has(X86_FEATURE_XMM2));
-#else
-       return (B_FALSE);
-#endif
  #elif !defined(_KERNEL)
         return (__cpuid_has_sse2());
  #endif
@@ -362,11 +456,7 @@ static inline boolean_t
  zfs_sse3_available(void)
  {
  #if defined(_KERNEL)
-#if defined(KERNEL_EXPORTS_X86_FPU)
         return (!!boot_cpu_has(X86_FEATURE_XMM3));
-#else
-       return (B_FALSE);
-#endif
  #elif !defined(_KERNEL)
         return (__cpuid_has_sse3());
  #endif
@@ -379,11 +469,7 @@ static inline boolean_t
  zfs_ssse3_available(void)
  {
  #if defined(_KERNEL)
-#if defined(KERNEL_EXPORTS_X86_FPU)
         return (!!boot_cpu_has(X86_FEATURE_SSSE3));
-#else
-       return (B_FALSE);
-#endif
  #elif !defined(_KERNEL)
         return (__cpuid_has_ssse3());
  #endif
@@ -396,11 +482,7 @@ static inline boolean_t
  zfs_sse4_1_available(void)
  {
  #if defined(_KERNEL)
-#if defined(KERNEL_EXPORTS_X86_FPU)
         return (!!boot_cpu_has(X86_FEATURE_XMM4_1));
-#else
-       return (B_FALSE);
-#endif
  #elif !defined(_KERNEL)
         return (__cpuid_has_sse4_1());
  #endif
@@ -413,11 +495,7 @@ static inline boolean_t
  zfs_sse4_2_available(void)
  {
  #if defined(_KERNEL)
-#if defined(KERNEL_EXPORTS_X86_FPU)
         return (!!boot_cpu_has(X86_FEATURE_XMM4_2));
-#else
-       return (B_FALSE);
-#endif
  #elif !defined(_KERNEL)
         return (__cpuid_has_sse4_2());
  #endif
@@ -431,11 +509,7 @@ zfs_avx_available(void)
  {
         boolean_t has_avx;
  #if defined(_KERNEL)
-#if defined(KERNEL_EXPORTS_X86_FPU)
         has_avx = !!boot_cpu_has(X86_FEATURE_AVX);
-#else
-       has_avx = B_FALSE;
-#endif
  #elif !defined(_KERNEL)
         has_avx = __cpuid_has_avx();
  #endif
@@ -451,11 +525,7 @@ zfs_avx2_available(void)
  {
         boolean_t has_avx2;
  #if defined(_KERNEL)
-#if defined(X86_FEATURE_AVX2) && defined(KERNEL_EXPORTS_X86_FPU)
         has_avx2 = !!boot_cpu_has(X86_FEATURE_AVX2);
-#else
-       has_avx2 = B_FALSE;
-#endif
  #elif !defined(_KERNEL)
         has_avx2 = __cpuid_has_avx2();
  #endif
@@ -470,7 +540,7 @@ static inline boolean_t
  zfs_bmi1_available(void)
  {
  #if defined(_KERNEL)
-#if defined(X86_FEATURE_BMI1) && defined(KERNEL_EXPORTS_X86_FPU)
+#if defined(X86_FEATURE_BMI1)
         return (!!boot_cpu_has(X86_FEATURE_BMI1));
  #else
         return (B_FALSE);
@@ -487,7 +557,7 @@ static inline boolean_t
  zfs_bmi2_available(void)
  {
  #if defined(_KERNEL)
-#if defined(X86_FEATURE_BMI2) && defined(KERNEL_EXPORTS_X86_FPU)
+#if defined(X86_FEATURE_BMI2)
         return (!!boot_cpu_has(X86_FEATURE_BMI2));
  #else
         return (B_FALSE);
@@ -504,7 +574,7 @@ static inline boolean_t
  zfs_aes_available(void)
  {
  #if defined(_KERNEL)
-#if defined(X86_FEATURE_AES) && defined(KERNEL_EXPORTS_X86_FPU)
+#if defined(X86_FEATURE_AES)
         return (!!boot_cpu_has(X86_FEATURE_AES));
  #else
         return (B_FALSE);
@@ -521,7 +591,7 @@ static inline boolean_t
  zfs_pclmulqdq_available(void)
  {
  #if defined(_KERNEL)
-#if defined(X86_FEATURE_PCLMULQDQ) && defined(KERNEL_EXPORTS_X86_FPU)
+#if defined(X86_FEATURE_PCLMULQDQ)
         return (!!boot_cpu_has(X86_FEATURE_PCLMULQDQ));
  #else
         return (B_FALSE);
@@ -555,7 +625,7 @@ zfs_avx512f_available(void)
         boolean_t has_avx512 = B_FALSE;
  
  #if defined(_KERNEL)
-#if defined(X86_FEATURE_AVX512F) && defined(KERNEL_EXPORTS_X86_FPU)
+#if defined(X86_FEATURE_AVX512F)
         has_avx512 = !!boot_cpu_has(X86_FEATURE_AVX512F);
  #else
         has_avx512 = B_FALSE;
@@ -574,7 +644,7 @@ zfs_avx512cd_available(void)
         boolean_t has_avx512 = B_FALSE;
  
  #if defined(_KERNEL)
-#if defined(X86_FEATURE_AVX512CD) && defined(KERNEL_EXPORTS_X86_FPU)
+#if defined(X86_FEATURE_AVX512CD)
         has_avx512 = boot_cpu_has(X86_FEATURE_AVX512F) &&
             boot_cpu_has(X86_FEATURE_AVX512CD);
  #else
@@ -594,7 +664,7 @@ zfs_avx512er_available(void)
         boolean_t has_avx512 = B_FALSE;
  
  #if defined(_KERNEL)
-#if defined(X86_FEATURE_AVX512ER) && defined(KERNEL_EXPORTS_X86_FPU)
+#if defined(X86_FEATURE_AVX512ER)
         has_avx512 = boot_cpu_has(X86_FEATURE_AVX512F) &&
             boot_cpu_has(X86_FEATURE_AVX512ER);
  #else
@@ -614,7 +684,7 @@ zfs_avx512pf_available(void)
         boolean_t has_avx512 = B_FALSE;
  
  #if defined(_KERNEL)
-#if defined(X86_FEATURE_AVX512PF) && defined(KERNEL_EXPORTS_X86_FPU)
+#if defined(X86_FEATURE_AVX512PF)
         has_avx512 = boot_cpu_has(X86_FEATURE_AVX512F) &&
             boot_cpu_has(X86_FEATURE_AVX512PF);
  #else
@@ -634,7 +704,7 @@ zfs_avx512bw_available(void)
         boolean_t has_avx512 = B_FALSE;
  
  #if defined(_KERNEL)
-#if defined(X86_FEATURE_AVX512BW) && defined(KERNEL_EXPORTS_X86_FPU)
+#if defined(X86_FEATURE_AVX512BW)
         has_avx512 = boot_cpu_has(X86_FEATURE_AVX512F) &&
             boot_cpu_has(X86_FEATURE_AVX512BW);
  #else
@@ -654,7 +724,7 @@ zfs_avx512dq_available(void)
         boolean_t has_avx512 = B_FALSE;
  
  #if defined(_KERNEL)
-#if defined(X86_FEATURE_AVX512DQ) && defined(KERNEL_EXPORTS_X86_FPU)
+#if defined(X86_FEATURE_AVX512DQ)
         has_avx512 = boot_cpu_has(X86_FEATURE_AVX512F) &&
             boot_cpu_has(X86_FEATURE_AVX512DQ);
  #else
@@ -674,7 +744,7 @@ zfs_avx512vl_available(void)
         boolean_t has_avx512 = B_FALSE;
  
  #if defined(_KERNEL)
-#if defined(X86_FEATURE_AVX512VL) && defined(KERNEL_EXPORTS_X86_FPU)
+#if defined(X86_FEATURE_AVX512VL)
         has_avx512 = boot_cpu_has(X86_FEATURE_AVX512F) &&
             boot_cpu_has(X86_FEATURE_AVX512VL);
  #else
@@ -694,7 +764,7 @@ zfs_avx512ifma_available(void)
         boolean_t has_avx512 = B_FALSE;
  
  #if defined(_KERNEL)
-#if defined(X86_FEATURE_AVX512IFMA) && defined(KERNEL_EXPORTS_X86_FPU)
+#if defined(X86_FEATURE_AVX512IFMA)
         has_avx512 = boot_cpu_has(X86_FEATURE_AVX512F) &&
             boot_cpu_has(X86_FEATURE_AVX512IFMA);
  #else
@@ -714,7 +784,7 @@ zfs_avx512vbmi_available(void)
         boolean_t has_avx512 = B_FALSE;
  
  #if defined(_KERNEL)
-#if defined(X86_FEATURE_AVX512VBMI) && defined(KERNEL_EXPORTS_X86_FPU)
+#if defined(X86_FEATURE_AVX512VBMI)
         has_avx512 = boot_cpu_has(X86_FEATURE_AVX512F) &&
             boot_cpu_has(X86_FEATURE_AVX512VBMI);
  #else
diff --git a/include/sys/vdev_raidz.h b/include/sys/vdev_raidz.h

index 2ce32469d4719e1689a51dbbd6e4bc653d92150c..0ce2b5ea1d67b0aa823652941c40192726967677 100644 (file)
--- a/include/sys/vdev_raidz.h
+++ b/include/sys/vdev_raidz.h
@@ -51,7 +51,7 @@ int vdev_raidz_reconstruct(struct raidz_map *, const int *, int);
   */
  void vdev_raidz_math_init(void);
  void vdev_raidz_math_fini(void);
-struct raidz_impl_ops *vdev_raidz_math_get_ops(void);
+const struct raidz_impl_ops *vdev_raidz_math_get_ops(void);
  int vdev_raidz_math_generate(struct raidz_map *);
  int vdev_raidz_math_reconstruct(struct raidz_map *, const int *, const int *,
      const int);
diff --git a/include/sys/vdev_raidz_impl.h b/include/sys/vdev_raidz_impl.h

index 0799ed19dfc88bf68d67c4e1b087584008571fd2..4969d110b8586485b47a42c0d8321e18ef9ad721 100644 (file)
--- a/include/sys/vdev_raidz_impl.h
+++ b/include/sys/vdev_raidz_impl.h
@@ -126,7 +126,7 @@ typedef struct raidz_map {
         uintptr_t rm_reports;           /* # of referencing checksum reports */
         uint8_t rm_freed;               /* map no longer has referencing ZIO */
         uint8_t rm_ecksuminjected;      /* checksum error was injected */
-       raidz_impl_ops_t *rm_ops;       /* RAIDZ math operations */
+       const raidz_impl_ops_t *rm_ops; /* RAIDZ math operations */
         raidz_col_t rm_col[1];          /* Flexible array of I/O columns */
  } raidz_map_t;
  
diff --git a/module/icp/algs/aes/aes_impl.c b/module/icp/algs/aes/aes_impl.c

index e1505063574108dce77e705b5f2fce7142e15c81..457b9e45c662e0a5a3e7d8e6d6a7e927996c8a9d 100644 (file)
--- a/module/icp/algs/aes/aes_impl.c
+++ b/module/icp/algs/aes/aes_impl.c
@@ -27,6 +27,7 @@
  #include <sys/crypto/spi.h>
  #include <modes/modes.h>
  #include <aes/aes_impl.h>
+#include <linux/simd.h>
  
  /*
   * Initialize AES encryption and decryption key schedules.
@@ -40,9 +41,9 @@
  void
  aes_init_keysched(const uint8_t *cipherKey, uint_t keyBits, void *keysched)
  {
-       aes_impl_ops_t  *ops = aes_impl_get_ops();
-       aes_key_t       *newbie = keysched;
-       uint_t          keysize, i, j;
+       const aes_impl_ops_t *ops = aes_impl_get_ops();
+       aes_key_t *newbie = keysched;
+       uint_t keysize, i, j;
         union {
                 uint64_t        ka64[4];
                 uint32_t        ka32[8];
@@ -252,12 +253,17 @@ static size_t aes_supp_impl_cnt = 0;
  static aes_impl_ops_t *aes_supp_impl[ARRAY_SIZE(aes_all_impl)];
  
  /*
- * Selects the aes operations for encrypt/decrypt/key setup
+ * Returns the AES operations for encrypt/decrypt/key setup.  When a
+ * SIMD implementation is not allowed in the current context, then
+ * fallback to the fastest generic implementation.
   */
-aes_impl_ops_t *
-aes_impl_get_ops()
+const aes_impl_ops_t *
+aes_impl_get_ops(void)
  {
-       aes_impl_ops_t *ops = NULL;
+       if (!kfpu_allowed())
+               return (&aes_generic_impl);
+
+       const aes_impl_ops_t *ops = NULL;
         const uint32_t impl = AES_IMPL_READ(icp_aes_impl);
  
         switch (impl) {
@@ -266,15 +272,13 @@ aes_impl_get_ops()
                 ops = &aes_fastest_impl;
                 break;
         case IMPL_CYCLE:
-       {
+               /* Cycle through supported implementations */
                 ASSERT(aes_impl_initialized);
                 ASSERT3U(aes_supp_impl_cnt, >, 0);
-               /* Cycle through supported implementations */
                 static size_t cycle_impl_idx = 0;
                 size_t idx = (++cycle_impl_idx) % aes_supp_impl_cnt;
                 ops = aes_supp_impl[idx];
-       }
-       break;
+               break;
         default:
                 ASSERT3U(impl, <, aes_supp_impl_cnt);
                 ASSERT3U(aes_supp_impl_cnt, >, 0);
@@ -288,13 +292,17 @@ aes_impl_get_ops()
         return (ops);
  }
  
+/*
+ * Initialize all supported implementations.
+ */
+/* ARGSUSED */
  void
-aes_impl_init(void)
+aes_impl_init(void *arg)
  {
         aes_impl_ops_t *curr_impl;
         int i, c;
  
-       /* move supported impl into aes_supp_impls */
+       /* Move supported implementations into aes_supp_impls */
         for (i = 0, c = 0; i < ARRAY_SIZE(aes_all_impl); i++) {
                 curr_impl = (aes_impl_ops_t *)aes_all_impl[i];
  
diff --git a/module/icp/algs/aes/aes_impl_aesni.c b/module/icp/algs/aes/aes_impl_aesni.c

index 97f7c3a4781bfbe9c2ba42ec0bc9d68e75d9b727..222c176aabab4248b74ef8d7ca6b2fa6d8f1cb1b 100644 (file)
--- a/module/icp/algs/aes/aes_impl_aesni.c
+++ b/module/icp/algs/aes/aes_impl_aesni.c
@@ -108,7 +108,7 @@ aes_aesni_decrypt(const uint32_t rk[], int Nr, const uint32_t ct[4],
  static boolean_t
  aes_aesni_will_work(void)
  {
-       return (zfs_aes_available());
+       return (kfpu_allowed() && zfs_aes_available());
  }
  
  const aes_impl_ops_t aes_aesni_impl = {
diff --git a/module/icp/algs/modes/gcm.c b/module/icp/algs/modes/gcm.c

index 13bceef0f1706111bd07f95a0cbaf191a191be7a..f6f8434de772c5c939152676d19ad7cd161f1d37 100644 (file)
--- a/module/icp/algs/modes/gcm.c
+++ b/module/icp/algs/modes/gcm.c
@@ -29,6 +29,7 @@
  #include <sys/crypto/impl.h>
  #include <sys/byteorder.h>
  #include <modes/gcm_impl.h>
+#include <linux/simd.h>
  
  #define        GHASH(c, d, t, o) \
         xor_block((uint8_t *)(d), (uint8_t *)(c)->gcm_ghash); \
@@ -46,7 +47,7 @@ gcm_mode_encrypt_contiguous_blocks(gcm_ctx_t *ctx, char *data, size_t length,
      void (*copy_block)(uint8_t *, uint8_t *),
      void (*xor_block)(uint8_t *, uint8_t *))
  {
-       gcm_impl_ops_t *gops;
+       const gcm_impl_ops_t *gops;
         size_t remainder = length;
         size_t need = 0;
         uint8_t *datap = (uint8_t *)data;
@@ -168,7 +169,7 @@ gcm_encrypt_final(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size,
      void (*copy_block)(uint8_t *, uint8_t *),
      void (*xor_block)(uint8_t *, uint8_t *))
  {
-       gcm_impl_ops_t *gops;
+       const gcm_impl_ops_t *gops;
         uint64_t counter_mask = ntohll(0x00000000ffffffffULL);
         uint8_t *ghash, *macp = NULL;
         int i, rv;
@@ -320,7 +321,7 @@ gcm_decrypt_final(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size,
      int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
      void (*xor_block)(uint8_t *, uint8_t *))
  {
-       gcm_impl_ops_t *gops;
+       const gcm_impl_ops_t *gops;
         size_t pt_len;
         size_t remainder;
         uint8_t *ghash;
@@ -427,7 +428,7 @@ gcm_format_initial_blocks(uchar_t *iv, ulong_t iv_len,
      void (*copy_block)(uint8_t *, uint8_t *),
      void (*xor_block)(uint8_t *, uint8_t *))
  {
-       gcm_impl_ops_t *gops;
+       const gcm_impl_ops_t *gops;
         uint8_t *cb;
         ulong_t remainder = iv_len;
         ulong_t processed = 0;
@@ -481,7 +482,7 @@ gcm_init(gcm_ctx_t *ctx, unsigned char *iv, size_t iv_len,
      void (*copy_block)(uint8_t *, uint8_t *),
      void (*xor_block)(uint8_t *, uint8_t *))
  {
-       gcm_impl_ops_t *gops;
+       const gcm_impl_ops_t *gops;
         uint8_t *ghash, *datap, *authp;
         size_t remainder, processed;
  
@@ -660,12 +661,17 @@ static size_t gcm_supp_impl_cnt = 0;
  static gcm_impl_ops_t *gcm_supp_impl[ARRAY_SIZE(gcm_all_impl)];
  
  /*
- * Selects the gcm operation
+ * Returns the GCM operations for encrypt/decrypt/key setup.  When a
+ * SIMD implementation is not allowed in the current context, then
+ * fallback to the fastest generic implementation.
   */
-gcm_impl_ops_t *
+const gcm_impl_ops_t *
  gcm_impl_get_ops()
  {
-       gcm_impl_ops_t *ops = NULL;
+       if (!kfpu_allowed())
+               return (&gcm_generic_impl);
+
+       const gcm_impl_ops_t *ops = NULL;
         const uint32_t impl = GCM_IMPL_READ(icp_gcm_impl);
  
         switch (impl) {
@@ -674,15 +680,13 @@ gcm_impl_get_ops()
                 ops = &gcm_fastest_impl;
                 break;
         case IMPL_CYCLE:
-       {
+               /* Cycle through supported implementations */
                 ASSERT(gcm_impl_initialized);
                 ASSERT3U(gcm_supp_impl_cnt, >, 0);
-               /* Cycle through supported implementations */
                 static size_t cycle_impl_idx = 0;
                 size_t idx = (++cycle_impl_idx) % gcm_supp_impl_cnt;
                 ops = gcm_supp_impl[idx];
-       }
-       break;
+               break;
         default:
                 ASSERT3U(impl, <, gcm_supp_impl_cnt);
                 ASSERT3U(gcm_supp_impl_cnt, >, 0);
@@ -696,13 +700,17 @@ gcm_impl_get_ops()
         return (ops);
  }
  
+/*
+ * Initialize all supported implementations.
+ */
+/* ARGSUSED */
  void
-gcm_impl_init(void)
+gcm_impl_init(void *arg)
  {
         gcm_impl_ops_t *curr_impl;
         int i, c;
  
-       /* move supported impl into aes_supp_impls */
+       /* Move supported implementations into gcm_supp_impls */
         for (i = 0, c = 0; i < ARRAY_SIZE(gcm_all_impl); i++) {
                 curr_impl = (gcm_impl_ops_t *)gcm_all_impl[i];
  
@@ -711,7 +719,10 @@ gcm_impl_init(void)
         }
         gcm_supp_impl_cnt = c;
  
-       /* set fastest implementation. assume hardware accelerated is fastest */
+       /*
+        * Set the fastest implementation given the assumption that the
+        * hardware accelerated version is the fastest.
+        */
  #if defined(__x86_64) && defined(HAVE_PCLMULQDQ)
         if (gcm_pclmulqdq_impl.is_supported())
                 memcpy(&gcm_fastest_impl, &gcm_pclmulqdq_impl,
diff --git a/module/icp/algs/modes/gcm_pclmulqdq.c b/module/icp/algs/modes/gcm_pclmulqdq.c

index be00ba37b6a6374d49c09fbeb0da9cf8090fc4ef..8a43ba33a6e5c8b73c66957fb9a5147e969efe81 100644 (file)
--- a/module/icp/algs/modes/gcm_pclmulqdq.c
+++ b/module/icp/algs/modes/gcm_pclmulqdq.c
@@ -52,7 +52,7 @@ gcm_pclmulqdq_mul(uint64_t *x_in, uint64_t *y, uint64_t *res)
  static boolean_t
  gcm_pclmulqdq_will_work(void)
  {
-       return (zfs_pclmulqdq_available());
+       return (kfpu_allowed() && zfs_pclmulqdq_available());
  }
  
  const gcm_impl_ops_t gcm_pclmulqdq_impl = {
diff --git a/module/icp/include/aes/aes_impl.h b/module/icp/include/aes/aes_impl.h

index 95cfddf9e0a4b8707a240678cc2bbd14850df663..9fd9c1bd1a9a6b54edf4408976c79612c66b6ef4 100644 (file)
--- a/module/icp/include/aes/aes_impl.h
+++ b/module/icp/include/aes/aes_impl.h
@@ -198,12 +198,12 @@ extern const aes_impl_ops_t aes_aesni_impl;
  /*
   * Initializes fastest implementation
   */
-void aes_impl_init(void);
+void aes_impl_init(void *arg);
  
  /*
- * Get selected aes implementation
+ * Returns optimal allowed AES implementation
   */
-struct aes_impl_ops *aes_impl_get_ops(void);
+const struct aes_impl_ops *aes_impl_get_ops(void);
  
  #ifdef __cplusplus
  }
diff --git a/module/icp/include/modes/gcm_impl.h b/module/icp/include/modes/gcm_impl.h

index cbb904c059b71eb40922f828f2c5335c003a0748..1380904873ab4fe0382fdf225dbb7ab9813f98ce 100644 (file)
--- a/module/icp/include/modes/gcm_impl.h
+++ b/module/icp/include/modes/gcm_impl.h
@@ -61,12 +61,12 @@ extern const gcm_impl_ops_t gcm_pclmulqdq_impl;
  /*
   * Initializes fastest implementation
   */
-void gcm_impl_init(void);
+void gcm_impl_init(void *arg);
  
  /*
- * Get selected aes implementation
+ * Returns optimal allowed GCM implementation
   */
-struct gcm_impl_ops *gcm_impl_get_ops(void);
+const struct gcm_impl_ops *gcm_impl_get_ops(void);
  
  #ifdef __cplusplus
  }
diff --git a/module/icp/io/aes.c b/module/icp/io/aes.c

index 53b1936938696f0fe13e0280610448734727ae73..51538bc607805c51b6ec53c3e1da1e612c87a871 100644 (file)
--- a/module/icp/io/aes.c
+++ b/module/icp/io/aes.c
@@ -206,9 +206,35 @@ aes_mod_init(void)
  {
         int ret;
  
-       /* find fastest implementations and set any requested implementations */
-       aes_impl_init();
-       gcm_impl_init();
+#if defined(_KERNEL)
+       /*
+        * Determine the fastest available implementation.  The benchmarks
+        * are run in dedicated kernel threads to allow Linux 5.0+ kernels
+        * to use SIMD operations.  If for some reason this isn't possible,
+        * fallback to the generic implementations.  See the comment in
+        * include/linux/simd_x86.h for additional details.  Additionally,
+        * this has the benefit of allowing them to be run in parallel.
+        */
+       taskqid_t aes_id = taskq_dispatch(system_taskq, aes_impl_init,
+           NULL, TQ_SLEEP);
+       taskqid_t gcm_id = taskq_dispatch(system_taskq, gcm_impl_init,
+           NULL, TQ_SLEEP);
+
+       if (aes_id != TASKQID_INVALID) {
+               taskq_wait_id(system_taskq, aes_id);
+       } else {
+               aes_impl_init(NULL);
+       }
+
+       if (gcm_id != TASKQID_INVALID) {
+               taskq_wait_id(system_taskq, gcm_id);
+       } else {
+               gcm_impl_init(NULL);
+       }
+#else
+       aes_impl_init(NULL);
+       gcm_impl_init(NULL);
+#endif
  
         if ((ret = mod_install(&modlinkage)) != 0)
                 return (ret);
diff --git a/module/spl/spl-taskq.c b/module/spl/spl-taskq.c

index a39f94e4cc20611a1537c64879b8f33d8d223c6c..69d591ff74af16dfca6a3aeb581a7ace331f03da 100644 (file)
--- a/module/spl/spl-taskq.c
+++ b/module/spl/spl-taskq.c
@@ -28,6 +28,7 @@
  #include <sys/taskq.h>
  #include <sys/kmem.h>
  #include <sys/tsd.h>
+#include <linux/simd.h>
  
  int spl_taskq_thread_bind = 0;
  module_param(spl_taskq_thread_bind, int, 0644);
@@ -853,6 +854,7 @@ taskq_thread(void *args)
         sigfillset(&blocked);
         sigprocmask(SIG_BLOCK, &blocked, NULL);
         flush_signals(current);
+       kfpu_initialize();
  
         tsd_set(taskq_tsd, tq);
         spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class);
diff --git a/module/spl/spl-thread.c b/module/spl/spl-thread.c

index d441ad65f3174a7e7d0c07266ea4193240f54d72..c4977bcf26cf131ccaa795db8d890a6aeeee3874 100644 (file)
--- a/module/spl/spl-thread.c
+++ b/module/spl/spl-thread.c
@@ -27,6 +27,7 @@
  #include <sys/thread.h>
  #include <sys/kmem.h>
  #include <sys/tsd.h>
+#include <linux/simd.h>
  
  /*
   * Thread interfaces
@@ -54,6 +55,7 @@ thread_generic_wrapper(void *arg)
         args = tp->tp_args;
         set_current_state(tp->tp_state);
         set_user_nice((kthread_t *)current, PRIO_TO_NICE(tp->tp_pri));
+       kfpu_initialize();
         kmem_free(tp->tp_name, tp->tp_name_size);
         kmem_free(tp, sizeof (thread_priv_t));
  
diff --git a/module/zcommon/zfs_fletcher.c b/module/zcommon/zfs_fletcher.c

index 5a991ba6073a6ccdd76c791423a757e068b6d068..b75d8ab003b06aa1bdbf20c671dd1d4dc5e41407 100644 (file)
--- a/module/zcommon/zfs_fletcher.c
+++ b/module/zcommon/zfs_fletcher.c
@@ -140,6 +140,7 @@
  #include <sys/zio_checksum.h>
  #include <sys/zfs_context.h>
  #include <zfs_fletcher.h>
+#include <linux/simd.h>
  
  #define        FLETCHER_MIN_SIMD_SIZE  64
  
@@ -205,21 +206,19 @@ static struct fletcher_4_impl_selector {
         const char      *fis_name;
         uint32_t        fis_sel;
  } fletcher_4_impl_selectors[] = {
-#if !defined(_KERNEL)
         { "cycle",      IMPL_CYCLE },
-#endif
         { "fastest",    IMPL_FASTEST },
         { "scalar",     IMPL_SCALAR }
  };
  
  #if defined(_KERNEL)
  static kstat_t *fletcher_4_kstat;
-#endif
  
  static struct fletcher_4_kstat {
         uint64_t native;
         uint64_t byteswap;
  } fletcher_4_stat_data[ARRAY_SIZE(fletcher_4_impls) + 1];
+#endif
  
  /* Indicate that benchmark has been completed */
  static boolean_t fletcher_4_initialized = B_FALSE;
@@ -408,32 +407,36 @@ fletcher_4_impl_set(const char *val)
         return (err);
  }
  
+/*
+ * Returns the Fletcher 4 operations for checksums.   When a SIMD
+ * implementation is not allowed in the current context, then fallback
+ * to the fastest generic implementation.
+ */
  static inline const fletcher_4_ops_t *
  fletcher_4_impl_get(void)
  {
-       fletcher_4_ops_t *ops = NULL;
-       const uint32_t impl = IMPL_READ(fletcher_4_impl_chosen);
+       if (!kfpu_allowed())
+               return (&fletcher_4_superscalar4_ops);
+
+       const fletcher_4_ops_t *ops = NULL;
+       uint32_t impl = IMPL_READ(fletcher_4_impl_chosen);
  
         switch (impl) {
         case IMPL_FASTEST:
                 ASSERT(fletcher_4_initialized);
                 ops = &fletcher_4_fastest_impl;
                 break;
-#if !defined(_KERNEL)
-       case IMPL_CYCLE: {
+       case IMPL_CYCLE:
+               /* Cycle through supported implementations */
                 ASSERT(fletcher_4_initialized);
                 ASSERT3U(fletcher_4_supp_impls_cnt, >, 0);
-
                 static uint32_t cycle_count = 0;
                 uint32_t idx = (++cycle_count) % fletcher_4_supp_impls_cnt;
                 ops = fletcher_4_supp_impls[idx];
-       }
-       break;
-#endif
+               break;
         default:
                 ASSERT3U(fletcher_4_supp_impls_cnt, >, 0);
                 ASSERT3U(impl, <, fletcher_4_supp_impls_cnt);
-
                 ops = fletcher_4_supp_impls[impl];
                 break;
         }
@@ -658,6 +661,7 @@ fletcher_4_kstat_addr(kstat_t *ksp, loff_t n)
  typedef void fletcher_checksum_func_t(const void *, uint64_t, const void *,
                                         zio_cksum_t *);
  
+#if defined(_KERNEL)
  static void
  fletcher_4_benchmark_impl(boolean_t native, char *data, uint64_t data_size)
  {
@@ -716,16 +720,18 @@ fletcher_4_benchmark_impl(boolean_t native, char *data, uint64_t data_size)
         /* restore original selection */
         atomic_swap_32(&fletcher_4_impl_chosen, sel_save);
  }
+#endif /* _KERNEL */
  
-void
-fletcher_4_init(void)
+/*
+ * Initialize and benchmark all supported implementations.
+ */
+static void
+fletcher_4_benchmark(void *arg)
  {
-       static const size_t data_size = 1 << SPA_OLD_MAXBLOCKSHIFT; /* 128kiB */
         fletcher_4_ops_t *curr_impl;
-       char *databuf;
         int i, c;
  
-       /* move supported impl into fletcher_4_supp_impls */
+       /* Move supported implementations into fletcher_4_supp_impls */
         for (i = 0, c = 0; i < ARRAY_SIZE(fletcher_4_impls); i++) {
                 curr_impl = (fletcher_4_ops_t *)fletcher_4_impls[i];
  
@@ -735,19 +741,10 @@ fletcher_4_init(void)
         membar_producer();      /* complete fletcher_4_supp_impls[] init */
         fletcher_4_supp_impls_cnt = c;  /* number of supported impl */
  
-#if !defined(_KERNEL)
-       /* Skip benchmarking and use last implementation as fastest */
-       memcpy(&fletcher_4_fastest_impl,
-           fletcher_4_supp_impls[fletcher_4_supp_impls_cnt-1],
-           sizeof (fletcher_4_fastest_impl));
-       fletcher_4_fastest_impl.name = "fastest";
-       membar_producer();
+#if defined(_KERNEL)
+       static const size_t data_size = 1 << SPA_OLD_MAXBLOCKSHIFT; /* 128kiB */
+       char *databuf = vmem_alloc(data_size, KM_SLEEP);
  
-       fletcher_4_initialized = B_TRUE;
-       return;
-#endif
-       /* Benchmark all supported implementations */
-       databuf = vmem_alloc(data_size, KM_SLEEP);
         for (i = 0; i < data_size / sizeof (uint64_t); i++)
                 ((uint64_t *)databuf)[i] = (uintptr_t)(databuf+i); /* warm-up */
  
@@ -755,9 +752,38 @@ fletcher_4_init(void)
         fletcher_4_benchmark_impl(B_TRUE, databuf, data_size);
  
         vmem_free(databuf, data_size);
+#else
+       /*
+        * Skip the benchmark in user space to avoid impacting libzpool
+        * consumers (zdb, zhack, zinject, ztest).  The last implementation
+        * is assumed to be the fastest and used by default.
+        */
+       memcpy(&fletcher_4_fastest_impl,
+           fletcher_4_supp_impls[fletcher_4_supp_impls_cnt - 1],
+           sizeof (fletcher_4_fastest_impl));
+       fletcher_4_fastest_impl.name = "fastest";
+       membar_producer();
+#endif /* _KERNEL */
+}
  
+void
+fletcher_4_init(void)
+{
  #if defined(_KERNEL)
-       /* install kstats for all implementations */
+       /*
+        * For 5.0 and latter Linux kernels the fletcher 4 benchmarks are
+        * run in a kernel threads.  This is needed to take advantage of the
+        * SIMD functionality, see include/linux/simd_x86.h for details.
+        */
+       taskqid_t id = taskq_dispatch(system_taskq, fletcher_4_benchmark,
+           NULL, TQ_SLEEP);
+       if (id != TASKQID_INVALID) {
+               taskq_wait_id(system_taskq, id);
+       } else {
+               fletcher_4_benchmark(NULL);
+       }
+
+       /* Install kstats for all implementations */
         fletcher_4_kstat = kstat_create("zfs", 0, "fletcher_4_bench", "misc",
             KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL);
         if (fletcher_4_kstat != NULL) {
@@ -769,6 +795,8 @@ fletcher_4_init(void)
                     fletcher_4_kstat_addr);
                 kstat_install(fletcher_4_kstat);
         }
+#else
+       fletcher_4_benchmark(NULL);
  #endif
  
         /* Finish initialization */
diff --git a/module/zcommon/zfs_fletcher_aarch64_neon.c b/module/zcommon/zfs_fletcher_aarch64_neon.c

index bd2db2b20fe23ad178a201890c506ad3dbc11ff9..3b3c1b52b804b0d7e75ee366d20de39940699935 100644 (file)
--- a/module/zcommon/zfs_fletcher_aarch64_neon.c
+++ b/module/zcommon/zfs_fletcher_aarch64_neon.c
@@ -198,7 +198,7 @@ unsigned char SRC __attribute__((vector_size(16)));
  
  static boolean_t fletcher_4_aarch64_neon_valid(void)
  {
-       return (B_TRUE);
+       return (kfpu_allowed());
  }
  
  const fletcher_4_ops_t fletcher_4_aarch64_neon_ops = {
diff --git a/module/zcommon/zfs_fletcher_avx512.c b/module/zcommon/zfs_fletcher_avx512.c

index 7260a9864be105d68c20aff69a66bbc0598b9631..0d4cff21a5064b47f549786cfdfd1a2c5628675e 100644 (file)
--- a/module/zcommon/zfs_fletcher_avx512.c
+++ b/module/zcommon/zfs_fletcher_avx512.c
@@ -157,7 +157,7 @@ STACK_FRAME_NON_STANDARD(fletcher_4_avx512f_byteswap);
  static boolean_t
  fletcher_4_avx512f_valid(void)
  {
-       return (zfs_avx512f_available());
+       return (kfpu_allowed() && zfs_avx512f_available());
  }
  
  const fletcher_4_ops_t fletcher_4_avx512f_ops = {
diff --git a/module/zcommon/zfs_fletcher_intel.c b/module/zcommon/zfs_fletcher_intel.c

index 6dac047dad0e3d15074043de664501f9c4ceccbd..7f12efe6d8c5c25c64eccd6484edf3065cdee511 100644 (file)
--- a/module/zcommon/zfs_fletcher_intel.c
+++ b/module/zcommon/zfs_fletcher_intel.c
@@ -156,7 +156,7 @@ fletcher_4_avx2_byteswap(fletcher_4_ctx_t *ctx, const void *buf, uint64_t size)
  
  static boolean_t fletcher_4_avx2_valid(void)
  {
-       return (zfs_avx_available() && zfs_avx2_available());
+       return (kfpu_allowed() && zfs_avx_available() && zfs_avx2_available());
  }
  
  const fletcher_4_ops_t fletcher_4_avx2_ops = {
diff --git a/module/zcommon/zfs_fletcher_sse.c b/module/zcommon/zfs_fletcher_sse.c

index a0b42e5f5fa8c25f4113598bbb7de985f950659d..e6389d6e5db86d4087ae326b1f0485fcf8f6d9ec 100644 (file)
--- a/module/zcommon/zfs_fletcher_sse.c
+++ b/module/zcommon/zfs_fletcher_sse.c
@@ -157,7 +157,7 @@ fletcher_4_sse2_byteswap(fletcher_4_ctx_t *ctx, const void *buf, uint64_t size)
  
  static boolean_t fletcher_4_sse2_valid(void)
  {
-       return (zfs_sse2_available());
+       return (kfpu_allowed() && zfs_sse2_available());
  }
  
  const fletcher_4_ops_t fletcher_4_sse2_ops = {
@@ -214,7 +214,8 @@ fletcher_4_ssse3_byteswap(fletcher_4_ctx_t *ctx, const void *buf, uint64_t size)
  
  static boolean_t fletcher_4_ssse3_valid(void)
  {
-       return (zfs_sse2_available() && zfs_ssse3_available());
+       return (kfpu_allowed() && zfs_sse2_available() &&
+           zfs_ssse3_available());
  }
  
  const fletcher_4_ops_t fletcher_4_ssse3_ops = {
diff --git a/module/zfs/vdev_raidz_math.c b/module/zfs/vdev_raidz_math.c

index 3ef67768f9161356f4c591be50ffa1b541e19913..ef514e9e102a357eae9cd6fd51472011b76b6180 100644 (file)
--- a/module/zfs/vdev_raidz_math.c
+++ b/module/zfs/vdev_raidz_math.c
@@ -27,9 +27,9 @@
  #include <sys/zio.h>
  #include <sys/debug.h>
  #include <sys/zfs_debug.h>
-
  #include <sys/vdev_raidz.h>
  #include <sys/vdev_raidz_impl.h>
+#include <linux/simd.h>
  
  extern boolean_t raidz_will_scalar_work(void);
  
@@ -87,6 +87,7 @@ static uint32_t user_sel_impl = IMPL_FASTEST;
  static size_t raidz_supp_impl_cnt = 0;
  static raidz_impl_ops_t *raidz_supp_impl[ARRAY_SIZE(raidz_all_maths)];
  
+#if defined(_KERNEL)
  /*
   * kstats values for supported implementations
   * Values represent per disk throughput of 8 disk+parity raidz vdev [B/s]
@@ -95,14 +96,19 @@ static raidz_impl_kstat_t raidz_impl_kstats[ARRAY_SIZE(raidz_all_maths) + 1];
  
  /* kstat for benchmarked implementations */
  static kstat_t *raidz_math_kstat = NULL;
+#endif
  
  /*
- * Selects the raidz operation for raidz_map
- * If rm_ops is set to NULL original raidz implementation will be used
+ * Returns the RAIDZ operations for raidz_map() parity calculations.   When
+ * a SIMD implementation is not allowed in the current context, then fallback
+ * to the fastest generic implementation.
   */
-raidz_impl_ops_t *
-vdev_raidz_math_get_ops()
+const raidz_impl_ops_t *
+vdev_raidz_math_get_ops(void)
  {
+       if (!kfpu_allowed())
+               return (&vdev_raidz_scalar_impl);
+
         raidz_impl_ops_t *ops = NULL;
         const uint32_t impl = RAIDZ_IMPL_READ(zfs_vdev_raidz_impl);
  
@@ -111,18 +117,14 @@ vdev_raidz_math_get_ops()
                 ASSERT(raidz_math_initialized);
                 ops = &vdev_raidz_fastest_impl;
                 break;
-#if !defined(_KERNEL)
         case IMPL_CYCLE:
-       {
+               /* Cycle through all supported implementations */
                 ASSERT(raidz_math_initialized);
                 ASSERT3U(raidz_supp_impl_cnt, >, 0);
-               /* Cycle through all supported implementations */
                 static size_t cycle_impl_idx = 0;
                 size_t idx = (++cycle_impl_idx) % raidz_supp_impl_cnt;
                 ops = raidz_supp_impl[idx];
-       }
-       break;
-#endif
+               break;
         case IMPL_ORIGINAL:
                 ops = (raidz_impl_ops_t *)&vdev_raidz_original_impl;
                 break;
@@ -273,6 +275,8 @@ const char *raidz_rec_name[] = {
         "rec_pq", "rec_pr", "rec_qr", "rec_pqr"
  };
  
+#if defined(_KERNEL)
+
  #define        RAIDZ_KSTAT_LINE_LEN    (17 + 10*12 + 1)
  
  static int
@@ -435,21 +439,21 @@ benchmark_raidz_impl(raidz_map_t *bench_rm, const int fn, benchmark_fn bench_fn)
                 }
         }
  }
+#endif
  
-void
-vdev_raidz_math_init(void)
+/*
+ * Initialize and benchmark all supported implementations.
+ */
+static void
+benchmark_raidz(void *arg)
  {
         raidz_impl_ops_t *curr_impl;
-       zio_t *bench_zio = NULL;
-       raidz_map_t *bench_rm = NULL;
-       uint64_t bench_parity;
-       int i, c, fn;
+       int i, c;
  
-       /* move supported impl into raidz_supp_impl */
+       /* Move supported impl into raidz_supp_impl */
         for (i = 0, c = 0; i < ARRAY_SIZE(raidz_all_maths); i++) {
                 curr_impl = (raidz_impl_ops_t *)raidz_all_maths[i];
  
-               /* initialize impl */
                 if (curr_impl->init)
                         curr_impl->init();
  
@@ -459,18 +463,10 @@ vdev_raidz_math_init(void)
         membar_producer();              /* complete raidz_supp_impl[] init */
         raidz_supp_impl_cnt = c;        /* number of supported impl */
  
-#if !defined(_KERNEL)
-       /* Skip benchmarking and use last implementation as fastest */
-       memcpy(&vdev_raidz_fastest_impl, raidz_supp_impl[raidz_supp_impl_cnt-1],
-           sizeof (vdev_raidz_fastest_impl));
-       strcpy(vdev_raidz_fastest_impl.name, "fastest");
-
-       raidz_math_initialized = B_TRUE;
-
-       /* Use 'cycle' math selection method for userspace */
-       VERIFY0(vdev_raidz_impl_set("cycle"));
-       return;
-#endif
+#if defined(_KERNEL)
+       zio_t *bench_zio = NULL;
+       raidz_map_t *bench_rm = NULL;
+       uint64_t bench_parity;
  
         /* Fake a zio and run the benchmark on a warmed up buffer */
         bench_zio = kmem_zalloc(sizeof (zio_t), KM_SLEEP);
@@ -480,7 +476,7 @@ vdev_raidz_math_init(void)
         memset(abd_to_buf(bench_zio->io_abd), 0xAA, BENCH_ZIO_SIZE);
  
         /* Benchmark parity generation methods */
-       for (fn = 0; fn < RAIDZ_GEN_NUM; fn++) {
+       for (int fn = 0; fn < RAIDZ_GEN_NUM; fn++) {
                 bench_parity = fn + 1;
                 /* New raidz_map is needed for each generate_p/q/r */
                 bench_rm = vdev_raidz_map_alloc(bench_zio, SPA_MINBLOCKSHIFT,
@@ -495,7 +491,7 @@ vdev_raidz_math_init(void)
         bench_rm = vdev_raidz_map_alloc(bench_zio, SPA_MINBLOCKSHIFT,
             BENCH_COLS, PARITY_PQR);
  
-       for (fn = 0; fn < RAIDZ_REC_NUM; fn++)
+       for (int fn = 0; fn < RAIDZ_REC_NUM; fn++)
                 benchmark_raidz_impl(bench_rm, fn, benchmark_rec_impl);
  
         vdev_raidz_map_free(bench_rm);
@@ -503,11 +499,39 @@ vdev_raidz_math_init(void)
         /* cleanup the bench zio */
         abd_free(bench_zio->io_abd);
         kmem_free(bench_zio, sizeof (zio_t));
+#else
+       /*
+        * Skip the benchmark in user space to avoid impacting libzpool
+        * consumers (zdb, zhack, zinject, ztest).  The last implementation
+        * is assumed to be the fastest and used by default.
+        */
+       memcpy(&vdev_raidz_fastest_impl,
+           raidz_supp_impl[raidz_supp_impl_cnt - 1],
+           sizeof (vdev_raidz_fastest_impl));
+       strcpy(vdev_raidz_fastest_impl.name, "fastest");
+#endif /* _KERNEL */
+}
  
-       /* install kstats for all impl */
+void
+vdev_raidz_math_init(void)
+{
+#if defined(_KERNEL)
+       /*
+        * For 5.0 and latter Linux kernels the fletcher 4 benchmarks are
+        * run in a kernel threads.  This is needed to take advantage of the
+        * SIMD functionality, see include/linux/simd_x86.h for details.
+        */
+       taskqid_t id = taskq_dispatch(system_taskq, benchmark_raidz,
+           NULL, TQ_SLEEP);
+       if (id != TASKQID_INVALID) {
+               taskq_wait_id(system_taskq, id);
+       } else {
+               benchmark_raidz(NULL);
+       }
+
+       /* Install kstats for all implementations */
         raidz_math_kstat = kstat_create("zfs", 0, "vdev_raidz_bench", "misc",
             KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL);
-
         if (raidz_math_kstat != NULL) {
                 raidz_math_kstat->ks_data = NULL;
                 raidz_math_kstat->ks_ndata = UINT32_MAX;
@@ -517,6 +541,9 @@ vdev_raidz_math_init(void)
                     raidz_math_kstat_addr);
                 kstat_install(raidz_math_kstat);
         }
+#else
+       benchmark_raidz(NULL);
+#endif
  
         /* Finish initialization */
         atomic_swap_32(&zfs_vdev_raidz_impl, user_sel_impl);
@@ -527,15 +554,15 @@ void
  vdev_raidz_math_fini(void)
  {
         raidz_impl_ops_t const *curr_impl;
-       int i;
  
+#if defined(_KERNEL)
         if (raidz_math_kstat != NULL) {
                 kstat_delete(raidz_math_kstat);
                 raidz_math_kstat = NULL;
         }
+#endif
  
-       /* fini impl */
-       for (i = 0; i < ARRAY_SIZE(raidz_all_maths); i++) {
+       for (int i = 0; i < ARRAY_SIZE(raidz_all_maths); i++) {
                 curr_impl = raidz_all_maths[i];
                 if (curr_impl->fini)
                         curr_impl->fini();
@@ -546,9 +573,7 @@ static const struct {
         char *name;
         uint32_t sel;
  } math_impl_opts[] = {
-#if !defined(_KERNEL)
                 { "cycle",      IMPL_CYCLE },
-#endif
                 { "fastest",    IMPL_FASTEST },
                 { "original",   IMPL_ORIGINAL },
                 { "scalar",     IMPL_SCALAR }
diff --git a/module/zfs/vdev_raidz_math_aarch64_neon.c b/module/zfs/vdev_raidz_math_aarch64_neon.c

index e3ad06776503415f4f74265180a257d89eb7038a..0a67ceb8492053849ec4d902892f1a89682fa9ae 100644 (file)
--- a/module/zfs/vdev_raidz_math_aarch64_neon.c
+++ b/module/zfs/vdev_raidz_math_aarch64_neon.c
@@ -207,7 +207,7 @@ DEFINE_REC_METHODS(aarch64_neon);
  static boolean_t
  raidz_will_aarch64_neon_work(void)
  {
-       return (B_TRUE); // __arch64__ requires NEON
+       return (kfpu_allowed());
  }
  
  const raidz_impl_ops_t vdev_raidz_aarch64_neon_impl = {
diff --git a/module/zfs/vdev_raidz_math_aarch64_neonx2.c b/module/zfs/vdev_raidz_math_aarch64_neonx2.c

index f8688a06a8f6b452ef2bc7c3013cfd097e3726db..e072f51cd6356c19390885975d9de71bb5fbf1a9 100644 (file)
--- a/module/zfs/vdev_raidz_math_aarch64_neonx2.c
+++ b/module/zfs/vdev_raidz_math_aarch64_neonx2.c
@@ -217,7 +217,7 @@ DEFINE_REC_METHODS(aarch64_neonx2);
  static boolean_t
  raidz_will_aarch64_neonx2_work(void)
  {
-       return (B_TRUE); // __arch64__ requires NEON
+       return (kfpu_allowed());
  }
  
  const raidz_impl_ops_t vdev_raidz_aarch64_neonx2_impl = {
diff --git a/module/zfs/vdev_raidz_math_avx2.c b/module/zfs/vdev_raidz_math_avx2.c

index 063d29bcd8bf45dd40bc75350bf3a56b2d8ef7b1..a12eb672081fb87fc7fdd0fa654fd43e7c255ef2 100644 (file)
--- a/module/zfs/vdev_raidz_math_avx2.c
+++ b/module/zfs/vdev_raidz_math_avx2.c
@@ -396,7 +396,7 @@ DEFINE_REC_METHODS(avx2);
  static boolean_t
  raidz_will_avx2_work(void)
  {
-       return (zfs_avx_available() && zfs_avx2_available());
+       return (kfpu_allowed() && zfs_avx_available() && zfs_avx2_available());
  }
  
  const raidz_impl_ops_t vdev_raidz_avx2_impl = {
diff --git a/module/zfs/vdev_raidz_math_avx512bw.c b/module/zfs/vdev_raidz_math_avx512bw.c

index d605653db3f1dd45c6f8d3339c4af87aa54c99b8..2f545c9ec07835655581fdfb330c4f764bf93541 100644 (file)
--- a/module/zfs/vdev_raidz_math_avx512bw.c
+++ b/module/zfs/vdev_raidz_math_avx512bw.c
@@ -393,9 +393,8 @@ DEFINE_REC_METHODS(avx512bw);
  static boolean_t
  raidz_will_avx512bw_work(void)
  {
-       return (zfs_avx_available() &&
-           zfs_avx512f_available() &&
-           zfs_avx512bw_available());
+       return (kfpu_allowed() && zfs_avx_available() &&
+           zfs_avx512f_available() && zfs_avx512bw_available());
  }
  
  const raidz_impl_ops_t vdev_raidz_avx512bw_impl = {
diff --git a/module/zfs/vdev_raidz_math_avx512f.c b/module/zfs/vdev_raidz_math_avx512f.c

index f4e4560ced836c50f4f23cab86794df738303b0e..75af7a8eea96709cac8472e8b32cebb365e56483 100644 (file)
--- a/module/zfs/vdev_raidz_math_avx512f.c
+++ b/module/zfs/vdev_raidz_math_avx512f.c
@@ -470,9 +470,8 @@ DEFINE_REC_METHODS(avx512f);
  static boolean_t
  raidz_will_avx512f_work(void)
  {
-       return (zfs_avx_available() &&
-           zfs_avx2_available() &&
-           zfs_avx512f_available());
+       return (kfpu_allowed() && zfs_avx_available() &&
+           zfs_avx2_available() && zfs_avx512f_available());
  }
  
  const raidz_impl_ops_t vdev_raidz_avx512f_impl = {
diff --git a/module/zfs/vdev_raidz_math_sse2.c b/module/zfs/vdev_raidz_math_sse2.c

index 9985da273643d95c3cf685b3e37eea2ab121d433..5b3a9385c9d88ff7788471c4ca84408a722a699e 100644 (file)
--- a/module/zfs/vdev_raidz_math_sse2.c
+++ b/module/zfs/vdev_raidz_math_sse2.c
@@ -607,7 +607,7 @@ DEFINE_REC_METHODS(sse2);
  static boolean_t
  raidz_will_sse2_work(void)
  {
-       return (zfs_sse_available() && zfs_sse2_available());
+       return (kfpu_allowed() && zfs_sse_available() && zfs_sse2_available());
  }
  
  const raidz_impl_ops_t vdev_raidz_sse2_impl = {
diff --git a/module/zfs/vdev_raidz_math_ssse3.c b/module/zfs/vdev_raidz_math_ssse3.c

index 047a48d544f1fed804da46baac3a44d29258c918..62247cf8eb8d9093349f579081746b26c7cb384a 100644 (file)
--- a/module/zfs/vdev_raidz_math_ssse3.c
+++ b/module/zfs/vdev_raidz_math_ssse3.c
@@ -399,8 +399,8 @@ DEFINE_REC_METHODS(ssse3);
  static boolean_t
  raidz_will_ssse3_work(void)
  {
-       return (zfs_sse_available() && zfs_sse2_available() &&
-           zfs_ssse3_available());
+       return (kfpu_allowed() && zfs_sse_available() &&
+           zfs_sse2_available() && zfs_ssse3_available());
  }
  
  const raidz_impl_ops_t vdev_raidz_ssse3_impl = {
author	Brian Behlendorf <behlendorf1@llnl.gov>
	Fri, 12 Jul 2019 16:31:20 +0000 (09:31 -0700)
committer	GitHub <noreply@github.com>
	Fri, 12 Jul 2019 16:31:20 +0000 (09:31 -0700)
cmd/ztest/ztest.c		patch \| blob \| history
config/kernel-fpu.m4		patch \| blob \| history
include/linux/Makefile.am		patch \| blob \| history
include/linux/simd.h	[new file with mode: 0644]	patch \| blob
include/linux/simd_aarch64.h		patch \| blob \| history
include/linux/simd_x86.h		patch \| blob \| history
include/sys/vdev_raidz.h		patch \| blob \| history
include/sys/vdev_raidz_impl.h		patch \| blob \| history
module/icp/algs/aes/aes_impl.c		patch \| blob \| history
module/icp/algs/aes/aes_impl_aesni.c		patch \| blob \| history
module/icp/algs/modes/gcm.c		patch \| blob \| history
module/icp/algs/modes/gcm_pclmulqdq.c		patch \| blob \| history
module/icp/include/aes/aes_impl.h		patch \| blob \| history
module/icp/include/modes/gcm_impl.h		patch \| blob \| history
module/icp/io/aes.c		patch \| blob \| history
module/spl/spl-taskq.c		patch \| blob \| history
module/spl/spl-thread.c		patch \| blob \| history
module/zcommon/zfs_fletcher.c		patch \| blob \| history
module/zcommon/zfs_fletcher_aarch64_neon.c		patch \| blob \| history
module/zcommon/zfs_fletcher_avx512.c		patch \| blob \| history
module/zcommon/zfs_fletcher_intel.c		patch \| blob \| history
module/zcommon/zfs_fletcher_sse.c		patch \| blob \| history
module/zfs/vdev_raidz_math.c		patch \| blob \| history
module/zfs/vdev_raidz_math_aarch64_neon.c		patch \| blob \| history
module/zfs/vdev_raidz_math_aarch64_neonx2.c		patch \| blob \| history
module/zfs/vdev_raidz_math_avx2.c		patch \| blob \| history
module/zfs/vdev_raidz_math_avx512bw.c		patch \| blob \| history
module/zfs/vdev_raidz_math_avx512f.c		patch \| blob \| history
module/zfs/vdev_raidz_math_sse2.c		patch \| blob \| history
module/zfs/vdev_raidz_math_ssse3.c		patch \| blob \| history