#include <sys/vdev_impl.h>
#include <sys/vdev_file.h>
#include <sys/vdev_initialize.h>
+#include <sys/vdev_raidz.h>
#include <sys/vdev_trim.h>
#include <sys/spa_impl.h>
#include <sys/metaslab_impl.h>
metaslab_preload_limit = ztest_random(20) + 1;
ztest_spa = spa;
+ VERIFY0(vdev_raidz_impl_set("cycle"));
+
dmu_objset_stats_t dds;
VERIFY0(ztest_dmu_objset_own(ztest_opts.zo_pool,
DMU_OST_ANY, B_TRUE, B_TRUE, FTAG, &os));
dnl # Handle differences in kernel FPU code.
dnl #
dnl # Kernel
-dnl # 5.0: All kernel fpu functions are GPL only, so we can't use them.
-dnl # (nothing defined)
+dnl # 5.2: The fpu->initialized flag was replaced by TIF_NEED_FPU_LOAD.
+dnl # HAVE_KERNEL_TIF_NEED_FPU_LOAD
+dnl #
+dnl # 5.0: As an optimization SIMD operations performed by kernel
+dnl # threads can skip saving and restoring their FPU context.
+dnl # Wrappers have been introduced to determine the running
+dnl # context and use either the SIMD or generic implementation.
+dnl # This change was made to the 4.19.38 and 4.14.120 LTS kernels.
+dnl # HAVE_KERNEL_FPU_INITIALIZED
dnl #
dnl # 4.2: Use __kernel_fpu_{begin,end}()
dnl # HAVE_UNDERSCORE_KERNEL_FPU & KERNEL_EXPORTS_X86_FPU
__kernel_fpu_end();
], [__kernel_fpu_begin], [arch/x86/kernel/fpu/core.c arch/x86/kernel/i387.c], [
AC_MSG_RESULT(__kernel_fpu_*)
- AC_DEFINE(HAVE_UNDERSCORE_KERNEL_FPU, 1, [kernel has __kernel_fpu_* functions])
- AC_DEFINE(KERNEL_EXPORTS_X86_FPU, 1, [kernel exports FPU functions])
+ AC_DEFINE(HAVE_UNDERSCORE_KERNEL_FPU, 1,
+ [kernel has __kernel_fpu_* functions])
+ AC_DEFINE(KERNEL_EXPORTS_X86_FPU, 1,
+ [kernel exports FPU functions])
],[
- AC_MSG_RESULT(not exported)
+ ZFS_LINUX_TRY_COMPILE([
+ #include <linux/module.h>
+ #include <linux/sched.h>
+ ],[
+ struct fpu *fpu = ¤t->thread.fpu;
+ if (fpu->initialized) { return (0); };
+ ],[
+ AC_MSG_RESULT(fpu.initialized)
+ AC_DEFINE(HAVE_KERNEL_FPU_INITIALIZED, 1,
+ [kernel fpu.initialized exists])
+ ],[
+ ZFS_LINUX_TRY_COMPILE([
+ #include <linux/module.h>
+ #include <asm/thread_info.h>
+
+ #if !defined(TIF_NEED_FPU_LOAD)
+ #error "TIF_NEED_FPU_LOAD undefined"
+ #endif
+ ],[
+ ],[
+ AC_MSG_RESULT(TIF_NEED_FPU_LOAD)
+ AC_DEFINE(
+ HAVE_KERNEL_TIF_NEED_FPU_LOAD, 1,
+ [kernel TIF_NEED_FPU_LOAD exists])
+ ],[
+ AC_MSG_RESULT(unavailable)
+ ])
+ ])
])
])
])
$(top_srcdir)/include/linux/blkdev_compat.h \
$(top_srcdir)/include/linux/utsname_compat.h \
$(top_srcdir)/include/linux/kmap_compat.h \
+ $(top_srcdir)/include/linux/simd.h \
$(top_srcdir)/include/linux/simd_x86.h \
$(top_srcdir)/include/linux/simd_aarch64.h \
$(top_srcdir)/include/linux/mod_compat.h \
--- /dev/null
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (C) 2019 Lawrence Livermore National Security, LLC.
+ */
+
+#ifndef _SIMD_H
+#define _SIMD_H
+
+#if defined(__x86)
+#include <linux/simd_x86.h>
+
+#elif defined(__aarch64__)
+#include <linux/simd_aarch64.h>
+#else
+
+#define kfpu_allowed() 1
+#define kfpu_initialize(tsk) do {} while (0)
+#define kfpu_begin() do {} while (0)
+#define kfpu_end() do {} while (0)
+
+#endif
+#endif /* _SIMD_H */
#if defined(_KERNEL)
#include <asm/neon.h>
-#define kfpu_begin() \
-{ \
- kernel_neon_begin(); \
-}
-#define kfpu_end() \
-{ \
- kernel_neon_end(); \
-}
+#define kfpu_allowed() 1
+#define kfpu_initialize(tsk) do {} while (0)
+#define kfpu_begin() kernel_neon_begin()
+#define kfpu_end() kernel_neon_end()
#else
/*
* fpu dummy methods for userspace
*/
-#define kfpu_begin() do {} while (0)
-#define kfpu_end() do {} while (0)
+#define kfpu_allowed() 1
+#define kfpu_initialize(tsk) do {} while (0)
+#define kfpu_begin() do {} while (0)
+#define kfpu_end() do {} while (0)
#endif /* defined(_KERNEL) */
#endif /* __aarch64__ */
#include <asm/xcr.h>
#endif
+/*
+ * The following cases are for kernels which export either the
+ * kernel_fpu_* or __kernel_fpu_* functions.
+ */
+#if defined(KERNEL_EXPORTS_X86_FPU)
+
+#define kfpu_allowed() 1
+#define kfpu_initialize(tsk) do {} while (0)
+
#if defined(HAVE_UNDERSCORE_KERNEL_FPU)
#define kfpu_begin() \
-{ \
- preempt_disable(); \
+{ \
+ preempt_disable(); \
__kernel_fpu_begin(); \
}
-#define kfpu_end() \
-{ \
- __kernel_fpu_end(); \
- preempt_enable(); \
+#define kfpu_end() \
+{ \
+ __kernel_fpu_end(); \
+ preempt_enable(); \
}
+
#elif defined(HAVE_KERNEL_FPU)
-#define kfpu_begin() kernel_fpu_begin()
+#define kfpu_begin() kernel_fpu_begin()
#define kfpu_end() kernel_fpu_end()
+
#else
-/* Kernel doesn't export any kernel_fpu_* functions */
-#include <asm/fpu/internal.h> /* For kernel xgetbv() */
-#define kfpu_begin() panic("This code should never run")
-#define kfpu_end() panic("This code should never run")
-#endif /* defined(HAVE_KERNEL_FPU) */
+/*
+ * This case is unreachable. When KERNEL_EXPORTS_X86_FPU is defined then
+ * either HAVE_UNDERSCORE_KERNEL_FPU or HAVE_KERNEL_FPU must be defined.
+ */
+#error "Unreachable kernel configuration"
+#endif
+
+#else /* defined(KERNEL_EXPORTS_X86_FPU) */
+/*
+ * When the kernel_fpu_* symbols are unavailable then provide our own
+ * versions which allow the FPU to be safely used in kernel threads.
+ * In practice, this is not a significant restriction for ZFS since the
+ * vast majority of SIMD operations are performed by the IO pipeline.
+ */
+/*
+ * Returns non-zero if FPU operations are allowed in the current context.
+ */
+#if defined(HAVE_KERNEL_TIF_NEED_FPU_LOAD)
+#define kfpu_allowed() ((current->flags & PF_KTHREAD) && \
+ test_thread_flag(TIF_NEED_FPU_LOAD))
+#elif defined(HAVE_KERNEL_FPU_INITIALIZED)
+#define kfpu_allowed() ((current->flags & PF_KTHREAD) && \
+ current->thread.fpu.initialized)
#else
+#define kfpu_allowed() 0
+#endif
+
+static inline void
+kfpu_initialize(void)
+{
+ WARN_ON_ONCE(!(current->flags & PF_KTHREAD));
+
+#if defined(HAVE_KERNEL_TIF_NEED_FPU_LOAD)
+ __fpu_invalidate_fpregs_state(¤t->thread.fpu);
+ set_thread_flag(TIF_NEED_FPU_LOAD);
+#elif defined(HAVE_KERNEL_FPU_INITIALIZED)
+ __fpu_invalidate_fpregs_state(¤t->thread.fpu);
+ current->thread.fpu.initialized = 1;
+#endif
+}
+
+static inline void
+kfpu_begin(void)
+{
+ WARN_ON_ONCE(!kfpu_allowed());
+
+ /*
+ * Preemption and interrupts must be disabled for the critical
+ * region where the FPU state is being modified.
+ */
+ preempt_disable();
+ local_irq_disable();
+
+#if defined(HAVE_KERNEL_TIF_NEED_FPU_LOAD)
+ /*
+ * The current FPU registers need to be preserved by kfpu_begin()
+ * and restored by kfpu_end(). This is required because we can
+ * not call __cpu_invalidate_fpregs_state() to invalidate the
+ * per-cpu FPU state and force them to be restored during a
+ * context switch.
+ */
+ copy_fpregs_to_fpstate(¤t->thread.fpu);
+#elif defined(HAVE_KERNEL_FPU_INITIALIZED)
+ /*
+ * There is no need to preserve and restore the FPU registers.
+ * They will always be restored from the task's stored FPU state
+ * when switching contexts.
+ */
+ WARN_ON_ONCE(current->thread.fpu.initialized == 0);
+#endif
+}
+
+static inline void
+kfpu_end(void)
+{
+#if defined(HAVE_KERNEL_TIF_NEED_FPU_LOAD)
+ union fpregs_state *state = ¤t->thread.fpu.state;
+ int error;
+
+ if (use_xsave()) {
+ error = copy_kernel_to_xregs_err(&state->xsave, -1);
+ } else if (use_fxsr()) {
+ error = copy_kernel_to_fxregs_err(&state->fxsave);
+ } else {
+ error = copy_kernel_to_fregs_err(&state->fsave);
+ }
+ WARN_ON_ONCE(error);
+#endif
+
+ local_irq_enable();
+ preempt_enable();
+}
+#endif /* defined(HAVE_KERNEL_FPU) */
+
+#else /* defined(_KERNEL) */
/*
- * fpu dummy methods for userspace
+ * FPU dummy methods for user space.
*/
-#define kfpu_begin() do {} while (0)
-#define kfpu_end() do {} while (0)
+#define kfpu_allowed() 1
+#define kfpu_initialize(tsk) do {} while (0)
+#define kfpu_begin() do {} while (0)
+#define kfpu_end() do {} while (0)
#endif /* defined(_KERNEL) */
/*
uint64_t xcr0;
#if defined(_KERNEL)
-#if defined(X86_FEATURE_OSXSAVE) && defined(KERNEL_EXPORTS_X86_FPU)
+#if defined(X86_FEATURE_OSXSAVE)
has_osxsave = !!boot_cpu_has(X86_FEATURE_OSXSAVE);
#else
has_osxsave = B_FALSE;
zfs_sse_available(void)
{
#if defined(_KERNEL)
-#if defined(KERNEL_EXPORTS_X86_FPU)
return (!!boot_cpu_has(X86_FEATURE_XMM));
-#else
- return (B_FALSE);
-#endif
#elif !defined(_KERNEL)
return (__cpuid_has_sse());
#endif
zfs_sse2_available(void)
{
#if defined(_KERNEL)
-#if defined(KERNEL_EXPORTS_X86_FPU)
return (!!boot_cpu_has(X86_FEATURE_XMM2));
-#else
- return (B_FALSE);
-#endif
#elif !defined(_KERNEL)
return (__cpuid_has_sse2());
#endif
zfs_sse3_available(void)
{
#if defined(_KERNEL)
-#if defined(KERNEL_EXPORTS_X86_FPU)
return (!!boot_cpu_has(X86_FEATURE_XMM3));
-#else
- return (B_FALSE);
-#endif
#elif !defined(_KERNEL)
return (__cpuid_has_sse3());
#endif
zfs_ssse3_available(void)
{
#if defined(_KERNEL)
-#if defined(KERNEL_EXPORTS_X86_FPU)
return (!!boot_cpu_has(X86_FEATURE_SSSE3));
-#else
- return (B_FALSE);
-#endif
#elif !defined(_KERNEL)
return (__cpuid_has_ssse3());
#endif
zfs_sse4_1_available(void)
{
#if defined(_KERNEL)
-#if defined(KERNEL_EXPORTS_X86_FPU)
return (!!boot_cpu_has(X86_FEATURE_XMM4_1));
-#else
- return (B_FALSE);
-#endif
#elif !defined(_KERNEL)
return (__cpuid_has_sse4_1());
#endif
zfs_sse4_2_available(void)
{
#if defined(_KERNEL)
-#if defined(KERNEL_EXPORTS_X86_FPU)
return (!!boot_cpu_has(X86_FEATURE_XMM4_2));
-#else
- return (B_FALSE);
-#endif
#elif !defined(_KERNEL)
return (__cpuid_has_sse4_2());
#endif
{
boolean_t has_avx;
#if defined(_KERNEL)
-#if defined(KERNEL_EXPORTS_X86_FPU)
has_avx = !!boot_cpu_has(X86_FEATURE_AVX);
-#else
- has_avx = B_FALSE;
-#endif
#elif !defined(_KERNEL)
has_avx = __cpuid_has_avx();
#endif
{
boolean_t has_avx2;
#if defined(_KERNEL)
-#if defined(X86_FEATURE_AVX2) && defined(KERNEL_EXPORTS_X86_FPU)
has_avx2 = !!boot_cpu_has(X86_FEATURE_AVX2);
-#else
- has_avx2 = B_FALSE;
-#endif
#elif !defined(_KERNEL)
has_avx2 = __cpuid_has_avx2();
#endif
zfs_bmi1_available(void)
{
#if defined(_KERNEL)
-#if defined(X86_FEATURE_BMI1) && defined(KERNEL_EXPORTS_X86_FPU)
+#if defined(X86_FEATURE_BMI1)
return (!!boot_cpu_has(X86_FEATURE_BMI1));
#else
return (B_FALSE);
zfs_bmi2_available(void)
{
#if defined(_KERNEL)
-#if defined(X86_FEATURE_BMI2) && defined(KERNEL_EXPORTS_X86_FPU)
+#if defined(X86_FEATURE_BMI2)
return (!!boot_cpu_has(X86_FEATURE_BMI2));
#else
return (B_FALSE);
zfs_aes_available(void)
{
#if defined(_KERNEL)
-#if defined(X86_FEATURE_AES) && defined(KERNEL_EXPORTS_X86_FPU)
+#if defined(X86_FEATURE_AES)
return (!!boot_cpu_has(X86_FEATURE_AES));
#else
return (B_FALSE);
zfs_pclmulqdq_available(void)
{
#if defined(_KERNEL)
-#if defined(X86_FEATURE_PCLMULQDQ) && defined(KERNEL_EXPORTS_X86_FPU)
+#if defined(X86_FEATURE_PCLMULQDQ)
return (!!boot_cpu_has(X86_FEATURE_PCLMULQDQ));
#else
return (B_FALSE);
boolean_t has_avx512 = B_FALSE;
#if defined(_KERNEL)
-#if defined(X86_FEATURE_AVX512F) && defined(KERNEL_EXPORTS_X86_FPU)
+#if defined(X86_FEATURE_AVX512F)
has_avx512 = !!boot_cpu_has(X86_FEATURE_AVX512F);
#else
has_avx512 = B_FALSE;
boolean_t has_avx512 = B_FALSE;
#if defined(_KERNEL)
-#if defined(X86_FEATURE_AVX512CD) && defined(KERNEL_EXPORTS_X86_FPU)
+#if defined(X86_FEATURE_AVX512CD)
has_avx512 = boot_cpu_has(X86_FEATURE_AVX512F) &&
boot_cpu_has(X86_FEATURE_AVX512CD);
#else
boolean_t has_avx512 = B_FALSE;
#if defined(_KERNEL)
-#if defined(X86_FEATURE_AVX512ER) && defined(KERNEL_EXPORTS_X86_FPU)
+#if defined(X86_FEATURE_AVX512ER)
has_avx512 = boot_cpu_has(X86_FEATURE_AVX512F) &&
boot_cpu_has(X86_FEATURE_AVX512ER);
#else
boolean_t has_avx512 = B_FALSE;
#if defined(_KERNEL)
-#if defined(X86_FEATURE_AVX512PF) && defined(KERNEL_EXPORTS_X86_FPU)
+#if defined(X86_FEATURE_AVX512PF)
has_avx512 = boot_cpu_has(X86_FEATURE_AVX512F) &&
boot_cpu_has(X86_FEATURE_AVX512PF);
#else
boolean_t has_avx512 = B_FALSE;
#if defined(_KERNEL)
-#if defined(X86_FEATURE_AVX512BW) && defined(KERNEL_EXPORTS_X86_FPU)
+#if defined(X86_FEATURE_AVX512BW)
has_avx512 = boot_cpu_has(X86_FEATURE_AVX512F) &&
boot_cpu_has(X86_FEATURE_AVX512BW);
#else
boolean_t has_avx512 = B_FALSE;
#if defined(_KERNEL)
-#if defined(X86_FEATURE_AVX512DQ) && defined(KERNEL_EXPORTS_X86_FPU)
+#if defined(X86_FEATURE_AVX512DQ)
has_avx512 = boot_cpu_has(X86_FEATURE_AVX512F) &&
boot_cpu_has(X86_FEATURE_AVX512DQ);
#else
boolean_t has_avx512 = B_FALSE;
#if defined(_KERNEL)
-#if defined(X86_FEATURE_AVX512VL) && defined(KERNEL_EXPORTS_X86_FPU)
+#if defined(X86_FEATURE_AVX512VL)
has_avx512 = boot_cpu_has(X86_FEATURE_AVX512F) &&
boot_cpu_has(X86_FEATURE_AVX512VL);
#else
boolean_t has_avx512 = B_FALSE;
#if defined(_KERNEL)
-#if defined(X86_FEATURE_AVX512IFMA) && defined(KERNEL_EXPORTS_X86_FPU)
+#if defined(X86_FEATURE_AVX512IFMA)
has_avx512 = boot_cpu_has(X86_FEATURE_AVX512F) &&
boot_cpu_has(X86_FEATURE_AVX512IFMA);
#else
boolean_t has_avx512 = B_FALSE;
#if defined(_KERNEL)
-#if defined(X86_FEATURE_AVX512VBMI) && defined(KERNEL_EXPORTS_X86_FPU)
+#if defined(X86_FEATURE_AVX512VBMI)
has_avx512 = boot_cpu_has(X86_FEATURE_AVX512F) &&
boot_cpu_has(X86_FEATURE_AVX512VBMI);
#else
*/
void vdev_raidz_math_init(void);
void vdev_raidz_math_fini(void);
-struct raidz_impl_ops *vdev_raidz_math_get_ops(void);
+const struct raidz_impl_ops *vdev_raidz_math_get_ops(void);
int vdev_raidz_math_generate(struct raidz_map *);
int vdev_raidz_math_reconstruct(struct raidz_map *, const int *, const int *,
const int);
uintptr_t rm_reports; /* # of referencing checksum reports */
uint8_t rm_freed; /* map no longer has referencing ZIO */
uint8_t rm_ecksuminjected; /* checksum error was injected */
- raidz_impl_ops_t *rm_ops; /* RAIDZ math operations */
+ const raidz_impl_ops_t *rm_ops; /* RAIDZ math operations */
raidz_col_t rm_col[1]; /* Flexible array of I/O columns */
} raidz_map_t;
#include <sys/crypto/spi.h>
#include <modes/modes.h>
#include <aes/aes_impl.h>
+#include <linux/simd.h>
/*
* Initialize AES encryption and decryption key schedules.
void
aes_init_keysched(const uint8_t *cipherKey, uint_t keyBits, void *keysched)
{
- aes_impl_ops_t *ops = aes_impl_get_ops();
- aes_key_t *newbie = keysched;
- uint_t keysize, i, j;
+ const aes_impl_ops_t *ops = aes_impl_get_ops();
+ aes_key_t *newbie = keysched;
+ uint_t keysize, i, j;
union {
uint64_t ka64[4];
uint32_t ka32[8];
static aes_impl_ops_t *aes_supp_impl[ARRAY_SIZE(aes_all_impl)];
/*
- * Selects the aes operations for encrypt/decrypt/key setup
+ * Returns the AES operations for encrypt/decrypt/key setup. When a
+ * SIMD implementation is not allowed in the current context, then
+ * fallback to the fastest generic implementation.
*/
-aes_impl_ops_t *
-aes_impl_get_ops()
+const aes_impl_ops_t *
+aes_impl_get_ops(void)
{
- aes_impl_ops_t *ops = NULL;
+ if (!kfpu_allowed())
+ return (&aes_generic_impl);
+
+ const aes_impl_ops_t *ops = NULL;
const uint32_t impl = AES_IMPL_READ(icp_aes_impl);
switch (impl) {
ops = &aes_fastest_impl;
break;
case IMPL_CYCLE:
- {
+ /* Cycle through supported implementations */
ASSERT(aes_impl_initialized);
ASSERT3U(aes_supp_impl_cnt, >, 0);
- /* Cycle through supported implementations */
static size_t cycle_impl_idx = 0;
size_t idx = (++cycle_impl_idx) % aes_supp_impl_cnt;
ops = aes_supp_impl[idx];
- }
- break;
+ break;
default:
ASSERT3U(impl, <, aes_supp_impl_cnt);
ASSERT3U(aes_supp_impl_cnt, >, 0);
return (ops);
}
+/*
+ * Initialize all supported implementations.
+ */
+/* ARGSUSED */
void
-aes_impl_init(void)
+aes_impl_init(void *arg)
{
aes_impl_ops_t *curr_impl;
int i, c;
- /* move supported impl into aes_supp_impls */
+ /* Move supported implementations into aes_supp_impls */
for (i = 0, c = 0; i < ARRAY_SIZE(aes_all_impl); i++) {
curr_impl = (aes_impl_ops_t *)aes_all_impl[i];
static boolean_t
aes_aesni_will_work(void)
{
- return (zfs_aes_available());
+ return (kfpu_allowed() && zfs_aes_available());
}
const aes_impl_ops_t aes_aesni_impl = {
#include <sys/crypto/impl.h>
#include <sys/byteorder.h>
#include <modes/gcm_impl.h>
+#include <linux/simd.h>
#define GHASH(c, d, t, o) \
xor_block((uint8_t *)(d), (uint8_t *)(c)->gcm_ghash); \
void (*copy_block)(uint8_t *, uint8_t *),
void (*xor_block)(uint8_t *, uint8_t *))
{
- gcm_impl_ops_t *gops;
+ const gcm_impl_ops_t *gops;
size_t remainder = length;
size_t need = 0;
uint8_t *datap = (uint8_t *)data;
void (*copy_block)(uint8_t *, uint8_t *),
void (*xor_block)(uint8_t *, uint8_t *))
{
- gcm_impl_ops_t *gops;
+ const gcm_impl_ops_t *gops;
uint64_t counter_mask = ntohll(0x00000000ffffffffULL);
uint8_t *ghash, *macp = NULL;
int i, rv;
int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
void (*xor_block)(uint8_t *, uint8_t *))
{
- gcm_impl_ops_t *gops;
+ const gcm_impl_ops_t *gops;
size_t pt_len;
size_t remainder;
uint8_t *ghash;
void (*copy_block)(uint8_t *, uint8_t *),
void (*xor_block)(uint8_t *, uint8_t *))
{
- gcm_impl_ops_t *gops;
+ const gcm_impl_ops_t *gops;
uint8_t *cb;
ulong_t remainder = iv_len;
ulong_t processed = 0;
void (*copy_block)(uint8_t *, uint8_t *),
void (*xor_block)(uint8_t *, uint8_t *))
{
- gcm_impl_ops_t *gops;
+ const gcm_impl_ops_t *gops;
uint8_t *ghash, *datap, *authp;
size_t remainder, processed;
static gcm_impl_ops_t *gcm_supp_impl[ARRAY_SIZE(gcm_all_impl)];
/*
- * Selects the gcm operation
+ * Returns the GCM operations for encrypt/decrypt/key setup. When a
+ * SIMD implementation is not allowed in the current context, then
+ * fallback to the fastest generic implementation.
*/
-gcm_impl_ops_t *
+const gcm_impl_ops_t *
gcm_impl_get_ops()
{
- gcm_impl_ops_t *ops = NULL;
+ if (!kfpu_allowed())
+ return (&gcm_generic_impl);
+
+ const gcm_impl_ops_t *ops = NULL;
const uint32_t impl = GCM_IMPL_READ(icp_gcm_impl);
switch (impl) {
ops = &gcm_fastest_impl;
break;
case IMPL_CYCLE:
- {
+ /* Cycle through supported implementations */
ASSERT(gcm_impl_initialized);
ASSERT3U(gcm_supp_impl_cnt, >, 0);
- /* Cycle through supported implementations */
static size_t cycle_impl_idx = 0;
size_t idx = (++cycle_impl_idx) % gcm_supp_impl_cnt;
ops = gcm_supp_impl[idx];
- }
- break;
+ break;
default:
ASSERT3U(impl, <, gcm_supp_impl_cnt);
ASSERT3U(gcm_supp_impl_cnt, >, 0);
return (ops);
}
+/*
+ * Initialize all supported implementations.
+ */
+/* ARGSUSED */
void
-gcm_impl_init(void)
+gcm_impl_init(void *arg)
{
gcm_impl_ops_t *curr_impl;
int i, c;
- /* move supported impl into aes_supp_impls */
+ /* Move supported implementations into gcm_supp_impls */
for (i = 0, c = 0; i < ARRAY_SIZE(gcm_all_impl); i++) {
curr_impl = (gcm_impl_ops_t *)gcm_all_impl[i];
}
gcm_supp_impl_cnt = c;
- /* set fastest implementation. assume hardware accelerated is fastest */
+ /*
+ * Set the fastest implementation given the assumption that the
+ * hardware accelerated version is the fastest.
+ */
#if defined(__x86_64) && defined(HAVE_PCLMULQDQ)
if (gcm_pclmulqdq_impl.is_supported())
memcpy(&gcm_fastest_impl, &gcm_pclmulqdq_impl,
static boolean_t
gcm_pclmulqdq_will_work(void)
{
- return (zfs_pclmulqdq_available());
+ return (kfpu_allowed() && zfs_pclmulqdq_available());
}
const gcm_impl_ops_t gcm_pclmulqdq_impl = {
/*
* Initializes fastest implementation
*/
-void aes_impl_init(void);
+void aes_impl_init(void *arg);
/*
- * Get selected aes implementation
+ * Returns optimal allowed AES implementation
*/
-struct aes_impl_ops *aes_impl_get_ops(void);
+const struct aes_impl_ops *aes_impl_get_ops(void);
#ifdef __cplusplus
}
/*
* Initializes fastest implementation
*/
-void gcm_impl_init(void);
+void gcm_impl_init(void *arg);
/*
- * Get selected aes implementation
+ * Returns optimal allowed GCM implementation
*/
-struct gcm_impl_ops *gcm_impl_get_ops(void);
+const struct gcm_impl_ops *gcm_impl_get_ops(void);
#ifdef __cplusplus
}
{
int ret;
- /* find fastest implementations and set any requested implementations */
- aes_impl_init();
- gcm_impl_init();
+#if defined(_KERNEL)
+ /*
+ * Determine the fastest available implementation. The benchmarks
+ * are run in dedicated kernel threads to allow Linux 5.0+ kernels
+ * to use SIMD operations. If for some reason this isn't possible,
+ * fallback to the generic implementations. See the comment in
+ * include/linux/simd_x86.h for additional details. Additionally,
+ * this has the benefit of allowing them to be run in parallel.
+ */
+ taskqid_t aes_id = taskq_dispatch(system_taskq, aes_impl_init,
+ NULL, TQ_SLEEP);
+ taskqid_t gcm_id = taskq_dispatch(system_taskq, gcm_impl_init,
+ NULL, TQ_SLEEP);
+
+ if (aes_id != TASKQID_INVALID) {
+ taskq_wait_id(system_taskq, aes_id);
+ } else {
+ aes_impl_init(NULL);
+ }
+
+ if (gcm_id != TASKQID_INVALID) {
+ taskq_wait_id(system_taskq, gcm_id);
+ } else {
+ gcm_impl_init(NULL);
+ }
+#else
+ aes_impl_init(NULL);
+ gcm_impl_init(NULL);
+#endif
if ((ret = mod_install(&modlinkage)) != 0)
return (ret);
#include <sys/taskq.h>
#include <sys/kmem.h>
#include <sys/tsd.h>
+#include <linux/simd.h>
int spl_taskq_thread_bind = 0;
module_param(spl_taskq_thread_bind, int, 0644);
sigfillset(&blocked);
sigprocmask(SIG_BLOCK, &blocked, NULL);
flush_signals(current);
+ kfpu_initialize();
tsd_set(taskq_tsd, tq);
spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class);
#include <sys/thread.h>
#include <sys/kmem.h>
#include <sys/tsd.h>
+#include <linux/simd.h>
/*
* Thread interfaces
args = tp->tp_args;
set_current_state(tp->tp_state);
set_user_nice((kthread_t *)current, PRIO_TO_NICE(tp->tp_pri));
+ kfpu_initialize();
kmem_free(tp->tp_name, tp->tp_name_size);
kmem_free(tp, sizeof (thread_priv_t));
#include <sys/zio_checksum.h>
#include <sys/zfs_context.h>
#include <zfs_fletcher.h>
+#include <linux/simd.h>
#define FLETCHER_MIN_SIMD_SIZE 64
const char *fis_name;
uint32_t fis_sel;
} fletcher_4_impl_selectors[] = {
-#if !defined(_KERNEL)
{ "cycle", IMPL_CYCLE },
-#endif
{ "fastest", IMPL_FASTEST },
{ "scalar", IMPL_SCALAR }
};
#if defined(_KERNEL)
static kstat_t *fletcher_4_kstat;
-#endif
static struct fletcher_4_kstat {
uint64_t native;
uint64_t byteswap;
} fletcher_4_stat_data[ARRAY_SIZE(fletcher_4_impls) + 1];
+#endif
/* Indicate that benchmark has been completed */
static boolean_t fletcher_4_initialized = B_FALSE;
return (err);
}
+/*
+ * Returns the Fletcher 4 operations for checksums. When a SIMD
+ * implementation is not allowed in the current context, then fallback
+ * to the fastest generic implementation.
+ */
static inline const fletcher_4_ops_t *
fletcher_4_impl_get(void)
{
- fletcher_4_ops_t *ops = NULL;
- const uint32_t impl = IMPL_READ(fletcher_4_impl_chosen);
+ if (!kfpu_allowed())
+ return (&fletcher_4_superscalar4_ops);
+
+ const fletcher_4_ops_t *ops = NULL;
+ uint32_t impl = IMPL_READ(fletcher_4_impl_chosen);
switch (impl) {
case IMPL_FASTEST:
ASSERT(fletcher_4_initialized);
ops = &fletcher_4_fastest_impl;
break;
-#if !defined(_KERNEL)
- case IMPL_CYCLE: {
+ case IMPL_CYCLE:
+ /* Cycle through supported implementations */
ASSERT(fletcher_4_initialized);
ASSERT3U(fletcher_4_supp_impls_cnt, >, 0);
-
static uint32_t cycle_count = 0;
uint32_t idx = (++cycle_count) % fletcher_4_supp_impls_cnt;
ops = fletcher_4_supp_impls[idx];
- }
- break;
-#endif
+ break;
default:
ASSERT3U(fletcher_4_supp_impls_cnt, >, 0);
ASSERT3U(impl, <, fletcher_4_supp_impls_cnt);
-
ops = fletcher_4_supp_impls[impl];
break;
}
typedef void fletcher_checksum_func_t(const void *, uint64_t, const void *,
zio_cksum_t *);
+#if defined(_KERNEL)
static void
fletcher_4_benchmark_impl(boolean_t native, char *data, uint64_t data_size)
{
/* restore original selection */
atomic_swap_32(&fletcher_4_impl_chosen, sel_save);
}
+#endif /* _KERNEL */
-void
-fletcher_4_init(void)
+/*
+ * Initialize and benchmark all supported implementations.
+ */
+static void
+fletcher_4_benchmark(void *arg)
{
- static const size_t data_size = 1 << SPA_OLD_MAXBLOCKSHIFT; /* 128kiB */
fletcher_4_ops_t *curr_impl;
- char *databuf;
int i, c;
- /* move supported impl into fletcher_4_supp_impls */
+ /* Move supported implementations into fletcher_4_supp_impls */
for (i = 0, c = 0; i < ARRAY_SIZE(fletcher_4_impls); i++) {
curr_impl = (fletcher_4_ops_t *)fletcher_4_impls[i];
membar_producer(); /* complete fletcher_4_supp_impls[] init */
fletcher_4_supp_impls_cnt = c; /* number of supported impl */
-#if !defined(_KERNEL)
- /* Skip benchmarking and use last implementation as fastest */
- memcpy(&fletcher_4_fastest_impl,
- fletcher_4_supp_impls[fletcher_4_supp_impls_cnt-1],
- sizeof (fletcher_4_fastest_impl));
- fletcher_4_fastest_impl.name = "fastest";
- membar_producer();
+#if defined(_KERNEL)
+ static const size_t data_size = 1 << SPA_OLD_MAXBLOCKSHIFT; /* 128kiB */
+ char *databuf = vmem_alloc(data_size, KM_SLEEP);
- fletcher_4_initialized = B_TRUE;
- return;
-#endif
- /* Benchmark all supported implementations */
- databuf = vmem_alloc(data_size, KM_SLEEP);
for (i = 0; i < data_size / sizeof (uint64_t); i++)
((uint64_t *)databuf)[i] = (uintptr_t)(databuf+i); /* warm-up */
fletcher_4_benchmark_impl(B_TRUE, databuf, data_size);
vmem_free(databuf, data_size);
+#else
+ /*
+ * Skip the benchmark in user space to avoid impacting libzpool
+ * consumers (zdb, zhack, zinject, ztest). The last implementation
+ * is assumed to be the fastest and used by default.
+ */
+ memcpy(&fletcher_4_fastest_impl,
+ fletcher_4_supp_impls[fletcher_4_supp_impls_cnt - 1],
+ sizeof (fletcher_4_fastest_impl));
+ fletcher_4_fastest_impl.name = "fastest";
+ membar_producer();
+#endif /* _KERNEL */
+}
+void
+fletcher_4_init(void)
+{
#if defined(_KERNEL)
- /* install kstats for all implementations */
+ /*
+ * For 5.0 and latter Linux kernels the fletcher 4 benchmarks are
+ * run in a kernel threads. This is needed to take advantage of the
+ * SIMD functionality, see include/linux/simd_x86.h for details.
+ */
+ taskqid_t id = taskq_dispatch(system_taskq, fletcher_4_benchmark,
+ NULL, TQ_SLEEP);
+ if (id != TASKQID_INVALID) {
+ taskq_wait_id(system_taskq, id);
+ } else {
+ fletcher_4_benchmark(NULL);
+ }
+
+ /* Install kstats for all implementations */
fletcher_4_kstat = kstat_create("zfs", 0, "fletcher_4_bench", "misc",
KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL);
if (fletcher_4_kstat != NULL) {
fletcher_4_kstat_addr);
kstat_install(fletcher_4_kstat);
}
+#else
+ fletcher_4_benchmark(NULL);
#endif
/* Finish initialization */
static boolean_t fletcher_4_aarch64_neon_valid(void)
{
- return (B_TRUE);
+ return (kfpu_allowed());
}
const fletcher_4_ops_t fletcher_4_aarch64_neon_ops = {
static boolean_t
fletcher_4_avx512f_valid(void)
{
- return (zfs_avx512f_available());
+ return (kfpu_allowed() && zfs_avx512f_available());
}
const fletcher_4_ops_t fletcher_4_avx512f_ops = {
static boolean_t fletcher_4_avx2_valid(void)
{
- return (zfs_avx_available() && zfs_avx2_available());
+ return (kfpu_allowed() && zfs_avx_available() && zfs_avx2_available());
}
const fletcher_4_ops_t fletcher_4_avx2_ops = {
static boolean_t fletcher_4_sse2_valid(void)
{
- return (zfs_sse2_available());
+ return (kfpu_allowed() && zfs_sse2_available());
}
const fletcher_4_ops_t fletcher_4_sse2_ops = {
static boolean_t fletcher_4_ssse3_valid(void)
{
- return (zfs_sse2_available() && zfs_ssse3_available());
+ return (kfpu_allowed() && zfs_sse2_available() &&
+ zfs_ssse3_available());
}
const fletcher_4_ops_t fletcher_4_ssse3_ops = {
#include <sys/zio.h>
#include <sys/debug.h>
#include <sys/zfs_debug.h>
-
#include <sys/vdev_raidz.h>
#include <sys/vdev_raidz_impl.h>
+#include <linux/simd.h>
extern boolean_t raidz_will_scalar_work(void);
static size_t raidz_supp_impl_cnt = 0;
static raidz_impl_ops_t *raidz_supp_impl[ARRAY_SIZE(raidz_all_maths)];
+#if defined(_KERNEL)
/*
* kstats values for supported implementations
* Values represent per disk throughput of 8 disk+parity raidz vdev [B/s]
/* kstat for benchmarked implementations */
static kstat_t *raidz_math_kstat = NULL;
+#endif
/*
- * Selects the raidz operation for raidz_map
- * If rm_ops is set to NULL original raidz implementation will be used
+ * Returns the RAIDZ operations for raidz_map() parity calculations. When
+ * a SIMD implementation is not allowed in the current context, then fallback
+ * to the fastest generic implementation.
*/
-raidz_impl_ops_t *
-vdev_raidz_math_get_ops()
+const raidz_impl_ops_t *
+vdev_raidz_math_get_ops(void)
{
+ if (!kfpu_allowed())
+ return (&vdev_raidz_scalar_impl);
+
raidz_impl_ops_t *ops = NULL;
const uint32_t impl = RAIDZ_IMPL_READ(zfs_vdev_raidz_impl);
ASSERT(raidz_math_initialized);
ops = &vdev_raidz_fastest_impl;
break;
-#if !defined(_KERNEL)
case IMPL_CYCLE:
- {
+ /* Cycle through all supported implementations */
ASSERT(raidz_math_initialized);
ASSERT3U(raidz_supp_impl_cnt, >, 0);
- /* Cycle through all supported implementations */
static size_t cycle_impl_idx = 0;
size_t idx = (++cycle_impl_idx) % raidz_supp_impl_cnt;
ops = raidz_supp_impl[idx];
- }
- break;
-#endif
+ break;
case IMPL_ORIGINAL:
ops = (raidz_impl_ops_t *)&vdev_raidz_original_impl;
break;
"rec_pq", "rec_pr", "rec_qr", "rec_pqr"
};
+#if defined(_KERNEL)
+
#define RAIDZ_KSTAT_LINE_LEN (17 + 10*12 + 1)
static int
}
}
}
+#endif
-void
-vdev_raidz_math_init(void)
+/*
+ * Initialize and benchmark all supported implementations.
+ */
+static void
+benchmark_raidz(void *arg)
{
raidz_impl_ops_t *curr_impl;
- zio_t *bench_zio = NULL;
- raidz_map_t *bench_rm = NULL;
- uint64_t bench_parity;
- int i, c, fn;
+ int i, c;
- /* move supported impl into raidz_supp_impl */
+ /* Move supported impl into raidz_supp_impl */
for (i = 0, c = 0; i < ARRAY_SIZE(raidz_all_maths); i++) {
curr_impl = (raidz_impl_ops_t *)raidz_all_maths[i];
- /* initialize impl */
if (curr_impl->init)
curr_impl->init();
membar_producer(); /* complete raidz_supp_impl[] init */
raidz_supp_impl_cnt = c; /* number of supported impl */
-#if !defined(_KERNEL)
- /* Skip benchmarking and use last implementation as fastest */
- memcpy(&vdev_raidz_fastest_impl, raidz_supp_impl[raidz_supp_impl_cnt-1],
- sizeof (vdev_raidz_fastest_impl));
- strcpy(vdev_raidz_fastest_impl.name, "fastest");
-
- raidz_math_initialized = B_TRUE;
-
- /* Use 'cycle' math selection method for userspace */
- VERIFY0(vdev_raidz_impl_set("cycle"));
- return;
-#endif
+#if defined(_KERNEL)
+ zio_t *bench_zio = NULL;
+ raidz_map_t *bench_rm = NULL;
+ uint64_t bench_parity;
/* Fake a zio and run the benchmark on a warmed up buffer */
bench_zio = kmem_zalloc(sizeof (zio_t), KM_SLEEP);
memset(abd_to_buf(bench_zio->io_abd), 0xAA, BENCH_ZIO_SIZE);
/* Benchmark parity generation methods */
- for (fn = 0; fn < RAIDZ_GEN_NUM; fn++) {
+ for (int fn = 0; fn < RAIDZ_GEN_NUM; fn++) {
bench_parity = fn + 1;
/* New raidz_map is needed for each generate_p/q/r */
bench_rm = vdev_raidz_map_alloc(bench_zio, SPA_MINBLOCKSHIFT,
bench_rm = vdev_raidz_map_alloc(bench_zio, SPA_MINBLOCKSHIFT,
BENCH_COLS, PARITY_PQR);
- for (fn = 0; fn < RAIDZ_REC_NUM; fn++)
+ for (int fn = 0; fn < RAIDZ_REC_NUM; fn++)
benchmark_raidz_impl(bench_rm, fn, benchmark_rec_impl);
vdev_raidz_map_free(bench_rm);
/* cleanup the bench zio */
abd_free(bench_zio->io_abd);
kmem_free(bench_zio, sizeof (zio_t));
+#else
+ /*
+ * Skip the benchmark in user space to avoid impacting libzpool
+ * consumers (zdb, zhack, zinject, ztest). The last implementation
+ * is assumed to be the fastest and used by default.
+ */
+ memcpy(&vdev_raidz_fastest_impl,
+ raidz_supp_impl[raidz_supp_impl_cnt - 1],
+ sizeof (vdev_raidz_fastest_impl));
+ strcpy(vdev_raidz_fastest_impl.name, "fastest");
+#endif /* _KERNEL */
+}
- /* install kstats for all impl */
+void
+vdev_raidz_math_init(void)
+{
+#if defined(_KERNEL)
+ /*
+ * For 5.0 and latter Linux kernels the fletcher 4 benchmarks are
+ * run in a kernel threads. This is needed to take advantage of the
+ * SIMD functionality, see include/linux/simd_x86.h for details.
+ */
+ taskqid_t id = taskq_dispatch(system_taskq, benchmark_raidz,
+ NULL, TQ_SLEEP);
+ if (id != TASKQID_INVALID) {
+ taskq_wait_id(system_taskq, id);
+ } else {
+ benchmark_raidz(NULL);
+ }
+
+ /* Install kstats for all implementations */
raidz_math_kstat = kstat_create("zfs", 0, "vdev_raidz_bench", "misc",
KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL);
-
if (raidz_math_kstat != NULL) {
raidz_math_kstat->ks_data = NULL;
raidz_math_kstat->ks_ndata = UINT32_MAX;
raidz_math_kstat_addr);
kstat_install(raidz_math_kstat);
}
+#else
+ benchmark_raidz(NULL);
+#endif
/* Finish initialization */
atomic_swap_32(&zfs_vdev_raidz_impl, user_sel_impl);
vdev_raidz_math_fini(void)
{
raidz_impl_ops_t const *curr_impl;
- int i;
+#if defined(_KERNEL)
if (raidz_math_kstat != NULL) {
kstat_delete(raidz_math_kstat);
raidz_math_kstat = NULL;
}
+#endif
- /* fini impl */
- for (i = 0; i < ARRAY_SIZE(raidz_all_maths); i++) {
+ for (int i = 0; i < ARRAY_SIZE(raidz_all_maths); i++) {
curr_impl = raidz_all_maths[i];
if (curr_impl->fini)
curr_impl->fini();
char *name;
uint32_t sel;
} math_impl_opts[] = {
-#if !defined(_KERNEL)
{ "cycle", IMPL_CYCLE },
-#endif
{ "fastest", IMPL_FASTEST },
{ "original", IMPL_ORIGINAL },
{ "scalar", IMPL_SCALAR }
static boolean_t
raidz_will_aarch64_neon_work(void)
{
- return (B_TRUE); // __arch64__ requires NEON
+ return (kfpu_allowed());
}
const raidz_impl_ops_t vdev_raidz_aarch64_neon_impl = {
static boolean_t
raidz_will_aarch64_neonx2_work(void)
{
- return (B_TRUE); // __arch64__ requires NEON
+ return (kfpu_allowed());
}
const raidz_impl_ops_t vdev_raidz_aarch64_neonx2_impl = {
static boolean_t
raidz_will_avx2_work(void)
{
- return (zfs_avx_available() && zfs_avx2_available());
+ return (kfpu_allowed() && zfs_avx_available() && zfs_avx2_available());
}
const raidz_impl_ops_t vdev_raidz_avx2_impl = {
static boolean_t
raidz_will_avx512bw_work(void)
{
- return (zfs_avx_available() &&
- zfs_avx512f_available() &&
- zfs_avx512bw_available());
+ return (kfpu_allowed() && zfs_avx_available() &&
+ zfs_avx512f_available() && zfs_avx512bw_available());
}
const raidz_impl_ops_t vdev_raidz_avx512bw_impl = {
static boolean_t
raidz_will_avx512f_work(void)
{
- return (zfs_avx_available() &&
- zfs_avx2_available() &&
- zfs_avx512f_available());
+ return (kfpu_allowed() && zfs_avx_available() &&
+ zfs_avx2_available() && zfs_avx512f_available());
}
const raidz_impl_ops_t vdev_raidz_avx512f_impl = {
static boolean_t
raidz_will_sse2_work(void)
{
- return (zfs_sse_available() && zfs_sse2_available());
+ return (kfpu_allowed() && zfs_sse_available() && zfs_sse2_available());
}
const raidz_impl_ops_t vdev_raidz_sse2_impl = {
static boolean_t
raidz_will_ssse3_work(void)
{
- return (zfs_sse_available() && zfs_sse2_available() &&
- zfs_ssse3_available());
+ return (kfpu_allowed() && zfs_sse_available() &&
+ zfs_sse2_available() && zfs_ssse3_available());
}
const raidz_impl_ops_t vdev_raidz_ssse3_impl = {