From 32ffaa3de58981814342fe6d3556c03d41d121f8 Mon Sep 17 00:00:00 2001 From: Gvozden Neskovic Date: Fri, 1 Jul 2016 18:33:04 +0200 Subject: [PATCH] Add support for AVX-512 family of instruction sets This patch adds compiler and runtime tests (user and kernel) for following instruction sets: avx512f, avx512cd, avx512er, avx512pf, avx512bw, avx512dq, avx512vl, avx512ifma, avx512vbmi. note: Linux support for AVX-512F (Foundation) instruction set started with linux v3.15 Signed-off-by: Gvozden Neskovic Signed-off-by: Brian Behlendorf Issue #4952 --- config/toolchain-simd.m4 | 189 ++++++++++++++++++++++++++++++ include/linux/simd_x86.h | 245 +++++++++++++++++++++++++++++++++++++-- 2 files changed, 422 insertions(+), 12 deletions(-) diff --git a/config/toolchain-simd.m4 b/config/toolchain-simd.m4 index 0f8c1f2d9..29abbbb5b 100644 --- a/config/toolchain-simd.m4 +++ b/config/toolchain-simd.m4 @@ -12,6 +12,15 @@ AC_DEFUN([ZFS_AC_CONFIG_ALWAYS_TOOLCHAIN_SIMD], [ ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_SSE4_2 ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_AVX ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_AVX2 + ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_AVX512F + ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_AVX512CD + ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_AVX512DQ + ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_AVX512BW + ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_AVX512IFMA + ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_AVX512VBMI + ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_AVX512PF + ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_AVX512ER + ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_AVX512VL ;; esac ]) @@ -170,3 +179,183 @@ AC_DEFUN([ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_AVX2], [ AC_MSG_RESULT([no]) ]) ]) + +dnl # +dnl # ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_AVX512F +dnl # +AC_DEFUN([ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_AVX512F], [ + AC_MSG_CHECKING([whether host toolchain supports AVX512F]) + + AC_LINK_IFELSE([AC_LANG_SOURCE([ + [ + void main() + { + __asm__ __volatile__("vpandd %zmm0,%zmm1,%zmm2"); + } + ]])], [ + AC_MSG_RESULT([yes]) + AC_DEFINE([HAVE_AVX512F], 1, [Define if host toolchain supports AVX512F]) + ], [ + AC_MSG_RESULT([no]) + ]) +]) + +dnl # +dnl # ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_AVX512CD +dnl # +AC_DEFUN([ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_AVX512CD], [ + AC_MSG_CHECKING([whether host toolchain supports AVX512CD]) + + AC_LINK_IFELSE([AC_LANG_SOURCE([ + [ + void main() + { + __asm__ __volatile__("vplzcntd %zmm0,%zmm1"); + } + ]])], [ + AC_MSG_RESULT([yes]) + AC_DEFINE([HAVE_AVX512CD], 1, [Define if host toolchain supports AVX512CD]) + ], [ + AC_MSG_RESULT([no]) + ]) +]) + +dnl # +dnl # ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_AVX512DQ +dnl # +AC_DEFUN([ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_AVX512DQ], [ + AC_MSG_CHECKING([whether host toolchain supports AVX512DQ]) + + AC_LINK_IFELSE([AC_LANG_SOURCE([ + [ + void main() + { + __asm__ __volatile__("vandpd %zmm0,%zmm1,%zmm2"); + } + ]])], [ + AC_MSG_RESULT([yes]) + AC_DEFINE([HAVE_AVX512DQ], 1, [Define if host toolchain supports AVX512DQ]) + ], [ + AC_MSG_RESULT([no]) + ]) +]) + +dnl # +dnl # ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_AVX512BW +dnl # +AC_DEFUN([ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_AVX512BW], [ + AC_MSG_CHECKING([whether host toolchain supports AVX512BW]) + + AC_LINK_IFELSE([AC_LANG_SOURCE([ + [ + void main() + { + __asm__ __volatile__("vpshufb %zmm0,%zmm1,%zmm2"); + } + ]])], [ + AC_MSG_RESULT([yes]) + AC_DEFINE([HAVE_AVX512BW], 1, [Define if host toolchain supports AVX512BW]) + ], [ + AC_MSG_RESULT([no]) + ]) +]) + +dnl # +dnl # ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_AVX512IFMA +dnl # +AC_DEFUN([ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_AVX512IFMA], [ + AC_MSG_CHECKING([whether host toolchain supports AVX512IFMA]) + + AC_LINK_IFELSE([AC_LANG_SOURCE([ + [ + void main() + { + __asm__ __volatile__("vpmadd52luq %zmm0,%zmm1,%zmm2"); + } + ]])], [ + AC_MSG_RESULT([yes]) + AC_DEFINE([HAVE_AVX512IFMA], 1, [Define if host toolchain supports AVX512IFMA]) + ], [ + AC_MSG_RESULT([no]) + ]) +]) + +dnl # +dnl # ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_AVX512VBMI +dnl # +AC_DEFUN([ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_AVX512VBMI], [ + AC_MSG_CHECKING([whether host toolchain supports AVX512VBMI]) + + AC_LINK_IFELSE([AC_LANG_SOURCE([ + [ + void main() + { + __asm__ __volatile__("vpermb %zmm0,%zmm1,%zmm2"); + } + ]])], [ + AC_MSG_RESULT([yes]) + AC_DEFINE([HAVE_AVX512VBMI], 1, [Define if host toolchain supports AVX512VBMI]) + ], [ + AC_MSG_RESULT([no]) + ]) +]) + +dnl # +dnl # ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_AVX512PF +dnl # +AC_DEFUN([ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_AVX512PF], [ + AC_MSG_CHECKING([whether host toolchain supports AVX512PF]) + + AC_LINK_IFELSE([AC_LANG_SOURCE([ + [ + void main() + { + __asm__ __volatile__("vgatherpf0dps (%rsi,%zmm0,4){%k1}"); + } + ]])], [ + AC_MSG_RESULT([yes]) + AC_DEFINE([HAVE_AVX512PF], 1, [Define if host toolchain supports AVX512PF]) + ], [ + AC_MSG_RESULT([no]) + ]) +]) + +dnl # +dnl # ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_AVX512ER +dnl # +AC_DEFUN([ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_AVX512ER], [ + AC_MSG_CHECKING([whether host toolchain supports AVX512ER]) + + AC_LINK_IFELSE([AC_LANG_SOURCE([ + [ + void main() + { + __asm__ __volatile__("vexp2pd %zmm0,%zmm1"); + } + ]])], [ + AC_MSG_RESULT([yes]) + AC_DEFINE([HAVE_AVX512ER], 1, [Define if host toolchain supports AVX512ER]) + ], [ + AC_MSG_RESULT([no]) + ]) +]) + +dnl # +dnl # ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_AVX512VL +dnl # +AC_DEFUN([ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_AVX512VL], [ + AC_MSG_CHECKING([whether host toolchain supports AVX512VL]) + + AC_LINK_IFELSE([AC_LANG_SOURCE([ + [ + void main() + { + __asm__ __volatile__("vpabsq %zmm0,%zmm1"); + } + ]])], [ + AC_MSG_RESULT([yes]) + AC_DEFINE([HAVE_AVX512VL], 1, [Define if host toolchain supports AVX512VL]) + ], [ + AC_MSG_RESULT([no]) + ]) +]) diff --git a/include/linux/simd_x86.h b/include/linux/simd_x86.h index 6aa51144c..216dbed62 100644 --- a/include/linux/simd_x86.h +++ b/include/linux/simd_x86.h @@ -43,10 +43,25 @@ * zfs_ssse3_available() * zfs_sse4_1_available() * zfs_sse4_2_available() + * * zfs_avx_available() * zfs_avx2_available() + * * zfs_bmi1_available() * zfs_bmi2_available() + * + * zfs_avx512f_available() + * zfs_avx512cd_available() + * zfs_avx512er_available() + * zfs_avx512pf_available() + * zfs_avx512bw_available() + * zfs_avx512dq_available() + * zfs_avx512vl_available() + * zfs_avx512ifma_available() + * zfs_avx512vbmi_available() + * + * NOTE(AVX-512VL): If using AVX-512 instructions with 128Bit registers + * also add zfs_avx512vl_available() to feature check. */ #ifndef _SIMD_X86_H @@ -124,7 +139,16 @@ typedef enum cpuid_inst_sets { AVX, AVX2, BMI1, - BMI2 + BMI2, + AVX512F, + AVX512CD, + AVX512DQ, + AVX512BW, + AVX512IFMA, + AVX512VBMI, + AVX512PF, + AVX512ER, + AVX512VL } cpuid_inst_sets_t; /* @@ -132,11 +156,21 @@ typedef enum cpuid_inst_sets { */ typedef struct cpuid_feature_desc { uint32_t leaf; /* CPUID leaf */ - uint32_t subleaf; /* CPUID subleaf */ + uint32_t subleaf; /* CPUID sub-leaf */ uint32_t flag; /* bit mask of the feature */ cpuid_regs_t reg; /* which CPUID return register to test */ } cpuid_feature_desc_t; +#define _AVX512F_BIT (1U << 16) +#define _AVX512CD_BIT (_AVX512F_BIT | (1U << 28)) +#define _AVX512DQ_BIT (_AVX512F_BIT | (1U << 17)) +#define _AVX512BW_BIT (_AVX512F_BIT | (1U << 30)) +#define _AVX512IFMA_BIT (_AVX512F_BIT | (1U << 21)) +#define _AVX512VBMI_BIT (1U << 1) /* AVX512F_BIT is on another leaf */ +#define _AVX512PF_BIT (_AVX512F_BIT | (1U << 26)) +#define _AVX512ER_BIT (_AVX512F_BIT | (1U << 27)) +#define _AVX512VL_BIT (1U << 31) /* if used also check other levels */ + /* * Descriptions of supported instruction sets */ @@ -151,7 +185,16 @@ static const cpuid_feature_desc_t cpuid_features[] = { [AVX] = {1U, 0U, 1U << 28, ECX }, [AVX2] = {7U, 0U, 1U << 5, EBX }, [BMI1] = {7U, 0U, 1U << 3, EBX }, - [BMI2] = {7U, 0U, 1U << 8, EBX } + [BMI2] = {7U, 0U, 1U << 8, EBX }, + [AVX512F] = {7U, 0U, _AVX512F_BIT, EBX }, + [AVX512CD] = {7U, 0U, _AVX512CD_BIT, EBX }, + [AVX512DQ] = {7U, 0U, _AVX512DQ_BIT, EBX }, + [AVX512BW] = {7U, 0U, _AVX512BW_BIT, EBX }, + [AVX512IFMA] = {7U, 0U, _AVX512IFMA_BIT, EBX }, + [AVX512VBMI] = {7U, 0U, _AVX512VBMI_BIT, ECX }, + [AVX512PF] = {7U, 0U, _AVX512PF_BIT, EBX }, + [AVX512ER] = {7U, 0U, _AVX512ER_BIT, EBX }, + [AVX512VL] = {7U, 0U, _AVX512ER_BIT, EBX } }; /* @@ -187,15 +230,15 @@ __cpuid_check_feature(const cpuid_feature_desc_t *desc) */ __cpuid_count(desc->leaf, desc->subleaf, r[EAX], r[EBX], r[ECX], r[EDX]); - return (!!(r[desc->reg] & desc->flag)); + return ((r[desc->reg] & desc->flag) == desc->flag); } return (B_FALSE); } -#define CPUID_FEATURE_CHECK(name, id) \ -static inline boolean_t \ -__cpuid_has_ ## name(void)\ -{ \ +#define CPUID_FEATURE_CHECK(name, id) \ +static inline boolean_t \ +__cpuid_has_ ## name(void) \ +{ \ return (__cpuid_check_feature(&cpuid_features[id])); \ } @@ -213,16 +256,25 @@ CPUID_FEATURE_CHECK(avx2, AVX2); CPUID_FEATURE_CHECK(osxsave, OSXSAVE); CPUID_FEATURE_CHECK(bmi1, BMI1); CPUID_FEATURE_CHECK(bmi2, BMI2); +CPUID_FEATURE_CHECK(avx512f, AVX512F); +CPUID_FEATURE_CHECK(avx512cd, AVX512CD); +CPUID_FEATURE_CHECK(avx512dq, AVX512DQ); +CPUID_FEATURE_CHECK(avx512bw, AVX512BW); +CPUID_FEATURE_CHECK(avx512ifma, AVX512IFMA); +CPUID_FEATURE_CHECK(avx512vbmi, AVX512VBMI); +CPUID_FEATURE_CHECK(avx512pf, AVX512PF); +CPUID_FEATURE_CHECK(avx512er, AVX512ER); +CPUID_FEATURE_CHECK(avx512vl, AVX512VL); #endif /* !defined(_KERNEL) */ + /* - * Detect ymm register set support + * Detect register set support */ static inline boolean_t -__ymm_enabled(void) +__simd_state_enabled(const uint64_t state) { - static const uint64_t XSTATE_SSE_AVX = 0x2 | 0x4; boolean_t has_osxsave; uint64_t xcr0; @@ -238,9 +290,16 @@ __ymm_enabled(void) return (B_FALSE); xcr0 = xgetbv(0); - return ((xcr0 & XSTATE_SSE_AVX) == XSTATE_SSE_AVX); + return ((xcr0 & state) == state); } +#define _XSTATE_SSE_AVX (0x2 | 0x4) +#define _XSTATE_AVX512 (0xE0 | _XSTATE_SSE_AVX) + +#define __ymm_enabled() __simd_state_enabled(_XSTATE_SSE_AVX) +#define __zmm_enabled() __simd_state_enabled(_XSTATE_AVX512) + + /* * Check if SSE instruction set is available */ @@ -383,6 +442,168 @@ zfs_bmi2_available(void) #endif } + +/* + * AVX-512 family of instruction sets: + * + * AVX512F Foundation + * AVX512CD Conflict Detection Instructions + * AVX512ER Exponential and Reciprocal Instructions + * AVX512PF Prefetch Instructions + * + * AVX512BW Byte and Word Instructions + * AVX512DQ Double-word and Quadword Instructions + * AVX512VL Vector Length Extensions + * + * AVX512IFMA Integer Fused Multiply Add (Not supported by kernel 4.4) + * AVX512VBMI Vector Byte Manipulation Instructions + */ + + +/* Check if AVX512F instruction set is available */ +static inline boolean_t +zfs_avx512f_available(void) +{ + boolean_t has_avx512 = B_FALSE; + +#if defined(_KERNEL) && defined(X86_FEATURE_AVX512F) + has_avx512 = !!boot_cpu_has(X86_FEATURE_AVX512F); +#elif !defined(_KERNEL) + has_avx512 = __cpuid_has_avx512f(); +#endif + + return (has_avx512 && __zmm_enabled()); +} + +/* Check if AVX512CD instruction set is available */ +static inline boolean_t +zfs_avx512cd_available(void) +{ + boolean_t has_avx512 = B_FALSE; + +#if defined(_KERNEL) && defined(X86_FEATURE_AVX512CD) + has_avx512 = boot_cpu_has(X86_FEATURE_AVX512F) && + boot_cpu_has(X86_FEATURE_AVX512CD); +#elif !defined(_KERNEL) + has_avx512 = __cpuid_has_avx512cd(); +#endif + + return (has_avx512 && __zmm_enabled()); +} + +/* Check if AVX512ER instruction set is available */ +static inline boolean_t +zfs_avx512er_available(void) +{ + boolean_t has_avx512 = B_FALSE; + +#if defined(_KERNEL) && defined(X86_FEATURE_AVX512ER) + has_avx512 = boot_cpu_has(X86_FEATURE_AVX512F) && + boot_cpu_has(X86_FEATURE_AVX512ER); +#elif !defined(_KERNEL) + has_avx512 = __cpuid_has_avx512er(); +#endif + + return (has_avx512 && __zmm_enabled()); +} + +/* Check if AVX512PF instruction set is available */ +static inline boolean_t +zfs_avx512pf_available(void) +{ + boolean_t has_avx512 = B_FALSE; + +#if defined(_KERNEL) && defined(X86_FEATURE_AVX512PF) + has_avx512 = boot_cpu_has(X86_FEATURE_AVX512F) && + boot_cpu_has(X86_FEATURE_AVX512PF); +#elif !defined(_KERNEL) + has_avx512 = __cpuid_has_avx512pf(); +#endif + + return (has_avx512 && __zmm_enabled()); +} + +/* Check if AVX512BW instruction set is available */ +static inline boolean_t +zfs_avx512bw_available(void) +{ + boolean_t has_avx512 = B_FALSE; + +#if defined(_KERNEL) && defined(X86_FEATURE_AVX512BW) + has_avx512 = boot_cpu_has(X86_FEATURE_AVX512F) && + boot_cpu_has(X86_FEATURE_AVX512BW); +#elif !defined(_KERNEL) + has_avx512 = __cpuid_has_avx512bw(); +#endif + + return (has_avx512 && __zmm_enabled()); +} + +/* Check if AVX512DQ instruction set is available */ +static inline boolean_t +zfs_avx512dq_available(void) +{ + boolean_t has_avx512 = B_FALSE; + +#if defined(_KERNEL) && defined(X86_FEATURE_AVX512DQ) + has_avx512 = boot_cpu_has(X86_FEATURE_AVX512F) && + boot_cpu_has(X86_FEATURE_AVX512DQ); +#elif !defined(_KERNEL) + has_avx512 = __cpuid_has_avx512dq(); +#endif + + return (has_avx512 && __zmm_enabled()); +} + +/* Check if AVX512VL instruction set is available */ +static inline boolean_t +zfs_avx512vl_available(void) +{ + boolean_t has_avx512 = B_FALSE; + +#if defined(_KERNEL) && defined(X86_FEATURE_AVX512VL) + has_avx512 = boot_cpu_has(X86_FEATURE_AVX512F) && + boot_cpu_has(X86_FEATURE_AVX512VL); +#elif !defined(_KERNEL) + has_avx512 = __cpuid_has_avx512vl(); +#endif + + return (has_avx512 && __zmm_enabled()); +} + +/* Check if AVX512IFMA instruction set is available */ +static inline boolean_t +zfs_avx512ifma_available(void) +{ + boolean_t has_avx512 = B_FALSE; + +#if defined(_KERNEL) && defined(X86_FEATURE_AVX512IFMA) + has_avx512 = boot_cpu_has(X86_FEATURE_AVX512F) && + boot_cpu_has(X86_FEATURE_AVX512IFMA); +#elif !defined(_KERNEL) + has_avx512 = __cpuid_has_avx512ifma(); +#endif + + return (has_avx512 && __zmm_enabled()); +} + +/* Check if AVX512VBMI instruction set is available */ +static inline boolean_t +zfs_avx512vbmi_available(void) +{ + boolean_t has_avx512 = B_FALSE; + +#if defined(_KERNEL) && defined(X86_FEATURE_AVX512VBMI) + has_avx512 = boot_cpu_has(X86_FEATURE_AVX512F) && + boot_cpu_has(X86_FEATURE_AVX512VBMI); +#elif !defined(_KERNEL) + has_avx512 = __cpuid_has_avx512f() && + __cpuid_has_avx512vbmi(); +#endif + + return (has_avx512 && __zmm_enabled()); +} + #endif /* defined(__x86) */ #endif /* _SIMD_X86_H */ -- 2.40.0