]> granicus.if.org Git - zfs/blobdiff - module/zfs/vdev_raidz_math_avx512bw.c
OpenZFS 9682 - page fault in dsl_async_clone_destroy() while opening pool
[zfs] / module / zfs / vdev_raidz_math_avx512bw.c
index 465d1e56935c9ca7b55a14368a1057a276820281..3d5326b9e6e161aae77b61461d1af7e300f267df 100644 (file)
  */
 /*
  * Copyright (C) 2016 Romain Dolbeau. All rights reserved.
+ * Copyright (C) 2016 Gvozden Nešković. All rights reserved.
  */
 
 #include <sys/isa_defs.h>
 
-#if 0 // defined(__x86_64) && defined(HAVE_AVX512BW)
+#if defined(__x86_64) && defined(HAVE_AVX512BW)
 
 #include <sys/types.h>
 #include <linux/simd_x86.h>
@@ -66,20 +67,6 @@ typedef struct v {
        uint8_t b[ELEM_SIZE] __attribute__((aligned(ELEM_SIZE)));
 } v_t;
 
-#define        PREFETCHNTA(ptr, offset)                                        \
-{                                                                      \
-       __asm(                                                          \
-           "prefetchnta " #offset "(%[MEM])\n"                         \
-           : : [MEM] "r" (ptr));                                       \
-}
-
-#define        PREFETCH(ptr, offset)                                           \
-{                                                                      \
-       __asm(                                                          \
-           "prefetcht0 " #offset "(%[MEM])\n"                          \
-           : : [MEM] "r" (ptr));                                       \
-}
-
 #define        XOR_ACC(src, r...)                                              \
 {                                                                      \
        switch (REG_CNT(r)) {                                           \
@@ -122,27 +109,9 @@ typedef struct v {
        }                                                               \
 }
 
-#define        ZERO(r...)                                                      \
-{                                                                      \
-       switch (REG_CNT(r)) {                                           \
-       case 4:                                                         \
-               __asm(                                                  \
-                   "vpxorq %" VR0(r) ", %" VR0(r)", %" VR0(r) "\n"     \
-                   "vpxorq %" VR1(r) ", %" VR1(r)", %" VR1(r) "\n"     \
-                   "vpxorq %" VR2(r) ", %" VR2(r)", %" VR2(r) "\n"     \
-                   "vpxorq %" VR3(r) ", %" VR3(r)", %" VR3(r));        \
-               break;                                                  \
-       case 2:                                                         \
-               __asm(                                                  \
-                   "vpxorq %" VR0(r) ", %" VR0(r)", %" VR0(r) "\n"     \
-                   "vpxorq %" VR1(r) ", %" VR1(r)", %" VR1(r));        \
-               break;                                                  \
-       default:                                                        \
-               ASM_BUG();                                              \
-       }                                                               \
-}
+#define        ZERO(r...)      XOR(r, r)
 
-#define        COPY(r...)                                                      \
+#define        COPY(r...)                                                      \
 {                                                                      \
        switch (REG_CNT(r)) {                                           \
        case 8:                                                         \
@@ -162,7 +131,7 @@ typedef struct v {
        }                                                               \
 }
 
-#define        LOAD(src, r...)                                                 \
+#define        LOAD(src, r...)                                                 \
 {                                                                      \
        switch (REG_CNT(r)) {                                           \
        case 4:                                                         \
@@ -184,7 +153,7 @@ typedef struct v {
        }                                                               \
 }
 
-#define        STORE(dst, r...)                                                \
+#define        STORE(dst, r...)                                                \
 {                                                                      \
        switch (REG_CNT(r)) {                                           \
        case 4:                                                         \
@@ -206,41 +175,26 @@ typedef struct v {
        }                                                               \
 }
 
-#define        FLUSH()                                                         \
-{                                                                      \
-       __asm("vzeroupper");                                            \
-}
-
 #define        MUL2_SETUP()                                                    \
 {                                                                      \
-       __asm("vmovq %0,   %%xmm14" :: "r"(0x1d1d1d1d1d1d1d1d));        \
-       __asm("vpbroadcastq %xmm14, %zmm14");                           \
-       __asm("vmovq %0,   %%xmm13" :: "r"(0x8080808080808080));        \
-       __asm("vpbroadcastq %xmm13, %zmm13");                           \
-       __asm("vmovq %0,   %%xmm12" :: "r"(0xfefefefefefefefe));        \
-       __asm("vpbroadcastq %xmm12, %zmm12");                           \
-       __asm("vpxorq       %zmm15, %zmm15 ,%zmm15");                   \
+       __asm("vmovq %0,    %%xmm22" :: "r"(0x1d1d1d1d1d1d1d1d));       \
+       __asm("vpbroadcastq  %xmm22, %zmm22");                          \
+       __asm("vpxord        %zmm23, %zmm23 ,%zmm23");                  \
 }
 
-#define        _MUL2(r...)                                                     \
+#define        _MUL2(r...)                                                     \
 {                                                                      \
        switch  (REG_CNT(r)) {                                          \
        case 2:                                                         \
                __asm(                                                  \
-                   "vpandq   %" VR0(r)", %zmm13, %zmm10\n"             \
-                   "vpandq   %" VR1(r)", %zmm13, %zmm11\n"             \
-                   "vpsrlq   $7, %zmm10, %zmm8\n"                      \
-                   "vpsrlq   $7, %zmm11, %zmm9\n"                      \
-                   "vpsllq   $1, %zmm10, %zmm10\n"                     \
-                   "vpsllq   $1, %zmm11, %zmm11\n"                     \
-                   "vpsubq   %zmm8, %zmm10, %zmm10\n"                  \
-                   "vpsubq   %zmm9, %zmm11, %zmm11\n"                  \
-                   "vpsllq   $1, %" VR0(r)", %" VR0(r) "\n"            \
-                   "vpsllq   $1, %" VR1(r)", %" VR1(r) "\n"            \
-                   "vpandq   %zmm10, %zmm14, %zmm10\n"                 \
-                   "vpandq   %zmm11, %zmm14, %zmm11\n"                 \
-                   "vpternlogd $0x6c,%zmm12, %zmm10, %" VR0(r) "\n"    \
-                   "vpternlogd $0x6c,%zmm12, %zmm11, %" VR1(r));       \
+                   "vpcmpb $1, %zmm23,     %" VR0(r)", %k1\n"          \
+                   "vpcmpb $1, %zmm23,     %" VR1(r)", %k2\n"          \
+                   "vpaddb     %" VR0(r)", %" VR0(r)", %" VR0(r) "\n"  \
+                   "vpaddb     %" VR1(r)", %" VR1(r)", %" VR1(r) "\n"  \
+                   "vpxord     %zmm22,     %" VR0(r)", %zmm12\n"       \
+                   "vpxord     %zmm22,     %" VR1(r)", %zmm13\n"       \
+                   "vmovdqu8   %zmm12,     %" VR0(r) "{%k1}\n"         \
+                   "vmovdqu8   %zmm13,     %" VR1(r) "{%k2}");         \
                break;                                                  \
        default:                                                        \
                ASM_BUG();                                              \
@@ -276,7 +230,7 @@ typedef struct v {
 #define        _ta             "zmm10"
 #define        _tb             "zmm15"
 
-static const uint8_t __attribute__((aligned(32))) _mul_mask = 0x0F;
+static const uint8_t __attribute__((aligned(64))) _mul_mask = 0x0F;
 
 #define        _MULx2(c, r...)                                                 \
 {                                                                      \
@@ -339,11 +293,15 @@ static const uint8_t __attribute__((aligned(32))) _mul_mask = 0x0F;
 }
 
 #define        raidz_math_begin()      kfpu_begin()
-#define        raidz_math_end()                                                \
-{                                                                      \
-       FLUSH();                                                        \
-       kfpu_end();                                                     \
-}
+#define        raidz_math_end()        kfpu_end()
+
+/*
+ * ZERO, COPY, and MUL operations are already 2x unrolled, which means that
+ * the stride of these operations for avx512 must not exceed 4. Otherwise, a
+ * single step would exceed 512B block size.
+ */
+
+#define        SYN_STRIDE              4
 
 #define        ZERO_STRIDE             4
 #define        ZERO_DEFINE()           {}
@@ -355,65 +313,73 @@ static const uint8_t __attribute__((aligned(32))) _mul_mask = 0x0F;
 
 #define        ADD_STRIDE              4
 #define        ADD_DEFINE()            {}
-#define        ADD_D                   0, 1, 2, 3
+#define        ADD_D                   0, 1, 2, 3
 
 #define        MUL_STRIDE              4
-#define        MUL_DEFINE()            {}
+#define        MUL_DEFINE()            {}
 #define        MUL_D                   0, 1, 2, 3
 
-#define        GEN_P_DEFINE()          {}
 #define        GEN_P_STRIDE            4
+#define        GEN_P_DEFINE()          {}
 #define        GEN_P_P                 0, 1, 2, 3
 
-#define        GEN_PQ_DEFINE()         {}
 #define        GEN_PQ_STRIDE           4
+#define        GEN_PQ_DEFINE()         {}
 #define        GEN_PQ_D                0, 1, 2, 3
-#define        GEN_PQ_P                4, 5, 6, 7
-#define        GEN_PQ_Q                20, 21, 22, 23
+#define        GEN_PQ_C                4, 5, 6, 7
 
+#define        GEN_PQR_STRIDE          4
 #define        GEN_PQR_DEFINE()        {}
-#define        GEN_PQR_STRIDE          2
-#define        GEN_PQR_D               0, 1
-#define        GEN_PQR_P               2, 3
-#define        GEN_PQR_Q               4, 5
-#define        GEN_PQR_R               6, 7
+#define        GEN_PQR_D               0, 1, 2, 3
+#define        GEN_PQR_C               4, 5, 6, 7
 
-#define        REC_P_DEFINE()          {}
-#define        REC_P_STRIDE            4
-#define        REC_P_X                 0, 1, 2, 3
+#define        SYN_Q_DEFINE()          {}
+#define        SYN_Q_D                 0, 1, 2, 3
+#define        SYN_Q_X                 4, 5, 6, 7
 
-#define        REC_Q_DEFINE()          {}
-#define        REC_Q_STRIDE            4
-#define        REC_Q_X                 0, 1, 2, 3
+#define        SYN_R_DEFINE()          {}
+#define        SYN_R_D                 0, 1, 2, 3
+#define        SYN_R_X                 4, 5, 6, 7
 
-#define        REC_R_DEFINE()          {}
-#define        REC_R_STRIDE            4
-#define        REC_R_X                 0, 1, 2, 3
+#define        SYN_PQ_DEFINE()         {}
+#define        SYN_PQ_D                0, 1, 2, 3
+#define        SYN_PQ_X                4, 5, 6, 7
 
+#define        REC_PQ_STRIDE           2
 #define        REC_PQ_DEFINE()         {}
-#define        REC_PQ_STRIDE           4
-#define        REC_PQ_X                0, 1, 2, 3
-#define        REC_PQ_Y                4, 5, 6, 7
-#define        REC_PQ_D                20, 21, 22, 23
+#define        REC_PQ_X                0, 1
+#define        REC_PQ_Y                2, 3
+#define        REC_PQ_T                4, 5
+
+#define        SYN_PR_DEFINE()         {}
+#define        SYN_PR_D                0, 1, 2, 3
+#define        SYN_PR_X                4, 5, 6, 7
 
+#define        REC_PR_STRIDE           2
 #define        REC_PR_DEFINE()         {}
-#define        REC_PR_STRIDE           4
-#define        REC_PR_X                0, 1, 2, 3
-#define        REC_PR_Y                4, 5, 6, 7
-#define        REC_PR_D                20, 21, 22, 23
+#define        REC_PR_X                0, 1
+#define        REC_PR_Y                2, 3
+#define        REC_PR_T                4, 5
 
+#define        SYN_QR_DEFINE()         {}
+#define        SYN_QR_D                0, 1, 2, 3
+#define        SYN_QR_X                4, 5, 6, 7
+
+#define        REC_QR_STRIDE           2
 #define        REC_QR_DEFINE()         {}
-#define        REC_QR_STRIDE           4
-#define        REC_QR_X                0, 1, 2, 3
-#define        REC_QR_Y                4, 5, 6, 7
-#define        REC_QR_D                20, 21, 22, 23
+#define        REC_QR_X                0, 1
+#define        REC_QR_Y                2, 3
+#define        REC_QR_T                4, 5
+
+#define        SYN_PQR_DEFINE()        {}
+#define        SYN_PQR_D               0, 1, 2, 3
+#define        SYN_PQR_X               4, 5, 6, 7
 
-#define        REC_PQR_DEFINE()        {}
 #define        REC_PQR_STRIDE          2
+#define        REC_PQR_DEFINE()        {}
 #define        REC_PQR_X               0, 1
 #define        REC_PQR_Y               2, 3
 #define        REC_PQR_Z               4, 5
-#define        REC_PQR_D               6, 7
 #define        REC_PQR_XS              6, 7
 #define        REC_PQR_YS              8, 9
 
@@ -428,8 +394,8 @@ static boolean_t
 raidz_will_avx512bw_work(void)
 {
        return (zfs_avx_available() &&
-               zfs_avx512f_available() &&
-               zfs_avx512bw_available());
+           zfs_avx512f_available() &&
+           zfs_avx512bw_available());
 }
 
 const raidz_impl_ops_t vdev_raidz_avx512bw_impl = {