ABD: Adapt avx512bw raidz assembly

author Gvozden Neskovic <neskovic@gmail.com>

Fri, 16 Dec 2016 01:31:33 +0000 (02:31 +0100)

committer Brian Behlendorf <behlendorf1@llnl.gov>

Fri, 16 Dec 2016 01:31:33 +0000 (17:31 -0800)
author Gvozden Neskovic <neskovic@gmail.com>
Fri, 16 Dec 2016 01:31:33 +0000 (02:31 +0100)
committer Brian Behlendorf <behlendorf1@llnl.gov>
Fri, 16 Dec 2016 01:31:33 +0000 (17:31 -0800)
diff --git a/module/zfs/vdev_raidz_math.c b/module/zfs/vdev_raidz_math.c

index 85dd15cc3c93ecc28f0f181be98b1d367c2da203..a175bcf770d9e095da74077cb5837f29b4ae1cb7 100644 (file)
--- a/module/zfs/vdev_raidz_math.c
+++ b/module/zfs/vdev_raidz_math.c
@@ -61,7 +61,7 @@ const raidz_impl_ops_t *raidz_all_maths[] = {
         &vdev_raidz_avx512f_impl,
  #endif
  #if defined(__x86_64) && defined(HAVE_AVX512BW)        /* only x86_64 for now */
-       // &vdev_raidz_avx512bw_impl,
+       &vdev_raidz_avx512bw_impl,
  #endif
  #if defined(__aarch64__)
         &vdev_raidz_aarch64_neon_impl,
diff --git a/module/zfs/vdev_raidz_math_avx512bw.c b/module/zfs/vdev_raidz_math_avx512bw.c

index 33b2d388f947ca76f5bb7c75a1d412a2053d02d4..3d5326b9e6e161aae77b61461d1af7e300f267df 100644 (file)
--- a/module/zfs/vdev_raidz_math_avx512bw.c
+++ b/module/zfs/vdev_raidz_math_avx512bw.c
@@ -20,11 +20,12 @@
   */
  /*
   * Copyright (C) 2016 Romain Dolbeau. All rights reserved.
+ * Copyright (C) 2016 Gvozden Nešković. All rights reserved.
   */
  
  #include <sys/isa_defs.h>
  
-#if 0 // defined(__x86_64) && defined(HAVE_AVX512BW)
+#if defined(__x86_64) && defined(HAVE_AVX512BW)
  
  #include <sys/types.h>
  #include <linux/simd_x86.h>
@@ -66,20 +67,6 @@ typedef struct v {
         uint8_t b[ELEM_SIZE] __attribute__((aligned(ELEM_SIZE)));
  } v_t;
  
-#define        PREFETCHNTA(ptr, offset)                                        \
-{                                                                      \
-       __asm(                                                          \
-           "prefetchnta " #offset "(%[MEM])\n"                         \
-           : : [MEM] "r" (ptr));                                       \
-}
-
-#define        PREFETCH(ptr, offset)                                           \
-{                                                                      \
-       __asm(                                                          \
-           "prefetcht0 " #offset "(%[MEM])\n"                          \
-           : : [MEM] "r" (ptr));                                       \
-}
-
  #define        XOR_ACC(src, r...)                                              \
  {                                                                      \
         switch (REG_CNT(r)) {                                           \
@@ -122,25 +109,7 @@ typedef struct v {
         }                                                               \
  }
  
-#define        ZERO(r...)                                                      \
-{                                                                      \
-       switch (REG_CNT(r)) {                                           \
-       case 4:                                                         \
-               __asm(                                                  \
-                   "vpxorq %" VR0(r) ", %" VR0(r)", %" VR0(r) "\n"     \
-                   "vpxorq %" VR1(r) ", %" VR1(r)", %" VR1(r) "\n"     \
-                   "vpxorq %" VR2(r) ", %" VR2(r)", %" VR2(r) "\n"     \
-                   "vpxorq %" VR3(r) ", %" VR3(r)", %" VR3(r));        \
-               break;                                                  \
-       case 2:                                                         \
-               __asm(                                                  \
-                   "vpxorq %" VR0(r) ", %" VR0(r)", %" VR0(r) "\n"     \
-                   "vpxorq %" VR1(r) ", %" VR1(r)", %" VR1(r));        \
-               break;                                                  \
-       default:                                                        \
-               ASM_BUG();                                              \
-       }                                                               \
-}
+#define        ZERO(r...)      XOR(r, r)
  
  #define        COPY(r...)                                                      \
  {                                                                      \
@@ -206,20 +175,11 @@ typedef struct v {
         }                                                               \
  }
  
-#define        FLUSH()                                                         \
-{                                                                      \
-       __asm("vzeroupper");                                            \
-}
-
-#define        MUL2_SETUP()                                                    \
-{                                                                      \
-       __asm("vmovq %0,   %%xmm14" :: "r"(0x1d1d1d1d1d1d1d1d));        \
-       __asm("vpbroadcastq %xmm14, %zmm14");                           \
-       __asm("vmovq %0,   %%xmm13" :: "r"(0x8080808080808080));        \
-       __asm("vpbroadcastq %xmm13, %zmm13");                           \
-       __asm("vmovq %0,   %%xmm12" :: "r"(0xfefefefefefefefe));        \
-       __asm("vpbroadcastq %xmm12, %zmm12");                           \
-       __asm("vpxorq       %zmm15, %zmm15 ,%zmm15");                   \
+#define        MUL2_SETUP()                                                    \
+{                                                                      \
+       __asm("vmovq %0,    %%xmm22" :: "r"(0x1d1d1d1d1d1d1d1d));       \
+       __asm("vpbroadcastq  %xmm22, %zmm22");                          \
+       __asm("vpxord        %zmm23, %zmm23 ,%zmm23");                  \
  }
  
  #define        _MUL2(r...)                                                     \
@@ -227,20 +187,14 @@ typedef struct v {
         switch  (REG_CNT(r)) {                                          \
         case 2:                                                         \
                 __asm(                                                  \
-                   "vpandq   %" VR0(r)", %zmm13, %zmm10\n"             \
-                   "vpandq   %" VR1(r)", %zmm13, %zmm11\n"             \
-                   "vpsrlq   $7, %zmm10, %zmm8\n"                      \
-                   "vpsrlq   $7, %zmm11, %zmm9\n"                      \
-                   "vpsllq   $1, %zmm10, %zmm10\n"                     \
-                   "vpsllq   $1, %zmm11, %zmm11\n"                     \
-                   "vpsubq   %zmm8, %zmm10, %zmm10\n"                  \
-                   "vpsubq   %zmm9, %zmm11, %zmm11\n"                  \
-                   "vpsllq   $1, %" VR0(r)", %" VR0(r) "\n"            \
-                   "vpsllq   $1, %" VR1(r)", %" VR1(r) "\n"            \
-                   "vpandq   %zmm10, %zmm14, %zmm10\n"                 \
-                   "vpandq   %zmm11, %zmm14, %zmm11\n"                 \
-                   "vpternlogd $0x6c,%zmm12, %zmm10, %" VR0(r) "\n"    \
-                   "vpternlogd $0x6c,%zmm12, %zmm11, %" VR1(r));       \
+                   "vpcmpb $1, %zmm23,     %" VR0(r)", %k1\n"          \
+                   "vpcmpb $1, %zmm23,     %" VR1(r)", %k2\n"          \
+                   "vpaddb     %" VR0(r)", %" VR0(r)", %" VR0(r) "\n"  \
+                   "vpaddb     %" VR1(r)", %" VR1(r)", %" VR1(r) "\n"  \
+                   "vpxord     %zmm22,     %" VR0(r)", %zmm12\n"       \
+                   "vpxord     %zmm22,     %" VR1(r)", %zmm13\n"       \
+                   "vmovdqu8   %zmm12,     %" VR0(r) "{%k1}\n"         \
+                   "vmovdqu8   %zmm13,     %" VR1(r) "{%k2}");         \
                 break;                                                  \
         default:                                                        \
                 ASM_BUG();                                              \
@@ -276,7 +230,7 @@ typedef struct v {
  #define        _ta             "zmm10"
  #define        _tb             "zmm15"
  
-static const uint8_t __attribute__((aligned(32))) _mul_mask = 0x0F;
+static const uint8_t __attribute__((aligned(64))) _mul_mask = 0x0F;
  
  #define        _MULx2(c, r...)                                                 \
  {                                                                      \
@@ -339,11 +293,15 @@ static const uint8_t __attribute__((aligned(32))) _mul_mask = 0x0F;
  }
  
  #define        raidz_math_begin()      kfpu_begin()
-#define        raidz_math_end()                                                \
-{                                                                      \
-       FLUSH();                                                        \
-       kfpu_end();                                                     \
-}
+#define        raidz_math_end()        kfpu_end()
+
+/*
+ * ZERO, COPY, and MUL operations are already 2x unrolled, which means that
+ * the stride of these operations for avx512 must not exceed 4. Otherwise, a
+ * single step would exceed 512B block size.
+ */
+
+#define        SYN_STRIDE              4
  
  #define        ZERO_STRIDE             4
  #define        ZERO_DEFINE()           {}
@@ -361,59 +319,67 @@ static const uint8_t __attribute__((aligned(32))) _mul_mask = 0x0F;
  #define        MUL_DEFINE()            {}
  #define        MUL_D                   0, 1, 2, 3
  
-#define        GEN_P_DEFINE()          {}
  #define        GEN_P_STRIDE            4
+#define        GEN_P_DEFINE()          {}
  #define        GEN_P_P                 0, 1, 2, 3
  
-#define        GEN_PQ_DEFINE()         {}
  #define        GEN_PQ_STRIDE           4
+#define        GEN_PQ_DEFINE()         {}
  #define        GEN_PQ_D                0, 1, 2, 3
-#define        GEN_PQ_P                4, 5, 6, 7
-#define        GEN_PQ_Q                20, 21, 22, 23
-
-#define        GEN_PQR_DEFINE()        {}
-#define        GEN_PQR_STRIDE          2
-#define        GEN_PQR_D               0, 1
-#define        GEN_PQR_P               2, 3
-#define        GEN_PQR_Q               4, 5
-#define        GEN_PQR_R               6, 7
-
-#define        REC_P_DEFINE()          {}
-#define        REC_P_STRIDE            4
-#define        REC_P_X                 0, 1, 2, 3
-
-#define        REC_Q_DEFINE()          {}
-#define        REC_Q_STRIDE            4
-#define        REC_Q_X                 0, 1, 2, 3
-
-#define        REC_R_DEFINE()          {}
-#define        REC_R_STRIDE            4
-#define        REC_R_X                 0, 1, 2, 3
-
-#define        REC_PQ_DEFINE()         {}
-#define        REC_PQ_STRIDE           4
-#define        REC_PQ_X                0, 1, 2, 3
-#define        REC_PQ_Y                4, 5, 6, 7
-#define        REC_PQ_D                20, 21, 22, 23
-
-#define        REC_PR_DEFINE()         {}
-#define        REC_PR_STRIDE           4
-#define        REC_PR_X                0, 1, 2, 3
-#define        REC_PR_Y                4, 5, 6, 7
-#define        REC_PR_D                20, 21, 22, 23
-
-#define        REC_QR_DEFINE()         {}
-#define        REC_QR_STRIDE           4
-#define        REC_QR_X                0, 1, 2, 3
-#define        REC_QR_Y                4, 5, 6, 7
-#define        REC_QR_D                20, 21, 22, 23
-
-#define        REC_PQR_DEFINE()        {}
+#define        GEN_PQ_C                4, 5, 6, 7
+
+#define        GEN_PQR_STRIDE          4
+#define        GEN_PQR_DEFINE()        {}
+#define        GEN_PQR_D               0, 1, 2, 3
+#define        GEN_PQR_C               4, 5, 6, 7
+
+#define        SYN_Q_DEFINE()          {}
+#define        SYN_Q_D                 0, 1, 2, 3
+#define        SYN_Q_X                 4, 5, 6, 7
+
+#define        SYN_R_DEFINE()          {}
+#define        SYN_R_D                 0, 1, 2, 3
+#define        SYN_R_X                 4, 5, 6, 7
+
+#define        SYN_PQ_DEFINE()         {}
+#define        SYN_PQ_D                0, 1, 2, 3
+#define        SYN_PQ_X                4, 5, 6, 7
+
+#define        REC_PQ_STRIDE           2
+#define        REC_PQ_DEFINE()         {}
+#define        REC_PQ_X                0, 1
+#define        REC_PQ_Y                2, 3
+#define        REC_PQ_T                4, 5
+
+#define        SYN_PR_DEFINE()         {}
+#define        SYN_PR_D                0, 1, 2, 3
+#define        SYN_PR_X                4, 5, 6, 7
+
+#define        REC_PR_STRIDE           2
+#define        REC_PR_DEFINE()         {}
+#define        REC_PR_X                0, 1
+#define        REC_PR_Y                2, 3
+#define        REC_PR_T                4, 5
+
+#define        SYN_QR_DEFINE()         {}
+#define        SYN_QR_D                0, 1, 2, 3
+#define        SYN_QR_X                4, 5, 6, 7
+
+#define        REC_QR_STRIDE           2
+#define        REC_QR_DEFINE()         {}
+#define        REC_QR_X                0, 1
+#define        REC_QR_Y                2, 3
+#define        REC_QR_T                4, 5
+
+#define        SYN_PQR_DEFINE()        {}
+#define        SYN_PQR_D               0, 1, 2, 3
+#define        SYN_PQR_X               4, 5, 6, 7
+
  #define        REC_PQR_STRIDE          2
+#define        REC_PQR_DEFINE()        {}
  #define        REC_PQR_X               0, 1
  #define        REC_PQR_Y               2, 3
  #define        REC_PQR_Z               4, 5
-#define        REC_PQR_D               6, 7
  #define        REC_PQR_XS              6, 7
  #define        REC_PQR_YS              8, 9
author	Gvozden Neskovic <neskovic@gmail.com>
	Fri, 16 Dec 2016 01:31:33 +0000 (02:31 +0100)
committer	Brian Behlendorf <behlendorf1@llnl.gov>
	Fri, 16 Dec 2016 01:31:33 +0000 (17:31 -0800)
module/zfs/vdev_raidz_math.c		patch \| blob \| history
module/zfs/vdev_raidz_math_avx512bw.c		patch \| blob \| history