*/
/*
* Copyright (C) 2016 Romain Dolbeau. All rights reserved.
+ * Copyright (C) 2016 Gvozden Nešković. All rights reserved.
*/
#include <sys/isa_defs.h>
-#if 0 // defined(__x86_64) && defined(HAVE_AVX512BW)
+#if defined(__x86_64) && defined(HAVE_AVX512BW)
#include <sys/types.h>
#include <linux/simd_x86.h>
uint8_t b[ELEM_SIZE] __attribute__((aligned(ELEM_SIZE)));
} v_t;
-#define PREFETCHNTA(ptr, offset) \
-{ \
- __asm( \
- "prefetchnta " #offset "(%[MEM])\n" \
- : : [MEM] "r" (ptr)); \
-}
-
-#define PREFETCH(ptr, offset) \
-{ \
- __asm( \
- "prefetcht0 " #offset "(%[MEM])\n" \
- : : [MEM] "r" (ptr)); \
-}
-
#define XOR_ACC(src, r...) \
{ \
switch (REG_CNT(r)) { \
} \
}
-#define ZERO(r...) \
-{ \
- switch (REG_CNT(r)) { \
- case 4: \
- __asm( \
- "vpxorq %" VR0(r) ", %" VR0(r)", %" VR0(r) "\n" \
- "vpxorq %" VR1(r) ", %" VR1(r)", %" VR1(r) "\n" \
- "vpxorq %" VR2(r) ", %" VR2(r)", %" VR2(r) "\n" \
- "vpxorq %" VR3(r) ", %" VR3(r)", %" VR3(r)); \
- break; \
- case 2: \
- __asm( \
- "vpxorq %" VR0(r) ", %" VR0(r)", %" VR0(r) "\n" \
- "vpxorq %" VR1(r) ", %" VR1(r)", %" VR1(r)); \
- break; \
- default: \
- ASM_BUG(); \
- } \
-}
+#define ZERO(r...) XOR(r, r)
#define COPY(r...) \
{ \
} \
}
-#define FLUSH() \
-{ \
- __asm("vzeroupper"); \
-}
-
-#define MUL2_SETUP() \
-{ \
- __asm("vmovq %0, %%xmm14" :: "r"(0x1d1d1d1d1d1d1d1d)); \
- __asm("vpbroadcastq %xmm14, %zmm14"); \
- __asm("vmovq %0, %%xmm13" :: "r"(0x8080808080808080)); \
- __asm("vpbroadcastq %xmm13, %zmm13"); \
- __asm("vmovq %0, %%xmm12" :: "r"(0xfefefefefefefefe)); \
- __asm("vpbroadcastq %xmm12, %zmm12"); \
- __asm("vpxorq %zmm15, %zmm15 ,%zmm15"); \
+#define MUL2_SETUP() \
+{ \
+ __asm("vmovq %0, %%xmm22" :: "r"(0x1d1d1d1d1d1d1d1d)); \
+ __asm("vpbroadcastq %xmm22, %zmm22"); \
+ __asm("vpxord %zmm23, %zmm23 ,%zmm23"); \
}
#define _MUL2(r...) \
switch (REG_CNT(r)) { \
case 2: \
__asm( \
- "vpandq %" VR0(r)", %zmm13, %zmm10\n" \
- "vpandq %" VR1(r)", %zmm13, %zmm11\n" \
- "vpsrlq $7, %zmm10, %zmm8\n" \
- "vpsrlq $7, %zmm11, %zmm9\n" \
- "vpsllq $1, %zmm10, %zmm10\n" \
- "vpsllq $1, %zmm11, %zmm11\n" \
- "vpsubq %zmm8, %zmm10, %zmm10\n" \
- "vpsubq %zmm9, %zmm11, %zmm11\n" \
- "vpsllq $1, %" VR0(r)", %" VR0(r) "\n" \
- "vpsllq $1, %" VR1(r)", %" VR1(r) "\n" \
- "vpandq %zmm10, %zmm14, %zmm10\n" \
- "vpandq %zmm11, %zmm14, %zmm11\n" \
- "vpternlogd $0x6c,%zmm12, %zmm10, %" VR0(r) "\n" \
- "vpternlogd $0x6c,%zmm12, %zmm11, %" VR1(r)); \
+ "vpcmpb $1, %zmm23, %" VR0(r)", %k1\n" \
+ "vpcmpb $1, %zmm23, %" VR1(r)", %k2\n" \
+ "vpaddb %" VR0(r)", %" VR0(r)", %" VR0(r) "\n" \
+ "vpaddb %" VR1(r)", %" VR1(r)", %" VR1(r) "\n" \
+ "vpxord %zmm22, %" VR0(r)", %zmm12\n" \
+ "vpxord %zmm22, %" VR1(r)", %zmm13\n" \
+ "vmovdqu8 %zmm12, %" VR0(r) "{%k1}\n" \
+ "vmovdqu8 %zmm13, %" VR1(r) "{%k2}"); \
break; \
default: \
ASM_BUG(); \
#define _ta "zmm10"
#define _tb "zmm15"
-static const uint8_t __attribute__((aligned(32))) _mul_mask = 0x0F;
+static const uint8_t __attribute__((aligned(64))) _mul_mask = 0x0F;
#define _MULx2(c, r...) \
{ \
}
#define raidz_math_begin() kfpu_begin()
-#define raidz_math_end() \
-{ \
- FLUSH(); \
- kfpu_end(); \
-}
+#define raidz_math_end() kfpu_end()
+
+/*
+ * ZERO, COPY, and MUL operations are already 2x unrolled, which means that
+ * the stride of these operations for avx512 must not exceed 4. Otherwise, a
+ * single step would exceed 512B block size.
+ */
+
+#define SYN_STRIDE 4
#define ZERO_STRIDE 4
#define ZERO_DEFINE() {}
#define MUL_DEFINE() {}
#define MUL_D 0, 1, 2, 3
-#define GEN_P_DEFINE() {}
#define GEN_P_STRIDE 4
+#define GEN_P_DEFINE() {}
#define GEN_P_P 0, 1, 2, 3
-#define GEN_PQ_DEFINE() {}
#define GEN_PQ_STRIDE 4
+#define GEN_PQ_DEFINE() {}
#define GEN_PQ_D 0, 1, 2, 3
-#define GEN_PQ_P 4, 5, 6, 7
-#define GEN_PQ_Q 20, 21, 22, 23
-
-#define GEN_PQR_DEFINE() {}
-#define GEN_PQR_STRIDE 2
-#define GEN_PQR_D 0, 1
-#define GEN_PQR_P 2, 3
-#define GEN_PQR_Q 4, 5
-#define GEN_PQR_R 6, 7
-
-#define REC_P_DEFINE() {}
-#define REC_P_STRIDE 4
-#define REC_P_X 0, 1, 2, 3
-
-#define REC_Q_DEFINE() {}
-#define REC_Q_STRIDE 4
-#define REC_Q_X 0, 1, 2, 3
-
-#define REC_R_DEFINE() {}
-#define REC_R_STRIDE 4
-#define REC_R_X 0, 1, 2, 3
-
-#define REC_PQ_DEFINE() {}
-#define REC_PQ_STRIDE 4
-#define REC_PQ_X 0, 1, 2, 3
-#define REC_PQ_Y 4, 5, 6, 7
-#define REC_PQ_D 20, 21, 22, 23
-
-#define REC_PR_DEFINE() {}
-#define REC_PR_STRIDE 4
-#define REC_PR_X 0, 1, 2, 3
-#define REC_PR_Y 4, 5, 6, 7
-#define REC_PR_D 20, 21, 22, 23
-
-#define REC_QR_DEFINE() {}
-#define REC_QR_STRIDE 4
-#define REC_QR_X 0, 1, 2, 3
-#define REC_QR_Y 4, 5, 6, 7
-#define REC_QR_D 20, 21, 22, 23
-
-#define REC_PQR_DEFINE() {}
+#define GEN_PQ_C 4, 5, 6, 7
+
+#define GEN_PQR_STRIDE 4
+#define GEN_PQR_DEFINE() {}
+#define GEN_PQR_D 0, 1, 2, 3
+#define GEN_PQR_C 4, 5, 6, 7
+
+#define SYN_Q_DEFINE() {}
+#define SYN_Q_D 0, 1, 2, 3
+#define SYN_Q_X 4, 5, 6, 7
+
+#define SYN_R_DEFINE() {}
+#define SYN_R_D 0, 1, 2, 3
+#define SYN_R_X 4, 5, 6, 7
+
+#define SYN_PQ_DEFINE() {}
+#define SYN_PQ_D 0, 1, 2, 3
+#define SYN_PQ_X 4, 5, 6, 7
+
+#define REC_PQ_STRIDE 2
+#define REC_PQ_DEFINE() {}
+#define REC_PQ_X 0, 1
+#define REC_PQ_Y 2, 3
+#define REC_PQ_T 4, 5
+
+#define SYN_PR_DEFINE() {}
+#define SYN_PR_D 0, 1, 2, 3
+#define SYN_PR_X 4, 5, 6, 7
+
+#define REC_PR_STRIDE 2
+#define REC_PR_DEFINE() {}
+#define REC_PR_X 0, 1
+#define REC_PR_Y 2, 3
+#define REC_PR_T 4, 5
+
+#define SYN_QR_DEFINE() {}
+#define SYN_QR_D 0, 1, 2, 3
+#define SYN_QR_X 4, 5, 6, 7
+
+#define REC_QR_STRIDE 2
+#define REC_QR_DEFINE() {}
+#define REC_QR_X 0, 1
+#define REC_QR_Y 2, 3
+#define REC_QR_T 4, 5
+
+#define SYN_PQR_DEFINE() {}
+#define SYN_PQR_D 0, 1, 2, 3
+#define SYN_PQR_X 4, 5, 6, 7
+
#define REC_PQR_STRIDE 2
+#define REC_PQR_DEFINE() {}
#define REC_PQR_X 0, 1
#define REC_PQR_Y 2, 3
#define REC_PQR_Z 4, 5
-#define REC_PQR_D 6, 7
#define REC_PQR_XS 6, 7
#define REC_PQR_YS 8, 9