From 01017962908392b83764f29b21e687844636eed6 Mon Sep 17 00:00:00 2001 From: Gvozden Neskovic Date: Fri, 16 Dec 2016 02:31:33 +0100 Subject: [PATCH] ABD: Adapt avx512bw raidz assembly Adapt avx512bw implementation for use with abd buffers. Mul2 implementation is rewritten to take advantage of the BW instruction set. Reviewed-by: Brian Behlendorf Reviewed-by: Romain Dolbeau Signed-off-by: Gvozden Neskovic Closes #5477 --- module/zfs/vdev_raidz_math.c | 2 +- module/zfs/vdev_raidz_math_avx512bw.c | 190 +++++++++++--------------- 2 files changed, 79 insertions(+), 113 deletions(-) diff --git a/module/zfs/vdev_raidz_math.c b/module/zfs/vdev_raidz_math.c index 85dd15cc3..a175bcf77 100644 --- a/module/zfs/vdev_raidz_math.c +++ b/module/zfs/vdev_raidz_math.c @@ -61,7 +61,7 @@ const raidz_impl_ops_t *raidz_all_maths[] = { &vdev_raidz_avx512f_impl, #endif #if defined(__x86_64) && defined(HAVE_AVX512BW) /* only x86_64 for now */ - // &vdev_raidz_avx512bw_impl, + &vdev_raidz_avx512bw_impl, #endif #if defined(__aarch64__) &vdev_raidz_aarch64_neon_impl, diff --git a/module/zfs/vdev_raidz_math_avx512bw.c b/module/zfs/vdev_raidz_math_avx512bw.c index 33b2d388f..3d5326b9e 100644 --- a/module/zfs/vdev_raidz_math_avx512bw.c +++ b/module/zfs/vdev_raidz_math_avx512bw.c @@ -20,11 +20,12 @@ */ /* * Copyright (C) 2016 Romain Dolbeau. All rights reserved. + * Copyright (C) 2016 Gvozden Nešković. All rights reserved. */ #include -#if 0 // defined(__x86_64) && defined(HAVE_AVX512BW) +#if defined(__x86_64) && defined(HAVE_AVX512BW) #include #include @@ -66,20 +67,6 @@ typedef struct v { uint8_t b[ELEM_SIZE] __attribute__((aligned(ELEM_SIZE))); } v_t; -#define PREFETCHNTA(ptr, offset) \ -{ \ - __asm( \ - "prefetchnta " #offset "(%[MEM])\n" \ - : : [MEM] "r" (ptr)); \ -} - -#define PREFETCH(ptr, offset) \ -{ \ - __asm( \ - "prefetcht0 " #offset "(%[MEM])\n" \ - : : [MEM] "r" (ptr)); \ -} - #define XOR_ACC(src, r...) \ { \ switch (REG_CNT(r)) { \ @@ -122,25 +109,7 @@ typedef struct v { } \ } -#define ZERO(r...) \ -{ \ - switch (REG_CNT(r)) { \ - case 4: \ - __asm( \ - "vpxorq %" VR0(r) ", %" VR0(r)", %" VR0(r) "\n" \ - "vpxorq %" VR1(r) ", %" VR1(r)", %" VR1(r) "\n" \ - "vpxorq %" VR2(r) ", %" VR2(r)", %" VR2(r) "\n" \ - "vpxorq %" VR3(r) ", %" VR3(r)", %" VR3(r)); \ - break; \ - case 2: \ - __asm( \ - "vpxorq %" VR0(r) ", %" VR0(r)", %" VR0(r) "\n" \ - "vpxorq %" VR1(r) ", %" VR1(r)", %" VR1(r)); \ - break; \ - default: \ - ASM_BUG(); \ - } \ -} +#define ZERO(r...) XOR(r, r) #define COPY(r...) \ { \ @@ -206,20 +175,11 @@ typedef struct v { } \ } -#define FLUSH() \ -{ \ - __asm("vzeroupper"); \ -} - -#define MUL2_SETUP() \ -{ \ - __asm("vmovq %0, %%xmm14" :: "r"(0x1d1d1d1d1d1d1d1d)); \ - __asm("vpbroadcastq %xmm14, %zmm14"); \ - __asm("vmovq %0, %%xmm13" :: "r"(0x8080808080808080)); \ - __asm("vpbroadcastq %xmm13, %zmm13"); \ - __asm("vmovq %0, %%xmm12" :: "r"(0xfefefefefefefefe)); \ - __asm("vpbroadcastq %xmm12, %zmm12"); \ - __asm("vpxorq %zmm15, %zmm15 ,%zmm15"); \ +#define MUL2_SETUP() \ +{ \ + __asm("vmovq %0, %%xmm22" :: "r"(0x1d1d1d1d1d1d1d1d)); \ + __asm("vpbroadcastq %xmm22, %zmm22"); \ + __asm("vpxord %zmm23, %zmm23 ,%zmm23"); \ } #define _MUL2(r...) \ @@ -227,20 +187,14 @@ typedef struct v { switch (REG_CNT(r)) { \ case 2: \ __asm( \ - "vpandq %" VR0(r)", %zmm13, %zmm10\n" \ - "vpandq %" VR1(r)", %zmm13, %zmm11\n" \ - "vpsrlq $7, %zmm10, %zmm8\n" \ - "vpsrlq $7, %zmm11, %zmm9\n" \ - "vpsllq $1, %zmm10, %zmm10\n" \ - "vpsllq $1, %zmm11, %zmm11\n" \ - "vpsubq %zmm8, %zmm10, %zmm10\n" \ - "vpsubq %zmm9, %zmm11, %zmm11\n" \ - "vpsllq $1, %" VR0(r)", %" VR0(r) "\n" \ - "vpsllq $1, %" VR1(r)", %" VR1(r) "\n" \ - "vpandq %zmm10, %zmm14, %zmm10\n" \ - "vpandq %zmm11, %zmm14, %zmm11\n" \ - "vpternlogd $0x6c,%zmm12, %zmm10, %" VR0(r) "\n" \ - "vpternlogd $0x6c,%zmm12, %zmm11, %" VR1(r)); \ + "vpcmpb $1, %zmm23, %" VR0(r)", %k1\n" \ + "vpcmpb $1, %zmm23, %" VR1(r)", %k2\n" \ + "vpaddb %" VR0(r)", %" VR0(r)", %" VR0(r) "\n" \ + "vpaddb %" VR1(r)", %" VR1(r)", %" VR1(r) "\n" \ + "vpxord %zmm22, %" VR0(r)", %zmm12\n" \ + "vpxord %zmm22, %" VR1(r)", %zmm13\n" \ + "vmovdqu8 %zmm12, %" VR0(r) "{%k1}\n" \ + "vmovdqu8 %zmm13, %" VR1(r) "{%k2}"); \ break; \ default: \ ASM_BUG(); \ @@ -276,7 +230,7 @@ typedef struct v { #define _ta "zmm10" #define _tb "zmm15" -static const uint8_t __attribute__((aligned(32))) _mul_mask = 0x0F; +static const uint8_t __attribute__((aligned(64))) _mul_mask = 0x0F; #define _MULx2(c, r...) \ { \ @@ -339,11 +293,15 @@ static const uint8_t __attribute__((aligned(32))) _mul_mask = 0x0F; } #define raidz_math_begin() kfpu_begin() -#define raidz_math_end() \ -{ \ - FLUSH(); \ - kfpu_end(); \ -} +#define raidz_math_end() kfpu_end() + +/* + * ZERO, COPY, and MUL operations are already 2x unrolled, which means that + * the stride of these operations for avx512 must not exceed 4. Otherwise, a + * single step would exceed 512B block size. + */ + +#define SYN_STRIDE 4 #define ZERO_STRIDE 4 #define ZERO_DEFINE() {} @@ -361,59 +319,67 @@ static const uint8_t __attribute__((aligned(32))) _mul_mask = 0x0F; #define MUL_DEFINE() {} #define MUL_D 0, 1, 2, 3 -#define GEN_P_DEFINE() {} #define GEN_P_STRIDE 4 +#define GEN_P_DEFINE() {} #define GEN_P_P 0, 1, 2, 3 -#define GEN_PQ_DEFINE() {} #define GEN_PQ_STRIDE 4 +#define GEN_PQ_DEFINE() {} #define GEN_PQ_D 0, 1, 2, 3 -#define GEN_PQ_P 4, 5, 6, 7 -#define GEN_PQ_Q 20, 21, 22, 23 - -#define GEN_PQR_DEFINE() {} -#define GEN_PQR_STRIDE 2 -#define GEN_PQR_D 0, 1 -#define GEN_PQR_P 2, 3 -#define GEN_PQR_Q 4, 5 -#define GEN_PQR_R 6, 7 - -#define REC_P_DEFINE() {} -#define REC_P_STRIDE 4 -#define REC_P_X 0, 1, 2, 3 - -#define REC_Q_DEFINE() {} -#define REC_Q_STRIDE 4 -#define REC_Q_X 0, 1, 2, 3 - -#define REC_R_DEFINE() {} -#define REC_R_STRIDE 4 -#define REC_R_X 0, 1, 2, 3 - -#define REC_PQ_DEFINE() {} -#define REC_PQ_STRIDE 4 -#define REC_PQ_X 0, 1, 2, 3 -#define REC_PQ_Y 4, 5, 6, 7 -#define REC_PQ_D 20, 21, 22, 23 - -#define REC_PR_DEFINE() {} -#define REC_PR_STRIDE 4 -#define REC_PR_X 0, 1, 2, 3 -#define REC_PR_Y 4, 5, 6, 7 -#define REC_PR_D 20, 21, 22, 23 - -#define REC_QR_DEFINE() {} -#define REC_QR_STRIDE 4 -#define REC_QR_X 0, 1, 2, 3 -#define REC_QR_Y 4, 5, 6, 7 -#define REC_QR_D 20, 21, 22, 23 - -#define REC_PQR_DEFINE() {} +#define GEN_PQ_C 4, 5, 6, 7 + +#define GEN_PQR_STRIDE 4 +#define GEN_PQR_DEFINE() {} +#define GEN_PQR_D 0, 1, 2, 3 +#define GEN_PQR_C 4, 5, 6, 7 + +#define SYN_Q_DEFINE() {} +#define SYN_Q_D 0, 1, 2, 3 +#define SYN_Q_X 4, 5, 6, 7 + +#define SYN_R_DEFINE() {} +#define SYN_R_D 0, 1, 2, 3 +#define SYN_R_X 4, 5, 6, 7 + +#define SYN_PQ_DEFINE() {} +#define SYN_PQ_D 0, 1, 2, 3 +#define SYN_PQ_X 4, 5, 6, 7 + +#define REC_PQ_STRIDE 2 +#define REC_PQ_DEFINE() {} +#define REC_PQ_X 0, 1 +#define REC_PQ_Y 2, 3 +#define REC_PQ_T 4, 5 + +#define SYN_PR_DEFINE() {} +#define SYN_PR_D 0, 1, 2, 3 +#define SYN_PR_X 4, 5, 6, 7 + +#define REC_PR_STRIDE 2 +#define REC_PR_DEFINE() {} +#define REC_PR_X 0, 1 +#define REC_PR_Y 2, 3 +#define REC_PR_T 4, 5 + +#define SYN_QR_DEFINE() {} +#define SYN_QR_D 0, 1, 2, 3 +#define SYN_QR_X 4, 5, 6, 7 + +#define REC_QR_STRIDE 2 +#define REC_QR_DEFINE() {} +#define REC_QR_X 0, 1 +#define REC_QR_Y 2, 3 +#define REC_QR_T 4, 5 + +#define SYN_PQR_DEFINE() {} +#define SYN_PQR_D 0, 1, 2, 3 +#define SYN_PQR_X 4, 5, 6, 7 + #define REC_PQR_STRIDE 2 +#define REC_PQR_DEFINE() {} #define REC_PQR_X 0, 1 #define REC_PQR_Y 2, 3 #define REC_PQR_Z 4, 5 -#define REC_PQR_D 6, 7 #define REC_PQR_XS 6, 7 #define REC_PQR_YS 8, 9 -- 2.40.0