From 88cc2352eaf6bdd87be8349097b4a3784aeafc51 Mon Sep 17 00:00:00 2001 From: Romain Dolbeau Date: Tue, 22 Nov 2016 08:38:34 +0100 Subject: [PATCH] ABD raidz NEON support Port NEON implementation of RAID-Z functions to ABD. Signed-off-by: Roomain Dolbeau --- module/zfs/vdev_raidz_math.c | 4 +- module/zfs/vdev_raidz_math_aarch64_neon.c | 134 +++++++++++---- .../zfs/vdev_raidz_math_aarch64_neon_common.h | 29 ++-- module/zfs/vdev_raidz_math_aarch64_neonx2.c | 156 +++++++++++++----- 4 files changed, 229 insertions(+), 94 deletions(-) diff --git a/module/zfs/vdev_raidz_math.c b/module/zfs/vdev_raidz_math.c index 25d25bd27..c050c9099 100644 --- a/module/zfs/vdev_raidz_math.c +++ b/module/zfs/vdev_raidz_math.c @@ -64,8 +64,8 @@ const raidz_impl_ops_t *raidz_all_maths[] = { // &vdev_raidz_avx512bw_impl, #endif #if defined(__aarch64__) - // &vdev_raidz_aarch64_neon_impl, - // &vdev_raidz_aarch64_neonx2_impl, + &vdev_raidz_aarch64_neon_impl, + &vdev_raidz_aarch64_neonx2_impl, #endif }; diff --git a/module/zfs/vdev_raidz_math_aarch64_neon.c b/module/zfs/vdev_raidz_math_aarch64_neon.c index 7ba30ba5e..c7b8afd38 100644 --- a/module/zfs/vdev_raidz_math_aarch64_neon.c +++ b/module/zfs/vdev_raidz_math_aarch64_neon.c @@ -25,10 +25,36 @@ #include #include -#if 0 // defined(__aarch64__) +#if defined(__aarch64__) #include "vdev_raidz_math_aarch64_neon_common.h" +#define SYN_STRIDE 4 + +#define ZERO_STRIDE 4 +#define ZERO_DEFINE() \ + GEN_X_DEFINE_0_3() \ + GEN_X_DEFINE_33_36() +#define ZERO_D 0, 1, 2, 3 + +#define COPY_STRIDE 4 +#define COPY_DEFINE() \ + GEN_X_DEFINE_0_3() \ + GEN_X_DEFINE_33_36() +#define COPY_D 0, 1, 2, 3 + +#define ADD_STRIDE 4 +#define ADD_DEFINE() \ + GEN_X_DEFINE_0_3() \ + GEN_X_DEFINE_33_36() +#define ADD_D 0, 1, 2, 3 + +#define MUL_STRIDE 4 +#define MUL_DEFINE() \ + GEN_X_DEFINE_0_3() \ + GEN_X_DEFINE_33_36() +#define MUL_D 0, 1, 2, 3 + #define GEN_P_DEFINE() \ GEN_X_DEFINE_0_3() \ GEN_X_DEFINE_33_36() @@ -39,15 +65,12 @@ GEN_X_DEFINE_0_3() \ GEN_X_DEFINE_4_5() \ GEN_X_DEFINE_6_7() \ - GEN_X_DEFINE_8_9() \ - GEN_X_DEFINE_10_11() \ GEN_X_DEFINE_16() \ GEN_X_DEFINE_17() \ GEN_X_DEFINE_33_36() #define GEN_PQ_STRIDE 4 #define GEN_PQ_D 0, 1, 2, 3 -#define GEN_PQ_P 4, 5, 6, 7 -#define GEN_PQ_Q 8, 9, 10, 11 +#define GEN_PQ_C 4, 5, 6, 7 #define GEN_PQR_DEFINE() \ GEN_X_DEFINE_0_3() \ @@ -55,69 +78,115 @@ GEN_X_DEFINE_6_7() \ GEN_X_DEFINE_16() \ GEN_X_DEFINE_17() \ - GEN_X_DEFINE_31() \ - GEN_X_DEFINE_32() \ GEN_X_DEFINE_33_36() -#define GEN_PQR_STRIDE 2 -#define GEN_PQR_D 0, 1 -#define GEN_PQR_P 2, 3 -#define GEN_PQR_Q 4, 5 -#define GEN_PQR_R 6, 7 +#define GEN_PQR_STRIDE 4 +#define GEN_PQR_D 0, 1, 2, 3 +#define GEN_PQR_C 4, 5, 6, 7 -#define REC_P_DEFINE() \ - GEN_X_DEFINE_0_3() \ +#define SYN_Q_DEFINE() \ + GEN_X_DEFINE_0_3() \ + GEN_X_DEFINE_4_5() \ + GEN_X_DEFINE_6_7() \ + GEN_X_DEFINE_16() \ + GEN_X_DEFINE_17() \ GEN_X_DEFINE_33_36() -#define REC_P_STRIDE 4 -#define REC_P_X 0, 1, 2, 3 +#define SYN_Q_STRIDE 4 +#define SYN_Q_D 0, 1, 2, 3 +#define SYN_Q_X 4, 5, 6, 7 -#define REC_Q_DEFINE() \ +#define SYN_R_DEFINE() \ GEN_X_DEFINE_0_3() \ + GEN_X_DEFINE_4_5() \ + GEN_X_DEFINE_6_7() \ GEN_X_DEFINE_16() \ GEN_X_DEFINE_17() \ GEN_X_DEFINE_33_36() -#define REC_Q_STRIDE 4 -#define REC_Q_X 0, 1, 2, 3 +#define SYN_R_STRIDE 4 +#define SYN_R_D 0, 1, 2, 3 +#define SYN_R_X 4, 5, 6, 7 -#define REC_R_DEFINE() \ +#define SYN_PQ_DEFINE() \ GEN_X_DEFINE_0_3() \ + GEN_X_DEFINE_4_5() \ + GEN_X_DEFINE_6_7() \ GEN_X_DEFINE_16() \ GEN_X_DEFINE_17() \ GEN_X_DEFINE_33_36() -#define REC_R_STRIDE 4 -#define REC_R_X 0, 1, 2, 3 +#define SYN_PQ_STRIDE 4 +#define SYN_PQ_D 0, 1, 2, 3 +#define SYN_PQ_X 4, 5, 6, 7 #define REC_PQ_DEFINE() \ GEN_X_DEFINE_0_3() \ GEN_X_DEFINE_4_5() \ - GEN_X_DEFINE_16() \ - GEN_X_DEFINE_17() \ GEN_X_DEFINE_31() \ GEN_X_DEFINE_32() \ GEN_X_DEFINE_33_36() #define REC_PQ_STRIDE 2 #define REC_PQ_X 0, 1 #define REC_PQ_Y 2, 3 -#define REC_PQ_D 4, 5 +#define REC_PQ_T 4, 5 + +#define SYN_PR_DEFINE() \ + GEN_X_DEFINE_0_3() \ + GEN_X_DEFINE_4_5() \ + GEN_X_DEFINE_6_7() \ + GEN_X_DEFINE_16() \ + GEN_X_DEFINE_17() \ + GEN_X_DEFINE_33_36() +#define SYN_PR_STRIDE 4 +#define SYN_PR_D 0, 1, 2, 3 +#define SYN_PR_X 4, 5, 6, 7 -#define REC_PR_DEFINE() REC_PQ_DEFINE() +#define REC_PR_DEFINE() \ + GEN_X_DEFINE_0_3() \ + GEN_X_DEFINE_4_5() \ + GEN_X_DEFINE_31() \ + GEN_X_DEFINE_32() \ + GEN_X_DEFINE_33_36() #define REC_PR_STRIDE 2 #define REC_PR_X 0, 1 #define REC_PR_Y 2, 3 -#define REC_PR_D 4, 5 +#define REC_PR_T 4, 5 + +#define SYN_QR_DEFINE() \ + GEN_X_DEFINE_0_3() \ + GEN_X_DEFINE_4_5() \ + GEN_X_DEFINE_6_7() \ + GEN_X_DEFINE_16() \ + GEN_X_DEFINE_17() \ + GEN_X_DEFINE_33_36() +#define SYN_QR_STRIDE 4 +#define SYN_QR_D 0, 1, 2, 3 +#define SYN_QR_X 4, 5, 6, 7 -#define REC_QR_DEFINE() REC_PQ_DEFINE() +#define REC_QR_DEFINE() \ + GEN_X_DEFINE_0_3() \ + GEN_X_DEFINE_4_5() \ + GEN_X_DEFINE_31() \ + GEN_X_DEFINE_32() \ + GEN_X_DEFINE_33_36() #define REC_QR_STRIDE 2 #define REC_QR_X 0, 1 #define REC_QR_Y 2, 3 -#define REC_QR_D 4, 5 +#define REC_QR_T 4, 5 -#define REC_PQR_DEFINE() \ +#define SYN_PQR_DEFINE() \ GEN_X_DEFINE_0_3() \ GEN_X_DEFINE_4_5() \ GEN_X_DEFINE_6_7() \ - GEN_X_DEFINE_8_9() \ GEN_X_DEFINE_16() \ GEN_X_DEFINE_17() \ + GEN_X_DEFINE_33_36() +#define SYN_PQR_STRIDE 4 +#define SYN_PQR_D 0, 1, 2, 3 +#define SYN_PQR_X 4, 5, 6, 7 + +#define REC_PQR_DEFINE() \ + GEN_X_DEFINE_0_3() \ + GEN_X_DEFINE_4_5() \ + GEN_X_DEFINE_6_7() \ + GEN_X_DEFINE_8_9() \ GEN_X_DEFINE_31() \ GEN_X_DEFINE_32() \ GEN_X_DEFINE_33_36() @@ -125,7 +194,6 @@ #define REC_PQR_X 0, 1 #define REC_PQR_Y 2, 3 #define REC_PQR_Z 4, 5 -#define REC_PQR_D 6, 7 #define REC_PQR_XS 6, 7 #define REC_PQR_YS 8, 9 @@ -154,7 +222,7 @@ const raidz_impl_ops_t vdev_raidz_aarch64_neon_impl = { #endif /* defined(__aarch64__) */ -#if 0 // defined(__aarch64__) +#if defined(__aarch64__) const uint8_t __attribute__((aligned(256))) gf_clmul_mod_lt[4*256][16] = { diff --git a/module/zfs/vdev_raidz_math_aarch64_neon_common.h b/module/zfs/vdev_raidz_math_aarch64_neon_common.h index 08dbddaea..cb9ff86c1 100644 --- a/module/zfs/vdev_raidz_math_aarch64_neon_common.h +++ b/module/zfs/vdev_raidz_math_aarch64_neon_common.h @@ -125,7 +125,7 @@ #define ASM_BUG() ASSERT(0) -#define OFFSET(ptr, val) (((unsigned char *)ptr)+val) +#define OFFSET(ptr, val) (((unsigned char *)(ptr))+val) extern const uint8_t gf_clmul_mod_lt[4*256][16]; @@ -135,20 +135,6 @@ typedef struct v { uint8_t b[ELEM_SIZE] __attribute__((aligned(ELEM_SIZE))); } v_t; -#define PREFETCHNTA(ptr, offset) \ -{ \ - __asm( \ - "prfm pstl1strm, %[MEM]\n" \ - : : [MEM] "Q" (*(ptr + offset))); \ -} - -#define PREFETCH(ptr, offset) \ -{ \ - __asm( \ - "prfm pldl1keep, %[MEM]\n" \ - : : [MEM] "Q" (*(ptr + offset))); \ -} - #define XOR_ACC(src, r...) \ { \ switch (REG_CNT(r)) { \ @@ -242,6 +228,19 @@ typedef struct v { #define ZERO(r...) \ { \ switch (REG_CNT(r)) { \ + case 8: \ + __asm( \ + "eor " VR0(r) ".16b," VR0(r) ".16b," VR0(r) ".16b\n" \ + "eor " VR1(r) ".16b," VR1(r) ".16b," VR1(r) ".16b\n" \ + "eor " VR2(r) ".16b," VR2(r) ".16b," VR2(r) ".16b\n" \ + "eor " VR3(r) ".16b," VR3(r) ".16b," VR3(r) ".16b\n" \ + "eor " VR4(r) ".16b," VR4(r) ".16b," VR4(r) ".16b\n" \ + "eor " VR5(r) ".16b," VR5(r) ".16b," VR5(r) ".16b\n" \ + "eor " VR6(r) ".16b," VR6(r) ".16b," VR6(r) ".16b\n" \ + "eor " VR7(r) ".16b," VR7(r) ".16b," VR7(r) ".16b\n" \ + : WVR0(r), WVR1(r), WVR2(r), WVR3(r), \ + WVR4(r), WVR5(r), WVR6(r), WVR7(r)); \ + break; \ case 4: \ __asm( \ "eor " VR0(r) ".16b," VR0(r) ".16b," VR0(r) ".16b\n" \ diff --git a/module/zfs/vdev_raidz_math_aarch64_neonx2.c b/module/zfs/vdev_raidz_math_aarch64_neonx2.c index e05deeb98..f8688a06a 100644 --- a/module/zfs/vdev_raidz_math_aarch64_neonx2.c +++ b/module/zfs/vdev_raidz_math_aarch64_neonx2.c @@ -24,115 +24,183 @@ #include -#if 0 // defined(__aarch64__) +#if defined(__aarch64__) #include "vdev_raidz_math_aarch64_neon_common.h" -#define GEN_P_DEFINE() \ +#define SYN_STRIDE 4 + +#define ZERO_STRIDE 8 +#define ZERO_DEFINE() \ GEN_X_DEFINE_0_3() \ GEN_X_DEFINE_4_5() \ GEN_X_DEFINE_6_7() -#define GEN_P_STRIDE 8 -#define GEN_P_P 0, 1, 2, 3, 4, 5, 6, 7 +#define ZERO_D 0, 1, 2, 3, 4, 5, 6, 7 -#define GEN_PQ_DEFINE() \ +#define COPY_STRIDE 8 +#define COPY_DEFINE() \ + GEN_X_DEFINE_0_3() \ + GEN_X_DEFINE_4_5() \ + GEN_X_DEFINE_6_7() +#define COPY_D 0, 1, 2, 3, 4, 5, 6, 7 + +#define ADD_STRIDE 8 +#define ADD_DEFINE() \ + GEN_X_DEFINE_0_3() \ + GEN_X_DEFINE_4_5() \ + GEN_X_DEFINE_6_7() +#define ADD_D 0, 1, 2, 3, 4, 5, 6, 7 + +#define MUL_STRIDE 4 +#define MUL_DEFINE() \ + GEN_X_DEFINE_0_3() \ + GEN_X_DEFINE_33_36() +#define MUL_D 0, 1, 2, 3 + +#define GEN_P_DEFINE() \ + GEN_X_DEFINE_0_3() \ + GEN_X_DEFINE_33_36() +#define GEN_P_STRIDE 4 +#define GEN_P_P 0, 1, 2, 3 + +#define GEN_PQ_DEFINE() \ GEN_X_DEFINE_0_3() \ GEN_X_DEFINE_4_5() \ GEN_X_DEFINE_6_7() \ - GEN_X_DEFINE_8_9() \ - GEN_X_DEFINE_10_11() \ GEN_X_DEFINE_16() \ GEN_X_DEFINE_17() \ GEN_X_DEFINE_33_36() #define GEN_PQ_STRIDE 4 #define GEN_PQ_D 0, 1, 2, 3 -#define GEN_PQ_P 4, 5, 6, 7 -#define GEN_PQ_Q 8, 9, 10, 11 +#define GEN_PQ_C 4, 5, 6, 7 #define GEN_PQR_DEFINE() \ GEN_X_DEFINE_0_3() \ GEN_X_DEFINE_4_5() \ GEN_X_DEFINE_6_7() \ - GEN_X_DEFINE_8_9() \ - GEN_X_DEFINE_22_23() \ - GEN_X_DEFINE_24_27() \ GEN_X_DEFINE_16() \ GEN_X_DEFINE_17() \ GEN_X_DEFINE_33_36() #define GEN_PQR_STRIDE 4 #define GEN_PQR_D 0, 1, 2, 3 -#define GEN_PQR_P 4, 5, 6, 7 -#define GEN_PQR_Q 8, 9, 22, 23 -#define GEN_PQR_R 24, 25, 26, 27 +#define GEN_PQR_C 4, 5, 6, 7 -#define REC_P_DEFINE() \ +#define SYN_Q_DEFINE() \ GEN_X_DEFINE_0_3() \ + GEN_X_DEFINE_4_5() \ + GEN_X_DEFINE_6_7() \ + GEN_X_DEFINE_16() \ + GEN_X_DEFINE_17() \ GEN_X_DEFINE_33_36() -#define REC_P_STRIDE 4 -#define REC_P_X 0, 1, 2, 3 +#define SYN_Q_STRIDE 4 +#define SYN_Q_D 0, 1, 2, 3 +#define SYN_Q_X 4, 5, 6, 7 -#define REC_Q_DEFINE() \ +#define SYN_R_DEFINE() \ GEN_X_DEFINE_0_3() \ + GEN_X_DEFINE_4_5() \ + GEN_X_DEFINE_6_7() \ GEN_X_DEFINE_16() \ GEN_X_DEFINE_17() \ GEN_X_DEFINE_33_36() -#define REC_Q_STRIDE 4 -#define REC_Q_X 0, 1, 2, 3 +#define SYN_R_STRIDE 4 +#define SYN_R_D 0, 1, 2, 3 +#define SYN_R_X 4, 5, 6, 7 -#define REC_R_DEFINE() \ +#define SYN_PQ_DEFINE() \ GEN_X_DEFINE_0_3() \ + GEN_X_DEFINE_4_5() \ + GEN_X_DEFINE_6_7() \ GEN_X_DEFINE_16() \ GEN_X_DEFINE_17() \ GEN_X_DEFINE_33_36() -#define REC_R_STRIDE 4 -#define REC_R_X 0, 1, 2, 3 +#define SYN_PQ_STRIDE 4 +#define SYN_PQ_D 0, 1, 2, 3 +#define SYN_PQ_X 4, 5, 6, 7 #define REC_PQ_DEFINE() \ GEN_X_DEFINE_0_3() \ GEN_X_DEFINE_4_5() \ GEN_X_DEFINE_6_7() \ GEN_X_DEFINE_8_9() \ - GEN_X_DEFINE_16() \ - GEN_X_DEFINE_17() \ GEN_X_DEFINE_22_23() \ GEN_X_DEFINE_33_36() #define REC_PQ_STRIDE 4 #define REC_PQ_X 0, 1, 2, 3 #define REC_PQ_Y 4, 5, 6, 7 -#define REC_PQ_D 8, 9, 22, 23 +#define REC_PQ_T 8, 9, 22, 23 -#define REC_PR_DEFINE() REC_PQ_DEFINE() +#define SYN_PR_DEFINE() \ + GEN_X_DEFINE_0_3() \ + GEN_X_DEFINE_4_5() \ + GEN_X_DEFINE_6_7() \ + GEN_X_DEFINE_16() \ + GEN_X_DEFINE_17() \ + GEN_X_DEFINE_33_36() +#define SYN_PR_STRIDE 4 +#define SYN_PR_D 0, 1, 2, 3 +#define SYN_PR_X 4, 5, 6, 7 + +#define REC_PR_DEFINE() \ + GEN_X_DEFINE_0_3() \ + GEN_X_DEFINE_4_5() \ + GEN_X_DEFINE_6_7() \ + GEN_X_DEFINE_8_9() \ + GEN_X_DEFINE_22_23() \ + GEN_X_DEFINE_33_36() #define REC_PR_STRIDE 4 #define REC_PR_X 0, 1, 2, 3 #define REC_PR_Y 4, 5, 6, 7 -#define REC_PR_D 8, 9, 22, 23 +#define REC_PR_T 8, 9, 22, 23 -#define REC_QR_DEFINE() REC_PQ_DEFINE() +#define SYN_QR_DEFINE() \ + GEN_X_DEFINE_0_3() \ + GEN_X_DEFINE_4_5() \ + GEN_X_DEFINE_6_7() \ + GEN_X_DEFINE_16() \ + GEN_X_DEFINE_17() \ + GEN_X_DEFINE_33_36() +#define SYN_QR_STRIDE 4 +#define SYN_QR_D 0, 1, 2, 3 +#define SYN_QR_X 4, 5, 6, 7 + +#define REC_QR_DEFINE() \ + GEN_X_DEFINE_0_3() \ + GEN_X_DEFINE_4_5() \ + GEN_X_DEFINE_6_7() \ + GEN_X_DEFINE_8_9() \ + GEN_X_DEFINE_22_23() \ + GEN_X_DEFINE_33_36() #define REC_QR_STRIDE 4 #define REC_QR_X 0, 1, 2, 3 #define REC_QR_Y 4, 5, 6, 7 -#define REC_QR_D 8, 9, 22, 23 +#define REC_QR_T 8, 9, 22, 23 -#define REC_PQR_DEFINE() \ +#define SYN_PQR_DEFINE() \ GEN_X_DEFINE_0_3() \ GEN_X_DEFINE_4_5() \ GEN_X_DEFINE_6_7() \ - GEN_X_DEFINE_8_9() \ GEN_X_DEFINE_16() \ GEN_X_DEFINE_17() \ - GEN_X_DEFINE_22_23() \ - GEN_X_DEFINE_24_27() \ - GEN_X_DEFINE_28_30() \ - GEN_X_DEFINE_31() \ GEN_X_DEFINE_33_36() -#define REC_PQR_STRIDE 4 -#define REC_PQR_X 0, 1, 2, 3 -#define REC_PQR_Y 4, 5, 6, 7 -#define REC_PQR_Z 8, 9, 22, 23 -#define REC_PQR_D 24, 25, 26, 27 -#define REC_PQR_XS 24, 25, 26, 27 -#define REC_PQR_YS 28, 29, 30, 31 +#define SYN_PQR_STRIDE 4 +#define SYN_PQR_D 0, 1, 2, 3 +#define SYN_PQR_X 4, 5, 6, 7 +#define REC_PQR_DEFINE() \ + GEN_X_DEFINE_0_3() \ + GEN_X_DEFINE_4_5() \ + GEN_X_DEFINE_6_7() \ + GEN_X_DEFINE_8_9() \ + GEN_X_DEFINE_31() \ + GEN_X_DEFINE_32() \ + GEN_X_DEFINE_33_36() +#define REC_PQR_STRIDE 2 +#define REC_PQR_X 0, 1 +#define REC_PQR_Y 2, 3 +#define REC_PQR_Z 4, 5 +#define REC_PQR_XS 6, 7 +#define REC_PQR_YS 8, 9 #include #include "vdev_raidz_math_impl.h" -- 2.40.0