From 3728aa01d8c8ab7b6547023e4c141f10dc7755e7 Mon Sep 17 00:00:00 2001 From: DRC Date: Wed, 23 Jul 2014 14:14:14 +0000 Subject: [PATCH] Fix performance and other issues uncovered in testing with actual ARM64 hardware; formatting tweaks; remove NEON platform check (NEON is always available with ARMv8) git-svn-id: svn+ssh://svn.code.sf.net/p/libjpeg-turbo/code/trunk@1333 632fc199-4ca6-4c93-a231-07263d6284db --- simd/jsimd_arm64.c | 83 +-------- simd/jsimd_arm_neon_64.S | 380 +++++++++++++++++++-------------------- 2 files changed, 193 insertions(+), 270 deletions(-) diff --git a/simd/jsimd_arm64.c b/simd/jsimd_arm64.c index 988023a..44225aa 100644 --- a/simd/jsimd_arm64.c +++ b/simd/jsimd_arm64.c @@ -27,98 +27,29 @@ static unsigned int simd_support = ~0; -#if defined(__linux__) || defined(ANDROID) || defined(__ANDROID__) - -#define SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT (1024 * 1024) - -LOCAL(int) -check_feature (char *buffer, char *feature) -{ - char *p; - if (*feature == 0) - return 0; - if (strncmp(buffer, "Features", 8) != 0) - return 0; - buffer += 8; - while (isspace(*buffer)) - buffer++; - - /* Check if 'feature' is present in the buffer as a separate word */ - while ((p = strstr(buffer, feature))) { - if (p > buffer && !isspace(*(p - 1))) { - buffer++; - continue; - } - p += strlen(feature); - if (*p != 0 && !isspace(*p)) { - buffer++; - continue; - } - return 1; - } - return 0; -} - -LOCAL(int) -parse_proc_cpuinfo (int bufsize) -{ - char *buffer = (char *)malloc(bufsize); - FILE *fd; - simd_support = 0; - - if (!buffer) - return 0; - - fd = fopen("/proc/cpuinfo", "r"); - if (fd) { - while (fgets(buffer, bufsize, fd)) { - if (!strchr(buffer, '\n') && !feof(fd)) { - /* "impossible" happened - insufficient size of the buffer! */ - fclose(fd); - free(buffer); - return 0; - } - if (check_feature(buffer, "neon")) - simd_support |= JSIMD_ARM_NEON; - } - fclose(fd); - } - free(buffer); - return 1; -} - -#endif - /* * Check what SIMD accelerations are supported. * * FIXME: This code is racy under a multi-threaded environment. */ + +/* + * ARMv8 architectures support NEON extensions by default. + * It is no longer optional as it was with ARMv7. + */ + + LOCAL(void) init_simd (void) { char *env = NULL; -#if !defined(__ARM_NEON__) && defined(__linux__) || defined(ANDROID) || defined(__ANDROID__) - int bufsize = 1024; /* an initial guess for the line buffer size limit */ -#endif if (simd_support != ~0U) return; simd_support = 0; -#if defined(__ARM_NEON__) simd_support |= JSIMD_ARM_NEON; -#elif defined(__linux__) || defined(ANDROID) || defined(__ANDROID__) - /* We still have a chance to use NEON regardless of globally used - * -mcpu/-mfpu options passed to gcc by performing runtime detection via - * /proc/cpuinfo parsing on linux/android */ - while (!parse_proc_cpuinfo(bufsize)) { - bufsize *= 2; - if (bufsize > SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT) - break; - } -#endif /* Force different settings through environment variables */ env = getenv("JSIMD_FORCENEON"); diff --git a/simd/jsimd_arm_neon_64.S b/simd/jsimd_arm_neon_64.S index 8806abc..2c3989c 100644 --- a/simd/jsimd_arm_neon_64.S +++ b/simd/jsimd_arm_neon_64.S @@ -34,7 +34,6 @@ #define RESPECT_STRICT_ALIGNMENT 1 -#define RTSM_SQSHRN_SIM_ISSUE /*****************************************************************************/ @@ -257,8 +256,18 @@ asm_function jsimd_idct_islow_neon ROW6R .req v29 ROW7L .req v30 ROW7R .req v31 - + /* Save all NEON registers and x15 (32 NEON registers * 8 bytes + 16) */ + sub sp, sp, 272 + str x15, [sp], 16 adr x15, jsimd_idct_islow_neon_consts + st1 {v0.8b - v3.8b}, [sp], 32 + st1 {v4.8b - v7.8b}, [sp], 32 + st1 {v8.8b - v11.8b}, [sp], 32 + st1 {v12.8b - v15.8b}, [sp], 32 + st1 {v16.8b - v19.8b}, [sp], 32 + st1 {v20.8b - v23.8b}, [sp], 32 + st1 {v24.8b - v27.8b}, [sp], 32 + st1 {v28.8b - v31.8b}, [sp], 32 ld1 {v16.4h, v17.4h, v18.4h, v19.4h}, [COEF_BLOCK], 32 ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [DCT_TABLE], 32 ld1 {v20.4h, v21.4h, v22.4h, v23.4h}, [COEF_BLOCK], 32 @@ -277,7 +286,7 @@ asm_function jsimd_idct_islow_neon mul v22.4h, v22.4h, v6.4h mul v23.4h, v23.4h, v7.4h ins v22.2d[1], v23.2d[0] /* 128 bit q11 */ - ld1 {v28.4h, v29.4h, v30.4h, v31.4h}, [COEF_BLOCK], 32 + ld1 {v28.4h, v29.4h, v30.4h, v31.4h}, [COEF_BLOCK] mul v24.4h, v24.4h, v0.4h mul v25.4h, v25.4h, v1.4h ins v24.2d[1], v25.2d[0] /* 128 bit q12 */ @@ -293,80 +302,79 @@ asm_function jsimd_idct_islow_neon mul v30.4h, v30.4h, v6.4h mul v31.4h, v31.4h, v7.4h ins v30.2d[1], v31.2d[0] /* 128 bit q15 */ - sub sp, sp, #32 - st1 {v8.4h-v11.4h}, [sp] /* save NEON registers */ - sub sp, sp, #32 - st1 {v12.4h-v15.4h}, [sp] + /* Go to the bottom of the stack */ + sub sp, sp, 352 + stp x4, x5, [sp], 16 + st1 {v8.4h - v11.4h}, [sp], 32 /* save NEON registers */ + st1 {v12.4h - v15.4h}, [sp], 32 /* 1-D IDCT, pass 1, left 4x8 half */ add v4.4h, ROW7L.4h, ROW3L.4h add v5.4h, ROW5L.4h, ROW1L.4h smull v12.4s, v4.4h, XFIX_1_175875602_MINUS_1_961570560 smlal v12.4s, v5.4h, XFIX_1_175875602 smull v14.4s, v4.4h, XFIX_1_175875602 - /* Check for the zero coefficients in the right 4x8 half */ - /* push {x4, x5} */ - stp x4, x5, [sp, -16]! - mov x5, #0 + /* Check for the zero coefficients in the right 4x8 half */ smlal v14.4s, v5.4h, XFIX_1_175875602_MINUS_0_390180644 ssubl v6.4s, ROW0L.4h, ROW4L.4h - ldr x4, [COEF_BLOCK, #(-96 + 2 * (4 + 1 * 8))] + ldp w4, w5, [COEF_BLOCK, #(-96 + 2 * (4 + 1 * 8))] smull v4.4s, ROW2L.4h, XFIX_0_541196100 smlal v4.4s, ROW6L.4h, XFIX_0_541196100_MINUS_1_847759065 - orr x0, x4, x5 + orr x0, x4, x5 mov v8.16b, v12.16b smlsl v12.4s, ROW5L.4h, XFIX_2_562915447 - ldr x4, [COEF_BLOCK, #(-96 + 2 * (4 + 2 * 8))] + ldp w4, w5, [COEF_BLOCK, #(-96 + 2 * (4 + 2 * 8))] smlal v12.4s, ROW3L.4h, XFIX_3_072711026_MINUS_2_562915447 shl v6.4s, v6.4s, #13 - orr x0, x0, x4 + orr x0, x0, x4 smlsl v8.4s, ROW1L.4h, XFIX_0_899976223 - orr x0, x0 , x5 + orr x0, x0 , x5 add v2.4s, v6.4s, v4.4s - ldr x4, [COEF_BLOCK, #(-96 + 2 * (4 + 3 * 8))] + ldp w4, w5, [COEF_BLOCK, #(-96 + 2 * (4 + 3 * 8))] mov v10.16b, v14.16b add v2.4s, v2.4s, v12.4s - orr x0, x0, x4 + orr x0, x0, x4 smlsl v14.4s, ROW7L.4h, XFIX_0_899976223 - orr x0, x0, x5 + orr x0, x0, x5 smlal v14.4s, ROW1L.4h, XFIX_1_501321110_MINUS_0_899976223 rshrn ROW1L.4h, v2.4s, #11 - ldr x4, [COEF_BLOCK, #(-96 + 2 * (4 + 4 * 8))] + ldp w4, w5, [COEF_BLOCK, #(-96 + 2 * (4 + 4 * 8))] sub v2.4s, v2.4s, v12.4s smlal v10.4s, ROW5L.4h, XFIX_2_053119869_MINUS_2_562915447 - orr x0, x0, x4 + orr x0, x0, x4 smlsl v10.4s, ROW3L.4h, XFIX_2_562915447 - orr x0, x0, x5 + orr x0, x0, x5 sub v2.4s, v2.4s, v12.4s smull v12.4s, ROW2L.4h, XFIX_0_541196100_PLUS_0_765366865 - ldr x4, [COEF_BLOCK, #(-96 + 2 * (4 + 5 * 8))] + ldp w4, w5, [COEF_BLOCK, #(-96 + 2 * (4 + 5 * 8))] smlal v12.4s, ROW6L.4h, XFIX_0_541196100 sub v6.4s, v6.4s, v4.4s - orr x0, x0, x4 + orr x0, x0, x4 rshrn ROW6L.4h, v2.4s, #11 - orr x0, x0, x5 + orr x0, x0, x5 add v2.4s, v6.4s, v10.4s - ldr x4, [COEF_BLOCK, #(-96 + 2 * (4 + 6 * 8))] + ldp w4, w5, [COEF_BLOCK, #(-96 + 2 * (4 + 6 * 8))] sub v6.4s, v6.4s, v10.4s saddl v10.4s, ROW0L.4h, ROW4L.4h - orr x0, x0, x4 + orr x0, x0, x4 rshrn ROW2L.4h, v2.4s, #11 - orr x0, x0, x5 + orr x0, x0, x5 rshrn ROW5L.4h, v6.4s, #11 - ldr x4, [COEF_BLOCK, #(-96 + 2 * (4 + 7 * 8))] + ldp w4, w5, [COEF_BLOCK, #(-96 + 2 * (4 + 7 * 8))] shl v10.4s, v10.4s, #13 smlal v8.4s, ROW7L.4h, XFIX_0_298631336_MINUS_0_899976223 - orr x0, x0, x4 + orr x0, x0, x4 add v4.4s, v10.4s, v12.4s - orr x0, x0, x5 + orr x0, x0, x5 sub v2.4s, v10.4s, v12.4s add v12.4s, v4.4s, v14.4s - ldr x4, [COEF_BLOCK, #(-96 + 2 * (4 + 0 * 8))] + ldp w4, w5, [COEF_BLOCK, #(-96 + 2 * (4 + 0 * 8))] sub v4.4s, v4.4s, v14.4s add v10.4s, v2.4s, v8.4s - orr x0, x4, x5 + orr x0, x4, x5 sub v6.4s, v2.4s, v8.4s /* pop {x4, x5} */ - ldp x4, x5, [sp], 16 + sub sp, sp, 80 + ldp x4, x5, [sp], 16 rshrn ROW7L.4h, v4.4s, #11 rshrn ROW3L.4h, v10.4s, #11 rshrn ROW0L.4h, v12.4s, #11 @@ -552,48 +560,27 @@ asm_function jsimd_idct_islow_neon ins v18.2d[1], v19.2d[0] ins v20.2d[1], v21.2d[0] ins v22.2d[1], v23.2d[0] -#ifdef RTSM_SQSHRN_SIM_ISSUE sqrshrn v16.8b, v16.8h, #2 sqrshrn2 v16.16b, v18.8h, #2 sqrshrn v18.8b, v20.8h, #2 sqrshrn2 v18.16b, v22.8h, #2 -#else - sqrshrn v16.4h, v16.4s, #2 - sqrshrn2 v16.8h, v18.4s, #2 - sqrshrn v18.4h, v20.4s, #2 - sqrshrn2 v18.8h, v22.4s, #2 -#endif - /* vpop {v8.4h-d15.4h} */ /* restore NEON registers */ - ld1 {v12.4h-v15.4h}, [sp], 32 - ld1 {v8.4h-v11.4h}, [sp], 32 + /* vpop {v8.4h - d15.4h} */ /* restore NEON registers */ + ld1 {v8.4h - v11.4h}, [sp], 32 + ld1 {v12.4h - v15.4h}, [sp], 32 ins v24.2d[1], v25.2d[0] -#ifdef RTSM_SQSHRN_SIM_ISSUE sqrshrn v20.8b, v24.8h, #2 -#else - - sqrshrn v20.4h, v24.4s, #2 -#endif /* Transpose the final 8-bit samples and do signed->unsigned conversion */ /* trn1 v16.8h, v16.8h, v18.8h */ transpose v16, v18, v3, .16b, .8h ins v26.2d[1], v27.2d[0] ins v28.2d[1], v29.2d[0] ins v30.2d[1], v31.2d[0] -#ifdef RTSM_SQSHRN_SIM_ISSUE sqrshrn2 v20.16b, v26.8h, #2 sqrshrn v22.8b, v28.8h, #2 -#else - sqrshrn2 v20.8h, v26.4s, #2 - sqrshrn v22.4h, v28.4s, #2 -#endif movi v0.16b, #(CENTERJSAMPLE) -#ifdef RTSM_SQSHRN_SIM_ISSUE sqrshrn2 v22.16b, v30.8h, #2 -#else - sqrshrn2 v22.8h, v30.4s, #2 -#endif transpose_single v16, v17, v3, .2d, .8b transpose_single v18, v19, v3, .2d, .8b add v16.8b, v16.8b, v0.8b @@ -628,6 +615,15 @@ asm_function jsimd_idct_islow_neon st1 {v21.8b}, [TMP2] st1 {v22.8b}, [TMP3] st1 {v23.8b}, [TMP4] + ldr x15, [sp], 16 + ld1 {v0.8b - v3.8b}, [sp], 32 + ld1 {v4.8b - v7.8b}, [sp], 32 + ld1 {v8.8b - v11.8b}, [sp], 32 + ld1 {v12.8b - v15.8b}, [sp], 32 + ld1 {v16.8b - v19.8b}, [sp], 32 + ld1 {v20.8b - v23.8b}, [sp], 32 + ld1 {v24.8b - v27.8b}, [sp], 32 + ld1 {v28.8b - v31.8b}, [sp], 32 blr x30 3: /* Left 4x8 half is done, right 4x8 half contains mostly zeros */ @@ -799,7 +795,8 @@ asm_function jsimd_idct_ifast_neon TMP1 .req x0 TMP2 .req x1 TMP3 .req x2 - TMP4 .req x15 + TMP4 .req x22 + TMP5 .req x23 /* Load and dequantize coefficients into NEON registers * with the following allocation: @@ -814,7 +811,15 @@ asm_function jsimd_idct_ifast_neon * 6 | d28 | d29 ( v14.8h ) * 7 | d30 | d31 ( v15.8h ) */ - adr x15, jsimd_idct_ifast_neon_consts + /* Save NEON registers used in fast IDCT */ + sub sp, sp, #176 + stp x22, x23, [sp], 16 + adr x23, jsimd_idct_ifast_neon_consts + st1 {v0.8b - v3.8b}, [sp], 32 + st1 {v4.8b - v7.8b}, [sp], 32 + st1 {v8.8b - v11.8b}, [sp], 32 + st1 {v12.8b - v15.8b}, [sp], 32 + st1 {v16.8b - v19.8b}, [sp], 32 ld1 {v8.8h, v9.8h}, [COEF_BLOCK], 32 ld1 {v0.8h, v1.8h}, [DCT_TABLE], 32 ld1 {v10.8h, v11.8h}, [COEF_BLOCK], 32 @@ -830,14 +835,9 @@ asm_function jsimd_idct_ifast_neon ld1 {v2.8h, v3.8h}, [DCT_TABLE], 32 mul v14.8h, v14.8h, v2.8h mul v13.8h, v13.8h, v1.8h - ld1 {v0.4h}, [x15] /* load constants */ + ld1 {v0.4h}, [x23] /* load constants */ mul v15.8h, v15.8h, v3.8h - /* vpush {v4.8h-v6.8h} */ /* save NEON registers */ - sub sp, sp, #32 - st1 {v4.8h-v5.8h}, [sp] /* save NEON registers */ - sub sp, sp, #16 - st1 {v6.8h}, [sp] /* 1-D IDCT, pass 1 */ sub v2.8h, v10.8h, v14.8h add v14.8h, v10.8h, v14.8h @@ -912,25 +912,25 @@ asm_function jsimd_idct_ifast_neon trn1 v13.4s, v13.4s, v15.4s trn2 v15.4s, v18.4s, v15.4s /* vswp v14.4h, v10-MSB.4h */ - umov x10, v14.d[0] + umov x22, v14.d[0] ins v14.2d[0], v10.2d[1] - ins v10.2d[1], x10 + ins v10.2d[1], x22 /* vswp v13.4h, v9MSB.4h */ - umov x10, v13.d[0] + umov x22, v13.d[0] ins v13.2d[0], v9.2d[1] - ins v9.2d[1], x10 + ins v9.2d[1], x22 /* 1-D IDCT, pass 2 */ sub v2.8h, v10.8h, v14.8h /* vswp v15.4h, v11MSB.4h */ - umov x10, v15.d[0] + umov x22, v15.d[0] ins v15.2d[0], v11.2d[1] - ins v11.2d[1], x10 + ins v11.2d[1], x22 add v14.8h, v10.8h, v14.8h /* vswp v12.4h, v8-MSB.4h */ - umov x10, v12.d[0] + umov x22, v12.d[0] ins v12.2d[0], v8.2d[1] - ins v8.2d[1], x10 + ins v8.2d[1], x22 sub v1.8h, v11.8h, v13.8h add v13.8h, v11.8h, v13.8h sub v5.8h, v9.8h, v15.8h @@ -966,15 +966,11 @@ asm_function jsimd_idct_ifast_neon add v14.8h, v5.8h, v3.8h sub v9.8h, v5.8h, v3.8h sub v13.8h, v10.8h, v2.8h - /* vpop {v4.8h-v7.4h} */ /* restore NEON registers...not available */ - ld1 {v6.8h}, [sp], 16 - ld1 {v4.8h-v5.8h}, [sp], 32 add v10.8h, v10.8h, v2.8h sub v11.8h, v12.8h, v1.8h add v12.8h, v12.8h, v1.8h /* Descale to 8-bit and range limit */ movi v0.16b, #0x80 -#ifdef RTSM_SQSHRN_SIM_ISSUE sqshrn v8.8b, v8.8h, #5 sqshrn2 v8.16b, v9.8h, #5 sqshrn v9.8b, v10.8h, #5 @@ -983,16 +979,6 @@ asm_function jsimd_idct_ifast_neon sqshrn2 v10.16b, v13.8h, #5 sqshrn v11.8b, v14.8h, #5 sqshrn2 v11.16b, v15.8h, #5 -#else - sqshrn v8.4h, v8.4s, #5 - sqshrn2 v8.8h, v9.4s, #5 - sqshrn v9.4h, v10.4s, #5 - sqshrn2 v9.8h, v11.4s, #5 - sqshrn v10.4h, v12.4s, #5 - sqshrn2 v10.8h, v13.4s, #5 - sqshrn v11.4h, v14.4s, #5 - sqshrn2 v11.8h, v15.4s, #5 -#endif add v8.16b, v8.16b, v0.16b add v9.16b, v9.16b, v0.16b add v10.16b, v10.16b, v0.16b @@ -1036,26 +1022,33 @@ asm_function jsimd_idct_ifast_neon add TMP2, TMP2, OUTPUT_COL st1 {v9.8b}, [TMP1] /* make copy */ - ins v21.2d[0], v10.2d[1] + ins v7.2d[0], v10.2d[1] mov v18.16b, v10.16b - trn1 v10.8b, v10.8b, v21.8b - trn2 v21.8b, v18.8b, v21.8b + trn1 v10.8b, v10.8b, v7.8b + trn2 v7.8b, v18.8b, v7.8b st1 {v19.8b}, [TMP2] ldp TMP1, TMP2, [OUTPUT_BUF], 16 - ldp TMP3, TMP4, [OUTPUT_BUF] + ldp TMP4, TMP5, [OUTPUT_BUF], 16 add TMP1, TMP1, OUTPUT_COL add TMP2, TMP2, OUTPUT_COL - add TMP3, TMP3, OUTPUT_COL add TMP4, TMP4, OUTPUT_COL + add TMP5, TMP5, OUTPUT_COL st1 {v10.8b}, [TMP1] /* make copy */ - ins v23.2d[0], v11.2d[1] + ins v16.2d[0], v11.2d[1] mov v18.16b, v11.16b - trn1 v11.8b, v11.8b, v23.8b - trn2 v23.8b, v18.8b, v23.8b - st1 {v21.8b}, [TMP2] - st1 {v11.8b}, [TMP3] - st1 {v23.8b}, [TMP4] + trn1 v11.8b, v11.8b, v16.8b + trn2 v16.8b, v18.8b, v16.8b + st1 {v7.8b}, [TMP2] + st1 {v11.8b}, [TMP4] + st1 {v16.8b}, [TMP5] + sub sp, sp, #176 + ldp x22, x23, [sp], 16 + ld1 {v0.8b - v3.8b}, [sp], 32 + ld1 {v4.8b - v7.8b}, [sp], 32 + ld1 {v8.8b - v11.8b}, [sp], 32 + ld1 {v12.8b - v15.8b}, [sp], 32 + ld1 {v16.8b - v19.8b}, [sp], 32 blr x30 .unreq DCT_TABLE @@ -1179,14 +1172,19 @@ asm_function jsimd_idct_4x4_neon TMP3 .req x2 TMP4 .req x15 - /* vpush {v8.4h-v15.4h} */ - sub sp, sp, #32 - st1 {v8.4h-v11.4h}, [sp] /* save NEON registers */ - sub sp, sp, #32 - st1 {v12.4h-v15.4h}, [sp] - + /* Save all used NEON registers */ + sub sp, sp, 272 + str x15, [sp], 16 /* Load constants (v3.4h is just used for padding) */ adr TMP4, jsimd_idct_4x4_neon_consts + st1 {v0.8b - v3.8b}, [sp], 32 + st1 {v4.8b - v7.8b}, [sp], 32 + st1 {v8.8b - v11.8b}, [sp], 32 + st1 {v12.8b - v15.8b}, [sp], 32 + st1 {v16.8b - v19.8b}, [sp], 32 + st1 {v20.8b - v23.8b}, [sp], 32 + st1 {v24.8b - v27.8b}, [sp], 32 + st1 {v28.8b - v31.8b}, [sp], 32 ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [TMP4] /* Load all COEF_BLOCK into NEON registers with the following allocation: @@ -1290,10 +1288,17 @@ asm_function jsimd_idct_4x4_neon st1 {v27.b}[7], [TMP4], 1 #endif - /* vpop {v8.4h-v15.4h} ;not available */ - ld1 {v12.4h-v15.4h}, [sp], 32 - ld1 {v8.4h-v11.4h}, [sp], 32 - + /* vpop {v8.4h - v15.4h} ;not available */ + sub sp, sp, #272 + ldr x15, [sp], 16 + ld1 {v0.8b - v3.8b}, [sp], 32 + ld1 {v4.8b - v7.8b}, [sp], 32 + ld1 {v8.8b - v11.8b}, [sp], 32 + ld1 {v12.8b - v15.8b}, [sp], 32 + ld1 {v16.8b - v19.8b}, [sp], 32 + ld1 {v20.8b - v23.8b}, [sp], 32 + ld1 {v24.8b - v27.8b}, [sp], 32 + ld1 {v28.8b - v31.8b}, [sp], 32 blr x30 .unreq DCT_TABLE @@ -1333,23 +1338,23 @@ jsimd_idct_2x2_neon_consts: .short FIX_3_624509785 /* d0[3] */ .macro idct_helper x4, x6, x10, x12, x16, shift, y26, y27 - sshll v28.4s, \x4, #15 + sshll v15.4s, \x4, #15 smull v26.4s, \x6, v0.4h[3] smlal v26.4s, \x10, v0.4h[2] smlal v26.4s, \x12, v0.4h[1] smlal v26.4s, \x16, v0.4h[0] - add v20.4s, v28.4s, v26.4s - sub v28.4s, v28.4s, v26.4s + add v20.4s, v15.4s, v26.4s + sub v15.4s, v15.4s, v26.4s .if \shift > 16 srshr v20.4s, v20.4s, #\shift - srshr v28.4s, v28.4s, #\shift + srshr v15.4s, v15.4s, #\shift xtn \y26, v20.4s - xtn \y27, v28.4s + xtn \y27, v15.4s .else rshrn \y26, v20.4s, #\shift - rshrn \y27, v28.4s, #\shift + rshrn \y27, v15.4s, #\shift .endif .endm @@ -1363,15 +1368,20 @@ asm_function jsimd_idct_2x2_neon TMP1 .req x0 TMP2 .req x15 - /* vpush {v8.4h-v15.4h} ; not available */ - sub sp, sp, #32 - st1 {v8.4h-v11.4h}, [sp] /* save NEON registers */ - sub sp, sp, #32 - st1 {v12.4h-v15.4h}, [sp] + /* vpush {v8.4h - v15.4h} ; not available */ + sub sp, sp, 208 + str x15, [sp], 16 /* Load constants */ adr TMP2, jsimd_idct_2x2_neon_consts - ld1 {v0.4h}, [TMP2] + st1 {v4.8b - v7.8b}, [sp], 32 + st1 {v8.8b - v11.8b}, [sp], 32 + st1 {v12.8b - v15.8b}, [sp], 32 + st1 {v16.8b - v19.8b}, [sp], 32 + st1 {v21.8b - v22.8b}, [sp], 16 + st1 {v24.8b - v27.8b}, [sp], 32 + st1 {v30.8b - v31.8b}, [sp], 16 + ld1 {v14.4h}, [TMP2] /* Load all COEF_BLOCK into NEON registers with the following allocation: * 0 1 2 3 | 4 5 6 7 @@ -1423,24 +1433,24 @@ asm_function jsimd_idct_2x2_neon idct_helper v5.4h, v7.4h, v11.4h, v13.4h, v17.4h, 13, v5.4h, v7.4h transpose_4x4 v5.4h, v7.4h, v9.4h, v11.4h #else - smull v26.4s, v6.4h, v0.4h[3] - smlal v26.4s, v10.4h, v0.4h[2] - smlal v26.4s, v12.4h, v0.4h[1] - smlal v26.4s, v16.4h, v0.4h[0] - smull v24.4s, v7.4h, v0.4h[3] - smlal v24.4s, v11.4h, v0.4h[2] - smlal v24.4s, v13.4h, v0.4h[1] - smlal v24.4s, v17.4h, v0.4h[0] - sshll v28.4s, v4.4h, #15 + smull v26.4s, v6.4h, v14.4h[3] + smlal v26.4s, v10.4h, v14.4h[2] + smlal v26.4s, v12.4h, v14.4h[1] + smlal v26.4s, v16.4h, v14.4h[0] + smull v24.4s, v7.4h, v14.4h[3] + smlal v24.4s, v11.4h, v14.4h[2] + smlal v24.4s, v13.4h, v14.4h[1] + smlal v24.4s, v17.4h, v14.4h[0] + sshll v15.4s, v4.4h, #15 sshll v30.4s, v5.4h, #15 - add v20.4s, v28.4s, v26.4s - sub v28.4s, v28.4s, v26.4s + add v20.4s, v15.4s, v26.4s + sub v15.4s, v15.4s, v26.4s rshrn v4.4h, v20.4s, #13 - rshrn v6.4h, v28.4s, #13 + rshrn v6.4h, v15.4s, #13 add v20.4s, v30.4s, v24.4s - sub v28.4s, v30.4s, v24.4s + sub v15.4s, v30.4s, v24.4s rshrn v5.4h, v20.4s, #13 - rshrn v7.4h, v28.4s, #13 + rshrn v7.4h, v15.4s, #13 transpose v4, v6, v3, .16b, .8h transpose v6, v10, v3, .16b, .4s #endif @@ -1466,11 +1476,15 @@ asm_function jsimd_idct_2x2_neon st1 {v26.b}[1], [TMP2], 1 st1 {v27.b}[5], [TMP2], 1 - /* vpop {v8.4h-v15.4h} ;not available */ - - ld1 {v12.4h-v15.4h}, [sp], 32 - ld1 {v8.4h-v11.4h}, [sp], 32 - + sub sp, sp, #208 + ldr x15, [sp], 16 + ld1 {v4.8b - v7.8b}, [sp], 32 + ld1 {v8.8b - v11.8b}, [sp], 32 + ld1 {v12.8b - v15.8b}, [sp], 32 + ld1 {v16.8b - v19.8b}, [sp], 32 + ld1 {v21.8b - v22.8b}, [sp], 16 + ld1 {v24.8b - v27.8b}, [sp], 32 + ld1 {v30.8b - v31.8b}, [sp], 16 blr x30 .unreq DCT_TABLE @@ -1572,13 +1586,11 @@ asm_function jsimd_idct_2x2_neon .error unsupported bpp .endif .endm -#ifdef RTSM_SQSHRN_SIM_ISSUE + .macro generate_jsimd_ycc_rgb_convert_neon colorid, bpp, r_offs, rsize, g_offs, gsize, b_offs, bsize, defsize -#else -.macro generate_jsimd_ycc_rgb_convert_neon colorid, bpp, r_offs, rsize, g_offs, gsize, b_offs, bsize -#endif + /* - * 2 stage pipelined YCbCr->RGB conversion + * 2-stage pipelined YCbCr->RGB conversion */ .macro do_yuv_to_rgb_stage1 @@ -1604,16 +1616,10 @@ asm_function jsimd_idct_2x2_neon uaddw v20.8h, v20.8h, v0.8b uaddw v24.8h, v24.8h, v0.8b uaddw v28.8h, v28.8h, v0.8b -#ifdef RTSM_SQSHRN_SIM_ISSUE sqxtun v1\g_offs\defsize, v20.8h sqxtun v1\r_offs\defsize, v24.8h sqxtun v1\b_offs\defsize, v28.8h -#else - sqxtun v1\g_offs\gsize, v20.4s - sqxtun v1\r_offs\rsize, v24.4s - sqxtun v1\b_offs\bsize, v28.4s -#endif .endm .macro do_yuv_to_rgb_stage2_store_load_stage1 @@ -1628,25 +1634,13 @@ asm_function jsimd_idct_2x2_neon uaddw v20.8h, v20.8h, v0.8b uaddw v24.8h, v24.8h, v0.8b uaddw v28.8h, v28.8h, v0.8b -#ifdef RTSM_SQSHRN_SIM_ISSUE sqxtun v1\g_offs\defsize, v20.8h -#else - sqxtun v1\g_offs\gsize, v20.4s -#endif ld1 {v0.8b}, [Y], 8 -#ifdef RTSM_SQSHRN_SIM_ISSUE sqxtun v1\r_offs\defsize, v24.8h -#else - sqxtun v1\r_offs\rsize, v24.4s -#endif prfm PLDL1KEEP, [U, #64] prfm PLDL1KEEP, [V, #64] prfm PLDL1KEEP, [Y, #64] -#ifdef RTSM_SQSHRN_SIM_ISSUE sqxtun v1\b_offs\defsize, v28.8h -#else - sqxtun v1\b_offs\gsize, v28.4s -#endif uaddw v6.8h, v2.8h, v4.8b /* v6.16b = u - 128 */ uaddw v8.8h, v2.8h, v5.8b /* q2 = v - 128 */ do_store \bpp, 8 @@ -1693,29 +1687,33 @@ asm_function jsimd_ycc_\colorid\()_convert_neon V .req x10 N .req x15 + sub sp, sp, 336 + str x15, [sp], 16 /* Load constants to d1, d2, d3 (v0.4h is just used for padding) */ adr x15, jsimd_ycc_\colorid\()_neon_consts + /* Save NEON registers */ + st1 {v0.8b - v3.8b}, [sp], 32 + st1 {v4.8b - v7.8b}, [sp], 32 + st1 {v8.8b - v11.8b}, [sp], 32 + st1 {v12.8b - v15.8b}, [sp], 32 + st1 {v16.8b - v19.8b}, [sp], 32 + st1 {v20.8b - v23.8b}, [sp], 32 + st1 {v24.8b - v27.8b}, [sp], 32 + st1 {v28.8b - v31.8b}, [sp], 32 ld1 {v0.4h, v1.4h}, [x15], 16 ld1 {v2.8h}, [x15] /* Save ARM registers and handle input arguments */ /* push {x4, x5, x6, x7, x8, x9, x10, x30} */ - stp x4, x5, [sp,-16]! - stp x6, x7, [sp,-16]! - stp x8, x9, [sp,-16]! - stp x10, x30, [sp,-16]! + stp x4, x5, [sp], 16 + stp x6, x7, [sp], 16 + stp x8, x9, [sp], 16 + stp x10, x30, [sp], 16 ldr INPUT_BUF0, [INPUT_BUF] ldr INPUT_BUF1, [INPUT_BUF, 8] ldr INPUT_BUF2, [INPUT_BUF, 16] .unreq INPUT_BUF - /* Save NEON registers */ - /* vpush {v8.4h-v15.4h} */ - sub sp, sp, #32 - st1 {v8.4h-v11.4h}, [sp] - sub sp, sp, #32 - st1 {v12.4h-v15.4h}, [sp] - /* Initially set v10, v11.4h, v12.8b, d13 to 0xFF */ movi v10.16b, #255 movi v12.16b, #255 @@ -1778,14 +1776,21 @@ asm_function jsimd_ycc_\colorid\()_convert_neon bgt 0b 9: /* Restore all registers and return */ - /* vpop {v8.4h-v15.4h} */ - ld1 {v12.4h-v15.4h}, [sp], #32 - ld1 {v8.4h-v11.4h}, [sp], #32 + sub sp, sp, #336 + ldr x15, [sp], 16 + ld1 {v0.8b - v3.8b}, [sp], 32 + ld1 {v4.8b - v7.8b}, [sp], 32 + ld1 {v8.8b - v11.8b}, [sp], 32 + ld1 {v12.8b - v15.8b}, [sp], 32 + ld1 {v16.8b - v19.8b}, [sp], 32 + ld1 {v20.8b - v23.8b}, [sp], 32 + ld1 {v24.8b - v27.8b}, [sp], 32 + ld1 {v28.8b - v31.8b}, [sp], 32 /* pop {r4, r5, r6, r7, r8, r9, r10, pc} */ - ldp x10, x30, [sp], #16 - ldp x8, x9, [sp], #16 - ldp x6, x5, [sp], #16 - ldp x4, x5, [sp], #16 + ldp x4, x5, [sp], 16 + ldp x6, x7, [sp], 16 + ldp x8, x9, [sp], 16 + ldp x10, x30, [sp], 16 br x30 .unreq OUTPUT_WIDTH .unreq INPUT_ROW @@ -1807,10 +1812,6 @@ asm_function jsimd_ycc_\colorid\()_convert_neon .purgem do_yuv_to_rgb_stage2_store_load_stage1 .endm -/* RTSM simulator fix integer saturation works on 8b boundry add a new parameter - * as a workaround for the simulator fix - */ -#ifdef RTSM_SQSHRN_SIM_ISSUE /*--------------------------------- id ----- bpp R rsize G gsize B bsize defsize */ generate_jsimd_ycc_rgb_convert_neon extrgb, 24, 0, .4h, 1, .4h, 2, .4h, .8b generate_jsimd_ycc_rgb_convert_neon extbgr, 24, 2, .4h, 1, .4h, 0, .4h, .8b @@ -1818,15 +1819,6 @@ generate_jsimd_ycc_rgb_convert_neon extrgbx, 32, 0, .4h, 1, .4h, 2, .4h, . generate_jsimd_ycc_rgb_convert_neon extbgrx, 32, 2, .4h, 1, .4h, 0, .4h, .8b generate_jsimd_ycc_rgb_convert_neon extxbgr, 32, 3, .4h, 2, .4h, 1, .4h, .8b generate_jsimd_ycc_rgb_convert_neon extxrgb, 32, 1, .4h, 2, .4h, 3, .4h, .8b -#else -/*--------------------------------- id ----- bpp R rsize G gsize B bsize */ -generate_jsimd_ycc_rgb_convert_neon extrgb, 24, 0, .4h, 1, .4h, 2, .4h -generate_jsimd_ycc_rgb_convert_neon extbgr, 24, 2, .4h, 1, .4h, 0, .4h -generate_jsimd_ycc_rgb_convert_neon extrgbx, 32, 0, .4h, 1, .4h, 2, .4h -generate_jsimd_ycc_rgb_convert_neon extbgrx, 32, 2, .4h, 1, .4h, 0, .4h -generate_jsimd_ycc_rgb_convert_neon extxbgr, 32, 3, .4h, 2, .4h, 1, .4h -generate_jsimd_ycc_rgb_convert_neon extxrgb, 32, 1, .4h, 2, .4h, 3, .4h -#endif .purgem do_load .purgem do_store -- 2.40.0