From: DRC Date: Tue, 9 Feb 2016 06:38:58 +0000 (-0600) Subject: ARM64: Avoid tbl instruction on Cortex-A53/A57 X-Git-Tag: 1.4.90~20 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=8632f1b2626ccef099f58382d880e9c7325a2d28;p=libjpeg-turbo ARM64: Avoid tbl instruction on Cortex-A53/A57 Full-color compression speedups relative to previous commits: Cortex-A53 (Nexus 5X), Android, 64-bit: 0.91-3.0% (avg. 1.8%) Cortex-A57 (Nexus 5X), Android, 64-bit: -0.35-1.5% (avg. 0.65%) --- diff --git a/simd/jsimd.h b/simd/jsimd.h index a312930..d05a2ec 100644 --- a/simd/jsimd.h +++ b/simd/jsimd.h @@ -865,3 +865,7 @@ EXTERN(JOCTET*) jsimd_huff_encode_one_block_sse2 EXTERN(JOCTET*) jsimd_huff_encode_one_block_neon (void * state, JOCTET *buffer, JCOEFPTR block, int last_dc_val, c_derived_tbl *dctbl, c_derived_tbl *actbl); + +EXTERN(JOCTET*) jsimd_huff_encode_one_block_neon_slowtbl + (void * state, JOCTET *buffer, JCOEFPTR block, int last_dc_val, + c_derived_tbl *dctbl, c_derived_tbl *actbl); diff --git a/simd/jsimd_arm64.c b/simd/jsimd_arm64.c index 8633162..cb48258 100644 --- a/simd/jsimd_arm64.c +++ b/simd/jsimd_arm64.c @@ -28,10 +28,12 @@ #define JSIMD_FASTLD3 1 #define JSIMD_FASTST3 2 +#define JSIMD_FASTTBL 4 static unsigned int simd_support = ~0; static unsigned int simd_huffman = 1; -static unsigned int simd_features = JSIMD_FASTLD3 | JSIMD_FASTST3; +static unsigned int simd_features = JSIMD_FASTLD3 | JSIMD_FASTST3 | + JSIMD_FASTTBL; #if defined(__linux__) || defined(ANDROID) || defined(__ANDROID__) @@ -83,7 +85,13 @@ parse_proc_cpuinfo (int bufsize) free(buffer); return 0; } - if (check_cpuinfo(buffer, "CPU part", "0x0a1")) + if (check_cpuinfo(buffer, "CPU part", "0xd03") || + check_cpuinfo(buffer, "CPU part", "0xd07")) + /* The Cortex-A53 has a slow tbl implementation. We can gain a few + percent speedup by disabling the use of that instruction. The + speedup on Cortex-A57 is more subtle but still measurable. */ + simd_features &= ~JSIMD_FASTTBL; + else if (check_cpuinfo(buffer, "CPU part", "0x0a1")) /* The SIMD version of Huffman encoding is slower than the C version on Cavium ThunderX. Also, ld3 and st3 are abyssmally slow on that CPU. */ @@ -785,6 +793,10 @@ jsimd_huff_encode_one_block (void * state, JOCTET *buffer, JCOEFPTR block, int last_dc_val, c_derived_tbl *dctbl, c_derived_tbl *actbl) { - return jsimd_huff_encode_one_block_neon(state, buffer, block, last_dc_val, - dctbl, actbl); + if (simd_features & JSIMD_FASTTBL) + return jsimd_huff_encode_one_block_neon(state, buffer, block, last_dc_val, + dctbl, actbl); + else + return jsimd_huff_encode_one_block_neon_slowtbl(state, buffer, block, + last_dc_val, dctbl, actbl); } diff --git a/simd/jsimd_arm64_neon.S b/simd/jsimd_arm64_neon.S index 0df1c4a..b9bb5de 100644 --- a/simd/jsimd_arm64_neon.S +++ b/simd/jsimd_arm64_neon.S @@ -2986,8 +2986,6 @@ asm_function jsimd_h2v2_downsample_neon /*****************************************************************************/ -#define TBL_IS_FAST - /* * GLOBAL(JOCTET*) * jsimd_huff_encode_one_block (working_state * state, JOCTET *buffer, @@ -3037,11 +3035,17 @@ asm_function jsimd_h2v2_downsample_neon 47: .endm +.macro generate_jsimd_huff_encode_one_block fast_tbl + .balign 16 +.if \fast_tbl == 1 Ljsimd_huff_encode_one_block_neon_consts: +.else +Ljsimd_huff_encode_one_block_neon_slowtbl_consts: +.endif .byte 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, \ 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80 -#if defined(TBL_IS_FAST) +.if \fast_tbl == 1 .byte 0, 1, 2, 3, 16, 17, 32, 33, \ 18, 19, 4, 5, 6, 7, 20, 21 /* L0 => L3 : 4 lines OK */ .byte 34, 35, 48, 49, 255, 255, 50, 51, \ @@ -3066,19 +3070,27 @@ Ljsimd_huff_encode_one_block_neon_consts: 255, 255, 255, 255, 8, 9, 22, 23 /* L5 => L6 : 2 lines OK */ .byte 4, 5, 6, 7, 255, 255, 255, 255, \ 255, 255, 255, 255, 255, 255, 255, 255 /* L7 : 1 line OK */ -#endif +.endif +.if \fast_tbl == 1 asm_function jsimd_huff_encode_one_block_neon +.else +asm_function jsimd_huff_encode_one_block_neon_slowtbl +.endif sub sp, sp, 272 sub BUFFER, BUFFER, #0x1 /* BUFFER=buffer-- */ /* Save ARM registers */ stp x19, x20, [sp], 16 +.if \fast_tbl == 1 adr x15, Ljsimd_huff_encode_one_block_neon_consts +.else + adr x15, Ljsimd_huff_encode_one_block_neon_slowtbl_consts +.endif ldr PUT_BUFFER, [x0, #0x10] ldr PUT_BITSw, [x0, #0x18] ldrsh w12, [x2] /* load DC coeff in w12 */ /* prepare data */ -#if defined(TBL_IS_FAST) +.if \fast_tbl == 1 ld1 {v23.16b}, [x15], #16 ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x15], #64 ld1 {v4.16b, v5.16b, v6.16b, v7.16b}, [x15], #64 @@ -3100,7 +3112,7 @@ asm_function jsimd_huff_encode_one_block_neon tbx v2.16b, {v29.16b, v30.16b}, v17.16b tbx v5.16b, {v29.16b, v30.16b}, v18.16b tbx v6.16b, {v31.16b}, v19.16b -#else +.else add x13, x2, #0x22 sub w12, w12, w3 /* last_dc_val, not used afterwards */ ld1 {v23.16b}, [x15] @@ -3230,7 +3242,7 @@ asm_function jsimd_huff_encode_one_block_neon ld1 {v5.h}[7], [x15] ld1 {v6.h}[7], [x19] ld1 {v7.h}[7], [x20] -#endif +.endif cmlt v24.8h, v0.8h, #0 cmlt v25.8h, v1.8h, #0 cmlt v26.8h, v2.8h, #0 @@ -3425,6 +3437,11 @@ asm_function jsimd_huff_encode_one_block_neon add sp, sp, 256 br x30 +.endm + +generate_jsimd_huff_encode_one_block 1 +generate_jsimd_huff_encode_one_block 0 + .unreq BUFFER .unreq PUT_BUFFER .unreq PUT_BITS