#define JSIMD_FASTLD3 1
#define JSIMD_FASTST3 2
+#define JSIMD_FASTTBL 4
static unsigned int simd_support = ~0;
static unsigned int simd_huffman = 1;
-static unsigned int simd_features = JSIMD_FASTLD3 | JSIMD_FASTST3;
+static unsigned int simd_features = JSIMD_FASTLD3 | JSIMD_FASTST3 |
+ JSIMD_FASTTBL;
#if defined(__linux__) || defined(ANDROID) || defined(__ANDROID__)
free(buffer);
return 0;
}
- if (check_cpuinfo(buffer, "CPU part", "0x0a1"))
+ if (check_cpuinfo(buffer, "CPU part", "0xd03") ||
+ check_cpuinfo(buffer, "CPU part", "0xd07"))
+ /* The Cortex-A53 has a slow tbl implementation. We can gain a few
+ percent speedup by disabling the use of that instruction. The
+ speedup on Cortex-A57 is more subtle but still measurable. */
+ simd_features &= ~JSIMD_FASTTBL;
+ else if (check_cpuinfo(buffer, "CPU part", "0x0a1"))
/* The SIMD version of Huffman encoding is slower than the C version on
Cavium ThunderX. Also, ld3 and st3 are abyssmally slow on that
CPU. */
int last_dc_val, c_derived_tbl *dctbl,
c_derived_tbl *actbl)
{
- return jsimd_huff_encode_one_block_neon(state, buffer, block, last_dc_val,
- dctbl, actbl);
+ if (simd_features & JSIMD_FASTTBL)
+ return jsimd_huff_encode_one_block_neon(state, buffer, block, last_dc_val,
+ dctbl, actbl);
+ else
+ return jsimd_huff_encode_one_block_neon_slowtbl(state, buffer, block,
+ last_dc_val, dctbl, actbl);
}
/*****************************************************************************/
-#define TBL_IS_FAST
-
/*
* GLOBAL(JOCTET*)
* jsimd_huff_encode_one_block (working_state * state, JOCTET *buffer,
47:
.endm
+.macro generate_jsimd_huff_encode_one_block fast_tbl
+
.balign 16
+.if \fast_tbl == 1
Ljsimd_huff_encode_one_block_neon_consts:
+.else
+Ljsimd_huff_encode_one_block_neon_slowtbl_consts:
+.endif
.byte 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, \
0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80
-#if defined(TBL_IS_FAST)
+.if \fast_tbl == 1
.byte 0, 1, 2, 3, 16, 17, 32, 33, \
18, 19, 4, 5, 6, 7, 20, 21 /* L0 => L3 : 4 lines OK */
.byte 34, 35, 48, 49, 255, 255, 50, 51, \
255, 255, 255, 255, 8, 9, 22, 23 /* L5 => L6 : 2 lines OK */
.byte 4, 5, 6, 7, 255, 255, 255, 255, \
255, 255, 255, 255, 255, 255, 255, 255 /* L7 : 1 line OK */
-#endif
+.endif
+.if \fast_tbl == 1
asm_function jsimd_huff_encode_one_block_neon
+.else
+asm_function jsimd_huff_encode_one_block_neon_slowtbl
+.endif
sub sp, sp, 272
sub BUFFER, BUFFER, #0x1 /* BUFFER=buffer-- */
/* Save ARM registers */
stp x19, x20, [sp], 16
+.if \fast_tbl == 1
adr x15, Ljsimd_huff_encode_one_block_neon_consts
+.else
+ adr x15, Ljsimd_huff_encode_one_block_neon_slowtbl_consts
+.endif
ldr PUT_BUFFER, [x0, #0x10]
ldr PUT_BITSw, [x0, #0x18]
ldrsh w12, [x2] /* load DC coeff in w12 */
/* prepare data */
-#if defined(TBL_IS_FAST)
+.if \fast_tbl == 1
ld1 {v23.16b}, [x15], #16
ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x15], #64
ld1 {v4.16b, v5.16b, v6.16b, v7.16b}, [x15], #64
tbx v2.16b, {v29.16b, v30.16b}, v17.16b
tbx v5.16b, {v29.16b, v30.16b}, v18.16b
tbx v6.16b, {v31.16b}, v19.16b
-#else
+.else
add x13, x2, #0x22
sub w12, w12, w3 /* last_dc_val, not used afterwards */
ld1 {v23.16b}, [x15]
ld1 {v5.h}[7], [x15]
ld1 {v6.h}[7], [x19]
ld1 {v7.h}[7], [x20]
-#endif
+.endif
cmlt v24.8h, v0.8h, #0
cmlt v25.8h, v1.8h, #0
cmlt v26.8h, v2.8h, #0
add sp, sp, 256
br x30
+.endm
+
+generate_jsimd_huff_encode_one_block 1
+generate_jsimd_huff_encode_one_block 0
+
.unreq BUFFER
.unreq PUT_BUFFER
.unreq PUT_BITS