From 4e8ac132cc2feff5786d12c90fd62cf97979bae1 Mon Sep 17 00:00:00 2001 From: Janne Grunau Date: Mon, 20 Oct 2014 13:12:14 +0200 Subject: [PATCH] aarch64: x264_coeff_level_run{4,8,15,16} All functions ~33% faster. --- common/aarch64/quant-a.S | 77 ++++++++++++++++++++++++++++++++++++++++ common/aarch64/quant.h | 4 +++ common/quant.c | 4 +++ 3 files changed, 85 insertions(+) diff --git a/common/aarch64/quant-a.S b/common/aarch64/quant-a.S index ed9b3ca8..d3b2933b 100644 --- a/common/aarch64/quant-a.S +++ b/common/aarch64/quant-a.S @@ -497,3 +497,80 @@ function x264_coeff_last64_neon, export=1 sub w0, w3, w2 ret endfunc + +.macro coeff_level_run_start size + add x6, x1, #23 // runlevel->mask + mov w7, #0 + mov w8, #0 + mov w9, #1 + and x6, x6, #~15 + mov w4, #\size - 1 +.endm + +.macro coeff_level_run shift + clz x3, x2 + subs w4, w4, w3, lsr #\shift + str w4, [x1], #4 +1: + ldrh w5, [x0, x4, lsl #1] + strh w5, [x6], #2 + add w7, w7, #1 + lsl w10, w9, w4 + orr w8, w8, w10 + b.le 2f + add w3, w3, #1 << \shift + sub w4, w4, #1 + and x3, x3, #~((1 << \shift) - 1) + lsl x2, x2, x3 + clz x3, x2 + subs w4, w4, w3, lsr #\shift + b.ge 1b +2: + str w8, [x1] + mov w0, w7 +.endm + +function x264_coeff_level_run4_aarch64, export=1 + ldr x2, [x0] + + coeff_level_run_start 4 + + coeff_level_run 4 + + ret +endfunc + +.macro X264_COEFF_LEVEL_RUN size +function x264_coeff_level_run\size\()_neon, export=1 +.if \size == 15 + sub x0, x0, #2 +.endif +.if \size < 15 + .equ shiftw, 3 + ld1 {v0.8h}, [x0] + uqxtn v0.8b, v0.8h + cmtst v0.8b, v0.8b, v0.8b +.else + .equ shiftw, 2 + ld1 {v0.8h,v1.8h}, [x0] + uqxtn v0.8b, v0.8h + uqxtn2 v0.16b, v1.8h + cmtst v0.16b, v0.16b, v0.16b + shrn v0.8b, v0.8h, #4 +.endif + fmov x2, d0 +.if \size == 15 + add x0, x0, #2 +.endif + + coeff_level_run_start \size + + coeff_level_run shiftw + + ret +endfunc +.endm + +X264_COEFF_LEVEL_RUN 8 +X264_COEFF_LEVEL_RUN 15 +X264_COEFF_LEVEL_RUN 16 diff --git a/common/aarch64/quant.h b/common/aarch64/quant.h index 5a797c1a..360af26f 100644 --- a/common/aarch64/quant.h +++ b/common/aarch64/quant.h @@ -49,4 +49,8 @@ int x264_coeff_last15_neon( int16_t * ); int x264_coeff_last16_neon( int16_t * ); int x264_coeff_last64_neon( int16_t * ); +int x264_coeff_level_run4_aarch64( int16_t *, x264_run_level_t * ); +int x264_coeff_level_run8_neon( int16_t *, x264_run_level_t * ); +int x264_coeff_level_run15_neon( int16_t *, x264_run_level_t * ); +int x264_coeff_level_run16_neon( int16_t *, x264_run_level_t * ); #endif diff --git a/common/quant.c b/common/quant.c index d1b89c08..514e658e 100644 --- a/common/quant.c +++ b/common/quant.c @@ -754,9 +754,13 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf ) { pf->coeff_last4 = x264_coeff_last4_aarch64; pf->coeff_last8 = x264_coeff_last8_aarch64; + pf->coeff_level_run4 = x264_coeff_level_run4_aarch64; } if( cpu&X264_CPU_NEON ) { + pf->coeff_level_run8 = x264_coeff_level_run8_neon; + pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_neon; + pf->coeff_level_run[ DCT_LUMA_4x4] = x264_coeff_level_run16_neon; pf->decimate_score15 = x264_decimate_score15_neon; pf->decimate_score16 = x264_decimate_score16_neon; pf->decimate_score64 = x264_decimate_score64_neon; -- 2.40.0