From: Janne Grunau Date: Tue, 21 Oct 2014 13:18:49 +0000 (+0200) Subject: aarch64: x264_denoise_dct_neon X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=4d400a6ec67f17ae3b17876b0318b956b6d5c856;p=libx264 aarch64: x264_denoise_dct_neon 3.5 times faster. --- diff --git a/common/aarch64/quant-a.S b/common/aarch64/quant-a.S index d3b2933b..f4be81b0 100644 --- a/common/aarch64/quant-a.S +++ b/common/aarch64/quant-a.S @@ -574,3 +574,28 @@ endfunc X264_COEFF_LEVEL_RUN 8 X264_COEFF_LEVEL_RUN 15 X264_COEFF_LEVEL_RUN 16 + +function x264_denoise_dct_neon, export=1 +1: subs w3, w3, #16 + ld1 {v0.8h,v1.8h}, [x0] + ld1 {v4.4s,v5.4s,v6.4s,v7.4s}, [x1] + abs v16.8h, v0.8h + abs v17.8h, v1.8h + ld1 {v2.8h,v3.8h}, [x2], #32 + cmlt v18.8h, v0.8h, #0 + cmlt v19.8h, v1.8h, #0 + uaddw v4.4s, v4.4s, v16.4h + uaddw2 v5.4s, v5.4s, v16.8h + uqsub v20.8h, v16.8h, v2.8h + uqsub v21.8h, v17.8h, v3.8h + uaddw v6.4s, v6.4s, v17.4h + uaddw2 v7.4s, v7.4s, v17.8h + neg v22.8h, v20.8h + neg v23.8h, v21.8h + bsl v18.16b, v22.16b, v20.16b + bsl v19.16b, v23.16b, v21.16b + st1 {v4.4s,v5.4s,v6.4s,v7.4s}, [x1], #64 + st1 {v18.8h,v19.8h}, [x0], #32 + b.gt 1b + ret +endfunc diff --git a/common/aarch64/quant.h b/common/aarch64/quant.h index 360af26f..a06e78ee 100644 --- a/common/aarch64/quant.h +++ b/common/aarch64/quant.h @@ -53,4 +53,7 @@ int x264_coeff_level_run4_aarch64( int16_t *, x264_run_level_t * ); int x264_coeff_level_run8_neon( int16_t *, x264_run_level_t * ); int x264_coeff_level_run15_neon( int16_t *, x264_run_level_t * ); int x264_coeff_level_run16_neon( int16_t *, x264_run_level_t * ); + +void x264_denoise_dct_neon( dctcoef *, uint32_t *, udctcoef *, int ); + #endif diff --git a/common/quant.c b/common/quant.c index 514e658e..c3392bcf 100644 --- a/common/quant.c +++ b/common/quant.c @@ -764,6 +764,7 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf ) pf->decimate_score15 = x264_decimate_score15_neon; pf->decimate_score16 = x264_decimate_score16_neon; pf->decimate_score64 = x264_decimate_score64_neon; + pf->denoise_dct = x264_denoise_dct_neon; } #endif #endif // HIGH_BIT_DEPTH