From b08403b5593307b919bfe5bfbd743da825326a4c Mon Sep 17 00:00:00 2001 From: =?utf8?q?Martin=20Storsj=C3=B6?= Date: Tue, 25 Aug 2015 14:38:13 +0300 Subject: [PATCH] arm: Implement x264_denoise_dct_neon checkasm timing Cortex-A7 A8 A9 denoise_dct_c 6604 5510 5858 denoise_dct_neon 1774 1139 1614 --- common/arm/quant-a.S | 29 +++++++++++++++++++++++++++++ common/arm/quant.h | 2 ++ common/quant.c | 2 +- 3 files changed, 32 insertions(+), 1 deletion(-) diff --git a/common/arm/quant-a.S b/common/arm/quant-a.S index ad8d8f84..e63170e3 100644 --- a/common/arm/quant-a.S +++ b/common/arm/quant-a.S @@ -4,6 +4,7 @@ * Copyright (C) 2009-2015 x264 project * * Authors: David Conrad + * Janne Grunau * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -404,3 +405,31 @@ function x264_coeff_last64_neon movlt r0, #0 bx lr endfunc + +function x264_denoise_dct_neon +1: subs r3, r3, #16 + vld1.16 {q0, q1}, [r0] + vld1.32 {q12, q13}, [r1]! + vld1.32 {q14, q15}, [r1] + sub r1, #32 + vabs.s16 q8, q0 + vabs.s16 q9, q1 + vld1.16 {q2, q3}, [r2]! + vclt.s16 q10, q0, #0 + vclt.s16 q11, q1, #0 + vaddw.u16 q12, q12, d16 + vaddw.u16 q13, q13, d17 + vqsub.u16 q0, q8, q2 + vqsub.u16 q1, q9, q3 + vaddw.u16 q14, q14, d18 + vaddw.u16 q15, q15, d19 + vneg.s16 q8, q0 + vneg.s16 q9, q1 + vbsl q10, q8, q0 + vbsl q11, q9, q1 + vst1.32 {q12, q13}, [r1]! + vst1.32 {q14, q15}, [r1]! + vst1.16 {q10, q11}, [r0]! + bgt 1b + bx lr +endfunc diff --git a/common/arm/quant.h b/common/arm/quant.h index 8ea179a1..78178e8d 100644 --- a/common/arm/quant.h +++ b/common/arm/quant.h @@ -44,4 +44,6 @@ int x264_coeff_last15_neon( int16_t * ); int x264_coeff_last16_neon( int16_t * ); int x264_coeff_last64_neon( int16_t * ); +void x264_denoise_dct_neon( dctcoef *, uint32_t *, udctcoef *, int ); + #endif diff --git a/common/quant.c b/common/quant.c index bc9e8d73..f8279a77 100644 --- a/common/quant.c +++ b/common/quant.c @@ -750,6 +750,7 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf ) pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_neon; pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_neon; pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_neon; + pf->denoise_dct = x264_denoise_dct_neon; } #endif #if ARCH_AARCH64 @@ -767,7 +768,6 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf ) pf->decimate_score15 = x264_decimate_score15_neon; pf->decimate_score16 = x264_decimate_score16_neon; pf->decimate_score64 = x264_decimate_score64_neon; - pf->denoise_dct = x264_denoise_dct_neon; } #endif -- 2.40.0