From: Frank Galligan Date: Fri, 16 Jan 2015 03:29:46 +0000 (-0800) Subject: Add Neon intrinsic vp9_fdct8x8_quant_neon X-Git-Tag: v1.4.0~222 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=9f6eba419a876598aa7edaee9e99610fcbf6362b;p=libvpx Add Neon intrinsic vp9_fdct8x8_quant_neon On Nexus 7 speed -5 got ~2%, -6 got ~15%, -7 and -8 got ~30% increase in perf. Tested on Nexus 7, built with ndk r10d, gcc 4.9. Change-Id: I83246d63b96674d170098a572fa4fe28a05aaf51 --- diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl index 947d8c7bb..12f076fed 100644 --- a/vp9/common/vp9_rtcd_defs.pl +++ b/vp9/common/vp9_rtcd_defs.pl @@ -1166,7 +1166,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { specialize qw/vp9_quantize_b_32x32/, "$ssse3_x86_64"; add_proto qw/void vp9_fdct8x8_quant/, "const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; - specialize qw/vp9_fdct8x8_quant sse2 ssse3/; + specialize qw/vp9_fdct8x8_quant sse2 ssse3 neon/; } # diff --git a/vp9/encoder/arm/neon/vp9_dct_neon.c b/vp9/encoder/arm/neon/vp9_dct_neon.c index 6c66f5d5b..a6d4797ad 100644 --- a/vp9/encoder/arm/neon/vp9_dct_neon.c +++ b/vp9/encoder/arm/neon/vp9_dct_neon.c @@ -32,6 +32,24 @@ void vp9_fdct8x8_1_neon(const int16_t *input, int16_t *output, int stride) { } } +void vp9_fdct8x8_quant_neon(const int16_t *input, int stride, + int16_t* coeff_ptr, intptr_t n_coeffs, + int skip_block, const int16_t* zbin_ptr, + const int16_t* round_ptr, const int16_t* quant_ptr, + const int16_t* quant_shift_ptr, + int16_t* qcoeff_ptr, int16_t* dqcoeff_ptr, + const int16_t* dequant_ptr, uint16_t* eob_ptr, + const int16_t* scan_ptr, + const int16_t* iscan_ptr) { + int16_t temp_buffer[64]; + (void)coeff_ptr; + + vp9_fdct8x8_neon(input, temp_buffer, stride); + vp9_quantize_fp_neon(temp_buffer, n_coeffs, skip_block, zbin_ptr, round_ptr, + quant_ptr, quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, + dequant_ptr, eob_ptr, scan_ptr, iscan_ptr); +} + void vp9_fdct8x8_neon(const int16_t *input, int16_t *final_output, int stride) { int i; // stage 1