From: Johann Date: Mon, 21 Aug 2017 18:23:49 +0000 (-0700) Subject: quantize neon: round dqcoeff towards zero X-Git-Tag: v1.7.0~219^2 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=2a5aa98a35d295bbfd17d107630be84a5ccc1077;p=libvpx quantize neon: round dqcoeff towards zero Add 1 if negative to get dqcoeff to round towards zero. 10-15% faster than converting to positive before shifting. Change-Id: I01a62fd0c9bca786b6885b318bd447bb9229903d --- diff --git a/vpx_dsp/arm/quantize_neon.c b/vpx_dsp/arm/quantize_neon.c index e62933aaa..d2d094a62 100644 --- a/vpx_dsp/arm/quantize_neon.c +++ b/vpx_dsp/arm/quantize_neon.c @@ -10,6 +10,7 @@ #include +#include "./vpx_config.h" #include "./vpx_dsp_rtcd.h" #include "vpx_dsp/arm/mem_neon.h" @@ -154,6 +155,10 @@ void vpx_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, } } +static INLINE int32x4_t extract_sign_bit(int32x4_t a) { + return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a), 31)); +} + // Main difference is that zbin values are halved before comparison and dqcoeff // values are divided by 2. zbin is rounded but dqcoeff is not. void vpx_quantize_b_32x32_neon( @@ -205,7 +210,7 @@ void vpx_quantize_b_32x32_neon( // (round * quant * 2) >> 16 >> 1 == (round * quant) >> 16 int16x8_t qcoeff = vshrq_n_s16(vqdmulhq_s16(rounded, quant), 1); int16x8_t dqcoeff; - int32x4_t dqcoeff_0, dqcoeff_1, dqcoeff_0_sign, dqcoeff_1_sign; + int32x4_t dqcoeff_0, dqcoeff_1; qcoeff = vaddq_s16(qcoeff, rounded); @@ -230,21 +235,12 @@ void vpx_quantize_b_32x32_neon( dqcoeff_0 = vmull_s16(vget_low_s16(qcoeff), vget_low_s16(dequant)); dqcoeff_1 = vmull_s16(vget_high_s16(qcoeff), vget_high_s16(dequant)); - // The way the C shifts the values requires us to convert to positive before - // shifting or even narrowing, then put the sign back. - dqcoeff_0_sign = vshrq_n_s32(dqcoeff_0, 31); - dqcoeff_1_sign = vshrq_n_s32(dqcoeff_1, 31); - dqcoeff_0 = vabsq_s32(dqcoeff_0); - dqcoeff_1 = vabsq_s32(dqcoeff_1); - dqcoeff_0 = vshrq_n_s32(dqcoeff_0, 1); - dqcoeff_1 = vshrq_n_s32(dqcoeff_1, 1); - dqcoeff_0 = veorq_s32(dqcoeff_0, dqcoeff_0_sign); - dqcoeff_1 = veorq_s32(dqcoeff_1, dqcoeff_1_sign); - dqcoeff_0 = vsubq_s32(dqcoeff_0, dqcoeff_0_sign); - dqcoeff_1 = vsubq_s32(dqcoeff_1, dqcoeff_1_sign); - - // Narrow *without saturation* because that's what the C does. - dqcoeff = vcombine_s16(vmovn_s32(dqcoeff_0), vmovn_s32(dqcoeff_1)); + // Add 1 if negative to round towards zero because the C uses division. + dqcoeff_0 = vaddq_s32(dqcoeff_0, extract_sign_bit(dqcoeff_0)); + dqcoeff_1 = vaddq_s32(dqcoeff_1, extract_sign_bit(dqcoeff_1)); + + dqcoeff = + vcombine_s16(vshrn_n_s32(dqcoeff_0, 1), vshrn_n_s32(dqcoeff_1, 1)); store_s16q_to_tran_low(dqcoeff_ptr, dqcoeff); dqcoeff_ptr += 8; @@ -274,7 +270,7 @@ void vpx_quantize_b_32x32_neon( // (round * quant * 2) >> 16 >> 1 == (round * quant) >> 16 int16x8_t qcoeff = vshrq_n_s16(vqdmulhq_s16(rounded, quant), 1); int16x8_t dqcoeff; - int32x4_t dqcoeff_0, dqcoeff_1, dqcoeff_0_sign, dqcoeff_1_sign; + int32x4_t dqcoeff_0, dqcoeff_1; qcoeff = vaddq_s16(qcoeff, rounded); @@ -300,18 +296,11 @@ void vpx_quantize_b_32x32_neon( dqcoeff_0 = vmull_s16(vget_low_s16(qcoeff), vget_low_s16(dequant)); dqcoeff_1 = vmull_s16(vget_high_s16(qcoeff), vget_high_s16(dequant)); - dqcoeff_0_sign = vshrq_n_s32(dqcoeff_0, 31); - dqcoeff_1_sign = vshrq_n_s32(dqcoeff_1, 31); - dqcoeff_0 = vabsq_s32(dqcoeff_0); - dqcoeff_1 = vabsq_s32(dqcoeff_1); - dqcoeff_0 = vshrq_n_s32(dqcoeff_0, 1); - dqcoeff_1 = vshrq_n_s32(dqcoeff_1, 1); - dqcoeff_0 = veorq_s32(dqcoeff_0, dqcoeff_0_sign); - dqcoeff_1 = veorq_s32(dqcoeff_1, dqcoeff_1_sign); - dqcoeff_0 = vsubq_s32(dqcoeff_0, dqcoeff_0_sign); - dqcoeff_1 = vsubq_s32(dqcoeff_1, dqcoeff_1_sign); - - dqcoeff = vcombine_s16(vmovn_s32(dqcoeff_0), vmovn_s32(dqcoeff_1)); + dqcoeff_0 = vaddq_s32(dqcoeff_0, extract_sign_bit(dqcoeff_0)); + dqcoeff_1 = vaddq_s32(dqcoeff_1, extract_sign_bit(dqcoeff_1)); + + dqcoeff = + vcombine_s16(vshrn_n_s32(dqcoeff_0, 1), vshrn_n_s32(dqcoeff_1, 1)); store_s16q_to_tran_low(dqcoeff_ptr, dqcoeff); dqcoeff_ptr += 8;