From 8894c766c6d9460d12e52d350e7084b143d6109b Mon Sep 17 00:00:00 2001 From: Jerome Jiang Date: Thu, 1 Aug 2019 10:48:35 -0700 Subject: [PATCH] Fix saturation issue in vp9_quantize_fp_neon Change-Id: I7850a5c5aea3633e50e9a2efc8116b9e16383a8f --- test/vp9_quantize_test.cc | 5 +++++ vp9/encoder/arm/neon/vp9_quantize_neon.c | 10 ++++++---- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/test/vp9_quantize_test.cc b/test/vp9_quantize_test.cc index cce6b6f19..d094904f1 100644 --- a/test/vp9_quantize_test.cc +++ b/test/vp9_quantize_test.cc @@ -77,7 +77,12 @@ class VP9QuantizeBase : public AbstractBench { coeff_(Buffer(max_size_, max_size_, 0, 16)), qcoeff_(Buffer(max_size_, max_size_, 0, 32)), dqcoeff_(Buffer(max_size_, max_size_, 0, 32)) { + // TODO(jianj): SSSE3 and AVX2 tests fail on extreme values. +#if HAVE_NEON + max_value_ = (1 << (7 + bit_depth_)) - 1; +#else max_value_ = (1 << bit_depth_) - 1; +#endif zbin_ptr_ = reinterpret_cast(vpx_memalign(16, 8 * sizeof(*zbin_ptr_))); round_fp_ptr_ = reinterpret_cast( diff --git a/vp9/encoder/arm/neon/vp9_quantize_neon.c b/vp9/encoder/arm/neon/vp9_quantize_neon.c index 8b62b450c..3fd9dff21 100644 --- a/vp9/encoder/arm/neon/vp9_quantize_neon.c +++ b/vp9/encoder/arm/neon/vp9_quantize_neon.c @@ -55,7 +55,8 @@ void vp9_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t count, const int16x8_t v_iscan = vld1q_s16(&iscan[0]); const int16x8_t v_coeff = load_tran_low_to_s16q(coeff_ptr); const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15); - const int16x8_t v_tmp = vabaq_s16(v_round, v_coeff, v_zero); + const int16x8_t v_abs = vabsq_s16(v_coeff); + const int16x8_t v_tmp = vqaddq_s16(v_abs, v_round); const int32x4_t v_tmp_lo = vmull_s16(vget_low_s16(v_tmp), vget_low_s16(v_quant)); const int32x4_t v_tmp_hi = @@ -80,7 +81,8 @@ void vp9_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t count, const int16x8_t v_iscan = vld1q_s16(&iscan[i]); const int16x8_t v_coeff = load_tran_low_to_s16q(coeff_ptr + i); const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15); - const int16x8_t v_tmp = vabaq_s16(v_round, v_coeff, v_zero); + const int16x8_t v_abs = vabsq_s16(v_coeff); + const int16x8_t v_tmp = vqaddq_s16(v_abs, v_round); const int32x4_t v_tmp_lo = vmull_s16(vget_low_s16(v_tmp), vget_low_s16(v_quant)); const int32x4_t v_tmp_hi = @@ -146,7 +148,7 @@ void vp9_quantize_fp_32x32_neon(const tran_low_t *coeff_ptr, intptr_t count, const int16x8_t dequant_mask = vreinterpretq_s16_u16(vcgeq_s16(coeff_abs, dequant_thresh)); - int16x8_t qcoeff = vaddq_s16(coeff_abs, round); + int16x8_t qcoeff = vqaddq_s16(coeff_abs, round); int32x4_t dqcoeff_0, dqcoeff_1; int16x8_t dqcoeff; uint16x8_t eob_max; @@ -200,7 +202,7 @@ void vp9_quantize_fp_32x32_neon(const tran_low_t *coeff_ptr, intptr_t count, const int16x8_t dequant_mask = vreinterpretq_s16_u16(vcgeq_s16(coeff_abs, dequant_thresh)); - int16x8_t qcoeff = vaddq_s16(coeff_abs, round); + int16x8_t qcoeff = vqaddq_s16(coeff_abs, round); int32x4_t dqcoeff_0, dqcoeff_1; int16x8_t dqcoeff; -- 2.40.0