From: Scott LaVarnway Date: Fri, 8 Aug 2014 20:57:25 +0000 (-0700) Subject: Improved vp9_quantize_fp_neon() X-Git-Tag: v1.4.0~987^2 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=7035527c9a1acac4bcd8e10acd84f8ffb9285b35;p=libvpx Improved vp9_quantize_fp_neon() Eliminated instructions by using better neon instructions and rearranging the loop. On a Nexus 7, vpxenc (in realtime mode, speed -12) reported a performance improvement of ~1.0%. Change-Id: I6b1700e79318f647ea67ef25e954c308932950ec --- diff --git a/vp9/encoder/arm/neon/vp9_quantize_neon.c b/vp9/encoder/arm/neon/vp9_quantize_neon.c index 2d5ec79b3..8c13d0da6 100644 --- a/vp9/encoder/arm/neon/vp9_quantize_neon.c +++ b/vp9/encoder/arm/neon/vp9_quantize_neon.c @@ -28,7 +28,6 @@ void vp9_quantize_fp_neon(const int16_t *coeff_ptr, intptr_t count, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { - int i; // TODO(jingning) Decide the need of these arguments after the // quantization process is completed. (void)zbin_ptr; @@ -39,7 +38,7 @@ void vp9_quantize_fp_neon(const int16_t *coeff_ptr, intptr_t count, if (!skip_block) { // Quantization pass: All coefficients with index >= zero_flag are // skippable. Note: zero_flag can be zero. - + int i; const int16x8_t v_zero = vdupq_n_s16(0); const int16x8_t v_one = vdupq_n_s16(1); int16x8_t v_eobmax_76543210 = vdupq_n_s16(-1); @@ -50,13 +49,37 @@ void vp9_quantize_fp_neon(const int16_t *coeff_ptr, intptr_t count, v_round = vsetq_lane_s16(round_ptr[0], v_round, 0); v_quant = vsetq_lane_s16(quant_ptr[0], v_quant, 0); v_dequant = vsetq_lane_s16(dequant_ptr[0], v_dequant, 0); - - for (i = 0; i < count; i += 8) { + // process dc and the first seven ac coeffs + { + const int16x8_t v_iscan = vld1q_s16(&iscan[0]); + const int16x8_t v_coeff = vld1q_s16(&coeff_ptr[0]); + const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15); + const int16x8_t v_tmp = vabaq_s16(v_round, v_coeff, v_zero); + const int32x4_t v_tmp_lo = vmull_s16(vget_low_s16(v_tmp), + vget_low_s16(v_quant)); + const int32x4_t v_tmp_hi = vmull_s16(vget_high_s16(v_tmp), + vget_high_s16(v_quant)); + const int16x8_t v_tmp2 = vcombine_s16(vshrn_n_s32(v_tmp_lo, 16), + vshrn_n_s32(v_tmp_hi, 16)); + const uint16x8_t v_nz_mask = vceqq_s16(v_tmp2, v_zero); + const int16x8_t v_iscan_plus1 = vaddq_s16(v_iscan, v_one); + const int16x8_t v_nz_iscan = vbslq_s16(v_nz_mask, v_zero, v_iscan_plus1); + const int16x8_t v_qcoeff_a = veorq_s16(v_tmp2, v_coeff_sign); + const int16x8_t v_qcoeff = vsubq_s16(v_qcoeff_a, v_coeff_sign); + const int16x8_t v_dqcoeff = vmulq_s16(v_qcoeff, v_dequant); + v_eobmax_76543210 = vmaxq_s16(v_eobmax_76543210, v_nz_iscan); + vst1q_s16(&qcoeff_ptr[0], v_qcoeff); + vst1q_s16(&dqcoeff_ptr[0], v_dqcoeff); + v_round = vmovq_n_s16(round_ptr[1]); + v_quant = vmovq_n_s16(quant_ptr[1]); + v_dequant = vmovq_n_s16(dequant_ptr[1]); + } + // now process the rest of the ac coeffs + for (i = 8; i < count; i += 8) { const int16x8_t v_iscan = vld1q_s16(&iscan[i]); const int16x8_t v_coeff = vld1q_s16(&coeff_ptr[i]); const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15); - const int16x8_t v_abs_coeff = vabsq_s16(v_coeff); - const int16x8_t v_tmp = vqaddq_s16(v_abs_coeff, v_round); + const int16x8_t v_tmp = vabaq_s16(v_round, v_coeff, v_zero); const int32x4_t v_tmp_lo = vmull_s16(vget_low_s16(v_tmp), vget_low_s16(v_quant)); const int32x4_t v_tmp_hi = vmull_s16(vget_high_s16(v_tmp), @@ -65,19 +88,13 @@ void vp9_quantize_fp_neon(const int16_t *coeff_ptr, intptr_t count, vshrn_n_s32(v_tmp_hi, 16)); const uint16x8_t v_nz_mask = vceqq_s16(v_tmp2, v_zero); const int16x8_t v_iscan_plus1 = vaddq_s16(v_iscan, v_one); - const int16x8_t v_nz_iscan = - vandq_s16(vmvnq_s16(vreinterpretq_s16_u16(v_nz_mask)), v_iscan_plus1); + const int16x8_t v_nz_iscan = vbslq_s16(v_nz_mask, v_zero, v_iscan_plus1); const int16x8_t v_qcoeff_a = veorq_s16(v_tmp2, v_coeff_sign); const int16x8_t v_qcoeff = vsubq_s16(v_qcoeff_a, v_coeff_sign); const int16x8_t v_dqcoeff = vmulq_s16(v_qcoeff, v_dequant); - v_eobmax_76543210 = vmaxq_s16(v_eobmax_76543210, v_nz_iscan); - vst1q_s16(&qcoeff_ptr[i], v_qcoeff); vst1q_s16(&dqcoeff_ptr[i], v_dqcoeff); - v_round = vmovq_n_s16(round_ptr[1]); - v_quant = vmovq_n_s16(quant_ptr[1]); - v_dequant = vmovq_n_s16(dequant_ptr[1]); } { const int16x4_t v_eobmax_3210 =