From 81a98509dcd44e939656d8c281e5a5ae5b9926be Mon Sep 17 00:00:00 2001 From: Luc Trudeau Date: Thu, 10 May 2018 16:54:13 -0400 Subject: [PATCH] Faster VSX vpx_quantize_b Process 16 coefficients on the first iteration (a full 4x4) and 24 coefficients on subsequent iteration. VSX/VP9QuantizeTest.DISABLED_Speed Before: 4x4 176 ms 8x8 91 ms 16x16 72 ms After: 4x4 152 ms 8x8 82 ms 16x16 64 ms Change-Id: I07cb130833504206ccdc5bc12ae5af369364999a --- vpx_dsp/ppc/quantize_vsx.c | 117 ++++++++++++++++++++++++------------- 1 file changed, 76 insertions(+), 41 deletions(-) diff --git a/vpx_dsp/ppc/quantize_vsx.c b/vpx_dsp/ppc/quantize_vsx.c index e037f89e3..361f9e3d3 100644 --- a/vpx_dsp/ppc/quantize_vsx.c +++ b/vpx_dsp/ppc/quantize_vsx.c @@ -41,10 +41,9 @@ static INLINE int16x8_t quantize_coeff(int16x8_t coeff, int16x8_t coeff_abs, } static INLINE int16x8_t nonzero_scanindex(int16x8_t qcoeff, bool16x8_t mask, - const int16_t *iscan_ptr) { - bool16x8_t zero_coeff; - int16x8_t scan = vec_vsx_ld(0, iscan_ptr); - zero_coeff = vec_cmpeq(qcoeff, vec_zeros_s16); + const int16_t *iscan_ptr, int index) { + int16x8_t scan = vec_vsx_ld(index, iscan_ptr); + bool16x8_t zero_coeff = vec_cmpeq(qcoeff, vec_zeros_s16); scan = vec_sub(scan, mask); return vec_andc(scan, zero_coeff); } @@ -64,7 +63,8 @@ void vpx_quantize_b_vsx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan_ptr, const int16_t *iscan_ptr) { - int16x8_t qcoeff, dqcoeff, eob; + int16x8_t qcoeff0, qcoeff1, dqcoeff0, dqcoeff1, eob; + bool16x8_t zero_mask0, zero_mask1; // First set of 8 coeff starts with DC + 7 AC int16x8_t zbin = vec_vsx_ld(0, zbin_ptr); @@ -73,51 +73,86 @@ void vpx_quantize_b_vsx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int16x8_t dequant = vec_vsx_ld(0, dequant_ptr); int16x8_t quant_shift = vec_vsx_ld(0, quant_shift_ptr); - int16x8_t coeff = vec_vsx_ld(0, coeff_ptr); - int16x8_t coeff_abs = vec_abs(coeff); - bool16x8_t zero_mask = vec_cmpge(coeff_abs, zbin); + int16x8_t coeff0 = vec_vsx_ld(0, coeff_ptr); + int16x8_t coeff1 = vec_vsx_ld(16, coeff_ptr); + + int16x8_t coeff0_abs = vec_abs(coeff0); + int16x8_t coeff1_abs = vec_abs(coeff1); + + zero_mask0 = vec_cmpge(coeff0_abs, zbin); + zbin = vec_splat(zbin, 1); + zero_mask1 = vec_cmpge(coeff1_abs, zbin); (void)scan_ptr; (void)skip_block; assert(!skip_block); - qcoeff = - quantize_coeff(coeff, coeff_abs, round, quant, quant_shift, zero_mask); - vec_vsx_st(qcoeff, 0, qcoeff_ptr); - - dqcoeff = vec_mladd(qcoeff, dequant, vec_zeros_s16); - vec_vsx_st(dqcoeff, 0, dqcoeff_ptr); - - eob = nonzero_scanindex(qcoeff, zero_mask, iscan_ptr); - - // All other sets of 8 coeffs will only contain AC - zbin = vec_splat(zbin, 1); + qcoeff0 = + quantize_coeff(coeff0, coeff0_abs, round, quant, quant_shift, zero_mask0); + vec_vsx_st(qcoeff0, 0, qcoeff_ptr); round = vec_splat(round, 1); quant = vec_splat(quant, 1); - dequant = vec_splat(dequant, 1); quant_shift = vec_splat(quant_shift, 1); + qcoeff1 = + quantize_coeff(coeff1, coeff1_abs, round, quant, quant_shift, zero_mask1); + vec_vsx_st(qcoeff1, 16, qcoeff_ptr); - n_coeffs -= 8; - do { - coeff_ptr += 8; - qcoeff_ptr += 8; - dqcoeff_ptr += 8; - iscan_ptr += 8; - - coeff = vec_vsx_ld(0, coeff_ptr); - coeff_abs = vec_abs(coeff); - zero_mask = vec_cmpge(coeff_abs, zbin); - qcoeff = - quantize_coeff(coeff, coeff_abs, round, quant, quant_shift, zero_mask); - vec_vsx_st(qcoeff, 0, qcoeff_ptr); - - dqcoeff = vec_mladd(qcoeff, dequant, vec_zeros_s16); - vec_vsx_st(dqcoeff, 0, dqcoeff_ptr); - - eob = vec_max(eob, nonzero_scanindex(qcoeff, zero_mask, iscan_ptr)); - - n_coeffs -= 8; - } while (n_coeffs > 0); + dqcoeff0 = vec_mladd(qcoeff0, dequant, vec_zeros_s16); + vec_vsx_st(dqcoeff0, 0, dqcoeff_ptr); + dequant = vec_splat(dequant, 1); + dqcoeff1 = vec_mladd(qcoeff1, dequant, vec_zeros_s16); + vec_vsx_st(dqcoeff1, 16, dqcoeff_ptr); + + eob = vec_max(nonzero_scanindex(qcoeff0, zero_mask0, iscan_ptr, 0), + nonzero_scanindex(qcoeff1, zero_mask1, iscan_ptr, 16)); + + if (n_coeffs > 16) { + int index = 16; + int off0 = 32; + int off1 = 48; + int off2 = 64; + do { + int16x8_t coeff2, coeff2_abs, qcoeff2, dqcoeff2, eob2; + bool16x8_t zero_mask2; + coeff0 = vec_vsx_ld(off0, coeff_ptr); + coeff1 = vec_vsx_ld(off1, coeff_ptr); + coeff2 = vec_vsx_ld(off2, coeff_ptr); + coeff0_abs = vec_abs(coeff0); + coeff1_abs = vec_abs(coeff1); + coeff2_abs = vec_abs(coeff2); + zero_mask0 = vec_cmpge(coeff0_abs, zbin); + zero_mask1 = vec_cmpge(coeff1_abs, zbin); + zero_mask2 = vec_cmpge(coeff2_abs, zbin); + qcoeff0 = quantize_coeff(coeff0, coeff0_abs, round, quant, quant_shift, + zero_mask0); + qcoeff1 = quantize_coeff(coeff1, coeff1_abs, round, quant, quant_shift, + zero_mask1); + qcoeff2 = quantize_coeff(coeff2, coeff2_abs, round, quant, quant_shift, + zero_mask2); + vec_vsx_st(qcoeff0, off0, qcoeff_ptr); + vec_vsx_st(qcoeff1, off1, qcoeff_ptr); + vec_vsx_st(qcoeff2, off2, qcoeff_ptr); + + dqcoeff0 = vec_mladd(qcoeff0, dequant, vec_zeros_s16); + dqcoeff1 = vec_mladd(qcoeff1, dequant, vec_zeros_s16); + dqcoeff2 = vec_mladd(qcoeff2, dequant, vec_zeros_s16); + + vec_vsx_st(dqcoeff0, off0, dqcoeff_ptr); + vec_vsx_st(dqcoeff1, off1, dqcoeff_ptr); + vec_vsx_st(dqcoeff2, off2, dqcoeff_ptr); + + eob = + vec_max(eob, nonzero_scanindex(qcoeff0, zero_mask0, iscan_ptr, off0)); + eob2 = vec_max(nonzero_scanindex(qcoeff1, zero_mask1, iscan_ptr, off1), + nonzero_scanindex(qcoeff2, zero_mask2, iscan_ptr, off2)); + eob = vec_max(eob, eob2); + + index += 24; + off0 += 48; + off1 += 48; + off2 += 48; + } while (index < n_coeffs); + } eob = vec_max_across(eob); *eob_ptr = eob[0]; -- 2.40.0