From b0adbb6c22f514ac66ef32ee1265983a52eb2934 Mon Sep 17 00:00:00 2001 From: Luc Trudeau Date: Tue, 26 Jun 2018 17:06:52 -0400 Subject: [PATCH] [VSX] Replace vec_pack and vec_perm with single vec_perm vpx_quantize_b: VP9QuantizeTest Speed Test (POWER8 Model 2.1) 32x32 Old VSX time = 8.1 ms, new VSX time = 7.9 ms vp9_quantize_fp: VP9QuantizeTest Speed Test (POWER8 Model 2.1) 32x32 Old VSX time = 6.5 ms, new VSX time = 6.2 ms Change-Id: Ic2183e8bd721bb69eaeb4865b542b656255a0870 --- vp9/encoder/ppc/vp9_quantize_vsx.c | 4 +--- vpx_dsp/ppc/fdct32x32_vsx.c | 12 ++++-------- vpx_dsp/ppc/quantize_vsx.c | 4 +--- vpx_dsp/ppc/types_vsx.h | 7 ++++--- 4 files changed, 10 insertions(+), 17 deletions(-) diff --git a/vp9/encoder/ppc/vp9_quantize_vsx.c b/vp9/encoder/ppc/vp9_quantize_vsx.c index f6fdb55a6..3720b0876 100644 --- a/vp9/encoder/ppc/vp9_quantize_vsx.c +++ b/vp9/encoder/ppc/vp9_quantize_vsx.c @@ -154,7 +154,6 @@ static INLINE int32x4_t vec_is_neg(int32x4_t a) { // vec_mladd results in overflow. static INLINE int16x8_t dequantize_coeff_32(int16x8_t qcoeff, int16x8_t dequant) { - int16x8_t dqcoeff; int32x4_t dqcoeffe = vec_mule(qcoeff, dequant); int32x4_t dqcoeffo = vec_mulo(qcoeff, dequant); // Add 1 if negative to round towards zero because the C uses division. @@ -162,8 +161,7 @@ static INLINE int16x8_t dequantize_coeff_32(int16x8_t qcoeff, dqcoeffo = vec_add(dqcoeffo, vec_is_neg(dqcoeffo)); dqcoeffe = vec_sra(dqcoeffe, vec_ones_u32); dqcoeffo = vec_sra(dqcoeffo, vec_ones_u32); - dqcoeff = vec_pack(dqcoeffe, dqcoeffo); - return vec_perm(dqcoeff, dqcoeff, vec_perm_merge); + return (int16x8_t)vec_perm(dqcoeffe, dqcoeffo, vec_perm_odd_even_pack); } void vp9_quantize_fp_32x32_vsx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, diff --git a/vpx_dsp/ppc/fdct32x32_vsx.c b/vpx_dsp/ppc/fdct32x32_vsx.c index 65ff00006..0156683c2 100644 --- a/vpx_dsp/ppc/fdct32x32_vsx.c +++ b/vpx_dsp/ppc/fdct32x32_vsx.c @@ -15,10 +15,6 @@ #include "vpx_dsp/ppc/txfm_common_vsx.h" #include "vpx_dsp/ppc/transpose_vsx.h" -static const uint8x16_t vec_perm_pack = { 0x00, 0x01, 0x10, 0x11, 0x04, 0x05, - 0x14, 0x15, 0x08, 0x09, 0x18, 0x19, - 0x0C, 0x0D, 0x1C, 0x1D }; - // Returns ((a +/- b) * cospi16 + (2 << 13)) >> 14. static INLINE void single_butterfly(int16x8_t a, int16x8_t b, int16x8_t *add, int16x8_t *sub) { @@ -47,8 +43,8 @@ static INLINE void single_butterfly(int16x8_t a, int16x8_t b, int16x8_t *add, const int32x4_t sdiff_e = vec_sra(rdiff_e, vec_dct_const_bits); // There's no pack operation for even and odd, so we need to permute. - *add = (int16x8_t)vec_perm(ssum_e, ssum_o, vec_perm_pack); - *sub = (int16x8_t)vec_perm(sdiff_e, sdiff_o, vec_perm_pack); + *add = (int16x8_t)vec_perm(ssum_e, ssum_o, vec_perm_odd_even_pack); + *sub = (int16x8_t)vec_perm(sdiff_e, sdiff_o, vec_perm_odd_even_pack); } // Returns (a * c1 +/- b * c2 + (2 << 13)) >> 14 @@ -82,8 +78,8 @@ static INLINE void double_butterfly(int16x8_t a, int16x8_t c1, int16x8_t b, const int32x4_t sdiff_e = vec_sra(rdiff_e, vec_dct_const_bits); // There's no pack operation for even and odd, so we need to permute. - *add = (int16x8_t)vec_perm(ssum_e, ssum_o, vec_perm_pack); - *sub = (int16x8_t)vec_perm(sdiff_e, sdiff_o, vec_perm_pack); + *add = (int16x8_t)vec_perm(ssum_e, ssum_o, vec_perm_odd_even_pack); + *sub = (int16x8_t)vec_perm(sdiff_e, sdiff_o, vec_perm_odd_even_pack); } // While other architecture combine the load and the stage 1 operations, Power9 diff --git a/vpx_dsp/ppc/quantize_vsx.c b/vpx_dsp/ppc/quantize_vsx.c index 3a9092f64..d85e63bd1 100644 --- a/vpx_dsp/ppc/quantize_vsx.c +++ b/vpx_dsp/ppc/quantize_vsx.c @@ -68,7 +68,6 @@ static INLINE int16x8_t quantize_coeff_32(int16x8_t coeff, int16x8_t coeff_abs, // vec_mladd results in overflow. static INLINE int16x8_t dequantize_coeff_32(int16x8_t qcoeff, int16x8_t dequant) { - int16x8_t dqcoeff; int32x4_t dqcoeffe = vec_mule(qcoeff, dequant); int32x4_t dqcoeffo = vec_mulo(qcoeff, dequant); // Add 1 if negative to round towards zero because the C uses division. @@ -76,8 +75,7 @@ static INLINE int16x8_t dequantize_coeff_32(int16x8_t qcoeff, dqcoeffo = vec_add(dqcoeffo, vec_is_neg(dqcoeffo)); dqcoeffe = vec_sra(dqcoeffe, vec_ones_u32); dqcoeffo = vec_sra(dqcoeffo, vec_ones_u32); - dqcoeff = vec_pack(dqcoeffe, dqcoeffo); - return vec_perm(dqcoeff, dqcoeff, vec_perm_merge); + return (int16x8_t)vec_perm(dqcoeffe, dqcoeffo, vec_perm_odd_even_pack); } static INLINE int16x8_t nonzero_scanindex(int16x8_t qcoeff, bool16x8_t mask, diff --git a/vpx_dsp/ppc/types_vsx.h b/vpx_dsp/ppc/types_vsx.h index 7a9f9c092..56ecdce59 100644 --- a/vpx_dsp/ppc/types_vsx.h +++ b/vpx_dsp/ppc/types_vsx.h @@ -96,8 +96,9 @@ static const uint8x16_t vec_perm16 = { 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0E, 0x0D, 0x0E, 0x0F, 0x00, 0x01 }; -static const uint8x16_t vec_perm_merge = { 0x00, 0x01, 0x08, 0x09, 0x02, 0x03, - 0x0A, 0x0B, 0x04, 0x05, 0x0C, 0x0D, - 0x06, 0x07, 0x0E, 0x0F }; +static const uint8x16_t vec_perm_odd_even_pack = { 0x00, 0x01, 0x10, 0x11, + 0x04, 0x05, 0x14, 0x15, + 0x08, 0x09, 0x18, 0x19, + 0x0C, 0x0D, 0x1C, 0x1D }; #endif // VPX_DSP_PPC_TYPES_VSX_H_ -- 2.40.0