From: Kaustubh Raste Date: Wed, 22 Mar 2017 08:31:03 +0000 (+0530) Subject: Fix mips msa fwd xform mismatch X-Git-Tag: v1.7.0~600^2 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=e45c1f55b4e8d10a3fe66a986749c849c72fae58;p=libvpx Fix mips msa fwd xform mismatch Change-Id: I32a6df11463144aa1a562256ee7d57a41fd678d6 --- diff --git a/vpx_dsp/mips/fwd_dct32x32_msa.c b/vpx_dsp/mips/fwd_dct32x32_msa.c index e41a90480..06fdc951e 100644 --- a/vpx_dsp/mips/fwd_dct32x32_msa.c +++ b/vpx_dsp/mips/fwd_dct32x32_msa.c @@ -927,21 +927,21 @@ void vpx_fdct32x32_rd_msa(const int16_t *input, int16_t *out, } void vpx_fdct32x32_1_msa(const int16_t *input, int16_t *out, int32_t stride) { - int sum = LD_HADD(input, stride); - sum += LD_HADD(input + 8, stride); - sum += LD_HADD(input + 16, stride); - sum += LD_HADD(input + 24, stride); - sum += LD_HADD(input + 32 * 8, stride); - sum += LD_HADD(input + 32 * 8 + 8, stride); - sum += LD_HADD(input + 32 * 8 + 16, stride); - sum += LD_HADD(input + 32 * 8 + 24, stride); - sum += LD_HADD(input + 32 * 16, stride); - sum += LD_HADD(input + 32 * 16 + 8, stride); - sum += LD_HADD(input + 32 * 16 + 16, stride); - sum += LD_HADD(input + 32 * 16 + 24, stride); - sum += LD_HADD(input + 32 * 24, stride); - sum += LD_HADD(input + 32 * 24 + 8, stride); - sum += LD_HADD(input + 32 * 24 + 16, stride); - sum += LD_HADD(input + 32 * 24 + 24, stride); + int sum, i; + v8i16 in0, in1, in2, in3, in4, in5, in6, in7; + v4i32 vec_w = { 0 }; + + for (i = 0; i < 16; ++i) { + LD_SH4(input, 8, in0, in1, in2, in3); + input += stride; + LD_SH4(input, 8, in4, in5, in6, in7); + input += stride; + ADD4(in0, in1, in2, in3, in4, in5, in6, in7, in0, in2, in4, in6); + ADD2(in0, in2, in4, in6, in0, in4); + vec_w += __msa_hadd_s_w(in0, in0); + vec_w += __msa_hadd_s_w(in4, in4); + } + + sum = HADD_SW_S32(vec_w); out[0] = (int16_t)(sum >> 3); } diff --git a/vpx_dsp/mips/fwd_txfm_msa.c b/vpx_dsp/mips/fwd_txfm_msa.c index fdead5050..f786664bb 100644 --- a/vpx_dsp/mips/fwd_txfm_msa.c +++ b/vpx_dsp/mips/fwd_txfm_msa.c @@ -216,7 +216,15 @@ void vpx_fdct8x8_msa(const int16_t *input, int16_t *output, } void vpx_fdct8x8_1_msa(const int16_t *input, int16_t *out, int32_t stride) { - out[0] = LD_HADD(input, stride); + v8i16 in0, in1, in2, in3, in4, in5, in6, in7; + v4i32 vec_w; + + LD_SH8(input, stride, in0, in1, in2, in3, in4, in5, in6, in7); + ADD4(in0, in1, in2, in3, in4, in5, in6, in7, in0, in2, in4, in6); + ADD2(in0, in2, in4, in6, in0, in4); + vec_w = __msa_hadd_s_w(in0, in0); + vec_w += __msa_hadd_s_w(in4, in4); + out[0] = HADD_SW_S32(vec_w); out[1] = 0; } @@ -237,9 +245,25 @@ void vpx_fdct16x16_msa(const int16_t *input, int16_t *output, } void vpx_fdct16x16_1_msa(const int16_t *input, int16_t *out, int32_t stride) { - int sum = LD_HADD(input, stride); - sum += LD_HADD(input + 8, stride); - sum += LD_HADD(input + 16 * 8, stride); - sum += LD_HADD(input + 16 * 8 + 8, stride); + int sum, i; + v8i16 in0, in1, in2, in3, in4, in5, in6, in7; + v4i32 vec_w = { 0 }; + + for (i = 0; i < 4; ++i) { + LD_SH2(input, 8, in0, in1); + input += stride; + LD_SH2(input, 8, in2, in3); + input += stride; + LD_SH2(input, 8, in4, in5); + input += stride; + LD_SH2(input, 8, in6, in7); + input += stride; + ADD4(in0, in1, in2, in3, in4, in5, in6, in7, in0, in2, in4, in6); + ADD2(in0, in2, in4, in6, in0, in4); + vec_w += __msa_hadd_s_w(in0, in0); + vec_w += __msa_hadd_s_w(in4, in4); + } + + sum = HADD_SW_S32(vec_w); out[0] = (int16_t)(sum >> 1); } diff --git a/vpx_dsp/mips/fwd_txfm_msa.h b/vpx_dsp/mips/fwd_txfm_msa.h index db5e90e7b..fd589224d 100644 --- a/vpx_dsp/mips/fwd_txfm_msa.h +++ b/vpx_dsp/mips/fwd_txfm_msa.h @@ -14,22 +14,6 @@ #include "vpx_dsp/mips/txfm_macros_msa.h" #include "vpx_dsp/txfm_common.h" -#define LD_HADD(psrc, stride) \ - ({ \ - v8i16 in0_m, in1_m, in2_m, in3_m, in4_m, in5_m, in6_m, in7_m; \ - v4i32 vec_w_m; \ - \ - LD_SH4((psrc), stride, in0_m, in1_m, in2_m, in3_m); \ - ADD2(in0_m, in1_m, in2_m, in3_m, in0_m, in2_m); \ - LD_SH4(((psrc) + 4 * stride), stride, in4_m, in5_m, in6_m, in7_m); \ - ADD4(in4_m, in5_m, in6_m, in7_m, in0_m, in2_m, in4_m, in6_m, in4_m, in6_m, \ - in0_m, in4_m); \ - in0_m += in4_m; \ - \ - vec_w_m = __msa_hadd_s_w(in0_m, in0_m); \ - HADD_SW_S32(vec_w_m); \ - }) - #define VP9_FDCT4(in0, in1, in2, in3, out0, out1, out2, out3) \ { \ v8i16 cnst0_m, cnst1_m, cnst2_m, cnst3_m; \