From fdb60962f4f98ec538d4c6ee161c391a4be9322d Mon Sep 17 00:00:00 2001 From: Urvang Joshi Date: Fri, 14 Oct 2016 15:30:27 -0700 Subject: [PATCH] Fix warnings reported by -Wshadow: Part1: aom_dsp directory While we are at it: - Rename some variables to more meaningful names - Reuse some common consts from a header instead of redefining them. Cherry-picked from aomedia/master: 09eea2193 Change-Id: I61030e773137ae107d3bd43556c0d5bb26f9dbf8 --- aom_dsp/arm/fwd_txfm_neon.c | 8 +- aom_dsp/arm/idct16x16_add_neon.c | 58 ++++++------- aom_dsp/arm/idct4x4_add_neon.c | 11 ++- aom_dsp/arm/idct8x8_1_add_neon.c | 2 +- aom_dsp/arm/idct8x8_add_neon.c | 32 +++---- aom_dsp/fwd_txfm.c | 138 +++++++++++++++---------------- aom_dsp/mips/aom_convolve_msa.h | 27 +++--- aom_dsp/mips/inv_txfm_msa.h | 14 ++-- aom_dsp/mips/loopfilter_msa.h | 94 ++++++++++----------- aom_dsp/mips/macros_msa.h | 31 ++++--- aom_dsp/x86/inv_txfm_sse2.c | 6 +- 11 files changed, 207 insertions(+), 214 deletions(-) diff --git a/aom_dsp/arm/fwd_txfm_neon.c b/aom_dsp/arm/fwd_txfm_neon.c index 17ce29e51..1cf8a3a6e 100644 --- a/aom_dsp/arm/fwd_txfm_neon.c +++ b/aom_dsp/arm/fwd_txfm_neon.c @@ -53,10 +53,10 @@ void aom_fdct8x8_neon(const int16_t *input, int16_t *final_output, int stride) { v_t2_hi = vmlal_n_s16(v_t2_hi, vget_high_s16(v_x3), (int16_t)cospi_8_64); v_t3_lo = vmlsl_n_s16(v_t3_lo, vget_low_s16(v_x2), (int16_t)cospi_8_64); v_t3_hi = vmlsl_n_s16(v_t3_hi, vget_high_s16(v_x2), (int16_t)cospi_8_64); - v_t0_lo = vmulq_n_s32(v_t0_lo, cospi_16_64); - v_t0_hi = vmulq_n_s32(v_t0_hi, cospi_16_64); - v_t1_lo = vmulq_n_s32(v_t1_lo, cospi_16_64); - v_t1_hi = vmulq_n_s32(v_t1_hi, cospi_16_64); + v_t0_lo = vmulq_n_s32(v_t0_lo, (int32_t)cospi_16_64); + v_t0_hi = vmulq_n_s32(v_t0_hi, (int32_t)cospi_16_64); + v_t1_lo = vmulq_n_s32(v_t1_lo, (int32_t)cospi_16_64); + v_t1_hi = vmulq_n_s32(v_t1_hi, (int32_t)cospi_16_64); { const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS); const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS); diff --git a/aom_dsp/arm/idct16x16_add_neon.c b/aom_dsp/arm/idct16x16_add_neon.c index 3d545f878..b4cb7a0cd 100644 --- a/aom_dsp/arm/idct16x16_add_neon.c +++ b/aom_dsp/arm/idct16x16_add_neon.c @@ -137,8 +137,8 @@ void aom_idct16x16_256_add_neon_pass1(int16_t *in, int16_t *out, d31s16 = vget_high_s16(q15s16); // stage 3 - d0s16 = vdup_n_s16(cospi_28_64); - d1s16 = vdup_n_s16(cospi_4_64); + d0s16 = vdup_n_s16((int16_t)cospi_28_64); + d1s16 = vdup_n_s16((int16_t)cospi_4_64); q2s32 = vmull_s16(d18s16, d0s16); q3s32 = vmull_s16(d19s16, d0s16); @@ -150,8 +150,8 @@ void aom_idct16x16_256_add_neon_pass1(int16_t *in, int16_t *out, q5s32 = vmlal_s16(q5s32, d30s16, d0s16); q6s32 = vmlal_s16(q6s32, d31s16, d0s16); - d2s16 = vdup_n_s16(cospi_12_64); - d3s16 = vdup_n_s16(cospi_20_64); + d2s16 = vdup_n_s16((int16_t)cospi_12_64); + d3s16 = vdup_n_s16((int16_t)cospi_20_64); d8s16 = vqrshrn_n_s32(q2s32, 14); d9s16 = vqrshrn_n_s32(q3s32, 14); @@ -178,15 +178,15 @@ void aom_idct16x16_256_add_neon_pass1(int16_t *in, int16_t *out, q6s16 = vcombine_s16(d12s16, d13s16); // stage 4 - d30s16 = vdup_n_s16(cospi_16_64); + d30s16 = vdup_n_s16((int16_t)cospi_16_64); q2s32 = vmull_s16(d16s16, d30s16); q11s32 = vmull_s16(d17s16, d30s16); q0s32 = vmull_s16(d24s16, d30s16); q1s32 = vmull_s16(d25s16, d30s16); - d30s16 = vdup_n_s16(cospi_24_64); - d31s16 = vdup_n_s16(cospi_8_64); + d30s16 = vdup_n_s16((int16_t)cospi_24_64); + d31s16 = vdup_n_s16((int16_t)cospi_8_64); q3s32 = vaddq_s32(q2s32, q0s32); q12s32 = vaddq_s32(q11s32, q1s32); @@ -232,7 +232,7 @@ void aom_idct16x16_256_add_neon_pass1(int16_t *in, int16_t *out, q2s16 = vsubq_s16(q9s16, q10s16); q3s16 = vsubq_s16(q8s16, q11s16); - d16s16 = vdup_n_s16(cospi_16_64); + d16s16 = vdup_n_s16((int16_t)cospi_16_64); q11s32 = vmull_s16(d26s16, d16s16); q12s32 = vmull_s16(d27s16, d16s16); @@ -378,8 +378,8 @@ void aom_idct16x16_256_add_neon_pass2(int16_t *src, int16_t *out, d31s16 = vget_high_s16(q15s16); // stage 3 - d12s16 = vdup_n_s16(cospi_30_64); - d13s16 = vdup_n_s16(cospi_2_64); + d12s16 = vdup_n_s16((int16_t)cospi_30_64); + d13s16 = vdup_n_s16((int16_t)cospi_2_64); q2s32 = vmull_s16(d16s16, d12s16); q3s32 = vmull_s16(d17s16, d12s16); @@ -398,8 +398,8 @@ void aom_idct16x16_256_add_neon_pass2(int16_t *src, int16_t *out, q0s16 = vcombine_s16(d0s16, d1s16); q7s16 = vcombine_s16(d14s16, d15s16); - d30s16 = vdup_n_s16(cospi_14_64); - d31s16 = vdup_n_s16(cospi_18_64); + d30s16 = vdup_n_s16((int16_t)cospi_14_64); + d31s16 = vdup_n_s16((int16_t)cospi_18_64); q2s32 = vmull_s16(d24s16, d30s16); q3s32 = vmull_s16(d25s16, d30s16); @@ -418,8 +418,8 @@ void aom_idct16x16_256_add_neon_pass2(int16_t *src, int16_t *out, q1s16 = vcombine_s16(d2s16, d3s16); q6s16 = vcombine_s16(d12s16, d13s16); - d30s16 = vdup_n_s16(cospi_22_64); - d31s16 = vdup_n_s16(cospi_10_64); + d30s16 = vdup_n_s16((int16_t)cospi_22_64); + d31s16 = vdup_n_s16((int16_t)cospi_10_64); q11s32 = vmull_s16(d20s16, d30s16); q12s32 = vmull_s16(d21s16, d30s16); @@ -438,8 +438,8 @@ void aom_idct16x16_256_add_neon_pass2(int16_t *src, int16_t *out, q2s16 = vcombine_s16(d4s16, d5s16); q5s16 = vcombine_s16(d10s16, d11s16); - d30s16 = vdup_n_s16(cospi_6_64); - d31s16 = vdup_n_s16(cospi_26_64); + d30s16 = vdup_n_s16((int16_t)cospi_6_64); + d31s16 = vdup_n_s16((int16_t)cospi_26_64); q10s32 = vmull_s16(d28s16, d30s16); q11s32 = vmull_s16(d29s16, d30s16); @@ -478,8 +478,8 @@ void aom_idct16x16_256_add_neon_pass2(int16_t *src, int16_t *out, d28s16 = vget_low_s16(q14s16); d29s16 = vget_high_s16(q14s16); - d30s16 = vdup_n_s16(cospi_8_64); - d31s16 = vdup_n_s16(cospi_24_64); + d30s16 = vdup_n_s16((int16_t)cospi_8_64); + d31s16 = vdup_n_s16((int16_t)cospi_24_64); q2s32 = vmull_s16(d18s16, d31s16); q3s32 = vmull_s16(d19s16, d31s16); @@ -539,7 +539,7 @@ void aom_idct16x16_256_add_neon_pass2(int16_t *src, int16_t *out, d26s16 = vget_low_s16(q13s16); d27s16 = vget_high_s16(q13s16); - d14s16 = vdup_n_s16(cospi_16_64); + d14s16 = vdup_n_s16((int16_t)cospi_16_64); q3s32 = vmull_s16(d26s16, d14s16); q4s32 = vmull_s16(d27s16, d14s16); @@ -903,15 +903,15 @@ void aom_idct16x16_10_add_neon_pass1(int16_t *in, int16_t *out, &q15s16); // stage 3 - q0s16 = vdupq_n_s16(cospi_28_64 * 2); - q1s16 = vdupq_n_s16(cospi_4_64 * 2); + q0s16 = vdupq_n_s16((int16_t)(cospi_28_64 * 2)); + q1s16 = vdupq_n_s16((int16_t)(cospi_4_64 * 2)); q4s16 = vqrdmulhq_s16(q9s16, q0s16); q7s16 = vqrdmulhq_s16(q9s16, q1s16); // stage 4 - q1s16 = vdupq_n_s16(cospi_16_64 * 2); - d4s16 = vdup_n_s16(cospi_16_64); + q1s16 = vdupq_n_s16((int16_t)(cospi_16_64 * 2)); + d4s16 = vdup_n_s16((int16_t)cospi_16_64); q8s16 = vqrdmulhq_s16(q8s16, q1s16); @@ -1046,13 +1046,13 @@ void aom_idct16x16_10_add_neon_pass2(int16_t *src, int16_t *out, &q15s16); // stage 3 - q6s16 = vdupq_n_s16(cospi_30_64 * 2); + q6s16 = vdupq_n_s16((int16_t)(cospi_30_64 * 2)); q0s16 = vqrdmulhq_s16(q8s16, q6s16); - q6s16 = vdupq_n_s16(cospi_2_64 * 2); + q6s16 = vdupq_n_s16((int16_t)(cospi_2_64 * 2)); q7s16 = vqrdmulhq_s16(q8s16, q6s16); q15s16 = vdupq_n_s16(-cospi_26_64 * 2); - q14s16 = vdupq_n_s16(cospi_6_64 * 2); + q14s16 = vdupq_n_s16((int16_t)(cospi_6_64 * 2)); q3s16 = vqrdmulhq_s16(q9s16, q15s16); q4s16 = vqrdmulhq_s16(q9s16, q14s16); @@ -1066,8 +1066,8 @@ void aom_idct16x16_10_add_neon_pass2(int16_t *src, int16_t *out, d14s16 = vget_low_s16(q7s16); d15s16 = vget_high_s16(q7s16); - d30s16 = vdup_n_s16(cospi_8_64); - d31s16 = vdup_n_s16(cospi_24_64); + d30s16 = vdup_n_s16((int16_t)cospi_8_64); + d31s16 = vdup_n_s16((int16_t)cospi_24_64); q12s32 = vmull_s16(d14s16, d31s16); q5s32 = vmull_s16(d15s16, d31s16); @@ -1124,7 +1124,7 @@ void aom_idct16x16_10_add_neon_pass2(int16_t *src, int16_t *out, d26s16 = vget_low_s16(q13s16); d27s16 = vget_high_s16(q13s16); - d14s16 = vdup_n_s16(cospi_16_64); + d14s16 = vdup_n_s16((int16_t)cospi_16_64); q3s32 = vmull_s16(d26s16, d14s16); q4s32 = vmull_s16(d27s16, d14s16); q0s32 = vmull_s16(d20s16, d14s16); diff --git a/aom_dsp/arm/idct4x4_add_neon.c b/aom_dsp/arm/idct4x4_add_neon.c index 397c61709..763be1ab0 100644 --- a/aom_dsp/arm/idct4x4_add_neon.c +++ b/aom_dsp/arm/idct4x4_add_neon.c @@ -11,6 +11,8 @@ #include +#include "aom_dsp/txfm_common.h" + void aom_idct4x4_16_add_neon(int16_t *input, uint8_t *dest, int dest_stride) { uint8x8_t d26u8, d27u8; uint32x2_t d26u32, d27u32; @@ -22,9 +24,6 @@ void aom_idct4x4_16_add_neon(int16_t *input, uint8_t *dest, int dest_stride) { int16x4x2_t d0x2s16, d1x2s16; int32x4x2_t q0x2s32; uint8_t *d; - int16_t cospi_8_64 = 15137; - int16_t cospi_16_64 = 11585; - int16_t cospi_24_64 = 6270; d26u32 = d27u32 = vdup_n_u32(0); @@ -41,8 +40,8 @@ void aom_idct4x4_16_add_neon(int16_t *input, uint8_t *dest, int dest_stride) { q8s16 = vcombine_s16(d0x2s16.val[0], d0x2s16.val[1]); q9s16 = vcombine_s16(d1x2s16.val[0], d1x2s16.val[1]); - d20s16 = vdup_n_s16(cospi_8_64); - d21s16 = vdup_n_s16(cospi_16_64); + d20s16 = vdup_n_s16((int16_t)cospi_8_64); + d21s16 = vdup_n_s16((int16_t)cospi_16_64); q0x2s32 = vtrnq_s32(vreinterpretq_s32_s16(q8s16), vreinterpretq_s32_s16(q9s16)); @@ -51,7 +50,7 @@ void aom_idct4x4_16_add_neon(int16_t *input, uint8_t *dest, int dest_stride) { d18s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[1])); d19s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[1])); - d22s16 = vdup_n_s16(cospi_24_64); + d22s16 = vdup_n_s16((int16_t)cospi_24_64); // stage 1 d23s16 = vadd_s16(d16s16, d18s16); diff --git a/aom_dsp/arm/idct8x8_1_add_neon.c b/aom_dsp/arm/idct8x8_1_add_neon.c index fcc2a2fcd..c7926f9e4 100644 --- a/aom_dsp/arm/idct8x8_1_add_neon.c +++ b/aom_dsp/arm/idct8x8_1_add_neon.c @@ -20,7 +20,7 @@ void aom_idct8x8_1_add_neon(int16_t *input, uint8_t *dest, int dest_stride) { uint16x8_t q0u16, q9u16, q10u16, q11u16, q12u16; int16x8_t q0s16; uint8_t *d1, *d2; - int16_t i, a1, cospi_16_64 = 11585; + int16_t i, a1; int16_t out = dct_const_round_shift(input[0] * cospi_16_64); out = dct_const_round_shift(out * cospi_16_64); a1 = ROUND_POWER_OF_TWO(out, 5); diff --git a/aom_dsp/arm/idct8x8_add_neon.c b/aom_dsp/arm/idct8x8_add_neon.c index 8e752105b..8ad70862d 100644 --- a/aom_dsp/arm/idct8x8_add_neon.c +++ b/aom_dsp/arm/idct8x8_add_neon.c @@ -90,10 +90,10 @@ static INLINE void IDCT8x8_1D(int16x8_t *q8s16, int16x8_t *q9s16, int32x4_t q2s32, q3s32, q5s32, q6s32, q8s32, q9s32; int32x4_t q10s32, q11s32, q12s32, q13s32, q15s32; - d0s16 = vdup_n_s16(cospi_28_64); - d1s16 = vdup_n_s16(cospi_4_64); - d2s16 = vdup_n_s16(cospi_12_64); - d3s16 = vdup_n_s16(cospi_20_64); + d0s16 = vdup_n_s16((int16_t)cospi_28_64); + d1s16 = vdup_n_s16((int16_t)cospi_4_64); + d2s16 = vdup_n_s16((int16_t)cospi_12_64); + d3s16 = vdup_n_s16((int16_t)cospi_20_64); d16s16 = vget_low_s16(*q8s16); d17s16 = vget_high_s16(*q8s16); @@ -146,7 +146,7 @@ static INLINE void IDCT8x8_1D(int16x8_t *q8s16, int16x8_t *q9s16, q6s16 = vcombine_s16(d12s16, d13s16); q7s16 = vcombine_s16(d14s16, d15s16); - d0s16 = vdup_n_s16(cospi_16_64); + d0s16 = vdup_n_s16((int16_t)cospi_16_64); q2s32 = vmull_s16(d16s16, d0s16); q3s32 = vmull_s16(d17s16, d0s16); @@ -158,8 +158,8 @@ static INLINE void IDCT8x8_1D(int16x8_t *q8s16, int16x8_t *q9s16, q13s32 = vmlsl_s16(q13s32, d24s16, d0s16); q15s32 = vmlsl_s16(q15s32, d25s16, d0s16); - d0s16 = vdup_n_s16(cospi_24_64); - d1s16 = vdup_n_s16(cospi_8_64); + d0s16 = vdup_n_s16((int16_t)cospi_24_64); + d1s16 = vdup_n_s16((int16_t)cospi_8_64); d18s16 = vqrshrn_n_s32(q2s32, 14); d19s16 = vqrshrn_n_s32(q3s32, 14); @@ -199,7 +199,7 @@ static INLINE void IDCT8x8_1D(int16x8_t *q8s16, int16x8_t *q9s16, d28s16 = vget_low_s16(*q14s16); d29s16 = vget_high_s16(*q14s16); - d16s16 = vdup_n_s16(cospi_16_64); + d16s16 = vdup_n_s16((int16_t)cospi_16_64); q9s32 = vmull_s16(d28s16, d16s16); q10s32 = vmull_s16(d29s16, d16s16); @@ -356,29 +356,29 @@ void aom_idct8x8_12_add_neon(int16_t *input, uint8_t *dest, int dest_stride) { // First transform rows // stage 1 - q0s16 = vdupq_n_s16(cospi_28_64 * 2); - q1s16 = vdupq_n_s16(cospi_4_64 * 2); + q0s16 = vdupq_n_s16((int16_t)cospi_28_64 * 2); + q1s16 = vdupq_n_s16((int16_t)cospi_4_64 * 2); q4s16 = vqrdmulhq_s16(q9s16, q0s16); - q0s16 = vdupq_n_s16(-cospi_20_64 * 2); + q0s16 = vdupq_n_s16(-(int16_t)cospi_20_64 * 2); q7s16 = vqrdmulhq_s16(q9s16, q1s16); - q1s16 = vdupq_n_s16(cospi_12_64 * 2); + q1s16 = vdupq_n_s16((int16_t)cospi_12_64 * 2); q5s16 = vqrdmulhq_s16(q11s16, q0s16); - q0s16 = vdupq_n_s16(cospi_16_64 * 2); + q0s16 = vdupq_n_s16((int16_t)cospi_16_64 * 2); q6s16 = vqrdmulhq_s16(q11s16, q1s16); // stage 2 & stage 3 - even half - q1s16 = vdupq_n_s16(cospi_24_64 * 2); + q1s16 = vdupq_n_s16((int16_t)cospi_24_64 * 2); q9s16 = vqrdmulhq_s16(q8s16, q0s16); - q0s16 = vdupq_n_s16(cospi_8_64 * 2); + q0s16 = vdupq_n_s16((int16_t)cospi_8_64 * 2); q13s16 = vqrdmulhq_s16(q10s16, q1s16); @@ -400,7 +400,7 @@ void aom_idct8x8_12_add_neon(int16_t *input, uint8_t *dest, int dest_stride) { d28s16 = vget_low_s16(q14s16); d29s16 = vget_high_s16(q14s16); - d16s16 = vdup_n_s16(cospi_16_64); + d16s16 = vdup_n_s16((int16_t)cospi_16_64); q9s32 = vmull_s16(d28s16, d16s16); q10s32 = vmull_s16(d29s16, d16s16); q11s32 = vmull_s16(d28s16, d16s16); diff --git a/aom_dsp/fwd_txfm.c b/aom_dsp/fwd_txfm.c index fadae2b8e..547919f88 100644 --- a/aom_dsp/fwd_txfm.c +++ b/aom_dsp/fwd_txfm.c @@ -9,8 +9,9 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#include "./aom_dsp_rtcd.h" #include "aom_dsp/fwd_txfm.h" +#include +#include "./aom_dsp_rtcd.h" void aom_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride) { // The 2D transform is done with two passes which are actually pretty @@ -22,36 +23,37 @@ void aom_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride) { int pass; // We need an intermediate buffer between passes. tran_low_t intermediate[4 * 4]; - const int16_t *in_pass0 = input; - const tran_low_t *in = NULL; + const tran_low_t *in_low = NULL; tran_low_t *out = intermediate; // Do the two transform/transpose passes for (pass = 0; pass < 2; ++pass) { - tran_high_t input[4]; // canbe16 + tran_high_t in_high[4]; // canbe16 tran_high_t step[4]; // canbe16 tran_high_t temp1, temp2; // needs32 int i; for (i = 0; i < 4; ++i) { // Load inputs. - if (0 == pass) { - input[0] = in_pass0[0 * stride] * 16; - input[1] = in_pass0[1 * stride] * 16; - input[2] = in_pass0[2 * stride] * 16; - input[3] = in_pass0[3 * stride] * 16; - if (i == 0 && input[0]) { - input[0] += 1; + if (pass == 0) { + in_high[0] = input[0 * stride] * 16; + in_high[1] = input[1 * stride] * 16; + in_high[2] = input[2 * stride] * 16; + in_high[3] = input[3 * stride] * 16; + if (i == 0 && in_high[0]) { + ++in_high[0]; } } else { - input[0] = in[0 * 4]; - input[1] = in[1 * 4]; - input[2] = in[2 * 4]; - input[3] = in[3 * 4]; + assert(in_low != NULL); + in_high[0] = in_low[0 * 4]; + in_high[1] = in_low[1 * 4]; + in_high[2] = in_low[2 * 4]; + in_high[3] = in_low[3 * 4]; + ++in_low; } // Transform. - step[0] = input[0] + input[3]; - step[1] = input[1] + input[2]; - step[2] = input[1] - input[2]; - step[3] = input[0] - input[3]; + step[0] = in_high[0] + in_high[3]; + step[1] = in_high[1] + in_high[2]; + step[2] = in_high[1] - in_high[2]; + step[3] = in_high[0] - in_high[3]; temp1 = (step[0] + step[1]) * cospi_16_64; temp2 = (step[0] - step[1]) * cospi_16_64; out[0] = (tran_low_t)fdct_round_shift(temp1); @@ -61,12 +63,11 @@ void aom_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride) { out[1] = (tran_low_t)fdct_round_shift(temp1); out[3] = (tran_low_t)fdct_round_shift(temp2); // Do next column (which is a transposed row in second/horizontal pass) - in_pass0++; - in++; + ++input; out += 4; } // Setup in/out for next pass. - in = intermediate; + in_low = intermediate; out = output; } @@ -100,7 +101,6 @@ void aom_fdct8x8_c(const int16_t *input, tran_low_t *final_output, int stride) { tran_high_t t0, t1, t2, t3; // needs32 tran_high_t x0, x1, x2, x3; // canbe16 - int i; for (i = 0; i < 8; i++) { // stage 1 if (pass == 0) { @@ -191,56 +191,57 @@ void aom_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride) { int pass; // We need an intermediate buffer between passes. tran_low_t intermediate[256]; - const int16_t *in_pass0 = input; - const tran_low_t *in = NULL; + const tran_low_t *in_low = NULL; tran_low_t *out = intermediate; // Do the two transform/transpose passes for (pass = 0; pass < 2; ++pass) { tran_high_t step1[8]; // canbe16 tran_high_t step2[8]; // canbe16 tran_high_t step3[8]; // canbe16 - tran_high_t input[8]; // canbe16 + tran_high_t in_high[8]; // canbe16 tran_high_t temp1, temp2; // needs32 int i; for (i = 0; i < 16; i++) { if (0 == pass) { // Calculate input for the first 8 results. - input[0] = (in_pass0[0 * stride] + in_pass0[15 * stride]) * 4; - input[1] = (in_pass0[1 * stride] + in_pass0[14 * stride]) * 4; - input[2] = (in_pass0[2 * stride] + in_pass0[13 * stride]) * 4; - input[3] = (in_pass0[3 * stride] + in_pass0[12 * stride]) * 4; - input[4] = (in_pass0[4 * stride] + in_pass0[11 * stride]) * 4; - input[5] = (in_pass0[5 * stride] + in_pass0[10 * stride]) * 4; - input[6] = (in_pass0[6 * stride] + in_pass0[9 * stride]) * 4; - input[7] = (in_pass0[7 * stride] + in_pass0[8 * stride]) * 4; + in_high[0] = (input[0 * stride] + input[15 * stride]) * 4; + in_high[1] = (input[1 * stride] + input[14 * stride]) * 4; + in_high[2] = (input[2 * stride] + input[13 * stride]) * 4; + in_high[3] = (input[3 * stride] + input[12 * stride]) * 4; + in_high[4] = (input[4 * stride] + input[11 * stride]) * 4; + in_high[5] = (input[5 * stride] + input[10 * stride]) * 4; + in_high[6] = (input[6 * stride] + input[9 * stride]) * 4; + in_high[7] = (input[7 * stride] + input[8 * stride]) * 4; // Calculate input for the next 8 results. - step1[0] = (in_pass0[7 * stride] - in_pass0[8 * stride]) * 4; - step1[1] = (in_pass0[6 * stride] - in_pass0[9 * stride]) * 4; - step1[2] = (in_pass0[5 * stride] - in_pass0[10 * stride]) * 4; - step1[3] = (in_pass0[4 * stride] - in_pass0[11 * stride]) * 4; - step1[4] = (in_pass0[3 * stride] - in_pass0[12 * stride]) * 4; - step1[5] = (in_pass0[2 * stride] - in_pass0[13 * stride]) * 4; - step1[6] = (in_pass0[1 * stride] - in_pass0[14 * stride]) * 4; - step1[7] = (in_pass0[0 * stride] - in_pass0[15 * stride]) * 4; + step1[0] = (input[7 * stride] - input[8 * stride]) * 4; + step1[1] = (input[6 * stride] - input[9 * stride]) * 4; + step1[2] = (input[5 * stride] - input[10 * stride]) * 4; + step1[3] = (input[4 * stride] - input[11 * stride]) * 4; + step1[4] = (input[3 * stride] - input[12 * stride]) * 4; + step1[5] = (input[2 * stride] - input[13 * stride]) * 4; + step1[6] = (input[1 * stride] - input[14 * stride]) * 4; + step1[7] = (input[0 * stride] - input[15 * stride]) * 4; } else { // Calculate input for the first 8 results. - input[0] = ((in[0 * 16] + 1) >> 2) + ((in[15 * 16] + 1) >> 2); - input[1] = ((in[1 * 16] + 1) >> 2) + ((in[14 * 16] + 1) >> 2); - input[2] = ((in[2 * 16] + 1) >> 2) + ((in[13 * 16] + 1) >> 2); - input[3] = ((in[3 * 16] + 1) >> 2) + ((in[12 * 16] + 1) >> 2); - input[4] = ((in[4 * 16] + 1) >> 2) + ((in[11 * 16] + 1) >> 2); - input[5] = ((in[5 * 16] + 1) >> 2) + ((in[10 * 16] + 1) >> 2); - input[6] = ((in[6 * 16] + 1) >> 2) + ((in[9 * 16] + 1) >> 2); - input[7] = ((in[7 * 16] + 1) >> 2) + ((in[8 * 16] + 1) >> 2); + assert(in_low != NULL); + in_high[0] = ((in_low[0 * 16] + 1) >> 2) + ((in_low[15 * 16] + 1) >> 2); + in_high[1] = ((in_low[1 * 16] + 1) >> 2) + ((in_low[14 * 16] + 1) >> 2); + in_high[2] = ((in_low[2 * 16] + 1) >> 2) + ((in_low[13 * 16] + 1) >> 2); + in_high[3] = ((in_low[3 * 16] + 1) >> 2) + ((in_low[12 * 16] + 1) >> 2); + in_high[4] = ((in_low[4 * 16] + 1) >> 2) + ((in_low[11 * 16] + 1) >> 2); + in_high[5] = ((in_low[5 * 16] + 1) >> 2) + ((in_low[10 * 16] + 1) >> 2); + in_high[6] = ((in_low[6 * 16] + 1) >> 2) + ((in_low[9 * 16] + 1) >> 2); + in_high[7] = ((in_low[7 * 16] + 1) >> 2) + ((in_low[8 * 16] + 1) >> 2); // Calculate input for the next 8 results. - step1[0] = ((in[7 * 16] + 1) >> 2) - ((in[8 * 16] + 1) >> 2); - step1[1] = ((in[6 * 16] + 1) >> 2) - ((in[9 * 16] + 1) >> 2); - step1[2] = ((in[5 * 16] + 1) >> 2) - ((in[10 * 16] + 1) >> 2); - step1[3] = ((in[4 * 16] + 1) >> 2) - ((in[11 * 16] + 1) >> 2); - step1[4] = ((in[3 * 16] + 1) >> 2) - ((in[12 * 16] + 1) >> 2); - step1[5] = ((in[2 * 16] + 1) >> 2) - ((in[13 * 16] + 1) >> 2); - step1[6] = ((in[1 * 16] + 1) >> 2) - ((in[14 * 16] + 1) >> 2); - step1[7] = ((in[0 * 16] + 1) >> 2) - ((in[15 * 16] + 1) >> 2); + step1[0] = ((in_low[7 * 16] + 1) >> 2) - ((in_low[8 * 16] + 1) >> 2); + step1[1] = ((in_low[6 * 16] + 1) >> 2) - ((in_low[9 * 16] + 1) >> 2); + step1[2] = ((in_low[5 * 16] + 1) >> 2) - ((in_low[10 * 16] + 1) >> 2); + step1[3] = ((in_low[4 * 16] + 1) >> 2) - ((in_low[11 * 16] + 1) >> 2); + step1[4] = ((in_low[3 * 16] + 1) >> 2) - ((in_low[12 * 16] + 1) >> 2); + step1[5] = ((in_low[2 * 16] + 1) >> 2) - ((in_low[13 * 16] + 1) >> 2); + step1[6] = ((in_low[1 * 16] + 1) >> 2) - ((in_low[14 * 16] + 1) >> 2); + step1[7] = ((in_low[0 * 16] + 1) >> 2) - ((in_low[15 * 16] + 1) >> 2); + in_low++; } // Work on the first eight values; fdct8(input, even_results); { @@ -249,14 +250,14 @@ void aom_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride) { tran_high_t x0, x1, x2, x3; // canbe16 // stage 1 - s0 = input[0] + input[7]; - s1 = input[1] + input[6]; - s2 = input[2] + input[5]; - s3 = input[3] + input[4]; - s4 = input[3] - input[4]; - s5 = input[2] - input[5]; - s6 = input[1] - input[6]; - s7 = input[0] - input[7]; + s0 = in_high[0] + in_high[7]; + s1 = in_high[1] + in_high[6]; + s2 = in_high[2] + in_high[5]; + s3 = in_high[3] + in_high[4]; + s4 = in_high[3] - in_high[4]; + s5 = in_high[2] - in_high[5]; + s6 = in_high[1] - in_high[6]; + s7 = in_high[0] - in_high[7]; // fdct4(step, step); x0 = s0 + s3; @@ -351,12 +352,11 @@ void aom_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride) { out[15] = (tran_low_t)fdct_round_shift(temp2); } // Do next column (which is a transposed row in second/horizontal pass) - in++; - in_pass0++; + input++; out += 16; } // Setup in/out for next pass. - in = intermediate; + in_low = intermediate; out = output; } } diff --git a/aom_dsp/mips/aom_convolve_msa.h b/aom_dsp/mips/aom_convolve_msa.h index 4efbcbcad..1a0ae4d8d 100644 --- a/aom_dsp/mips/aom_convolve_msa.h +++ b/aom_dsp/mips/aom_convolve_msa.h @@ -17,18 +17,18 @@ extern const uint8_t mc_filt_mask_arr[16 * 3]; -#define FILT_8TAP_DPADD_S_H(vec0, vec1, vec2, vec3, filt0, filt1, filt2, \ - filt3) \ - ({ \ - v8i16 tmp0, tmp1; \ - \ - tmp0 = __msa_dotp_s_h((v16i8)vec0, (v16i8)filt0); \ - tmp0 = __msa_dpadd_s_h(tmp0, (v16i8)vec1, (v16i8)filt1); \ - tmp1 = __msa_dotp_s_h((v16i8)vec2, (v16i8)filt2); \ - tmp1 = __msa_dpadd_s_h(tmp1, (v16i8)vec3, (v16i8)filt3); \ - tmp0 = __msa_adds_s_h(tmp0, tmp1); \ - \ - tmp0; \ +#define FILT_8TAP_DPADD_S_H(vec0, vec1, vec2, vec3, filt0, filt1, filt2, \ + filt3) \ + ({ \ + v8i16 tmp_dpadd_0, tmp_dpadd_1; \ + \ + tmp_dpadd_0 = __msa_dotp_s_h((v16i8)vec0, (v16i8)filt0); \ + tmp_dpadd_0 = __msa_dpadd_s_h(tmp_dpadd_0, (v16i8)vec1, (v16i8)filt1); \ + tmp_dpadd_1 = __msa_dotp_s_h((v16i8)vec2, (v16i8)filt2); \ + tmp_dpadd_1 = __msa_dpadd_s_h(tmp_dpadd_1, (v16i8)vec3, (v16i8)filt3); \ + tmp_dpadd_0 = __msa_adds_s_h(tmp_dpadd_0, tmp_dpadd_1); \ + \ + tmp_dpadd_0; \ }) #define HORIZ_8TAP_FILT(src0, src1, mask0, mask1, mask2, mask3, filt_h0, \ @@ -115,11 +115,10 @@ extern const uint8_t mc_filt_mask_arr[16 * 3]; stride) \ { \ v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ - uint8_t *pdst_m = (uint8_t *)(pdst); \ \ PCKEV_B2_UB(in2, in1, in4, in3, tmp0_m, tmp1_m); \ PCKEV_D2_UB(dst1, dst0, dst3, dst2, tmp2_m, tmp3_m); \ AVER_UB2_UB(tmp0_m, tmp2_m, tmp1_m, tmp3_m, tmp0_m, tmp1_m); \ - ST8x4_UB(tmp0_m, tmp1_m, pdst_m, stride); \ + ST8x4_UB(tmp0_m, tmp1_m, pdst, stride); \ } #endif /* AOM_DSP_MIPS_AOM_CONVOLVE_MSA_H_ */ diff --git a/aom_dsp/mips/inv_txfm_msa.h b/aom_dsp/mips/inv_txfm_msa.h index ce2065bb2..122667aa8 100644 --- a/aom_dsp/mips/inv_txfm_msa.h +++ b/aom_dsp/mips/inv_txfm_msa.h @@ -197,18 +197,18 @@ out2, out3) \ { \ v8i16 madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m; \ - v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ + v4i32 tmp0_madd, tmp1_madd, tmp2_madd, tmp3_madd; \ \ ILVRL_H2_SH(inp1, inp0, madd_s1_m, madd_s0_m); \ ILVRL_H2_SH(inp3, inp2, madd_s3_m, madd_s2_m); \ DOTP_SH4_SW(madd_s1_m, madd_s0_m, madd_s1_m, madd_s0_m, cst0, cst0, cst1, \ - cst1, tmp0_m, tmp1_m, tmp2_m, tmp3_m); \ - SRARI_W4_SW(tmp0_m, tmp1_m, tmp2_m, tmp3_m, DCT_CONST_BITS); \ - PCKEV_H2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out0, out1); \ + cst1, tmp0_madd, tmp1_madd, tmp2_madd, tmp3_madd); \ + SRARI_W4_SW(tmp0_madd, tmp1_madd, tmp2_madd, tmp3_madd, DCT_CONST_BITS); \ + PCKEV_H2_SH(tmp1_madd, tmp0_madd, tmp3_madd, tmp2_madd, out0, out1); \ DOTP_SH4_SW(madd_s3_m, madd_s2_m, madd_s3_m, madd_s2_m, cst2, cst2, cst3, \ - cst3, tmp0_m, tmp1_m, tmp2_m, tmp3_m); \ - SRARI_W4_SW(tmp0_m, tmp1_m, tmp2_m, tmp3_m, DCT_CONST_BITS); \ - PCKEV_H2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out2, out3); \ + cst3, tmp0_madd, tmp1_madd, tmp2_madd, tmp3_madd); \ + SRARI_W4_SW(tmp0_madd, tmp1_madd, tmp2_madd, tmp3_madd, DCT_CONST_BITS); \ + PCKEV_H2_SH(tmp1_madd, tmp0_madd, tmp3_madd, tmp2_madd, out2, out3); \ } /* idct 8x8 macro */ diff --git a/aom_dsp/mips/loopfilter_msa.h b/aom_dsp/mips/loopfilter_msa.h index c1cabc2f2..450594262 100644 --- a/aom_dsp/mips/loopfilter_msa.h +++ b/aom_dsp/mips/loopfilter_msa.h @@ -123,35 +123,35 @@ p1_out = __msa_xori_b((v16u8)p1_m, 0x80); \ } -#define AOM_FLAT4(p3_in, p2_in, p0_in, q0_in, q2_in, q3_in, flat_out) \ - { \ - v16u8 tmp, p2_a_sub_p0, q2_a_sub_q0, p3_a_sub_p0, q3_a_sub_q0; \ - v16u8 zero_in = { 0 }; \ - \ - tmp = __msa_ori_b(zero_in, 1); \ - p2_a_sub_p0 = __msa_asub_u_b(p2_in, p0_in); \ - q2_a_sub_q0 = __msa_asub_u_b(q2_in, q0_in); \ - p3_a_sub_p0 = __msa_asub_u_b(p3_in, p0_in); \ - q3_a_sub_q0 = __msa_asub_u_b(q3_in, q0_in); \ - \ - p2_a_sub_p0 = __msa_max_u_b(p2_a_sub_p0, q2_a_sub_q0); \ - flat_out = __msa_max_u_b(p2_a_sub_p0, flat_out); \ - p3_a_sub_p0 = __msa_max_u_b(p3_a_sub_p0, q3_a_sub_q0); \ - flat_out = __msa_max_u_b(p3_a_sub_p0, flat_out); \ - \ - flat_out = (tmp < (v16u8)flat_out); \ - flat_out = __msa_xori_b(flat_out, 0xff); \ - flat_out = flat_out & (mask); \ +#define AOM_FLAT4(p3_in, p2_in, p0_in, q0_in, q2_in, q3_in, flat_out) \ + { \ + v16u8 tmp_flat4, p2_a_sub_p0, q2_a_sub_q0, p3_a_sub_p0, q3_a_sub_q0; \ + v16u8 zero_in = { 0 }; \ + \ + tmp_flat4 = __msa_ori_b(zero_in, 1); \ + p2_a_sub_p0 = __msa_asub_u_b(p2_in, p0_in); \ + q2_a_sub_q0 = __msa_asub_u_b(q2_in, q0_in); \ + p3_a_sub_p0 = __msa_asub_u_b(p3_in, p0_in); \ + q3_a_sub_q0 = __msa_asub_u_b(q3_in, q0_in); \ + \ + p2_a_sub_p0 = __msa_max_u_b(p2_a_sub_p0, q2_a_sub_q0); \ + flat_out = __msa_max_u_b(p2_a_sub_p0, flat_out); \ + p3_a_sub_p0 = __msa_max_u_b(p3_a_sub_p0, q3_a_sub_q0); \ + flat_out = __msa_max_u_b(p3_a_sub_p0, flat_out); \ + \ + flat_out = (tmp_flat4 < (v16u8)flat_out); \ + flat_out = __msa_xori_b(flat_out, 0xff); \ + flat_out = flat_out & (mask); \ } #define AOM_FLAT5(p7_in, p6_in, p5_in, p4_in, p0_in, q0_in, q4_in, q5_in, \ q6_in, q7_in, flat_in, flat2_out) \ { \ - v16u8 tmp, zero_in = { 0 }; \ + v16u8 tmp_flat5, zero_in = { 0 }; \ v16u8 p4_a_sub_p0, q4_a_sub_q0, p5_a_sub_p0, q5_a_sub_q0; \ v16u8 p6_a_sub_p0, q6_a_sub_q0, p7_a_sub_p0, q7_a_sub_q0; \ \ - tmp = __msa_ori_b(zero_in, 1); \ + tmp_flat5 = __msa_ori_b(zero_in, 1); \ p4_a_sub_p0 = __msa_asub_u_b(p4_in, p0_in); \ q4_a_sub_q0 = __msa_asub_u_b(q4_in, q0_in); \ p5_a_sub_p0 = __msa_asub_u_b(p5_in, p0_in); \ @@ -169,7 +169,7 @@ p7_a_sub_p0 = __msa_max_u_b(p7_a_sub_p0, q7_a_sub_q0); \ flat2_out = __msa_max_u_b(p7_a_sub_p0, flat2_out); \ \ - flat2_out = (tmp < (v16u8)flat2_out); \ + flat2_out = (tmp_flat5 < (v16u8)flat2_out); \ flat2_out = __msa_xori_b(flat2_out, 0xff); \ flat2_out = flat2_out & flat_in; \ } @@ -178,38 +178,38 @@ p2_filt8_out, p1_filt8_out, p0_filt8_out, q0_filt8_out, \ q1_filt8_out, q2_filt8_out) \ { \ - v8u16 tmp0, tmp1, tmp2; \ + v8u16 tmp_filt8_0, tmp_filt8_1, tmp_filt8_2; \ \ - tmp2 = p2_in + p1_in + p0_in; \ - tmp0 = p3_in << 1; \ + tmp_filt8_2 = p2_in + p1_in + p0_in; \ + tmp_filt8_0 = p3_in << 1; \ \ - tmp0 = tmp0 + tmp2 + q0_in; \ - tmp1 = tmp0 + p3_in + p2_in; \ - p2_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp1, 3); \ + tmp_filt8_0 = tmp_filt8_0 + tmp_filt8_2 + q0_in; \ + tmp_filt8_1 = tmp_filt8_0 + p3_in + p2_in; \ + p2_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp_filt8_1, 3); \ \ - tmp1 = tmp0 + p1_in + q1_in; \ - p1_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp1, 3); \ + tmp_filt8_1 = tmp_filt8_0 + p1_in + q1_in; \ + p1_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp_filt8_1, 3); \ \ - tmp1 = q2_in + q1_in + q0_in; \ - tmp2 = tmp2 + tmp1; \ - tmp0 = tmp2 + (p0_in); \ - tmp0 = tmp0 + (p3_in); \ - p0_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp0, 3); \ + tmp_filt8_1 = q2_in + q1_in + q0_in; \ + tmp_filt8_2 = tmp_filt8_2 + tmp_filt8_1; \ + tmp_filt8_0 = tmp_filt8_2 + (p0_in); \ + tmp_filt8_0 = tmp_filt8_0 + (p3_in); \ + p0_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp_filt8_0, 3); \ \ - tmp0 = q2_in + q3_in; \ - tmp0 = p0_in + tmp1 + tmp0; \ - tmp1 = q3_in + q3_in; \ - tmp1 = tmp1 + tmp0; \ - q2_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp1, 3); \ + tmp_filt8_0 = q2_in + q3_in; \ + tmp_filt8_0 = p0_in + tmp_filt8_1 + tmp_filt8_0; \ + tmp_filt8_1 = q3_in + q3_in; \ + tmp_filt8_1 = tmp_filt8_1 + tmp_filt8_0; \ + q2_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp_filt8_1, 3); \ \ - tmp0 = tmp2 + q3_in; \ - tmp1 = tmp0 + q0_in; \ - q0_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp1, 3); \ + tmp_filt8_0 = tmp_filt8_2 + q3_in; \ + tmp_filt8_1 = tmp_filt8_0 + q0_in; \ + q0_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp_filt8_1, 3); \ \ - tmp1 = tmp0 - p2_in; \ - tmp0 = q1_in + q3_in; \ - tmp1 = tmp0 + tmp1; \ - q1_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp1, 3); \ + tmp_filt8_1 = tmp_filt8_0 - p2_in; \ + tmp_filt8_0 = q1_in + q3_in; \ + tmp_filt8_1 = tmp_filt8_0 + tmp_filt8_1; \ + q1_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp_filt8_1, 3); \ } #define LPF_MASK_HEV(p3_in, p2_in, p1_in, p0_in, q0_in, q1_in, q2_in, q3_in, \ diff --git a/aom_dsp/mips/macros_msa.h b/aom_dsp/mips/macros_msa.h index 7d0ba4bfc..48fbcfd47 100644 --- a/aom_dsp/mips/macros_msa.h +++ b/aom_dsp/mips/macros_msa.h @@ -169,20 +169,20 @@ val_m; \ }) #else // !(__mips == 64) -#define LD(psrc) \ - ({ \ - const uint8_t *psrc_m1 = (const uint8_t *)(psrc); \ - uint32_t val0_m, val1_m; \ - uint64_t val_m = 0; \ - \ - val0_m = LW(psrc_m1); \ - val1_m = LW(psrc_m1 + 4); \ - \ - val_m = (uint64_t)(val1_m); \ - val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000); \ - val_m = (uint64_t)(val_m | (uint64_t)val0_m); \ - \ - val_m; \ +#define LD(psrc) \ + ({ \ + const uint8_t *psrc_m1 = (const uint8_t *)(psrc); \ + uint32_t val0_m, val1_m; \ + uint64_t val_m_combined = 0; \ + \ + val0_m = LW(psrc_m1); \ + val1_m = LW(psrc_m1 + 4); \ + \ + val_m_combined = (uint64_t)(val1_m); \ + val_m_combined = (uint64_t)((val_m_combined << 32) & 0xFFFFFFFF00000000); \ + val_m_combined = (uint64_t)(val_m_combined | (uint64_t)val0_m); \ + \ + val_m_combined; \ }) #endif // (__mips == 64) @@ -2020,13 +2020,12 @@ pdst, stride) \ { \ v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ - uint8_t *pdst_m = (uint8_t *)(pdst); \ \ tmp0_m = PCKEV_XORI128_UB(in0, in1); \ tmp1_m = PCKEV_XORI128_UB(in2, in3); \ ILVR_D2_UB(dst1, dst0, dst3, dst2, tmp2_m, tmp3_m); \ AVER_UB2_UB(tmp0_m, tmp2_m, tmp1_m, tmp3_m, tmp0_m, tmp1_m); \ - ST8x4_UB(tmp0_m, tmp1_m, pdst_m, stride); \ + ST8x4_UB(tmp0_m, tmp1_m, pdst, stride); \ } /* Description : Pack even byte elements and store byte vector in destination diff --git a/aom_dsp/x86/inv_txfm_sse2.c b/aom_dsp/x86/inv_txfm_sse2.c index 61f548a51..4735d973e 100644 --- a/aom_dsp/x86/inv_txfm_sse2.c +++ b/aom_dsp/x86/inv_txfm_sse2.c @@ -2372,7 +2372,6 @@ void aom_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest, #define IDCT32_34 \ /* Stage1 */ \ { \ - const __m128i zero = _mm_setzero_si128(); \ const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], zero); \ const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], zero); \ \ @@ -2397,7 +2396,6 @@ void aom_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest, \ /* Stage2 */ \ { \ - const __m128i zero = _mm_setzero_si128(); \ const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], zero); \ const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], zero); \ \ @@ -2424,7 +2422,6 @@ void aom_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest, \ /* Stage3 */ \ { \ - const __m128i zero = _mm_setzero_si128(); \ const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], zero); \ const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], zero); \ \ @@ -2465,7 +2462,6 @@ void aom_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest, \ /* Stage4 */ \ { \ - const __m128i zero = _mm_setzero_si128(); \ const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], zero); \ const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], zero); \ \ @@ -3002,6 +2998,7 @@ void aom_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest, // Only upper-left 8x8 has non-zero coeff void aom_idct32x32_34_add_sse2(const tran_low_t *input, uint8_t *dest, int stride) { + const __m128i zero = _mm_setzero_si128(); const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); const __m128i final_rounding = _mm_set1_epi16(1 << 5); @@ -3107,7 +3104,6 @@ void aom_idct32x32_34_add_sse2(const tran_low_t *input, uint8_t *dest, col[31] = _mm_sub_epi16(stp1_0, stp1_31); for (i = 0; i < 4; i++) { int j; - const __m128i zero = _mm_setzero_si128(); // Transpose 32x8 block to 8x32 block array_transpose_8x8(col + i * 8, in); IDCT32_34 -- 2.50.0