From fdb60962f4f98ec538d4c6ee161c391a4be9322d Mon Sep 17 00:00:00 2001
From: Urvang Joshi <urvang@google.com>
Date: Fri, 14 Oct 2016 15:30:27 -0700
Subject: [PATCH] Fix warnings reported by -Wshadow: Part1: aom_dsp directory

While we are at it:
- Rename some variables to more meaningful names
- Reuse some common consts from a header instead of redefining them.

Cherry-picked from aomedia/master: 09eea2193

Change-Id: I61030e773137ae107d3bd43556c0d5bb26f9dbf8
---
 aom_dsp/arm/fwd_txfm_neon.c      |   8 +-
 aom_dsp/arm/idct16x16_add_neon.c |  58 ++++++-------
 aom_dsp/arm/idct4x4_add_neon.c   |  11 ++-
 aom_dsp/arm/idct8x8_1_add_neon.c |   2 +-
 aom_dsp/arm/idct8x8_add_neon.c   |  32 +++----
 aom_dsp/fwd_txfm.c               | 138 +++++++++++++++----------------
 aom_dsp/mips/aom_convolve_msa.h  |  27 +++---
 aom_dsp/mips/inv_txfm_msa.h      |  14 ++--
 aom_dsp/mips/loopfilter_msa.h    |  94 ++++++++++-----------
 aom_dsp/mips/macros_msa.h        |  31 ++++---
 aom_dsp/x86/inv_txfm_sse2.c      |   6 +-
 11 files changed, 207 insertions(+), 214 deletions(-)

diff --git a/aom_dsp/arm/fwd_txfm_neon.c b/aom_dsp/arm/fwd_txfm_neon.c
index 17ce29e51..1cf8a3a6e 100644
--- a/aom_dsp/arm/fwd_txfm_neon.c
+++ b/aom_dsp/arm/fwd_txfm_neon.c
@@ -53,10 +53,10 @@ void aom_fdct8x8_neon(const int16_t *input, int16_t *final_output, int stride) {
     v_t2_hi = vmlal_n_s16(v_t2_hi, vget_high_s16(v_x3), (int16_t)cospi_8_64);
     v_t3_lo = vmlsl_n_s16(v_t3_lo, vget_low_s16(v_x2), (int16_t)cospi_8_64);
     v_t3_hi = vmlsl_n_s16(v_t3_hi, vget_high_s16(v_x2), (int16_t)cospi_8_64);
-    v_t0_lo = vmulq_n_s32(v_t0_lo, cospi_16_64);
-    v_t0_hi = vmulq_n_s32(v_t0_hi, cospi_16_64);
-    v_t1_lo = vmulq_n_s32(v_t1_lo, cospi_16_64);
-    v_t1_hi = vmulq_n_s32(v_t1_hi, cospi_16_64);
+    v_t0_lo = vmulq_n_s32(v_t0_lo, (int32_t)cospi_16_64);
+    v_t0_hi = vmulq_n_s32(v_t0_hi, (int32_t)cospi_16_64);
+    v_t1_lo = vmulq_n_s32(v_t1_lo, (int32_t)cospi_16_64);
+    v_t1_hi = vmulq_n_s32(v_t1_hi, (int32_t)cospi_16_64);
     {
       const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS);
       const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS);
diff --git a/aom_dsp/arm/idct16x16_add_neon.c b/aom_dsp/arm/idct16x16_add_neon.c
index 3d545f878..b4cb7a0cd 100644
--- a/aom_dsp/arm/idct16x16_add_neon.c
+++ b/aom_dsp/arm/idct16x16_add_neon.c
@@ -137,8 +137,8 @@ void aom_idct16x16_256_add_neon_pass1(int16_t *in, int16_t *out,
   d31s16 = vget_high_s16(q15s16);
 
   // stage 3
-  d0s16 = vdup_n_s16(cospi_28_64);
-  d1s16 = vdup_n_s16(cospi_4_64);
+  d0s16 = vdup_n_s16((int16_t)cospi_28_64);
+  d1s16 = vdup_n_s16((int16_t)cospi_4_64);
 
   q2s32 = vmull_s16(d18s16, d0s16);
   q3s32 = vmull_s16(d19s16, d0s16);
@@ -150,8 +150,8 @@ void aom_idct16x16_256_add_neon_pass1(int16_t *in, int16_t *out,
   q5s32 = vmlal_s16(q5s32, d30s16, d0s16);
   q6s32 = vmlal_s16(q6s32, d31s16, d0s16);
 
-  d2s16 = vdup_n_s16(cospi_12_64);
-  d3s16 = vdup_n_s16(cospi_20_64);
+  d2s16 = vdup_n_s16((int16_t)cospi_12_64);
+  d3s16 = vdup_n_s16((int16_t)cospi_20_64);
 
   d8s16 = vqrshrn_n_s32(q2s32, 14);
   d9s16 = vqrshrn_n_s32(q3s32, 14);
@@ -178,15 +178,15 @@ void aom_idct16x16_256_add_neon_pass1(int16_t *in, int16_t *out,
   q6s16 = vcombine_s16(d12s16, d13s16);
 
   // stage 4
-  d30s16 = vdup_n_s16(cospi_16_64);
+  d30s16 = vdup_n_s16((int16_t)cospi_16_64);
 
   q2s32 = vmull_s16(d16s16, d30s16);
   q11s32 = vmull_s16(d17s16, d30s16);
   q0s32 = vmull_s16(d24s16, d30s16);
   q1s32 = vmull_s16(d25s16, d30s16);
 
-  d30s16 = vdup_n_s16(cospi_24_64);
-  d31s16 = vdup_n_s16(cospi_8_64);
+  d30s16 = vdup_n_s16((int16_t)cospi_24_64);
+  d31s16 = vdup_n_s16((int16_t)cospi_8_64);
 
   q3s32 = vaddq_s32(q2s32, q0s32);
   q12s32 = vaddq_s32(q11s32, q1s32);
@@ -232,7 +232,7 @@ void aom_idct16x16_256_add_neon_pass1(int16_t *in, int16_t *out,
   q2s16 = vsubq_s16(q9s16, q10s16);
   q3s16 = vsubq_s16(q8s16, q11s16);
 
-  d16s16 = vdup_n_s16(cospi_16_64);
+  d16s16 = vdup_n_s16((int16_t)cospi_16_64);
 
   q11s32 = vmull_s16(d26s16, d16s16);
   q12s32 = vmull_s16(d27s16, d16s16);
@@ -378,8 +378,8 @@ void aom_idct16x16_256_add_neon_pass2(int16_t *src, int16_t *out,
   d31s16 = vget_high_s16(q15s16);
 
   // stage 3
-  d12s16 = vdup_n_s16(cospi_30_64);
-  d13s16 = vdup_n_s16(cospi_2_64);
+  d12s16 = vdup_n_s16((int16_t)cospi_30_64);
+  d13s16 = vdup_n_s16((int16_t)cospi_2_64);
 
   q2s32 = vmull_s16(d16s16, d12s16);
   q3s32 = vmull_s16(d17s16, d12s16);
@@ -398,8 +398,8 @@ void aom_idct16x16_256_add_neon_pass2(int16_t *src, int16_t *out,
   q0s16 = vcombine_s16(d0s16, d1s16);
   q7s16 = vcombine_s16(d14s16, d15s16);
 
-  d30s16 = vdup_n_s16(cospi_14_64);
-  d31s16 = vdup_n_s16(cospi_18_64);
+  d30s16 = vdup_n_s16((int16_t)cospi_14_64);
+  d31s16 = vdup_n_s16((int16_t)cospi_18_64);
 
   q2s32 = vmull_s16(d24s16, d30s16);
   q3s32 = vmull_s16(d25s16, d30s16);
@@ -418,8 +418,8 @@ void aom_idct16x16_256_add_neon_pass2(int16_t *src, int16_t *out,
   q1s16 = vcombine_s16(d2s16, d3s16);
   q6s16 = vcombine_s16(d12s16, d13s16);
 
-  d30s16 = vdup_n_s16(cospi_22_64);
-  d31s16 = vdup_n_s16(cospi_10_64);
+  d30s16 = vdup_n_s16((int16_t)cospi_22_64);
+  d31s16 = vdup_n_s16((int16_t)cospi_10_64);
 
   q11s32 = vmull_s16(d20s16, d30s16);
   q12s32 = vmull_s16(d21s16, d30s16);
@@ -438,8 +438,8 @@ void aom_idct16x16_256_add_neon_pass2(int16_t *src, int16_t *out,
   q2s16 = vcombine_s16(d4s16, d5s16);
   q5s16 = vcombine_s16(d10s16, d11s16);
 
-  d30s16 = vdup_n_s16(cospi_6_64);
-  d31s16 = vdup_n_s16(cospi_26_64);
+  d30s16 = vdup_n_s16((int16_t)cospi_6_64);
+  d31s16 = vdup_n_s16((int16_t)cospi_26_64);
 
   q10s32 = vmull_s16(d28s16, d30s16);
   q11s32 = vmull_s16(d29s16, d30s16);
@@ -478,8 +478,8 @@ void aom_idct16x16_256_add_neon_pass2(int16_t *src, int16_t *out,
   d28s16 = vget_low_s16(q14s16);
   d29s16 = vget_high_s16(q14s16);
 
-  d30s16 = vdup_n_s16(cospi_8_64);
-  d31s16 = vdup_n_s16(cospi_24_64);
+  d30s16 = vdup_n_s16((int16_t)cospi_8_64);
+  d31s16 = vdup_n_s16((int16_t)cospi_24_64);
 
   q2s32 = vmull_s16(d18s16, d31s16);
   q3s32 = vmull_s16(d19s16, d31s16);
@@ -539,7 +539,7 @@ void aom_idct16x16_256_add_neon_pass2(int16_t *src, int16_t *out,
   d26s16 = vget_low_s16(q13s16);
   d27s16 = vget_high_s16(q13s16);
 
-  d14s16 = vdup_n_s16(cospi_16_64);
+  d14s16 = vdup_n_s16((int16_t)cospi_16_64);
 
   q3s32 = vmull_s16(d26s16, d14s16);
   q4s32 = vmull_s16(d27s16, d14s16);
@@ -903,15 +903,15 @@ void aom_idct16x16_10_add_neon_pass1(int16_t *in, int16_t *out,
                &q15s16);
 
   // stage 3
-  q0s16 = vdupq_n_s16(cospi_28_64 * 2);
-  q1s16 = vdupq_n_s16(cospi_4_64 * 2);
+  q0s16 = vdupq_n_s16((int16_t)(cospi_28_64 * 2));
+  q1s16 = vdupq_n_s16((int16_t)(cospi_4_64 * 2));
 
   q4s16 = vqrdmulhq_s16(q9s16, q0s16);
   q7s16 = vqrdmulhq_s16(q9s16, q1s16);
 
   // stage 4
-  q1s16 = vdupq_n_s16(cospi_16_64 * 2);
-  d4s16 = vdup_n_s16(cospi_16_64);
+  q1s16 = vdupq_n_s16((int16_t)(cospi_16_64 * 2));
+  d4s16 = vdup_n_s16((int16_t)cospi_16_64);
 
   q8s16 = vqrdmulhq_s16(q8s16, q1s16);
 
@@ -1046,13 +1046,13 @@ void aom_idct16x16_10_add_neon_pass2(int16_t *src, int16_t *out,
                &q15s16);
 
   // stage 3
-  q6s16 = vdupq_n_s16(cospi_30_64 * 2);
+  q6s16 = vdupq_n_s16((int16_t)(cospi_30_64 * 2));
   q0s16 = vqrdmulhq_s16(q8s16, q6s16);
-  q6s16 = vdupq_n_s16(cospi_2_64 * 2);
+  q6s16 = vdupq_n_s16((int16_t)(cospi_2_64 * 2));
   q7s16 = vqrdmulhq_s16(q8s16, q6s16);
 
   q15s16 = vdupq_n_s16(-cospi_26_64 * 2);
-  q14s16 = vdupq_n_s16(cospi_6_64 * 2);
+  q14s16 = vdupq_n_s16((int16_t)(cospi_6_64 * 2));
   q3s16 = vqrdmulhq_s16(q9s16, q15s16);
   q4s16 = vqrdmulhq_s16(q9s16, q14s16);
 
@@ -1066,8 +1066,8 @@ void aom_idct16x16_10_add_neon_pass2(int16_t *src, int16_t *out,
   d14s16 = vget_low_s16(q7s16);
   d15s16 = vget_high_s16(q7s16);
 
-  d30s16 = vdup_n_s16(cospi_8_64);
-  d31s16 = vdup_n_s16(cospi_24_64);
+  d30s16 = vdup_n_s16((int16_t)cospi_8_64);
+  d31s16 = vdup_n_s16((int16_t)cospi_24_64);
 
   q12s32 = vmull_s16(d14s16, d31s16);
   q5s32 = vmull_s16(d15s16, d31s16);
@@ -1124,7 +1124,7 @@ void aom_idct16x16_10_add_neon_pass2(int16_t *src, int16_t *out,
   d26s16 = vget_low_s16(q13s16);
   d27s16 = vget_high_s16(q13s16);
 
-  d14s16 = vdup_n_s16(cospi_16_64);
+  d14s16 = vdup_n_s16((int16_t)cospi_16_64);
   q3s32 = vmull_s16(d26s16, d14s16);
   q4s32 = vmull_s16(d27s16, d14s16);
   q0s32 = vmull_s16(d20s16, d14s16);
diff --git a/aom_dsp/arm/idct4x4_add_neon.c b/aom_dsp/arm/idct4x4_add_neon.c
index 397c61709..763be1ab0 100644
--- a/aom_dsp/arm/idct4x4_add_neon.c
+++ b/aom_dsp/arm/idct4x4_add_neon.c
@@ -11,6 +11,8 @@
 
 #include <arm_neon.h>
 
+#include "aom_dsp/txfm_common.h"
+
 void aom_idct4x4_16_add_neon(int16_t *input, uint8_t *dest, int dest_stride) {
   uint8x8_t d26u8, d27u8;
   uint32x2_t d26u32, d27u32;
@@ -22,9 +24,6 @@ void aom_idct4x4_16_add_neon(int16_t *input, uint8_t *dest, int dest_stride) {
   int16x4x2_t d0x2s16, d1x2s16;
   int32x4x2_t q0x2s32;
   uint8_t *d;
-  int16_t cospi_8_64 = 15137;
-  int16_t cospi_16_64 = 11585;
-  int16_t cospi_24_64 = 6270;
 
   d26u32 = d27u32 = vdup_n_u32(0);
 
@@ -41,8 +40,8 @@ void aom_idct4x4_16_add_neon(int16_t *input, uint8_t *dest, int dest_stride) {
   q8s16 = vcombine_s16(d0x2s16.val[0], d0x2s16.val[1]);
   q9s16 = vcombine_s16(d1x2s16.val[0], d1x2s16.val[1]);
 
-  d20s16 = vdup_n_s16(cospi_8_64);
-  d21s16 = vdup_n_s16(cospi_16_64);
+  d20s16 = vdup_n_s16((int16_t)cospi_8_64);
+  d21s16 = vdup_n_s16((int16_t)cospi_16_64);
 
   q0x2s32 =
       vtrnq_s32(vreinterpretq_s32_s16(q8s16), vreinterpretq_s32_s16(q9s16));
@@ -51,7 +50,7 @@ void aom_idct4x4_16_add_neon(int16_t *input, uint8_t *dest, int dest_stride) {
   d18s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[1]));
   d19s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[1]));
 
-  d22s16 = vdup_n_s16(cospi_24_64);
+  d22s16 = vdup_n_s16((int16_t)cospi_24_64);
 
   // stage 1
   d23s16 = vadd_s16(d16s16, d18s16);
diff --git a/aom_dsp/arm/idct8x8_1_add_neon.c b/aom_dsp/arm/idct8x8_1_add_neon.c
index fcc2a2fcd..c7926f9e4 100644
--- a/aom_dsp/arm/idct8x8_1_add_neon.c
+++ b/aom_dsp/arm/idct8x8_1_add_neon.c
@@ -20,7 +20,7 @@ void aom_idct8x8_1_add_neon(int16_t *input, uint8_t *dest, int dest_stride) {
   uint16x8_t q0u16, q9u16, q10u16, q11u16, q12u16;
   int16x8_t q0s16;
   uint8_t *d1, *d2;
-  int16_t i, a1, cospi_16_64 = 11585;
+  int16_t i, a1;
   int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
   out = dct_const_round_shift(out * cospi_16_64);
   a1 = ROUND_POWER_OF_TWO(out, 5);
diff --git a/aom_dsp/arm/idct8x8_add_neon.c b/aom_dsp/arm/idct8x8_add_neon.c
index 8e752105b..8ad70862d 100644
--- a/aom_dsp/arm/idct8x8_add_neon.c
+++ b/aom_dsp/arm/idct8x8_add_neon.c
@@ -90,10 +90,10 @@ static INLINE void IDCT8x8_1D(int16x8_t *q8s16, int16x8_t *q9s16,
   int32x4_t q2s32, q3s32, q5s32, q6s32, q8s32, q9s32;
   int32x4_t q10s32, q11s32, q12s32, q13s32, q15s32;
 
-  d0s16 = vdup_n_s16(cospi_28_64);
-  d1s16 = vdup_n_s16(cospi_4_64);
-  d2s16 = vdup_n_s16(cospi_12_64);
-  d3s16 = vdup_n_s16(cospi_20_64);
+  d0s16 = vdup_n_s16((int16_t)cospi_28_64);
+  d1s16 = vdup_n_s16((int16_t)cospi_4_64);
+  d2s16 = vdup_n_s16((int16_t)cospi_12_64);
+  d3s16 = vdup_n_s16((int16_t)cospi_20_64);
 
   d16s16 = vget_low_s16(*q8s16);
   d17s16 = vget_high_s16(*q8s16);
@@ -146,7 +146,7 @@ static INLINE void IDCT8x8_1D(int16x8_t *q8s16, int16x8_t *q9s16,
   q6s16 = vcombine_s16(d12s16, d13s16);
   q7s16 = vcombine_s16(d14s16, d15s16);
 
-  d0s16 = vdup_n_s16(cospi_16_64);
+  d0s16 = vdup_n_s16((int16_t)cospi_16_64);
 
   q2s32 = vmull_s16(d16s16, d0s16);
   q3s32 = vmull_s16(d17s16, d0s16);
@@ -158,8 +158,8 @@ static INLINE void IDCT8x8_1D(int16x8_t *q8s16, int16x8_t *q9s16,
   q13s32 = vmlsl_s16(q13s32, d24s16, d0s16);
   q15s32 = vmlsl_s16(q15s32, d25s16, d0s16);
 
-  d0s16 = vdup_n_s16(cospi_24_64);
-  d1s16 = vdup_n_s16(cospi_8_64);
+  d0s16 = vdup_n_s16((int16_t)cospi_24_64);
+  d1s16 = vdup_n_s16((int16_t)cospi_8_64);
 
   d18s16 = vqrshrn_n_s32(q2s32, 14);
   d19s16 = vqrshrn_n_s32(q3s32, 14);
@@ -199,7 +199,7 @@ static INLINE void IDCT8x8_1D(int16x8_t *q8s16, int16x8_t *q9s16,
   d28s16 = vget_low_s16(*q14s16);
   d29s16 = vget_high_s16(*q14s16);
 
-  d16s16 = vdup_n_s16(cospi_16_64);
+  d16s16 = vdup_n_s16((int16_t)cospi_16_64);
 
   q9s32 = vmull_s16(d28s16, d16s16);
   q10s32 = vmull_s16(d29s16, d16s16);
@@ -356,29 +356,29 @@ void aom_idct8x8_12_add_neon(int16_t *input, uint8_t *dest, int dest_stride) {
 
   // First transform rows
   // stage 1
-  q0s16 = vdupq_n_s16(cospi_28_64 * 2);
-  q1s16 = vdupq_n_s16(cospi_4_64 * 2);
+  q0s16 = vdupq_n_s16((int16_t)cospi_28_64 * 2);
+  q1s16 = vdupq_n_s16((int16_t)cospi_4_64 * 2);
 
   q4s16 = vqrdmulhq_s16(q9s16, q0s16);
 
-  q0s16 = vdupq_n_s16(-cospi_20_64 * 2);
+  q0s16 = vdupq_n_s16(-(int16_t)cospi_20_64 * 2);
 
   q7s16 = vqrdmulhq_s16(q9s16, q1s16);
 
-  q1s16 = vdupq_n_s16(cospi_12_64 * 2);
+  q1s16 = vdupq_n_s16((int16_t)cospi_12_64 * 2);
 
   q5s16 = vqrdmulhq_s16(q11s16, q0s16);
 
-  q0s16 = vdupq_n_s16(cospi_16_64 * 2);
+  q0s16 = vdupq_n_s16((int16_t)cospi_16_64 * 2);
 
   q6s16 = vqrdmulhq_s16(q11s16, q1s16);
 
   // stage 2 & stage 3 - even half
-  q1s16 = vdupq_n_s16(cospi_24_64 * 2);
+  q1s16 = vdupq_n_s16((int16_t)cospi_24_64 * 2);
 
   q9s16 = vqrdmulhq_s16(q8s16, q0s16);
 
-  q0s16 = vdupq_n_s16(cospi_8_64 * 2);
+  q0s16 = vdupq_n_s16((int16_t)cospi_8_64 * 2);
 
   q13s16 = vqrdmulhq_s16(q10s16, q1s16);
 
@@ -400,7 +400,7 @@ void aom_idct8x8_12_add_neon(int16_t *input, uint8_t *dest, int dest_stride) {
   d28s16 = vget_low_s16(q14s16);
   d29s16 = vget_high_s16(q14s16);
 
-  d16s16 = vdup_n_s16(cospi_16_64);
+  d16s16 = vdup_n_s16((int16_t)cospi_16_64);
   q9s32 = vmull_s16(d28s16, d16s16);
   q10s32 = vmull_s16(d29s16, d16s16);
   q11s32 = vmull_s16(d28s16, d16s16);
diff --git a/aom_dsp/fwd_txfm.c b/aom_dsp/fwd_txfm.c
index fadae2b8e..547919f88 100644
--- a/aom_dsp/fwd_txfm.c
+++ b/aom_dsp/fwd_txfm.c
@@ -9,8 +9,9 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#include "./aom_dsp_rtcd.h"
 #include "aom_dsp/fwd_txfm.h"
+#include <assert.h>
+#include "./aom_dsp_rtcd.h"
 
 void aom_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride) {
   // The 2D transform is done with two passes which are actually pretty
@@ -22,36 +23,37 @@ void aom_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride) {
   int pass;
   // We need an intermediate buffer between passes.
   tran_low_t intermediate[4 * 4];
-  const int16_t *in_pass0 = input;
-  const tran_low_t *in = NULL;
+  const tran_low_t *in_low = NULL;
   tran_low_t *out = intermediate;
   // Do the two transform/transpose passes
   for (pass = 0; pass < 2; ++pass) {
-    tran_high_t input[4];      // canbe16
+    tran_high_t in_high[4];    // canbe16
     tran_high_t step[4];       // canbe16
     tran_high_t temp1, temp2;  // needs32
     int i;
     for (i = 0; i < 4; ++i) {
       // Load inputs.
-      if (0 == pass) {
-        input[0] = in_pass0[0 * stride] * 16;
-        input[1] = in_pass0[1 * stride] * 16;
-        input[2] = in_pass0[2 * stride] * 16;
-        input[3] = in_pass0[3 * stride] * 16;
-        if (i == 0 && input[0]) {
-          input[0] += 1;
+      if (pass == 0) {
+        in_high[0] = input[0 * stride] * 16;
+        in_high[1] = input[1 * stride] * 16;
+        in_high[2] = input[2 * stride] * 16;
+        in_high[3] = input[3 * stride] * 16;
+        if (i == 0 && in_high[0]) {
+          ++in_high[0];
         }
       } else {
-        input[0] = in[0 * 4];
-        input[1] = in[1 * 4];
-        input[2] = in[2 * 4];
-        input[3] = in[3 * 4];
+        assert(in_low != NULL);
+        in_high[0] = in_low[0 * 4];
+        in_high[1] = in_low[1 * 4];
+        in_high[2] = in_low[2 * 4];
+        in_high[3] = in_low[3 * 4];
+        ++in_low;
       }
       // Transform.
-      step[0] = input[0] + input[3];
-      step[1] = input[1] + input[2];
-      step[2] = input[1] - input[2];
-      step[3] = input[0] - input[3];
+      step[0] = in_high[0] + in_high[3];
+      step[1] = in_high[1] + in_high[2];
+      step[2] = in_high[1] - in_high[2];
+      step[3] = in_high[0] - in_high[3];
       temp1 = (step[0] + step[1]) * cospi_16_64;
       temp2 = (step[0] - step[1]) * cospi_16_64;
       out[0] = (tran_low_t)fdct_round_shift(temp1);
@@ -61,12 +63,11 @@ void aom_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride) {
       out[1] = (tran_low_t)fdct_round_shift(temp1);
       out[3] = (tran_low_t)fdct_round_shift(temp2);
       // Do next column (which is a transposed row in second/horizontal pass)
-      in_pass0++;
-      in++;
+      ++input;
       out += 4;
     }
     // Setup in/out for next pass.
-    in = intermediate;
+    in_low = intermediate;
     out = output;
   }
 
@@ -100,7 +101,6 @@ void aom_fdct8x8_c(const int16_t *input, tran_low_t *final_output, int stride) {
     tran_high_t t0, t1, t2, t3;                  // needs32
     tran_high_t x0, x1, x2, x3;                  // canbe16
 
-    int i;
     for (i = 0; i < 8; i++) {
       // stage 1
       if (pass == 0) {
@@ -191,56 +191,57 @@ void aom_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride) {
   int pass;
   // We need an intermediate buffer between passes.
   tran_low_t intermediate[256];
-  const int16_t *in_pass0 = input;
-  const tran_low_t *in = NULL;
+  const tran_low_t *in_low = NULL;
   tran_low_t *out = intermediate;
   // Do the two transform/transpose passes
   for (pass = 0; pass < 2; ++pass) {
     tran_high_t step1[8];      // canbe16
     tran_high_t step2[8];      // canbe16
     tran_high_t step3[8];      // canbe16
-    tran_high_t input[8];      // canbe16
+    tran_high_t in_high[8];    // canbe16
     tran_high_t temp1, temp2;  // needs32
     int i;
     for (i = 0; i < 16; i++) {
       if (0 == pass) {
         // Calculate input for the first 8 results.
-        input[0] = (in_pass0[0 * stride] + in_pass0[15 * stride]) * 4;
-        input[1] = (in_pass0[1 * stride] + in_pass0[14 * stride]) * 4;
-        input[2] = (in_pass0[2 * stride] + in_pass0[13 * stride]) * 4;
-        input[3] = (in_pass0[3 * stride] + in_pass0[12 * stride]) * 4;
-        input[4] = (in_pass0[4 * stride] + in_pass0[11 * stride]) * 4;
-        input[5] = (in_pass0[5 * stride] + in_pass0[10 * stride]) * 4;
-        input[6] = (in_pass0[6 * stride] + in_pass0[9 * stride]) * 4;
-        input[7] = (in_pass0[7 * stride] + in_pass0[8 * stride]) * 4;
+        in_high[0] = (input[0 * stride] + input[15 * stride]) * 4;
+        in_high[1] = (input[1 * stride] + input[14 * stride]) * 4;
+        in_high[2] = (input[2 * stride] + input[13 * stride]) * 4;
+        in_high[3] = (input[3 * stride] + input[12 * stride]) * 4;
+        in_high[4] = (input[4 * stride] + input[11 * stride]) * 4;
+        in_high[5] = (input[5 * stride] + input[10 * stride]) * 4;
+        in_high[6] = (input[6 * stride] + input[9 * stride]) * 4;
+        in_high[7] = (input[7 * stride] + input[8 * stride]) * 4;
         // Calculate input for the next 8 results.
-        step1[0] = (in_pass0[7 * stride] - in_pass0[8 * stride]) * 4;
-        step1[1] = (in_pass0[6 * stride] - in_pass0[9 * stride]) * 4;
-        step1[2] = (in_pass0[5 * stride] - in_pass0[10 * stride]) * 4;
-        step1[3] = (in_pass0[4 * stride] - in_pass0[11 * stride]) * 4;
-        step1[4] = (in_pass0[3 * stride] - in_pass0[12 * stride]) * 4;
-        step1[5] = (in_pass0[2 * stride] - in_pass0[13 * stride]) * 4;
-        step1[6] = (in_pass0[1 * stride] - in_pass0[14 * stride]) * 4;
-        step1[7] = (in_pass0[0 * stride] - in_pass0[15 * stride]) * 4;
+        step1[0] = (input[7 * stride] - input[8 * stride]) * 4;
+        step1[1] = (input[6 * stride] - input[9 * stride]) * 4;
+        step1[2] = (input[5 * stride] - input[10 * stride]) * 4;
+        step1[3] = (input[4 * stride] - input[11 * stride]) * 4;
+        step1[4] = (input[3 * stride] - input[12 * stride]) * 4;
+        step1[5] = (input[2 * stride] - input[13 * stride]) * 4;
+        step1[6] = (input[1 * stride] - input[14 * stride]) * 4;
+        step1[7] = (input[0 * stride] - input[15 * stride]) * 4;
       } else {
         // Calculate input for the first 8 results.
-        input[0] = ((in[0 * 16] + 1) >> 2) + ((in[15 * 16] + 1) >> 2);
-        input[1] = ((in[1 * 16] + 1) >> 2) + ((in[14 * 16] + 1) >> 2);
-        input[2] = ((in[2 * 16] + 1) >> 2) + ((in[13 * 16] + 1) >> 2);
-        input[3] = ((in[3 * 16] + 1) >> 2) + ((in[12 * 16] + 1) >> 2);
-        input[4] = ((in[4 * 16] + 1) >> 2) + ((in[11 * 16] + 1) >> 2);
-        input[5] = ((in[5 * 16] + 1) >> 2) + ((in[10 * 16] + 1) >> 2);
-        input[6] = ((in[6 * 16] + 1) >> 2) + ((in[9 * 16] + 1) >> 2);
-        input[7] = ((in[7 * 16] + 1) >> 2) + ((in[8 * 16] + 1) >> 2);
+        assert(in_low != NULL);
+        in_high[0] = ((in_low[0 * 16] + 1) >> 2) + ((in_low[15 * 16] + 1) >> 2);
+        in_high[1] = ((in_low[1 * 16] + 1) >> 2) + ((in_low[14 * 16] + 1) >> 2);
+        in_high[2] = ((in_low[2 * 16] + 1) >> 2) + ((in_low[13 * 16] + 1) >> 2);
+        in_high[3] = ((in_low[3 * 16] + 1) >> 2) + ((in_low[12 * 16] + 1) >> 2);
+        in_high[4] = ((in_low[4 * 16] + 1) >> 2) + ((in_low[11 * 16] + 1) >> 2);
+        in_high[5] = ((in_low[5 * 16] + 1) >> 2) + ((in_low[10 * 16] + 1) >> 2);
+        in_high[6] = ((in_low[6 * 16] + 1) >> 2) + ((in_low[9 * 16] + 1) >> 2);
+        in_high[7] = ((in_low[7 * 16] + 1) >> 2) + ((in_low[8 * 16] + 1) >> 2);
         // Calculate input for the next 8 results.
-        step1[0] = ((in[7 * 16] + 1) >> 2) - ((in[8 * 16] + 1) >> 2);
-        step1[1] = ((in[6 * 16] + 1) >> 2) - ((in[9 * 16] + 1) >> 2);
-        step1[2] = ((in[5 * 16] + 1) >> 2) - ((in[10 * 16] + 1) >> 2);
-        step1[3] = ((in[4 * 16] + 1) >> 2) - ((in[11 * 16] + 1) >> 2);
-        step1[4] = ((in[3 * 16] + 1) >> 2) - ((in[12 * 16] + 1) >> 2);
-        step1[5] = ((in[2 * 16] + 1) >> 2) - ((in[13 * 16] + 1) >> 2);
-        step1[6] = ((in[1 * 16] + 1) >> 2) - ((in[14 * 16] + 1) >> 2);
-        step1[7] = ((in[0 * 16] + 1) >> 2) - ((in[15 * 16] + 1) >> 2);
+        step1[0] = ((in_low[7 * 16] + 1) >> 2) - ((in_low[8 * 16] + 1) >> 2);
+        step1[1] = ((in_low[6 * 16] + 1) >> 2) - ((in_low[9 * 16] + 1) >> 2);
+        step1[2] = ((in_low[5 * 16] + 1) >> 2) - ((in_low[10 * 16] + 1) >> 2);
+        step1[3] = ((in_low[4 * 16] + 1) >> 2) - ((in_low[11 * 16] + 1) >> 2);
+        step1[4] = ((in_low[3 * 16] + 1) >> 2) - ((in_low[12 * 16] + 1) >> 2);
+        step1[5] = ((in_low[2 * 16] + 1) >> 2) - ((in_low[13 * 16] + 1) >> 2);
+        step1[6] = ((in_low[1 * 16] + 1) >> 2) - ((in_low[14 * 16] + 1) >> 2);
+        step1[7] = ((in_low[0 * 16] + 1) >> 2) - ((in_low[15 * 16] + 1) >> 2);
+        in_low++;
       }
       // Work on the first eight values; fdct8(input, even_results);
       {
@@ -249,14 +250,14 @@ void aom_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride) {
         tran_high_t x0, x1, x2, x3;                  // canbe16
 
         // stage 1
-        s0 = input[0] + input[7];
-        s1 = input[1] + input[6];
-        s2 = input[2] + input[5];
-        s3 = input[3] + input[4];
-        s4 = input[3] - input[4];
-        s5 = input[2] - input[5];
-        s6 = input[1] - input[6];
-        s7 = input[0] - input[7];
+        s0 = in_high[0] + in_high[7];
+        s1 = in_high[1] + in_high[6];
+        s2 = in_high[2] + in_high[5];
+        s3 = in_high[3] + in_high[4];
+        s4 = in_high[3] - in_high[4];
+        s5 = in_high[2] - in_high[5];
+        s6 = in_high[1] - in_high[6];
+        s7 = in_high[0] - in_high[7];
 
         // fdct4(step, step);
         x0 = s0 + s3;
@@ -351,12 +352,11 @@ void aom_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride) {
         out[15] = (tran_low_t)fdct_round_shift(temp2);
       }
       // Do next column (which is a transposed row in second/horizontal pass)
-      in++;
-      in_pass0++;
+      input++;
       out += 16;
     }
     // Setup in/out for next pass.
-    in = intermediate;
+    in_low = intermediate;
     out = output;
   }
 }
diff --git a/aom_dsp/mips/aom_convolve_msa.h b/aom_dsp/mips/aom_convolve_msa.h
index 4efbcbcad..1a0ae4d8d 100644
--- a/aom_dsp/mips/aom_convolve_msa.h
+++ b/aom_dsp/mips/aom_convolve_msa.h
@@ -17,18 +17,18 @@
 
 extern const uint8_t mc_filt_mask_arr[16 * 3];
 
-#define FILT_8TAP_DPADD_S_H(vec0, vec1, vec2, vec3, filt0, filt1, filt2, \
-                            filt3)                                       \
-  ({                                                                     \
-    v8i16 tmp0, tmp1;                                                    \
-                                                                         \
-    tmp0 = __msa_dotp_s_h((v16i8)vec0, (v16i8)filt0);                    \
-    tmp0 = __msa_dpadd_s_h(tmp0, (v16i8)vec1, (v16i8)filt1);             \
-    tmp1 = __msa_dotp_s_h((v16i8)vec2, (v16i8)filt2);                    \
-    tmp1 = __msa_dpadd_s_h(tmp1, (v16i8)vec3, (v16i8)filt3);             \
-    tmp0 = __msa_adds_s_h(tmp0, tmp1);                                   \
-                                                                         \
-    tmp0;                                                                \
+#define FILT_8TAP_DPADD_S_H(vec0, vec1, vec2, vec3, filt0, filt1, filt2,   \
+                            filt3)                                         \
+  ({                                                                       \
+    v8i16 tmp_dpadd_0, tmp_dpadd_1;                                        \
+                                                                           \
+    tmp_dpadd_0 = __msa_dotp_s_h((v16i8)vec0, (v16i8)filt0);               \
+    tmp_dpadd_0 = __msa_dpadd_s_h(tmp_dpadd_0, (v16i8)vec1, (v16i8)filt1); \
+    tmp_dpadd_1 = __msa_dotp_s_h((v16i8)vec2, (v16i8)filt2);               \
+    tmp_dpadd_1 = __msa_dpadd_s_h(tmp_dpadd_1, (v16i8)vec3, (v16i8)filt3); \
+    tmp_dpadd_0 = __msa_adds_s_h(tmp_dpadd_0, tmp_dpadd_1);                \
+                                                                           \
+    tmp_dpadd_0;                                                           \
   })
 
 #define HORIZ_8TAP_FILT(src0, src1, mask0, mask1, mask2, mask3, filt_h0,       \
@@ -115,11 +115,10 @@ extern const uint8_t mc_filt_mask_arr[16 * 3];
                            stride)                                           \
   {                                                                          \
     v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                    \
-    uint8_t *pdst_m = (uint8_t *)(pdst);                                     \
                                                                              \
     PCKEV_B2_UB(in2, in1, in4, in3, tmp0_m, tmp1_m);                         \
     PCKEV_D2_UB(dst1, dst0, dst3, dst2, tmp2_m, tmp3_m);                     \
     AVER_UB2_UB(tmp0_m, tmp2_m, tmp1_m, tmp3_m, tmp0_m, tmp1_m);             \
-    ST8x4_UB(tmp0_m, tmp1_m, pdst_m, stride);                                \
+    ST8x4_UB(tmp0_m, tmp1_m, pdst, stride);                                  \
   }
 #endif /* AOM_DSP_MIPS_AOM_CONVOLVE_MSA_H_ */
diff --git a/aom_dsp/mips/inv_txfm_msa.h b/aom_dsp/mips/inv_txfm_msa.h
index ce2065bb2..122667aa8 100644
--- a/aom_dsp/mips/inv_txfm_msa.h
+++ b/aom_dsp/mips/inv_txfm_msa.h
@@ -197,18 +197,18 @@
                  out2, out3)                                                  \
   {                                                                           \
     v8i16 madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m;                         \
-    v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                     \
+    v4i32 tmp0_madd, tmp1_madd, tmp2_madd, tmp3_madd;                         \
                                                                               \
     ILVRL_H2_SH(inp1, inp0, madd_s1_m, madd_s0_m);                            \
     ILVRL_H2_SH(inp3, inp2, madd_s3_m, madd_s2_m);                            \
     DOTP_SH4_SW(madd_s1_m, madd_s0_m, madd_s1_m, madd_s0_m, cst0, cst0, cst1, \
-                cst1, tmp0_m, tmp1_m, tmp2_m, tmp3_m);                        \
-    SRARI_W4_SW(tmp0_m, tmp1_m, tmp2_m, tmp3_m, DCT_CONST_BITS);              \
-    PCKEV_H2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out0, out1);                  \
+                cst1, tmp0_madd, tmp1_madd, tmp2_madd, tmp3_madd);            \
+    SRARI_W4_SW(tmp0_madd, tmp1_madd, tmp2_madd, tmp3_madd, DCT_CONST_BITS);  \
+    PCKEV_H2_SH(tmp1_madd, tmp0_madd, tmp3_madd, tmp2_madd, out0, out1);      \
     DOTP_SH4_SW(madd_s3_m, madd_s2_m, madd_s3_m, madd_s2_m, cst2, cst2, cst3, \
-                cst3, tmp0_m, tmp1_m, tmp2_m, tmp3_m);                        \
-    SRARI_W4_SW(tmp0_m, tmp1_m, tmp2_m, tmp3_m, DCT_CONST_BITS);              \
-    PCKEV_H2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out2, out3);                  \
+                cst3, tmp0_madd, tmp1_madd, tmp2_madd, tmp3_madd);            \
+    SRARI_W4_SW(tmp0_madd, tmp1_madd, tmp2_madd, tmp3_madd, DCT_CONST_BITS);  \
+    PCKEV_H2_SH(tmp1_madd, tmp0_madd, tmp3_madd, tmp2_madd, out2, out3);      \
   }
 
 /* idct 8x8 macro */
diff --git a/aom_dsp/mips/loopfilter_msa.h b/aom_dsp/mips/loopfilter_msa.h
index c1cabc2f2..450594262 100644
--- a/aom_dsp/mips/loopfilter_msa.h
+++ b/aom_dsp/mips/loopfilter_msa.h
@@ -123,35 +123,35 @@
     p1_out = __msa_xori_b((v16u8)p1_m, 0x80);                           \
   }
 
-#define AOM_FLAT4(p3_in, p2_in, p0_in, q0_in, q2_in, q3_in, flat_out) \
-  {                                                                   \
-    v16u8 tmp, p2_a_sub_p0, q2_a_sub_q0, p3_a_sub_p0, q3_a_sub_q0;    \
-    v16u8 zero_in = { 0 };                                            \
-                                                                      \
-    tmp = __msa_ori_b(zero_in, 1);                                    \
-    p2_a_sub_p0 = __msa_asub_u_b(p2_in, p0_in);                       \
-    q2_a_sub_q0 = __msa_asub_u_b(q2_in, q0_in);                       \
-    p3_a_sub_p0 = __msa_asub_u_b(p3_in, p0_in);                       \
-    q3_a_sub_q0 = __msa_asub_u_b(q3_in, q0_in);                       \
-                                                                      \
-    p2_a_sub_p0 = __msa_max_u_b(p2_a_sub_p0, q2_a_sub_q0);            \
-    flat_out = __msa_max_u_b(p2_a_sub_p0, flat_out);                  \
-    p3_a_sub_p0 = __msa_max_u_b(p3_a_sub_p0, q3_a_sub_q0);            \
-    flat_out = __msa_max_u_b(p3_a_sub_p0, flat_out);                  \
-                                                                      \
-    flat_out = (tmp < (v16u8)flat_out);                               \
-    flat_out = __msa_xori_b(flat_out, 0xff);                          \
-    flat_out = flat_out & (mask);                                     \
+#define AOM_FLAT4(p3_in, p2_in, p0_in, q0_in, q2_in, q3_in, flat_out)    \
+  {                                                                      \
+    v16u8 tmp_flat4, p2_a_sub_p0, q2_a_sub_q0, p3_a_sub_p0, q3_a_sub_q0; \
+    v16u8 zero_in = { 0 };                                               \
+                                                                         \
+    tmp_flat4 = __msa_ori_b(zero_in, 1);                                 \
+    p2_a_sub_p0 = __msa_asub_u_b(p2_in, p0_in);                          \
+    q2_a_sub_q0 = __msa_asub_u_b(q2_in, q0_in);                          \
+    p3_a_sub_p0 = __msa_asub_u_b(p3_in, p0_in);                          \
+    q3_a_sub_q0 = __msa_asub_u_b(q3_in, q0_in);                          \
+                                                                         \
+    p2_a_sub_p0 = __msa_max_u_b(p2_a_sub_p0, q2_a_sub_q0);               \
+    flat_out = __msa_max_u_b(p2_a_sub_p0, flat_out);                     \
+    p3_a_sub_p0 = __msa_max_u_b(p3_a_sub_p0, q3_a_sub_q0);               \
+    flat_out = __msa_max_u_b(p3_a_sub_p0, flat_out);                     \
+                                                                         \
+    flat_out = (tmp_flat4 < (v16u8)flat_out);                            \
+    flat_out = __msa_xori_b(flat_out, 0xff);                             \
+    flat_out = flat_out & (mask);                                        \
   }
 
 #define AOM_FLAT5(p7_in, p6_in, p5_in, p4_in, p0_in, q0_in, q4_in, q5_in, \
                   q6_in, q7_in, flat_in, flat2_out)                       \
   {                                                                       \
-    v16u8 tmp, zero_in = { 0 };                                           \
+    v16u8 tmp_flat5, zero_in = { 0 };                                     \
     v16u8 p4_a_sub_p0, q4_a_sub_q0, p5_a_sub_p0, q5_a_sub_q0;             \
     v16u8 p6_a_sub_p0, q6_a_sub_q0, p7_a_sub_p0, q7_a_sub_q0;             \
                                                                           \
-    tmp = __msa_ori_b(zero_in, 1);                                        \
+    tmp_flat5 = __msa_ori_b(zero_in, 1);                                  \
     p4_a_sub_p0 = __msa_asub_u_b(p4_in, p0_in);                           \
     q4_a_sub_q0 = __msa_asub_u_b(q4_in, q0_in);                           \
     p5_a_sub_p0 = __msa_asub_u_b(p5_in, p0_in);                           \
@@ -169,7 +169,7 @@
     p7_a_sub_p0 = __msa_max_u_b(p7_a_sub_p0, q7_a_sub_q0);                \
     flat2_out = __msa_max_u_b(p7_a_sub_p0, flat2_out);                    \
                                                                           \
-    flat2_out = (tmp < (v16u8)flat2_out);                                 \
+    flat2_out = (tmp_flat5 < (v16u8)flat2_out);                           \
     flat2_out = __msa_xori_b(flat2_out, 0xff);                            \
     flat2_out = flat2_out & flat_in;                                      \
   }
@@ -178,38 +178,38 @@
                     p2_filt8_out, p1_filt8_out, p0_filt8_out, q0_filt8_out, \
                     q1_filt8_out, q2_filt8_out)                             \
   {                                                                         \
-    v8u16 tmp0, tmp1, tmp2;                                                 \
+    v8u16 tmp_filt8_0, tmp_filt8_1, tmp_filt8_2;                            \
                                                                             \
-    tmp2 = p2_in + p1_in + p0_in;                                           \
-    tmp0 = p3_in << 1;                                                      \
+    tmp_filt8_2 = p2_in + p1_in + p0_in;                                    \
+    tmp_filt8_0 = p3_in << 1;                                               \
                                                                             \
-    tmp0 = tmp0 + tmp2 + q0_in;                                             \
-    tmp1 = tmp0 + p3_in + p2_in;                                            \
-    p2_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp1, 3);                    \
+    tmp_filt8_0 = tmp_filt8_0 + tmp_filt8_2 + q0_in;                        \
+    tmp_filt8_1 = tmp_filt8_0 + p3_in + p2_in;                              \
+    p2_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp_filt8_1, 3);             \
                                                                             \
-    tmp1 = tmp0 + p1_in + q1_in;                                            \
-    p1_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp1, 3);                    \
+    tmp_filt8_1 = tmp_filt8_0 + p1_in + q1_in;                              \
+    p1_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp_filt8_1, 3);             \
                                                                             \
-    tmp1 = q2_in + q1_in + q0_in;                                           \
-    tmp2 = tmp2 + tmp1;                                                     \
-    tmp0 = tmp2 + (p0_in);                                                  \
-    tmp0 = tmp0 + (p3_in);                                                  \
-    p0_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp0, 3);                    \
+    tmp_filt8_1 = q2_in + q1_in + q0_in;                                    \
+    tmp_filt8_2 = tmp_filt8_2 + tmp_filt8_1;                                \
+    tmp_filt8_0 = tmp_filt8_2 + (p0_in);                                    \
+    tmp_filt8_0 = tmp_filt8_0 + (p3_in);                                    \
+    p0_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp_filt8_0, 3);             \
                                                                             \
-    tmp0 = q2_in + q3_in;                                                   \
-    tmp0 = p0_in + tmp1 + tmp0;                                             \
-    tmp1 = q3_in + q3_in;                                                   \
-    tmp1 = tmp1 + tmp0;                                                     \
-    q2_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp1, 3);                    \
+    tmp_filt8_0 = q2_in + q3_in;                                            \
+    tmp_filt8_0 = p0_in + tmp_filt8_1 + tmp_filt8_0;                        \
+    tmp_filt8_1 = q3_in + q3_in;                                            \
+    tmp_filt8_1 = tmp_filt8_1 + tmp_filt8_0;                                \
+    q2_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp_filt8_1, 3);             \
                                                                             \
-    tmp0 = tmp2 + q3_in;                                                    \
-    tmp1 = tmp0 + q0_in;                                                    \
-    q0_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp1, 3);                    \
+    tmp_filt8_0 = tmp_filt8_2 + q3_in;                                      \
+    tmp_filt8_1 = tmp_filt8_0 + q0_in;                                      \
+    q0_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp_filt8_1, 3);             \
                                                                             \
-    tmp1 = tmp0 - p2_in;                                                    \
-    tmp0 = q1_in + q3_in;                                                   \
-    tmp1 = tmp0 + tmp1;                                                     \
-    q1_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp1, 3);                    \
+    tmp_filt8_1 = tmp_filt8_0 - p2_in;                                      \
+    tmp_filt8_0 = q1_in + q3_in;                                            \
+    tmp_filt8_1 = tmp_filt8_0 + tmp_filt8_1;                                \
+    q1_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp_filt8_1, 3);             \
   }
 
 #define LPF_MASK_HEV(p3_in, p2_in, p1_in, p0_in, q0_in, q1_in, q2_in, q3_in, \
diff --git a/aom_dsp/mips/macros_msa.h b/aom_dsp/mips/macros_msa.h
index 7d0ba4bfc..48fbcfd47 100644
--- a/aom_dsp/mips/macros_msa.h
+++ b/aom_dsp/mips/macros_msa.h
@@ -169,20 +169,20 @@
     val_m;                                                 \
   })
 #else  // !(__mips == 64)
-#define LD(psrc)                                            \
-  ({                                                        \
-    const uint8_t *psrc_m1 = (const uint8_t *)(psrc);       \
-    uint32_t val0_m, val1_m;                                \
-    uint64_t val_m = 0;                                     \
-                                                            \
-    val0_m = LW(psrc_m1);                                   \
-    val1_m = LW(psrc_m1 + 4);                               \
-                                                            \
-    val_m = (uint64_t)(val1_m);                             \
-    val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000); \
-    val_m = (uint64_t)(val_m | (uint64_t)val0_m);           \
-                                                            \
-    val_m;                                                  \
+#define LD(psrc)                                                              \
+  ({                                                                          \
+    const uint8_t *psrc_m1 = (const uint8_t *)(psrc);                         \
+    uint32_t val0_m, val1_m;                                                  \
+    uint64_t val_m_combined = 0;                                              \
+                                                                              \
+    val0_m = LW(psrc_m1);                                                     \
+    val1_m = LW(psrc_m1 + 4);                                                 \
+                                                                              \
+    val_m_combined = (uint64_t)(val1_m);                                      \
+    val_m_combined = (uint64_t)((val_m_combined << 32) & 0xFFFFFFFF00000000); \
+    val_m_combined = (uint64_t)(val_m_combined | (uint64_t)val0_m);           \
+                                                                              \
+    val_m_combined;                                                           \
   })
 #endif  // (__mips == 64)
 
@@ -2020,13 +2020,12 @@
                                 pdst, stride)                               \
   {                                                                         \
     v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                   \
-    uint8_t *pdst_m = (uint8_t *)(pdst);                                    \
                                                                             \
     tmp0_m = PCKEV_XORI128_UB(in0, in1);                                    \
     tmp1_m = PCKEV_XORI128_UB(in2, in3);                                    \
     ILVR_D2_UB(dst1, dst0, dst3, dst2, tmp2_m, tmp3_m);                     \
     AVER_UB2_UB(tmp0_m, tmp2_m, tmp1_m, tmp3_m, tmp0_m, tmp1_m);            \
-    ST8x4_UB(tmp0_m, tmp1_m, pdst_m, stride);                               \
+    ST8x4_UB(tmp0_m, tmp1_m, pdst, stride);                                 \
   }
 
 /* Description : Pack even byte elements and store byte vector in destination
diff --git a/aom_dsp/x86/inv_txfm_sse2.c b/aom_dsp/x86/inv_txfm_sse2.c
index 61f548a51..4735d973e 100644
--- a/aom_dsp/x86/inv_txfm_sse2.c
+++ b/aom_dsp/x86/inv_txfm_sse2.c
@@ -2372,7 +2372,6 @@ void aom_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest,
 #define IDCT32_34                                                              \
   /* Stage1 */                                                                 \
   {                                                                            \
-    const __m128i zero = _mm_setzero_si128();                                  \
     const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], zero);                   \
     const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], zero);                   \
                                                                                \
@@ -2397,7 +2396,6 @@ void aom_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest,
                                                                                \
   /* Stage2 */                                                                 \
   {                                                                            \
-    const __m128i zero = _mm_setzero_si128();                                  \
     const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], zero);                   \
     const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], zero);                   \
                                                                                \
@@ -2424,7 +2422,6 @@ void aom_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest,
                                                                                \
   /* Stage3 */                                                                 \
   {                                                                            \
-    const __m128i zero = _mm_setzero_si128();                                  \
     const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], zero);                   \
     const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], zero);                   \
                                                                                \
@@ -2465,7 +2462,6 @@ void aom_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest,
                                                                                \
   /* Stage4 */                                                                 \
   {                                                                            \
-    const __m128i zero = _mm_setzero_si128();                                  \
     const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], zero);                   \
     const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], zero);                   \
                                                                                \
@@ -3002,6 +2998,7 @@ void aom_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest,
 // Only upper-left 8x8 has non-zero coeff
 void aom_idct32x32_34_add_sse2(const tran_low_t *input, uint8_t *dest,
                                int stride) {
+  const __m128i zero = _mm_setzero_si128();
   const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
   const __m128i final_rounding = _mm_set1_epi16(1 << 5);
 
@@ -3107,7 +3104,6 @@ void aom_idct32x32_34_add_sse2(const tran_low_t *input, uint8_t *dest,
   col[31] = _mm_sub_epi16(stp1_0, stp1_31);
   for (i = 0; i < 4; i++) {
     int j;
-    const __m128i zero = _mm_setzero_si128();
     // Transpose 32x8 block to 8x32 block
     array_transpose_8x8(col + i * 8, in);
     IDCT32_34
-- 
2.50.0