]> granicus.if.org Git - libvpx/commitdiff
vp8,vpx_dsp:[loongson] fix bugs reported by clang
authorjinbo <jinbo-hf@loongson.cn>
Wed, 1 Jul 2020 00:56:25 +0000 (08:56 +0800)
committerjinbo <jinbo-hf@loongson.cn>
Tue, 7 Jul 2020 01:25:58 +0000 (09:25 +0800)
1. Adjust variable type to match clang compiler.
Clang is more strict on the type of asm operands, float or double
type variable should use constraint 'f', integer variable should
use constraint 'r'.

2. Fix prob of using r-value in output operands.
clang report error: 'invalid use of a cast in a inline asm context
requiring an l-value: remove the cast or build with -fheinous-gnu-extensions'.

Change-Id: Iae9e08f55f249059066c391534013e320812463e

vp8/common/mips/mmi/idctllm_mmi.c
vp8/common/mips/mmi/loopfilter_filters_mmi.c
vp8/common/mips/mmi/sixtap_filter_mmi.c
vp8/encoder/mips/mmi/dct_mmi.c
vp8/encoder/mips/mmi/vp8_quantize_mmi.c
vpx_dsp/mips/sad_mmi.c
vpx_dsp/mips/variance_mmi.c

index 4fad1d347f1e28bca7d4c6d3fbe570bd765d6b16..a35689dd30a3552234c25ea092fb4f07be6df8f1 100644 (file)
@@ -41,14 +41,18 @@ void vp8_short_idct4x4llm_mmi(int16_t *input, unsigned char *pred_ptr,
                               int pred_stride, unsigned char *dst_ptr,
                               int dst_stride) {
   double ftmp[12];
-  uint32_t tmp[0];
-  DECLARE_ALIGNED(8, const uint64_t, ff_ph_04) = { 0x0004000400040004ULL };
-  DECLARE_ALIGNED(8, const uint64_t, ff_ph_4e7b) = { 0x4e7b4e7b4e7b4e7bULL };
-  DECLARE_ALIGNED(8, const uint64_t, ff_ph_22a3) = { 0x22a322a322a322a3ULL };
+  uint64_t tmp[1];
+  double ff_ph_04, ff_ph_4e7b, ff_ph_22a3;
 
   __asm__ volatile (
+    "dli        %[tmp0],    0x0004000400040004                  \n\t"
+    "dmtc1      %[tmp0],    %[ff_ph_04]                         \n\t"
+    "dli        %[tmp0],    0x4e7b4e7b4e7b4e7b                  \n\t"
+    "dmtc1      %[tmp0],    %[ff_ph_4e7b]                       \n\t"
+    "dli        %[tmp0],    0x22a322a322a322a3                  \n\t"
+    "dmtc1      %[tmp0],    %[ff_ph_22a3]                       \n\t"
     MMI_LI(%[tmp0], 0x02)
-    "mtc1       %[tmp0],    %[ftmp11]                           \n\t"
+    "dmtc1      %[tmp0],    %[ftmp11]                           \n\t"
     "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
 
     "gsldlc1    %[ftmp1],   0x07(%[ip])                         \n\t"
@@ -186,9 +190,10 @@ void vp8_short_idct4x4llm_mmi(int16_t *input, unsigned char *pred_ptr,
       [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), [ftmp8]"=&f"(ftmp[8]),
       [ftmp9]"=&f"(ftmp[9]), [ftmp10]"=&f"(ftmp[10]),
       [ftmp11]"=&f"(ftmp[11]), [tmp0]"=&r"(tmp[0]),
-      [pred_ptr]"+&r"(pred_ptr), [dst_ptr]"+&r"(dst_ptr)
-    : [ip]"r"(input), [ff_ph_22a3]"f"(ff_ph_22a3),
-      [ff_ph_4e7b]"f"(ff_ph_4e7b), [ff_ph_04]"f"(ff_ph_04),
+      [pred_ptr]"+&r"(pred_ptr), [dst_ptr]"+&r"(dst_ptr),
+      [ff_ph_4e7b]"=&f"(ff_ph_4e7b), [ff_ph_04]"=&f"(ff_ph_04),
+      [ff_ph_22a3]"=&f"(ff_ph_22a3)
+    : [ip]"r"(input),
       [pred_stride]"r"((mips_reg)pred_stride),
       [dst_stride]"r"((mips_reg)dst_stride)
     : "memory"
@@ -198,12 +203,13 @@ void vp8_short_idct4x4llm_mmi(int16_t *input, unsigned char *pred_ptr,
 void vp8_dc_only_idct_add_mmi(int16_t input_dc, unsigned char *pred_ptr,
                               int pred_stride, unsigned char *dst_ptr,
                               int dst_stride) {
-  int a1 = ((input_dc + 4) >> 3);
-  double ftmp[5];
+  int a0 = ((input_dc + 4) >> 3);
+  double a1, ftmp[5];
   int low32;
 
   __asm__ volatile (
     "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]        \n\t"
+    "dmtc1      %[a0],      %[a1]                           \n\t"
     "pshufh     %[a1],      %[a1],          %[ftmp0]        \n\t"
     "ulw        %[low32],   0x00(%[pred_ptr])               \n\t"
     "mtc1       %[low32],   %[ftmp1]                        \n\t"
@@ -244,9 +250,9 @@ void vp8_dc_only_idct_add_mmi(int16_t input_dc, unsigned char *pred_ptr,
     "gsswrc1    %[ftmp1],   0x00(%[dst_ptr])                \n\t"
     : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), [ftmp2]"=&f"(ftmp[2]),
       [ftmp3]"=&f"(ftmp[3]), [ftmp4]"=&f"(ftmp[4]), [low32]"=&r"(low32),
-      [dst_ptr]"+&r"(dst_ptr), [pred_ptr]"+&r"(pred_ptr)
+      [dst_ptr]"+&r"(dst_ptr), [pred_ptr]"+&r"(pred_ptr), [a1]"=&f"(a1)
     : [dst_stride]"r"((mips_reg)dst_stride),
-      [pred_stride]"r"((mips_reg)pred_stride), [a1]"f"(a1)
+      [pred_stride]"r"((mips_reg)pred_stride), [a0]"r"(a0)
     : "memory"
   );
 }
@@ -254,14 +260,15 @@ void vp8_dc_only_idct_add_mmi(int16_t input_dc, unsigned char *pred_ptr,
 void vp8_short_inv_walsh4x4_mmi(int16_t *input, int16_t *mb_dqcoeff) {
   int i;
   int16_t output[16];
-  double ftmp[12];
-  uint32_t tmp[1];
-  DECLARE_ALIGNED(8, const uint64_t, ff_ph_03) = { 0x0003000300030003ULL };
+  double ff_ph_03, ftmp[12];
+  uint64_t tmp[1];
 
   __asm__ volatile (
+    "dli        %[tmp0],    0x0003000300030003                  \n\t"
+    "dmtc1      %[tmp0],    %[ff_ph_03]                         \n\t"
     MMI_LI(%[tmp0], 0x03)
     "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
-    "mtc1       %[tmp0],    %[ftmp11]                           \n\t"
+    "dmtc1      %[tmp0],    %[ftmp11]                           \n\t"
     "gsldlc1    %[ftmp1],   0x07(%[ip])                         \n\t"
     "gsldrc1    %[ftmp1],   0x00(%[ip])                         \n\t"
     "gsldlc1    %[ftmp2],   0x0f(%[ip])                         \n\t"
@@ -317,8 +324,8 @@ void vp8_short_inv_walsh4x4_mmi(int16_t *input, int16_t *mb_dqcoeff) {
       [ftmp3]"=&f"(ftmp[3]), [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
       [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), [ftmp8]"=&f"(ftmp[8]),
       [ftmp9]"=&f"(ftmp[9]), [ftmp10]"=&f"(ftmp[10]),
-      [ftmp11]"=&f"(ftmp[11]), [tmp0]"=&r"(tmp[0])
-    : [ip]"r"(input), [op]"r"(output), [ff_ph_03]"f"(ff_ph_03)
+      [ftmp11]"=&f"(ftmp[11]), [tmp0]"=&r"(tmp[0]), [ff_ph_03]"=&f"(ff_ph_03)
+    : [ip]"r"(input), [op]"r"(output)
     : "memory"
   );
 
index fc1240cc27e72a02d6eafee9113e6e66a4f2a5d1..a07a7e3b41d3362fbd4f4d06087f3aaeee4d3fb0 100644 (file)
 #include "vp8/common/onyxc_int.h"
 #include "vpx_ports/asmdefs_mmi.h"
 
-DECLARE_ALIGNED(8, static const uint64_t, ff_ph_01) = { 0x0001000100010001ULL };
-DECLARE_ALIGNED(8, static const uint64_t,
-                ff_ph_003f) = { 0x003f003f003f003fULL };
-DECLARE_ALIGNED(8, static const uint64_t,
-                ff_ph_0900) = { 0x0900090009000900ULL };
-DECLARE_ALIGNED(8, static const uint64_t,
-                ff_ph_1200) = { 0x1200120012001200ULL };
-DECLARE_ALIGNED(8, static const uint64_t,
-                ff_ph_1b00) = { 0x1b001b001b001b00ULL };
-DECLARE_ALIGNED(8, static const uint64_t, ff_pb_fe) = { 0xfefefefefefefefeULL };
-DECLARE_ALIGNED(8, static const uint64_t, ff_pb_80) = { 0x8080808080808080ULL };
-DECLARE_ALIGNED(8, static const uint64_t, ff_pb_04) = { 0x0404040404040404ULL };
-DECLARE_ALIGNED(8, static const uint64_t, ff_pb_03) = { 0x0303030303030303ULL };
-DECLARE_ALIGNED(8, static const uint64_t, ff_pb_01) = { 0x0101010101010101ULL };
-
 void vp8_loop_filter_horizontal_edge_mmi(
     unsigned char *src_ptr, int src_pixel_step, const unsigned char *blimit,
     const unsigned char *limit, const unsigned char *thresh, int count) {
-  uint32_t tmp[1];
+  uint64_t tmp[1];
   mips_reg addr[2];
   double ftmp[12];
+  double ff_ph_01, ff_pb_fe, ff_pb_80, ff_pb_04, ff_pb_03;
+  /* clang-format off */
   __asm__ volatile (
+    "dli        %[tmp0],    0x0001000100010001                      \n\t"
+    "dmtc1      %[tmp0],    %[ff_ph_01]                             \n\t"
+    "dli        %[tmp0],    0xfefefefefefefefe                      \n\t"
+    "dmtc1      %[tmp0],    %[ff_pb_fe]                             \n\t"
+    "dli        %[tmp0],    0x8080808080808080                      \n\t"
+    "dmtc1      %[tmp0],    %[ff_pb_80]                             \n\t"
+    "dli        %[tmp0],    0x0404040404040404                      \n\t"
+    "dmtc1      %[tmp0],    %[ff_pb_04]                             \n\t"
+    "dli        %[tmp0],    0x0303030303030303                      \n\t"
+    "dmtc1      %[tmp0],    %[ff_pb_03]                             \n\t"
     "1:                                                             \n\t"
     "gsldlc1    %[ftmp10],  0x07(%[limit])                          \n\t"
     "gsldrc1    %[ftmp10],  0x00(%[limit])                          \n\t"
@@ -91,9 +88,9 @@ void vp8_loop_filter_horizontal_edge_mmi(
     "pasubub    %[ftmp1],   %[ftmp5],           %[ftmp6]            \n\t"
     "paddusb    %[ftmp1],   %[ftmp1],           %[ftmp1]            \n\t"
     "pasubub    %[ftmp2],   %[ftmp4],           %[ftmp7]            \n\t"
-    "and        %[ftmp2],   %[ftmp2],           %[ff_pb_fe]         \n\t"
-    "li         %[tmp0],    0x01                                    \n\t"
-    "mtc1       %[tmp0],    %[ftmp10]                               \n\t"
+    "pand       %[ftmp2],   %[ftmp2],           %[ff_pb_fe]         \n\t"
+    "dli        %[tmp0],    0x01                                    \n\t"
+    "dmtc1      %[tmp0],    %[ftmp10]                               \n\t"
     "psrlh      %[ftmp2],   %[ftmp2],           %[ftmp10]           \n\t"
     "paddusb    %[ftmp1],   %[ftmp1],           %[ftmp2]            \n\t"
     "gsldlc1    %[ftmp10],  0x07(%[blimit])                         \n\t"
@@ -134,8 +131,8 @@ void vp8_loop_filter_horizontal_edge_mmi(
     "punpcklbh  %[ftmp0],   %[ftmp0],           %[ftmp8]            \n\t"
     "punpckhbh  %[ftmp11],  %[ftmp11],          %[ftmp8]            \n\t"
 
-    "li         %[tmp0],    0x0b                                    \n\t"
-    "mtc1       %[tmp0],    %[ftmp10]                               \n\t"
+    "dli        %[tmp0],    0x0b                                    \n\t"
+    "dmtc1      %[tmp0],    %[ftmp10]                               \n\t"
     "psrah      %[ftmp0],   %[ftmp0],           %[ftmp10]           \n\t"
     "psrah      %[ftmp11],  %[ftmp11],          %[ftmp10]           \n\t"
     "packsshb   %[ftmp8],   %[ftmp0],           %[ftmp11]           \n\t"
@@ -149,8 +146,8 @@ void vp8_loop_filter_horizontal_edge_mmi(
     "packsshb   %[ftmp0],   %[ftmp0],           %[ftmp9]            \n\t"
     "paddsh     %[ftmp9],   %[ftmp9],           %[ff_ph_01]         \n\t"
 
-    "li         %[tmp0],    0x01                                    \n\t"
-    "mtc1       %[tmp0],    %[ftmp10]                               \n\t"
+    "dli        %[tmp0],    0x01                                    \n\t"
+    "dmtc1      %[tmp0],    %[ftmp10]                               \n\t"
     "psrah      %[ftmp11],  %[ftmp11],          %[ftmp10]           \n\t"
     "psrah      %[ftmp9],   %[ftmp9],           %[ftmp10]           \n\t"
     "packsshb   %[ftmp11],  %[ftmp11],          %[ftmp9]            \n\t"
@@ -188,17 +185,18 @@ void vp8_loop_filter_horizontal_edge_mmi(
       [ftmp10]"=&f"(ftmp[10]),            [ftmp11]"=&f"(ftmp[11]),
       [tmp0]"=&r"(tmp[0]),
       [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
-      [src_ptr]"+&r"(src_ptr),          [count]"+&r"(count)
+      [src_ptr]"+&r"(src_ptr),          [count]"+&r"(count),
+      [ff_ph_01]"=&f"(ff_ph_01),        [ff_pb_fe]"=&f"(ff_pb_fe),
+      [ff_pb_80]"=&f"(ff_pb_80),        [ff_pb_04]"=&f"(ff_pb_04),
+      [ff_pb_03]"=&f"(ff_pb_03)
     : [limit]"r"(limit),                [blimit]"r"(blimit),
       [thresh]"r"(thresh),
       [src_pixel_step]"r"((mips_reg)src_pixel_step),
       [src_pixel_step_x2]"r"((mips_reg)(src_pixel_step<<1)),
-      [src_pixel_step_x4]"r"((mips_reg)(src_pixel_step<<2)),
-      [ff_ph_01]"f"(ff_ph_01),          [ff_pb_fe]"f"(ff_pb_fe),
-      [ff_pb_80]"f"(ff_pb_80),          [ff_pb_04]"f"(ff_pb_04),
-      [ff_pb_03]"f"(ff_pb_03)
+      [src_pixel_step_x4]"r"((mips_reg)(src_pixel_step<<2))
     : "memory"
   );
+  /* clang-format on */
 }
 
 void vp8_loop_filter_vertical_edge_mmi(unsigned char *src_ptr,
@@ -206,11 +204,23 @@ void vp8_loop_filter_vertical_edge_mmi(unsigned char *src_ptr,
                                        const unsigned char *blimit,
                                        const unsigned char *limit,
                                        const unsigned char *thresh, int count) {
-  uint32_t tmp[1];
+  uint64_t tmp[1];
   mips_reg addr[2];
   double ftmp[13];
+  double ff_pb_fe, ff_ph_01, ff_pb_03, ff_pb_04, ff_pb_80;
 
+  /* clang-format off */
   __asm__ volatile (
+    "dli        %[tmp0],    0xfefefefefefefefe                      \n\t"
+    "dmtc1      %[tmp0],    %[ff_pb_fe]                             \n\t"
+    "dli        %[tmp0],    0x0001000100010001                      \n\t"
+    "dmtc1      %[tmp0],    %[ff_ph_01]                             \n\t"
+    "dli        %[tmp0],    0x0303030303030303                      \n\t"
+    "dmtc1      %[tmp0],    %[ff_pb_03]                             \n\t"
+    "dli        %[tmp0],    0x0404040404040404                      \n\t"
+    "dmtc1      %[tmp0],    %[ff_pb_04]                             \n\t"
+    "dli        %[tmp0],    0x8080808080808080                      \n\t"
+    "dmtc1      %[tmp0],    %[ff_pb_80]                             \n\t"
     MMI_SLL(%[tmp0], %[src_pixel_step], 0x02)
     MMI_ADDU(%[src_ptr], %[src_ptr], %[tmp0])
     MMI_SUBU(%[src_ptr], %[src_ptr], 0x04)
@@ -315,8 +325,8 @@ void vp8_loop_filter_vertical_edge_mmi(unsigned char *src_ptr,
     /* abs (p1-q1) */
     "pasubub    %[ftmp12],  %[ftmp10],          %[ftmp5]            \n\t"
     "pand       %[ftmp12],  %[ftmp12],          %[ff_pb_fe]         \n\t"
-    "li         %[tmp0],    0x01                                    \n\t"
-    "mtc1       %[tmp0],    %[ftmp1]                                \n\t"
+    "dli        %[tmp0],    0x01                                    \n\t"
+    "dmtc1      %[tmp0],    %[ftmp1]                                \n\t"
     "psrlh      %[ftmp12],  %[ftmp12],          %[ftmp1]            \n\t"
     "paddusb    %[ftmp1],   %[ftmp11],          %[ftmp12]           \n\t"
     "psubusb    %[ftmp1],   %[ftmp1],           %[ftmp8]            \n\t"
@@ -354,8 +364,8 @@ void vp8_loop_filter_vertical_edge_mmi(unsigned char *src_ptr,
     "paddsb     %[ftmp11],  %[ftmp2],           %[ff_pb_04]         \n\t"
     "paddsb     %[ftmp12],  %[ftmp2],           %[ff_pb_03]         \n\t"
 
-    "li         %[tmp0],    0x0b                                    \n\t"
-    "mtc1       %[tmp0],    %[ftmp7]                                \n\t"
+    "dli        %[tmp0],    0x0b                                    \n\t"
+    "dmtc1      %[tmp0],    %[ftmp7]                                \n\t"
     "pxor      %[ftmp0],    %[ftmp0],           %[ftmp0]            \n\t"
     "pxor      %[ftmp8],    %[ftmp8],           %[ftmp8]            \n\t"
     "punpcklbh %[ftmp0],    %[ftmp0],           %[ftmp12]           \n\t"
@@ -379,8 +389,8 @@ void vp8_loop_filter_vertical_edge_mmi(unsigned char *src_ptr,
     "paddsh     %[ftmp0],   %[ftmp0],           %[ff_ph_01]         \n\t"
     "paddsh     %[ftmp8],   %[ftmp8],           %[ff_ph_01]         \n\t"
 
-    "li         %[tmp0],    0x01                                    \n\t"
-    "mtc1       %[tmp0],    %[ftmp7]                                \n\t"
+    "dli        %[tmp0],    0x01                                    \n\t"
+    "dmtc1      %[tmp0],    %[ftmp7]                                \n\t"
     "psrah      %[ftmp0],   %[ftmp0],           %[ftmp7]            \n\t"
     "psrah      %[ftmp8],   %[ftmp8],           %[ftmp7]            \n\t"
     "packsshb   %[ftmp2],   %[ftmp0],           %[ftmp8]            \n\t"
@@ -450,15 +460,16 @@ void vp8_loop_filter_vertical_edge_mmi(unsigned char *src_ptr,
       [ftmp10]"=&f"(ftmp[10]),            [ftmp11]"=&f"(ftmp[11]),
       [ftmp12]"=&f"(ftmp[12]),            [tmp0]"=&r"(tmp[0]),
       [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
-      [src_ptr]"+&r"(src_ptr),          [count]"+&r"(count)
+      [src_ptr]"+&r"(src_ptr),          [count]"+&r"(count),
+      [ff_ph_01]"=&f"(ff_ph_01),        [ff_pb_03]"=&f"(ff_pb_03),
+      [ff_pb_04]"=&f"(ff_pb_04),        [ff_pb_80]"=&f"(ff_pb_80),
+      [ff_pb_fe]"=&f"(ff_pb_fe)
     : [limit]"r"(limit),                [blimit]"r"(blimit),
       [thresh]"r"(thresh),
-      [src_pixel_step]"r"((mips_reg)src_pixel_step),
-      [ff_ph_01]"f"(ff_ph_01),          [ff_pb_03]"f"(ff_pb_03),
-      [ff_pb_04]"f"(ff_pb_04),          [ff_pb_80]"f"(ff_pb_80),
-      [ff_pb_fe]"f"(ff_pb_fe)
+      [src_pixel_step]"r"((mips_reg)src_pixel_step)
     : "memory"
   );
+  /* clang-format on */
 }
 
 /* clang-format off */
@@ -484,10 +495,29 @@ void vp8_loop_filter_vertical_edge_mmi(unsigned char *src_ptr,
 void vp8_mbloop_filter_horizontal_edge_mmi(
     unsigned char *src_ptr, int src_pixel_step, const unsigned char *blimit,
     const unsigned char *limit, const unsigned char *thresh, int count) {
-  uint32_t tmp[1];
+  uint64_t tmp[1];
   double ftmp[13];
+  double ff_pb_fe, ff_pb_80, ff_pb_04, ff_pb_03, ff_ph_003f, ff_ph_0900,
+      ff_ph_1200, ff_ph_1b00;
 
+  /* clang-format off */
   __asm__ volatile (
+    "dli        %[tmp0],    0xfefefefefefefefe                      \n\t"
+    "dmtc1      %[tmp0],    %[ff_pb_fe]                             \n\t"
+    "dli        %[tmp0],    0x8080808080808080                      \n\t"
+    "dmtc1      %[tmp0],    %[ff_pb_80]                             \n\t"
+    "dli        %[tmp0],    0x0404040404040404                      \n\t"
+    "dmtc1      %[tmp0],    %[ff_pb_04]                             \n\t"
+    "dli        %[tmp0],    0x0303030303030303                      \n\t"
+    "dmtc1      %[tmp0],    %[ff_pb_03]                             \n\t"
+    "dli        %[tmp0],    0x003f003f003f003f                      \n\t"
+    "dmtc1      %[tmp0],    %[ff_ph_003f]                           \n\t"
+    "dli        %[tmp0],    0x0900090009000900                      \n\t"
+    "dmtc1      %[tmp0],    %[ff_ph_0900]                           \n\t"
+    "dli        %[tmp0],    0x1200120012001200                      \n\t"
+    "dmtc1      %[tmp0],    %[ff_ph_1200]                           \n\t"
+    "dli        %[tmp0],    0x1b001b001b001b00                      \n\t"
+    "dmtc1      %[tmp0],    %[ff_ph_1b00]                           \n\t"
     MMI_SLL(%[tmp0], %[src_pixel_step], 0x02)
     MMI_SUBU(%[src_ptr], %[src_ptr], %[tmp0])
     "1:                                                             \n\t"
@@ -550,8 +580,8 @@ void vp8_mbloop_filter_horizontal_edge_mmi(
     "paddusb    %[ftmp1],   %[ftmp1],           %[ftmp1]            \n\t"
     "pasubub    %[ftmp2],   %[ftmp4],           %[ftmp7]            \n\t"
     "pand       %[ftmp2],   %[ftmp2],           %[ff_pb_fe]         \n\t"
-    "li         %[tmp0],    0x01                                    \n\t"
-    "mtc1       %[tmp0],    %[ftmp9]                                \n\t"
+    "dli        %[tmp0],    0x01                                    \n\t"
+    "dmtc1      %[tmp0],    %[ftmp9]                                \n\t"
     "psrlh      %[ftmp2],   %[ftmp2],           %[ftmp9]            \n\t"
     "paddusb    %[ftmp1],   %[ftmp1],           %[ftmp2]            \n\t"
     "psubusb    %[ftmp1],   %[ftmp1],           %[ftmp12]           \n\t"
@@ -584,8 +614,8 @@ void vp8_mbloop_filter_horizontal_edge_mmi(
     "pandn      %[ftmp12],  %[ftmp1],           %[ftmp2]            \n\t"
     "pand       %[ftmp2],   %[ftmp2],           %[ftmp1]            \n\t"
 
-    "li         %[tmp0],    0x0b                                    \n\t"
-    "mtc1       %[tmp0],    %[ftmp9]                                \n\t"
+    "dli        %[tmp0],    0x0b                                    \n\t"
+    "dmtc1      %[tmp0],    %[ftmp9]                                \n\t"
     "paddsb     %[ftmp0],   %[ftmp2],           %[ff_pb_03]         \n\t"
     VP8_MBLOOP_HPSRAB
     "paddsb     %[ftmp5],   %[ftmp5],           %[ftmp0]            \n\t"
@@ -593,8 +623,8 @@ void vp8_mbloop_filter_horizontal_edge_mmi(
     VP8_MBLOOP_HPSRAB
     "psubsb     %[ftmp6],   %[ftmp6],           %[ftmp0]            \n\t"
 
-    "li         %[tmp0],    0x07                                    \n\t"
-    "mtc1       %[tmp0],    %[ftmp9]                                \n\t"
+    "dli        %[tmp0],    0x07                                    \n\t"
+    "dmtc1      %[tmp0],    %[ftmp9]                                \n\t"
     "pxor       %[ftmp0],   %[ftmp0],           %[ftmp0]            \n\t"
 
     VP8_MBLOOP_HPSRAB_ADD(%[ff_ph_1b00])
@@ -649,18 +679,20 @@ void vp8_mbloop_filter_horizontal_edge_mmi(
       [ftmp8]"=&f"(ftmp[8]),              [ftmp9]"=&f"(ftmp[9]),
       [ftmp10]"=&f"(ftmp[10]),            [ftmp11]"=&f"(ftmp[11]),
       [ftmp12]"=&f"(ftmp[12]),            [tmp0]"=&r"(tmp[0]),
-      [src_ptr]"+&r"(src_ptr),            [count]"+&r"(count)
+      [src_ptr]"+&r"(src_ptr),            [count]"+&r"(count),
+      [ff_pb_fe]"=&f"(ff_pb_fe),          [ff_pb_80]"=&f"(ff_pb_80),
+      [ff_pb_04]"=&f"(ff_pb_04),          [ff_pb_03]"=&f"(ff_pb_03),
+      [ff_ph_0900]"=&f"(ff_ph_0900),      [ff_ph_1b00]"=&f"(ff_ph_1b00),
+      [ff_ph_1200]"=&f"(ff_ph_1200),      [ff_ph_003f]"=&f"(ff_ph_003f)
     : [limit]"r"(limit),                  [blimit]"r"(blimit),
       [thresh]"r"(thresh),
-      [src_pixel_step]"r"((mips_reg)src_pixel_step),
-      [ff_pb_fe]"f"(ff_pb_fe),            [ff_pb_80]"f"(ff_pb_80),
-      [ff_pb_04]"f"(ff_pb_04),            [ff_pb_03]"f"(ff_pb_03),
-      [ff_ph_0900]"f"(ff_ph_0900),        [ff_ph_1b00]"f"(ff_ph_1b00),
-      [ff_ph_1200]"f"(ff_ph_1200),        [ff_ph_003f]"f"(ff_ph_003f)
+      [src_pixel_step]"r"((mips_reg)src_pixel_step)
     : "memory"
   );
+  /* clang-format on */
 }
 
+/* clang-format off */
 #define VP8_MBLOOP_VPSRAB_ADDH                                          \
   "pxor       %[ftmp7],   %[ftmp7],           %[ftmp7]            \n\t" \
   "pxor       %[ftmp8],   %[ftmp8],           %[ftmp8]            \n\t" \
@@ -673,15 +705,30 @@ void vp8_mbloop_filter_horizontal_edge_mmi(
   "psrah      %[ftmp7],   %[ftmp7],           %[ftmp12]           \n\t" \
   "psrah      %[ftmp8],   %[ftmp8],           %[ftmp12]           \n\t" \
   "packsshb   %[ftmp3],   %[ftmp7],           %[ftmp8]            \n\t"
+/* clang-format on */
 
 void vp8_mbloop_filter_vertical_edge_mmi(
     unsigned char *src_ptr, int src_pixel_step, const unsigned char *blimit,
     const unsigned char *limit, const unsigned char *thresh, int count) {
   mips_reg tmp[1];
-  DECLARE_ALIGNED(8, const uint64_t, srct[1]);
+  DECLARE_ALIGNED(8, const uint64_t, srct[2]);
   double ftmp[14];
+  double ff_ph_003f, ff_ph_0900, ff_pb_fe, ff_pb_80, ff_pb_04, ff_pb_03;
 
+  /* clang-format off */
   __asm__ volatile (
+    "dli        %[tmp0],    0x003f003f003f003f                      \n\t"
+    "dmtc1      %[tmp0],    %[ff_ph_003f]                           \n\t"
+    "dli        %[tmp0],    0x0900090009000900                      \n\t"
+    "dmtc1      %[tmp0],    %[ff_ph_0900]                           \n\t"
+    "dli        %[tmp0],    0xfefefefefefefefe                      \n\t"
+    "dmtc1      %[tmp0],    %[ff_pb_fe]                             \n\t"
+    "dli        %[tmp0],    0x8080808080808080                      \n\t"
+    "dmtc1      %[tmp0],    %[ff_pb_80]                             \n\t"
+    "dli        %[tmp0],    0x0404040404040404                      \n\t"
+    "dmtc1      %[tmp0],    %[ff_pb_04]                             \n\t"
+    "dli        %[tmp0],    0x0303030303030303                      \n\t"
+    "dmtc1      %[tmp0],    %[ff_pb_03]                             \n\t"
     MMI_SUBU(%[src_ptr], %[src_ptr], 0x04)
 
     "1:                                                             \n\t"
@@ -783,8 +830,8 @@ void vp8_mbloop_filter_vertical_edge_mmi(
     /* abs (p1-q1) / 2 */
     "pasubub    %[ftmp12],  %[ftmp10],          %[ftmp5]            \n\t"
     "pand       %[ftmp12],  %[ftmp12],          %[ff_pb_fe]         \n\t"
-    "li         %[tmp0],    0x01                                    \n\t"
-    "mtc1       %[tmp0],    %[ftmp8]                                \n\t"
+    "dli        %[tmp0],    0x01                                    \n\t"
+    "dmtc1      %[tmp0],    %[ftmp8]                                \n\t"
     "psrlh      %[ftmp12],  %[ftmp12],          %[ftmp8]            \n\t"
     "paddusb    %[ftmp12],  %[ftmp1],           %[ftmp12]           \n\t"
     "psubusb    %[ftmp12],  %[ftmp12],          %[ftmp13]           \n\t"
@@ -824,8 +871,8 @@ void vp8_mbloop_filter_vertical_edge_mmi(
     "pandn      %[ftmp0],   %[ftmp1],           %[ftmp0]            \n\t"
 
     "paddsb     %[ftmp4],   %[ftmp3],           %[ff_pb_04]         \n\t"
-    "li         %[tmp0],    0x0b                                    \n\t"
-    "mtc1       %[tmp0],    %[ftmp12]                               \n\t"
+    "dli        %[tmp0],    0x0b                                    \n\t"
+    "dmtc1      %[tmp0],    %[ftmp12]                               \n\t"
     "punpcklbh  %[ftmp7],   %[ftmp7],           %[ftmp4]            \n\t"
     "punpckhbh  %[ftmp8],   %[ftmp8],           %[ftmp4]            \n\t"
     "psrah      %[ftmp7],   %[ftmp7],           %[ftmp12]           \n\t"
@@ -842,8 +889,8 @@ void vp8_mbloop_filter_vertical_edge_mmi(
     /* ftmp6: ps0 */
     "paddsb     %[ftmp6],   %[ftmp6],           %[ftmp3]            \n\t"
 
-    "li         %[tmp0],    0x07                                    \n\t"
-    "mtc1       %[tmp0],    %[ftmp12]                               \n\t"
+    "dli        %[tmp0],    0x07                                    \n\t"
+    "dmtc1      %[tmp0],    %[ftmp12]                               \n\t"
     VP8_MBLOOP_VPSRAB_ADDH
     "paddh      %[ftmp1],   %[ff_ph_0900],      %[ff_ph_0900]       \n\t"
     "paddh      %[ftmp1],   %[ftmp1],           %[ff_ph_0900]       \n\t"
@@ -948,17 +995,19 @@ void vp8_mbloop_filter_vertical_edge_mmi(
       [ftmp10]"=&f"(ftmp[10]),            [ftmp11]"=&f"(ftmp[11]),
       [ftmp12]"=&f"(ftmp[12]),            [ftmp13]"=&f"(ftmp[13]),
       [tmp0]"=&r"(tmp[0]),                [src_ptr]"+&r"(src_ptr),
-      [count]"+&r"(count)
+      [count]"+&r"(count),
+      [ff_ph_003f]"=&f"(ff_ph_003f),    [ff_ph_0900]"=&f"(ff_ph_0900),
+      [ff_pb_03]"=&f"(ff_pb_03),        [ff_pb_04]"=&f"(ff_pb_04),
+      [ff_pb_80]"=&f"(ff_pb_80),        [ff_pb_fe]"=&f"(ff_pb_fe)
     : [limit]"r"(limit),                [blimit]"r"(blimit),
       [srct]"r"(srct),                  [thresh]"r"(thresh),
-      [src_pixel_step]"r"((mips_reg)src_pixel_step),
-      [ff_ph_003f]"f"(ff_ph_003f),      [ff_ph_0900]"f"(ff_ph_0900),
-      [ff_pb_03]"f"(ff_pb_03),          [ff_pb_04]"f"(ff_pb_04),
-      [ff_pb_80]"f"(ff_pb_80),          [ff_pb_fe]"f"(ff_pb_fe)
+      [src_pixel_step]"r"((mips_reg)src_pixel_step)
     : "memory"
   );
+  /* clang-format on */
 }
 
+/* clang-format off */
 #define VP8_SIMPLE_HPSRAB                                               \
   "psllh      %[ftmp0],   %[ftmp5],           %[ftmp8]            \n\t" \
   "psrah      %[ftmp0],   %[ftmp0],           %[ftmp9]            \n\t" \
@@ -966,23 +1015,38 @@ void vp8_mbloop_filter_vertical_edge_mmi(
   "psrah      %[ftmp1],   %[ftmp5],           %[ftmp10]           \n\t" \
   "psllh      %[ftmp1],   %[ftmp1],           %[ftmp8]            \n\t" \
   "por        %[ftmp0],   %[ftmp0],           %[ftmp1]            \n\t"
+/* clang-format on */
 
 void vp8_loop_filter_simple_horizontal_edge_mmi(unsigned char *src_ptr,
                                                 int src_pixel_step,
                                                 const unsigned char *blimit) {
-  uint32_t tmp[1], count = 2;
+  uint64_t tmp[1], count = 2;
   mips_reg addr[2];
   double ftmp[12];
+  double ff_pb_fe, ff_pb_80, ff_pb_04, ff_pb_01;
 
+  /* clang-format off */
   __asm__ volatile (
-    "li         %[tmp0],    0x08                                    \n\t"
-    "mtc1       %[tmp0],    %[ftmp8]                                \n\t"
-    "li         %[tmp0],    0x03                                    \n\t"
-    "mtc1       %[tmp0],    %[ftmp9]                                \n\t"
-    "li         %[tmp0],    0x0b                                    \n\t"
-    "mtc1       %[tmp0],    %[ftmp10]                               \n\t"
-    "li         %[tmp0],    0x01                                    \n\t"
-    "mtc1       %[tmp0],    %[ftmp11]                               \n\t"
+    "dli        %[tmp0],    0x0b                                    \n\t"
+    "dmtc1      %[tmp0],    %[ftmp10]                               \n\t"
+    "dli        %[tmp0],    0x01                                    \n\t"
+    "dmtc1      %[tmp0],    %[ftmp11]                               \n\t"
+    "dli        %[tmp0],    0x08                                    \n\t"
+    "dmtc1      %[tmp0],    %[ftmp8]                                \n\t"
+    "dli        %[tmp0],    0x03                                    \n\t"
+    "dmtc1      %[tmp0],    %[ftmp9]                                \n\t"
+    "dli        %[tmp0],    0x0b                                    \n\t"
+    "dmtc1      %[tmp0],    %[ftmp10]                               \n\t"
+    "dli        %[tmp0],    0x01                                    \n\t"
+    "dmtc1      %[tmp0],    %[ftmp11]                               \n\t"
+    "dli        %[tmp0],    0xfefefefefefefefe                      \n\t"
+    "dmtc1      %[tmp0],    %[ff_pb_fe]                             \n\t"
+    "dli        %[tmp0],    0x8080808080808080                      \n\t"
+    "dmtc1      %[tmp0],    %[ff_pb_80]                             \n\t"
+    "dli        %[tmp0],    0x0404040404040404                      \n\t"
+    "dmtc1      %[tmp0],    %[ff_pb_04]                             \n\t"
+    "dli        %[tmp0],    0x0101010101010101                      \n\t"
+    "dmtc1      %[tmp0],    %[ff_pb_01]                             \n\t"
 
     "1:                                                             \n\t"
     "gsldlc1    %[ftmp3],   0x07(%[blimit])                         \n\t"
@@ -996,7 +1060,7 @@ void vp8_loop_filter_simple_horizontal_edge_mmi(unsigned char *src_ptr,
     "gsldlc1    %[ftmp7],   0x07(%[addr0])                          \n\t"
     "gsldrc1    %[ftmp7],   0x00(%[addr0])                          \n\t"
     "pasubub    %[ftmp1],   %[ftmp7],           %[ftmp2]            \n\t"
-    "and        %[ftmp1],   %[ftmp1],           %[ff_pb_fe]         \n\t"
+    "pand       %[ftmp1],   %[ftmp1],           %[ff_pb_fe]         \n\t"
     "psrlh      %[ftmp1],   %[ftmp1],           %[ftmp11]           \n\t"
 
     MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step])
@@ -1020,7 +1084,7 @@ void vp8_loop_filter_simple_horizontal_edge_mmi(unsigned char *src_ptr,
     "paddsb     %[ftmp2],   %[ftmp2],           %[ftmp0]            \n\t"
     "paddsb     %[ftmp2],   %[ftmp2],           %[ftmp0]            \n\t"
     "paddsb     %[ftmp2],   %[ftmp2],           %[ftmp0]            \n\t"
-    "and        %[ftmp5],   %[ftmp5],           %[ftmp2]            \n\t"
+    "pand       %[ftmp5],   %[ftmp5],           %[ftmp2]            \n\t"
 
     "paddsb     %[ftmp5],   %[ftmp5],           %[ff_pb_04]         \n\t"
     VP8_SIMPLE_HPSRAB
@@ -1048,30 +1112,43 @@ void vp8_loop_filter_simple_horizontal_edge_mmi(unsigned char *src_ptr,
       [ftmp10]"=&f"(ftmp[10]),            [ftmp11]"=&f"(ftmp[11]),
       [tmp0]"=&r"(tmp[0]),
       [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
-      [src_ptr]"+&r"(src_ptr),          [count]"+&r"(count)
+      [src_ptr]"+&r"(src_ptr),          [count]"+&r"(count),
+      [ff_pb_fe]"=&f"(ff_pb_fe),        [ff_pb_80]"=&f"(ff_pb_80),
+      [ff_pb_04]"=&f"(ff_pb_04),        [ff_pb_01]"=&f"(ff_pb_01)
     : [blimit]"r"(blimit),
       [src_pixel_step]"r"((mips_reg)src_pixel_step),
-      [src_pixel_step_x2]"r"((mips_reg)(src_pixel_step<<1)),
-      [ff_pb_fe]"f"(ff_pb_fe),          [ff_pb_80]"f"(ff_pb_80),
-      [ff_pb_04]"f"(ff_pb_04),          [ff_pb_01]"f"(ff_pb_01)
+      [src_pixel_step_x2]"r"((mips_reg)(src_pixel_step<<1))
     : "memory"
   );
+  /* clang-format on */
 }
 
 void vp8_loop_filter_simple_vertical_edge_mmi(unsigned char *src_ptr,
                                               int src_pixel_step,
                                               const unsigned char *blimit) {
-  uint32_t tmp[1], count = 2;
+  uint64_t tmp[1], count = 2;
   mips_reg addr[2];
-  DECLARE_ALIGNED(8, const uint64_t, srct[1]);
-  double ftmp[12];
+  DECLARE_ALIGNED(8, const uint64_t, srct[2]);
+  double ftmp[12], ff_pb_fe, ff_pb_80, ff_pb_04, ff_pb_01;
 
+  /* clang-format off */
   __asm__ volatile (
-    "li         %[tmp0],    0x08                                    \n\t"
-    "mtc1       %[tmp0],    %[ftmp8]                                \n\t"
-    "li         %[tmp0],    0x20                                    \n\t"
-    "mtc1       %[tmp0],    %[ftmp10]                               \n\t"
-
+    "dli        %[tmp0],    0x08                                    \n\t"
+    "dmtc1      %[tmp0],    %[ftmp8]                                \n\t"
+    "dli        %[tmp0],    0x20                                    \n\t"
+    "dmtc1      %[tmp0],    %[ftmp10]                               \n\t"
+    "dli        %[tmp0],    0x08                                    \n\t"
+    "dmtc1      %[tmp0],    %[ftmp8]                                \n\t"
+    "dli        %[tmp0],    0x20                                    \n\t"
+    "dmtc1      %[tmp0],    %[ftmp10]                               \n\t"
+    "dli        %[tmp0],    0xfefefefefefefefe                      \n\t"
+    "dmtc1      %[tmp0],    %[ff_pb_fe]                             \n\t"
+    "dli        %[tmp0],    0x8080808080808080                      \n\t"
+    "dmtc1      %[tmp0],    %[ff_pb_80]                             \n\t"
+    "dli        %[tmp0],    0x0404040404040404                      \n\t"
+    "dmtc1      %[tmp0],    %[ff_pb_04]                             \n\t"
+    "dli        %[tmp0],    0x0101010101010101                      \n\t"
+    "dmtc1      %[tmp0],    %[ff_pb_01]                             \n\t"
     MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step_x4])
     MMI_SUBU(%[src_ptr], %[src_ptr], 0x02)
 
@@ -1118,8 +1195,8 @@ void vp8_loop_filter_simple_vertical_edge_mmi(unsigned char *src_ptr,
     "punpckhwd  %[ftmp3],   %[ftmp2],           %[ftmp5]            \n\t"
     "punpcklwd  %[ftmp2],   %[ftmp2],           %[ftmp5]            \n\t"
 
-    "li         %[tmp0],    0x01                                    \n\t"
-    "mtc1       %[tmp0],    %[ftmp9]                                \n\t"
+    "dli        %[tmp0],    0x01                                    \n\t"
+    "dmtc1      %[tmp0],    %[ftmp9]                                \n\t"
     "pasubub    %[ftmp6],   %[ftmp3],           %[ftmp0]            \n\t"
     "pand       %[ftmp6],   %[ftmp6],           %[ff_pb_fe]         \n\t"
     "psrlh      %[ftmp6],   %[ftmp6],           %[ftmp9]            \n\t"
@@ -1149,14 +1226,14 @@ void vp8_loop_filter_simple_vertical_edge_mmi(unsigned char *src_ptr,
     "pand       %[ftmp5],   %[ftmp5],           %[ftmp0]            \n\t"
     "paddsb     %[ftmp5],   %[ftmp5],           %[ff_pb_04]         \n\t"
 
-    "li         %[tmp0],    0x03                                    \n\t"
-    "mtc1       %[tmp0],    %[ftmp9]                                \n\t"
+    "dli        %[tmp0],    0x03                                    \n\t"
+    "dmtc1      %[tmp0],    %[ftmp9]                                \n\t"
     "psllh      %[ftmp0],   %[ftmp5],           %[ftmp8]            \n\t"
     "psrah      %[ftmp0],   %[ftmp0],           %[ftmp9]            \n\t"
     "psrlh      %[ftmp0],   %[ftmp0],           %[ftmp8]            \n\t"
 
-    "li         %[tmp0],    0x0b                                    \n\t"
-    "mtc1       %[tmp0],    %[ftmp9]                                \n\t"
+    "dli        %[tmp0],    0x0b                                    \n\t"
+    "dmtc1      %[tmp0],    %[ftmp9]                                \n\t"
     "psrah      %[ftmp7],   %[ftmp5],           %[ftmp9]            \n\t"
     "psllh      %[ftmp7],   %[ftmp7],           %[ftmp8]            \n\t"
     "por        %[ftmp0],   %[ftmp0],           %[ftmp7]            \n\t"
@@ -1164,14 +1241,14 @@ void vp8_loop_filter_simple_vertical_edge_mmi(unsigned char *src_ptr,
     "pxor       %[ftmp3],   %[ftmp3],           %[ff_pb_80]         \n\t"
     "psubsb     %[ftmp5],   %[ftmp5],           %[ff_pb_01]         \n\t"
 
-    "li         %[tmp0],    0x03                                    \n\t"
-    "mtc1       %[tmp0],    %[ftmp9]                                \n\t"
+    "dli        %[tmp0],    0x03                                    \n\t"
+    "dmtc1      %[tmp0],    %[ftmp9]                                \n\t"
     "psllh      %[ftmp0],   %[ftmp5],           %[ftmp8]            \n\t"
     "psrah      %[ftmp0],   %[ftmp0],           %[ftmp9]            \n\t"
     "psrlh      %[ftmp0],   %[ftmp0],           %[ftmp8]            \n\t"
 
-    "li         %[tmp0],    0x0b                                    \n\t"
-    "mtc1       %[tmp0],    %[ftmp9]                                \n\t"
+    "dli        %[tmp0],    0x0b                                    \n\t"
+    "dmtc1      %[tmp0],    %[ftmp9]                                \n\t"
     "psrah      %[ftmp5],   %[ftmp5],           %[ftmp9]            \n\t"
     "psllh      %[ftmp5],   %[ftmp5],           %[ftmp8]            \n\t"
     "por        %[ftmp0],   %[ftmp0],           %[ftmp5]            \n\t"
@@ -1235,16 +1312,17 @@ void vp8_loop_filter_simple_vertical_edge_mmi(unsigned char *src_ptr,
       [ftmp10]"=&f"(ftmp[10]),            [ftmp11]"=&f"(ftmp[11]),
       [tmp0]"=&r"(tmp[0]),
       [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
-      [src_ptr]"+&r"(src_ptr),          [count]"+&r"(count)
+      [src_ptr]"+&r"(src_ptr),          [count]"+&r"(count),
+      [ff_pb_fe]"=&f"(ff_pb_fe),        [ff_pb_80]"=&f"(ff_pb_80),
+      [ff_pb_04]"=&f"(ff_pb_04),        [ff_pb_01]"=&f"(ff_pb_01)
     : [blimit]"r"(blimit),              [srct]"r"(srct),
       [src_pixel_step]"r"((mips_reg)src_pixel_step),
       [src_pixel_step_x2]"r"((mips_reg)(src_pixel_step<<1)),
       [src_pixel_step_x4]"r"((mips_reg)(src_pixel_step<<2)),
-      [src_pixel_step_x8]"r"((mips_reg)(src_pixel_step<<3)),
-      [ff_pb_fe]"f"(ff_pb_fe),          [ff_pb_80]"f"(ff_pb_80),
-      [ff_pb_04]"f"(ff_pb_04),          [ff_pb_01]"f"(ff_pb_01)
+      [src_pixel_step_x8]"r"((mips_reg)(src_pixel_step<<3))
     : "memory"
   );
+  /* clang-format on */
 }
 
 /* Horizontal MB filtering */
index dbe35d09f3f301527c211e5090f25ff040106b2d..b85f73fdff8ff39678f9d340199d9bb6a3120fa2 100644 (file)
@@ -70,9 +70,8 @@ static INLINE void vp8_filter_block1d_h6_mmi(unsigned char *src_ptr,
                                              unsigned int output_height,
                                              unsigned int output_width,
                                              const int16_t *vp8_filter) {
-  uint32_t tmp[1];
-  DECLARE_ALIGNED(8, const uint64_t, ff_ph_40) = { 0x0040004000400040ULL };
-
+  uint64_t tmp[1];
+  double ff_ph_40;
 #if _MIPS_SIM == _ABIO32
   register double fzero asm("$f0");
   register double ftmp0 asm("$f2");
@@ -103,7 +102,10 @@ static INLINE void vp8_filter_block1d_h6_mmi(unsigned char *src_ptr,
   register double ftmp11 asm("$f12");
 #endif  // _MIPS_SIM == _ABIO32
 
+  /* clang-format off */
   __asm__ volatile (
+    "dli        %[tmp0],        0x0040004000400040                    \n\t"
+    "dmtc1      %[tmp0],        %[ff_ph_40]                           \n\t"
     "ldc1       %[ftmp0],       0x00(%[vp8_filter])                   \n\t"
     "ldc1       %[ftmp1],       0x10(%[vp8_filter])                   \n\t"
     "ldc1       %[ftmp2],       0x20(%[vp8_filter])                   \n\t"
@@ -111,10 +113,10 @@ static INLINE void vp8_filter_block1d_h6_mmi(unsigned char *src_ptr,
     "ldc1       %[ftmp4],       0x40(%[vp8_filter])                   \n\t"
     "ldc1       %[ftmp5],       0x50(%[vp8_filter])                   \n\t"
     "pxor       %[fzero],       %[fzero],           %[fzero]          \n\t"
-    "li         %[tmp0],        0x07                                  \n\t"
-    "mtc1       %[tmp0],        %[ftmp7]                              \n\t"
-    "li         %[tmp0],        0x08                                  \n\t"
-    "mtc1       %[tmp0],        %[ftmp11]                             \n\t"
+    "dli        %[tmp0],        0x07                                  \n\t"
+    "dmtc1      %[tmp0],        %[ftmp7]                              \n\t"
+    "dli        %[tmp0],        0x08                                  \n\t"
+    "dmtc1      %[tmp0],        %[ftmp11]                             \n\t"
 
     "1:                                                               \n\t"
     "gsldlc1    %[ftmp9],       0x05(%[src_ptr])                      \n\t"
@@ -166,21 +168,22 @@ static INLINE void vp8_filter_block1d_h6_mmi(unsigned char *src_ptr,
       [ftmp9]"=&f"(ftmp9),              [ftmp10]"=&f"(ftmp10),
       [ftmp11]"=&f"(ftmp11),            [tmp0]"=&r"(tmp[0]),
       [output_ptr]"+&r"(output_ptr),    [output_height]"+&r"(output_height),
-      [src_ptr]"+&r"(src_ptr)
+      [src_ptr]"+&r"(src_ptr),          [ff_ph_40]"=&f"(ff_ph_40)
     : [src_pixels_per_line]"r"((mips_reg)src_pixels_per_line),
-      [vp8_filter]"r"(vp8_filter),      [output_width]"r"(output_width),
-      [ff_ph_40]"f"(ff_ph_40)
+      [vp8_filter]"r"(vp8_filter),      [output_width]"r"(output_width)
     : "memory"
     );
+  /* clang-format on */
 }
 
 /* Horizontal filter:  pixel_step is always W */
 static INLINE void vp8_filter_block1dc_v6_mmi(
     uint16_t *src_ptr, unsigned char *output_ptr, unsigned int output_height,
     int output_pitch, unsigned int pixels_per_line, const int16_t *vp8_filter) {
-  DECLARE_ALIGNED(8, const uint64_t, ff_ph_40) = { 0x0040004000400040ULL };
-  uint32_t tmp[1];
+  double ff_ph_40;
+  uint64_t tmp[1];
   mips_reg addr[1];
+
 #if _MIPS_SIM == _ABIO32
   register double fzero asm("$f0");
   register double ftmp0 asm("$f2");
@@ -215,7 +218,10 @@ static INLINE void vp8_filter_block1dc_v6_mmi(
   register double ftmp13 asm("$f14");
 #endif  // _MIPS_SIM == _ABIO32
 
+  /* clang-format off */
   __asm__ volatile (
+    "dli        %[tmp0],      0x0040004000400040                      \n\t"
+    "dmtc1      %[tmp0],      %[ff_ph_40]                             \n\t"
     "ldc1       %[ftmp0],     0x00(%[vp8_filter])                     \n\t"
     "ldc1       %[ftmp1],     0x10(%[vp8_filter])                     \n\t"
     "ldc1       %[ftmp2],     0x20(%[vp8_filter])                     \n\t"
@@ -223,8 +229,8 @@ static INLINE void vp8_filter_block1dc_v6_mmi(
     "ldc1       %[ftmp4],     0x40(%[vp8_filter])                     \n\t"
     "ldc1       %[ftmp5],     0x50(%[vp8_filter])                     \n\t"
     "pxor       %[fzero],     %[fzero],        %[fzero]               \n\t"
-    "li         %[tmp0],      0x07                                    \n\t"
-    "mtc1       %[tmp0],      %[ftmp13]                               \n\t"
+    "dli        %[tmp0],      0x07                                    \n\t"
+    "dmtc1      %[tmp0],      %[ftmp13]                               \n\t"
 
     /* In order to make full use of memory load delay slot,
      * Operation of memory loading and calculating has been rearranged.
@@ -285,15 +291,16 @@ static INLINE void vp8_filter_block1dc_v6_mmi(
       [ftmp11]"=&f"(ftmp11),            [ftmp12]"=&f"(ftmp12),
       [ftmp13]"=&f"(ftmp13),            [tmp0]"=&r"(tmp[0]),
       [addr0]"=&r"(addr[0]),            [src_ptr]"+&r"(src_ptr),
-      [output_ptr]"+&r"(output_ptr),    [output_height]"+&r"(output_height)
+      [output_ptr]"+&r"(output_ptr),    [output_height]"+&r"(output_height),
+      [ff_ph_40]"=&f"(ff_ph_40)
     : [pixels_per_line]"r"((mips_reg)pixels_per_line),
       [pixels_per_line_x2]"r"((mips_reg)(pixels_per_line<<1)),
       [pixels_per_line_x4]"r"((mips_reg)(pixels_per_line<<2)),
       [vp8_filter]"r"(vp8_filter),
-      [output_pitch]"r"((mips_reg)output_pitch),
-      [ff_ph_40]"f"(ff_ph_40)
+      [output_pitch]"r"((mips_reg)output_pitch)
     : "memory"
     );
+  /* clang-format on */
 }
 
 /* When xoffset == 0, vp8_filter= {0,0,128,0,0,0},
@@ -313,6 +320,7 @@ static INLINE void vp8_filter_block1d_h6_filter0_mmi(
   register double ftmp1 asm("$f2");
 #endif  // _MIPS_SIM == _ABIO32
 
+  /* clang-format off */
   __asm__ volatile (
     "pxor       %[fzero],       %[fzero],           %[fzero]          \n\t"
 
@@ -335,6 +343,7 @@ static INLINE void vp8_filter_block1d_h6_filter0_mmi(
       [output_width]"r"(output_width)
     : "memory"
     );
+  /* clang-format on */
 }
 
 static INLINE void vp8_filter_block1dc_v6_filter0_mmi(
@@ -350,6 +359,7 @@ static INLINE void vp8_filter_block1dc_v6_filter0_mmi(
   register double ftmp1 asm("$f2");
 #endif  // _MIPS_SIM == _ABIO32
 
+  /* clang-format on */
   __asm__ volatile (
     "pxor       %[fzero],     %[fzero],        %[fzero]               \n\t"
 
@@ -371,6 +381,7 @@ static INLINE void vp8_filter_block1dc_v6_filter0_mmi(
       [output_pitch]"r"((mips_reg)output_pitch)
     : "memory"
     );
+  /* clang-format on */
 }
 
 #define sixtapNxM(n, m)                                                        \
index b5ecf0f1ca77ed98495879f391f2e1741768d189..0fd25fcda507212a44a6f17a82a29987a1f06d67 100644 (file)
@@ -46,6 +46,7 @@
 void vp8_short_fdct4x4_mmi(int16_t *input, int16_t *output, int pitch) {
   uint64_t tmp[1];
   int16_t *ip = input;
+  double ff_ph_op1, ff_ph_op3;
 
 #if _MIPS_SIM == _ABIO32
   register double ftmp0 asm("$f0");
@@ -83,13 +84,16 @@ void vp8_short_fdct4x4_mmi(int16_t *input, int16_t *output, int pitch) {
   DECLARE_ALIGNED(8, const uint64_t, ff_pw_51000) = { 0x0000c7380000c738ULL };
   DECLARE_ALIGNED(8, const uint64_t, ff_pw_14500) = { 0x000038a4000038a4ULL };
   DECLARE_ALIGNED(8, const uint64_t, ff_pw_7500) = { 0x00001d4c00001d4cULL };
-  DECLARE_ALIGNED(8, const uint64_t, ff_ph_op1) = { 0x14e808a914e808a9ULL };
-  DECLARE_ALIGNED(8, const uint64_t, ff_ph_op3) = { 0xeb1808a9eb1808a9ULL };
   DECLARE_ALIGNED(8, const uint64_t, ff_pw_5352) = { 0x000014e8000014e8ULL };
   DECLARE_ALIGNED(8, const uint64_t, ff_pw_2217) = { 0x000008a9000008a9ULL };
   DECLARE_ALIGNED(8, const uint64_t, ff_ph_8) = { 0x0008000800080008ULL };
 
+  /* clang-format off */
   __asm__ volatile (
+    "dli        %[tmp0],    0x14e808a914e808a9              \n\t"
+    "dmtc1      %[tmp0],    %[ff_ph_op1]                    \n\t"
+    "dli        %[tmp0],    0xeb1808a9eb1808a9              \n\t"
+    "dmtc1      %[tmp0],    %[ff_ph_op3]                    \n\t"
     "pxor       %[ftmp0],   %[ftmp0],      %[ftmp0]         \n\t"
     "gsldlc1    %[ftmp1],   0x07(%[ip])                     \n\t"
     "gsldrc1    %[ftmp1],   0x00(%[ip])                     \n\t"
@@ -129,7 +133,7 @@ void vp8_short_fdct4x4_mmi(int16_t *input, int16_t *output, int pitch) {
 
     // op[1] = (c1 * 2217 + d1 * 5352 + 14500) >> 12
     MMI_LI(%[tmp0], 0x0c)
-    "mtc1       %[tmp0],    %[ftmp11]                       \n\t"
+    "dmtc1      %[tmp0],    %[ftmp11]                       \n\t"
     "ldc1       %[ftmp12],  %[ff_pw_14500]                  \n\t"
     "punpcklhw  %[ftmp9],   %[ftmp7],       %[ftmp8]        \n\t"
     "pmaddhw    %[ftmp5],   %[ftmp9],       %[ff_ph_op1]    \n\t"
@@ -169,7 +173,7 @@ void vp8_short_fdct4x4_mmi(int16_t *input, int16_t *output, int pitch) {
     "paddh      %[ftmp1],   %[ftmp1],       %[ftmp9]        \n\t"
     "paddh      %[ftmp2],   %[ftmp2],       %[ftmp9]        \n\t"
     MMI_LI(%[tmp0], 0x04)
-    "mtc1       %[tmp0],    %[ftmp9]                        \n\t"
+    "dmtc1      %[tmp0],    %[ftmp9]                        \n\t"
     "psrah      %[ftmp1],   %[ftmp1],       %[ftmp9]        \n\t"
     "psrah      %[ftmp2],   %[ftmp2],       %[ftmp9]        \n\t"
 
@@ -211,15 +215,16 @@ void vp8_short_fdct4x4_mmi(int16_t *input, int16_t *output, int pitch) {
       [ftmp3] "=&f"(ftmp3), [ftmp4] "=&f"(ftmp4), [ftmp5] "=&f"(ftmp5),
       [ftmp6] "=&f"(ftmp6), [ftmp7] "=&f"(ftmp7), [ftmp8] "=&f"(ftmp8),
       [ftmp9] "=&f"(ftmp9), [ftmp10] "=&f"(ftmp10), [ftmp11] "=&f"(ftmp11),
-      [ftmp12] "=&f"(ftmp12), [tmp0] "=&r"(tmp[0]), [ip]"+&r"(ip)
+      [ftmp12] "=&f"(ftmp12), [tmp0] "=&r"(tmp[0]), [ip]"+&r"(ip),
+      [ff_ph_op1] "=&f"(ff_ph_op1), [ff_ph_op3] "=&f"(ff_ph_op3)
     : [ff_ph_01] "m"(ff_ph_01), [ff_ph_07] "m"(ff_ph_07),
-      [ff_ph_op1] "f"(ff_ph_op1), [ff_ph_op3] "f"(ff_ph_op3),
       [ff_pw_14500] "m"(ff_pw_14500), [ff_pw_7500] "m"(ff_pw_7500),
       [ff_pw_12000] "m"(ff_pw_12000), [ff_pw_51000] "m"(ff_pw_51000),
       [ff_pw_5352]"m"(ff_pw_5352), [ff_pw_2217]"m"(ff_pw_2217),
       [ff_ph_8]"m"(ff_ph_8), [pitch]"r"(pitch), [output] "r"(output)
     : "memory"
   );
+  /* clang-format on */
 }
 
 void vp8_short_fdct8x4_mmi(int16_t *input, int16_t *output, int pitch) {
@@ -228,17 +233,22 @@ void vp8_short_fdct8x4_mmi(int16_t *input, int16_t *output, int pitch) {
 }
 
 void vp8_short_walsh4x4_mmi(int16_t *input, int16_t *output, int pitch) {
-  double ftmp[13];
-  uint32_t tmp[1];
-  DECLARE_ALIGNED(8, const uint64_t, ff_ph_01) = { 0x0001000100010001ULL };
-  DECLARE_ALIGNED(8, const uint64_t, ff_pw_01) = { 0x0000000100000001ULL };
-  DECLARE_ALIGNED(8, const uint64_t, ff_pw_03) = { 0x0000000300000003ULL };
-  DECLARE_ALIGNED(8, const uint64_t, ff_pw_mask) = { 0x0001000000010000ULL };
+  double ftmp[13], ff_ph_01, ff_pw_01, ff_pw_03, ff_pw_mask;
+  uint64_t tmp[1];
 
+  /* clang-format off */
   __asm__ volatile (
+    "dli        %[tmp0],    0x0001000100010001                  \n\t"
+    "dmtc1      %[tmp0],    %[ff_ph_01]                         \n\t"
+    "dli        %[tmp0],    0x0000000100000001                  \n\t"
+    "dmtc1      %[tmp0],    %[ff_pw_01]                         \n\t"
+    "dli        %[tmp0],    0x0000000300000003                  \n\t"
+    "dmtc1      %[tmp0],    %[ff_pw_03]                         \n\t"
+    "dli        %[tmp0],    0x0001000000010000                  \n\t"
+    "dmtc1      %[tmp0],    %[ff_pw_mask]                       \n\t"
     MMI_LI(%[tmp0], 0x02)
     "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
-    "mtc1       %[tmp0],    %[ftmp11]                           \n\t"
+    "dmtc1      %[tmp0],    %[ftmp11]                           \n\t"
 
     "gsldlc1    %[ftmp1],   0x07(%[ip])                         \n\t"
     "gsldrc1    %[ftmp1],   0x00(%[ip])                         \n\t"
@@ -337,7 +347,7 @@ void vp8_short_walsh4x4_mmi(int16_t *input, int16_t *output, int pitch) {
     "psubw      %[ftmp4],   %[ftmp9],       %[ftmp10]           \n\t"
 
     MMI_LI(%[tmp0], 0x03)
-    "mtc1       %[tmp0],    %[ftmp11]                           \n\t"
+    "dmtc1      %[tmp0],    %[ftmp11]                           \n\t"
 
     "pcmpgtw    %[ftmp9],   %[ftmp0],       %[ftmp1]            \n\t"
     "pand       %[ftmp9],   %[ftmp9],       %[ff_pw_01]         \n\t"
@@ -393,7 +403,7 @@ void vp8_short_walsh4x4_mmi(int16_t *input, int16_t *output, int pitch) {
     "packsswh   %[ftmp4],   %[ftmp4],       %[ftmp8]            \n\t"
 
     MMI_LI(%[tmp0], 0x72)
-    "mtc1       %[tmp0],    %[ftmp11]                           \n\t"
+    "dmtc1      %[tmp0],    %[ftmp11]                           \n\t"
     "pshufh     %[ftmp1],   %[ftmp1],       %[ftmp11]           \n\t"
     "pshufh     %[ftmp2],   %[ftmp2],       %[ftmp11]           \n\t"
     "pshufh     %[ftmp3],   %[ftmp3],       %[ftmp11]           \n\t"
@@ -413,13 +423,12 @@ void vp8_short_walsh4x4_mmi(int16_t *input, int16_t *output, int pitch) {
       [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
       [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
       [ftmp10]"=&f"(ftmp[10]),          [ftmp11]"=&f"(ftmp[11]),
-      [ftmp12]"=&f"(ftmp[12]),
-      [tmp0]"=&r"(tmp[0]),
-      [ip]"+&r"(input)
-    : [op]"r"(output),
-      [ff_pw_01]"f"(ff_pw_01),          [pitch]"r"((mips_reg)pitch),
-      [ff_pw_03]"f"(ff_pw_03),          [ff_pw_mask]"f"(ff_pw_mask),
-      [ff_ph_01]"f"(ff_ph_01)
+      [ftmp12]"=&f"(ftmp[12]),          [ff_pw_mask]"=&f"(ff_pw_mask),
+      [tmp0]"=&r"(tmp[0]),              [ff_pw_01]"=&f"(ff_pw_01),
+      [ip]"+&r"(input),                 [ff_pw_03]"=&f"(ff_pw_03),
+      [ff_ph_01]"=&f"(ff_ph_01)
+    : [op]"r"(output),                  [pitch]"r"((mips_reg)pitch)
     : "memory"
   );
+  /* clang-format on */
 }
index 69a9e5e01852c1a692c9e81079cb3c2ce5b209b2..1986444aa39635e4be7dc43de5b1062ee3fdcf7b 100644 (file)
@@ -42,16 +42,17 @@ void vp8_fast_quantize_b_mmi(BLOCK *b, BLOCKD *d) {
 
   double ftmp[13];
   uint64_t tmp[1];
-  DECLARE_ALIGNED(8, const uint64_t, ones) = { 0xffffffffffffffffULL };
-  int eob = 0;
+  int64_t eob = 0;
+  double ones;
 
   __asm__ volatile(
       // loop 0 ~ 7
       "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]        \n\t"
+      "pcmpeqh    %[ones],    %[ones],        %[ones]         \n\t"
       "gsldlc1    %[ftmp1],   0x07(%[coeff_ptr])              \n\t"
       "gsldrc1    %[ftmp1],   0x00(%[coeff_ptr])              \n\t"
-      "li         %[tmp0],    0x0f                            \n\t"
-      "mtc1       %[tmp0],    %[ftmp9]                        \n\t"
+      "dli        %[tmp0],    0x0f                            \n\t"
+      "dmtc1      %[tmp0],    %[ftmp9]                        \n\t"
       "gsldlc1    %[ftmp2],   0x0f(%[coeff_ptr])              \n\t"
       "gsldrc1    %[ftmp2],   0x08(%[coeff_ptr])              \n\t"
 
@@ -165,18 +166,18 @@ void vp8_fast_quantize_b_mmi(BLOCK *b, BLOCKD *d) {
       "gssdlc1    %[ftmp6],   0x1f(%[dqcoeff_ptr])            \n\t"
       "gssdrc1    %[ftmp6],   0x18(%[dqcoeff_ptr])            \n\t"
 
-      "li         %[tmp0],    0x10                            \n\t"
-      "mtc1       %[tmp0],    %[ftmp9]                        \n\t"
+      "dli        %[tmp0],    0x10                            \n\t"
+      "dmtc1      %[tmp0],    %[ftmp9]                        \n\t"
 
       "pmaxsh     %[ftmp10],  %[ftmp10],       %[ftmp11]      \n\t"
       "psrlw      %[ftmp11],  %[ftmp10],       %[ftmp9]       \n\t"
       "pmaxsh     %[ftmp10],  %[ftmp10],       %[ftmp11]      \n\t"
-      "li         %[tmp0],    0xaa                            \n\t"
-      "mtc1       %[tmp0],    %[ftmp9]                        \n\t"
+      "dli        %[tmp0],    0xaa                            \n\t"
+      "dmtc1      %[tmp0],    %[ftmp9]                        \n\t"
       "pshufh     %[ftmp11],  %[ftmp10],       %[ftmp9]       \n\t"
       "pmaxsh     %[ftmp10],  %[ftmp10],       %[ftmp11]      \n\t"
-      "li         %[tmp0],    0xffff                          \n\t"
-      "mtc1       %[tmp0],    %[ftmp9]                        \n\t"
+      "dli        %[tmp0],    0xffff                          \n\t"
+      "dmtc1      %[tmp0],    %[ftmp9]                        \n\t"
       "pand       %[ftmp10],  %[ftmp10],       %[ftmp9]       \n\t"
       "gssdlc1    %[ftmp10],  0x07(%[eob])                    \n\t"
       "gssdrc1    %[ftmp10],  0x00(%[eob])                    \n\t"
@@ -184,15 +185,15 @@ void vp8_fast_quantize_b_mmi(BLOCK *b, BLOCKD *d) {
         [ftmp3] "=&f"(ftmp[3]), [ftmp4] "=&f"(ftmp[4]), [ftmp5] "=&f"(ftmp[5]),
         [ftmp6] "=&f"(ftmp[6]), [ftmp7] "=&f"(ftmp[7]), [ftmp8] "=&f"(ftmp[8]),
         [ftmp9] "=&f"(ftmp[9]), [ftmp10] "=&f"(ftmp[10]),
-        [ftmp11] "=&f"(ftmp[11]), [ftmp12] "=&f"(ftmp[12]), [tmp0] "=&r"(tmp[0])
+        [ftmp11] "=&f"(ftmp[11]), [ftmp12] "=&f"(ftmp[12]),
+        [tmp0] "=&r"(tmp[0]), [ones] "=&f"(ones)
       : [coeff_ptr] "r"((mips_reg)coeff_ptr),
         [qcoeff_ptr] "r"((mips_reg)qcoeff_ptr),
         [dequant_ptr] "r"((mips_reg)dequant_ptr),
         [round_ptr] "r"((mips_reg)round_ptr),
         [quant_ptr] "r"((mips_reg)quant_ptr),
         [dqcoeff_ptr] "r"((mips_reg)dqcoeff_ptr),
-        [inv_zig_zag] "r"((mips_reg)inv_zig_zag), [eob] "r"((mips_reg)&eob),
-        [ones] "f"(ones)
+        [inv_zig_zag] "r"((mips_reg)inv_zig_zag), [eob] "r"((mips_reg)&eob)
       : "memory");
 
   *d->eob = eob;
index 5dee3164bc16eca36c30a4e67411386193028276..eaca4773f2b86eba254005429fc59682811cf325 100644 (file)
@@ -364,6 +364,7 @@ static inline unsigned int vpx_sad64x(const uint8_t *src, int src_stride,
   double ftmp1, ftmp2, ftmp3, ftmp4, ftmp5;
   mips_reg l_counter = counter;
 
+  /* clang-format off */
   __asm__ volatile (
     "pxor       %[ftmp5],   %[ftmp5],       %[ftmp5]            \n\t"
     "1:                                                         \n\t"
@@ -383,6 +384,7 @@ static inline unsigned int vpx_sad64x(const uint8_t *src, int src_stride,
     : [src_stride]"r"((mips_reg)src_stride),
       [ref_stride]"r"((mips_reg)ref_stride)
   );
+  /* clang-format on */
 
   return sad;
 }
@@ -405,7 +407,9 @@ static inline unsigned int vpx_sad_avg64x(const uint8_t *src, int src_stride,
   unsigned int sad;
   double ftmp1, ftmp2, ftmp3, ftmp4, ftmp5;
   mips_reg l_counter = counter;
+  mips_reg l_second_pred = (mips_reg)second_pred;
 
+  /* clang-format off */
   __asm__ volatile (
     "pxor       %[ftmp5],   %[ftmp5],       %[ftmp5]            \n\t"
     "1:                                                         \n\t"
@@ -424,11 +428,12 @@ static inline unsigned int vpx_sad_avg64x(const uint8_t *src, int src_stride,
     : [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), [ftmp3]"=&f"(ftmp3),
       [ftmp4]"=&f"(ftmp4), [ftmp5]"=&f"(ftmp5), [counter]"+&r"(l_counter),
       [src]"+&r"(src), [ref]"+&r"(ref),
-      [second_pred]"+&r"((mips_reg)second_pred),
+      [second_pred]"+&r"(l_second_pred),
       [sad]"=&r"(sad)
     : [src_stride]"r"((mips_reg)src_stride),
       [ref_stride]"r"((mips_reg)ref_stride)
   );
+  /* clang-format on */
 
   return sad;
 }
@@ -450,6 +455,7 @@ static inline unsigned int vpx_sad32x(const uint8_t *src, int src_stride,
   double ftmp1, ftmp2, ftmp3, ftmp4, ftmp5;
   mips_reg l_counter = counter;
 
+  /* clang-format off */
   __asm__ volatile (
     "pxor       %[ftmp5],   %[ftmp5],       %[ftmp5]            \n\t"
     "1:                                                         \n\t"
@@ -469,6 +475,7 @@ static inline unsigned int vpx_sad32x(const uint8_t *src, int src_stride,
     : [src_stride]"r"((mips_reg)src_stride),
       [ref_stride]"r"((mips_reg)ref_stride)
   );
+  /* clang-format on */
 
   return sad;
 }
@@ -493,7 +500,9 @@ static inline unsigned int vpx_sad_avg32x(const uint8_t *src, int src_stride,
   unsigned int sad;
   double ftmp1, ftmp2, ftmp3, ftmp4, ftmp5;
   mips_reg l_counter = counter;
+  mips_reg l_second_pred = (mips_reg)second_pred;
 
+  /* clang-format off */
   __asm__ volatile (
     "pxor       %[ftmp5],   %[ftmp5],       %[ftmp5]            \n\t"
     "1:                                                         \n\t"
@@ -512,11 +521,12 @@ static inline unsigned int vpx_sad_avg32x(const uint8_t *src, int src_stride,
     : [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), [ftmp3]"=&f"(ftmp3),
       [ftmp4]"=&f"(ftmp4), [ftmp5]"=&f"(ftmp5), [counter]"+&r"(l_counter),
       [src]"+&r"(src), [ref]"+&r"(ref),
-      [second_pred]"+&r"((mips_reg)second_pred),
+      [second_pred]"+&r"(l_second_pred),
       [sad]"=&r"(sad)
     : [src_stride]"r"((mips_reg)src_stride),
       [ref_stride]"r"((mips_reg)ref_stride)
   );
+  /* clang-format on */
 
   return sad;
 }
@@ -539,6 +549,7 @@ static inline unsigned int vpx_sad16x(const uint8_t *src, int src_stride,
   double ftmp1, ftmp2, ftmp3, ftmp4, ftmp5;
   mips_reg l_counter = counter;
 
+  /* clang-format off */
   __asm__ volatile (
     "pxor       %[ftmp5],   %[ftmp5],       %[ftmp5]            \n\t"
     "1:                                                         \n\t"
@@ -558,6 +569,7 @@ static inline unsigned int vpx_sad16x(const uint8_t *src, int src_stride,
     : [src_stride]"r"((mips_reg)src_stride),
       [ref_stride]"r"((mips_reg)ref_stride)
   );
+  /* clang-format on */
 
   return sad;
 }
@@ -586,7 +598,9 @@ static inline unsigned int vpx_sad_avg16x(const uint8_t *src, int src_stride,
   unsigned int sad;
   double ftmp1, ftmp2, ftmp3, ftmp4, ftmp5;
   mips_reg l_counter = counter;
+  mips_reg l_second_pred = (mips_reg)second_pred;
 
+  /* clang-format off */
   __asm__ volatile (
     "pxor       %[ftmp5],   %[ftmp5],       %[ftmp5]            \n\t"
     "1:                                                         \n\t"
@@ -605,11 +619,12 @@ static inline unsigned int vpx_sad_avg16x(const uint8_t *src, int src_stride,
     : [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), [ftmp3]"=&f"(ftmp3),
       [ftmp4]"=&f"(ftmp4), [ftmp5]"=&f"(ftmp5), [counter]"+&r"(l_counter),
       [src]"+&r"(src), [ref]"+&r"(ref),
-      [second_pred]"+&r"((mips_reg)second_pred),
+      [second_pred]"+&r"(l_second_pred),
       [sad]"=&r"(sad)
     : [src_stride]"r"((mips_reg)src_stride),
       [ref_stride]"r"((mips_reg)ref_stride)
   );
+  /* clang-format on */
 
   return sad;
 }
@@ -632,6 +647,7 @@ static inline unsigned int vpx_sad8x(const uint8_t *src, int src_stride,
   double ftmp1, ftmp2, ftmp3;
   mips_reg l_counter = counter;
 
+  /* clang-format off */
   __asm__ volatile (
     "pxor       %[ftmp3],   %[ftmp3],       %[ftmp3]            \n\t"
     "1:                                                         \n\t"
@@ -651,6 +667,7 @@ static inline unsigned int vpx_sad8x(const uint8_t *src, int src_stride,
     : [src_stride]"r"((mips_reg)src_stride),
       [ref_stride]"r"((mips_reg)ref_stride)
   );
+  /* clang-format on */
 
   return sad;
 }
@@ -679,7 +696,9 @@ static inline unsigned int vpx_sad_avg8x(const uint8_t *src, int src_stride,
   unsigned int sad;
   double ftmp1, ftmp2, ftmp3;
   mips_reg l_counter = counter;
+  mips_reg l_second_pred = (mips_reg)second_pred;
 
+  /* clang-format off */
   __asm__ volatile (
     "pxor       %[ftmp3],   %[ftmp3],       %[ftmp3]            \n\t"
     "1:                                                         \n\t"
@@ -697,11 +716,12 @@ static inline unsigned int vpx_sad_avg8x(const uint8_t *src, int src_stride,
     "mfc1       %[sad],     %[ftmp3]                            \n\t"
     : [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), [ftmp3]"=&f"(ftmp3),
       [counter]"+&r"(l_counter), [src]"+&r"(src), [ref]"+&r"(ref),
-      [second_pred]"+&r"((mips_reg)second_pred),
+      [second_pred]"+&r"(l_second_pred),
       [sad]"=&r"(sad)
     : [src_stride]"r"((mips_reg)src_stride),
       [ref_stride]"r"((mips_reg)ref_stride)
   );
+  /* clang-format on */
 
   return sad;
 }
@@ -724,6 +744,7 @@ static inline unsigned int vpx_sad4x(const uint8_t *src, int src_stride,
   double ftmp1, ftmp2, ftmp3;
   mips_reg l_counter = counter;
 
+  /* clang-format off */
   __asm__ volatile (
     "pxor       %[ftmp3],   %[ftmp3],       %[ftmp3]            \n\t"
     "1:                                                         \n\t"
@@ -743,6 +764,7 @@ static inline unsigned int vpx_sad4x(const uint8_t *src, int src_stride,
     : [src_stride]"r"((mips_reg)src_stride),
       [ref_stride]"r"((mips_reg)ref_stride)
   );
+  /* clang-format on */
 
   return sad;
 }
@@ -767,7 +789,9 @@ static inline unsigned int vpx_sad_avg4x(const uint8_t *src, int src_stride,
   unsigned int sad;
   double ftmp1, ftmp2, ftmp3;
   mips_reg l_counter = counter;
+  mips_reg l_second_pred = (mips_reg)second_pred;
 
+  /* clang-format off */
   __asm__ volatile (
     "pxor       %[ftmp3],   %[ftmp3],       %[ftmp3]            \n\t"
     "1:                                                         \n\t"
@@ -785,11 +809,12 @@ static inline unsigned int vpx_sad_avg4x(const uint8_t *src, int src_stride,
     "mfc1       %[sad],     %[ftmp3]                            \n\t"
     : [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), [ftmp3]"=&f"(ftmp3),
       [counter]"+&r"(l_counter), [src]"+&r"(src), [ref]"+&r"(ref),
-      [second_pred]"+&r"((mips_reg)second_pred),
+      [second_pred]"+&r"(l_second_pred),
       [sad]"=&r"(sad)
     : [src_stride]"r"((mips_reg)src_stride),
       [ref_stride]"r"((mips_reg)ref_stride)
   );
+  /* clang-format on */
 
   return sad;
 }
index 29e52a1a89463d3ce9d5b13c3d15a99980701684..c2adcfa018947eb97fd947d9e27c9913fed03be4 100644 (file)
@@ -414,6 +414,7 @@ static inline uint32_t vpx_variance64x(const uint8_t *src_ptr, int src_stride,
 
   *sse = 0;
 
+  /* clang-format off */
   __asm__ volatile (
     "li         %[tmp0],    0x20                                \n\t"
     "mtc1       %[tmp0],    %[ftmp11]                           \n\t"
@@ -496,6 +497,7 @@ static inline uint32_t vpx_variance64x(const uint8_t *src_ptr, int src_stride,
       [high]"r"(&high), [sse]"r"(sse)
     : "memory"
   );
+  /* clang-format on */
 
   return *sse - (((int64_t)sum * sum) / (64 * high));
 }
@@ -519,6 +521,7 @@ uint32_t vpx_variance32x64_mmi(const uint8_t *src_ptr, int src_stride,
 
   *sse = 0;
 
+  /* clang-format off */
   __asm__ volatile (
     "li         %[tmp0],    0x20                                \n\t"
     "mtc1       %[tmp0],    %[ftmp11]                           \n\t"
@@ -577,6 +580,7 @@ uint32_t vpx_variance32x64_mmi(const uint8_t *src_ptr, int src_stride,
       [sse]"r"(sse)
     : "memory"
   );
+  /* clang-format on */
 
   return *sse - (((int64_t)sum * sum) / 2048);
 }
@@ -590,6 +594,7 @@ static inline uint32_t vpx_variance32x(const uint8_t *src_ptr, int src_stride,
 
   *sse = 0;
 
+  /* clang-format off */
   __asm__ volatile (
     "li         %[tmp0],    0x20                                \n\t"
     "mtc1       %[tmp0],    %[ftmp11]                           \n\t"
@@ -653,6 +658,7 @@ static inline uint32_t vpx_variance32x(const uint8_t *src_ptr, int src_stride,
       [high]"r"(&high), [sse]"r"(sse), [sum]"r"(&sum)
     : "memory"
   );
+  /* clang-format on */
 
   return *sse - (((int64_t)sum * sum) / (32 * high));
 }
@@ -676,6 +682,7 @@ static inline uint32_t vpx_variance16x(const uint8_t *src_ptr, int src_stride,
 
   *sse = 0;
 
+  /* clang-format off */
   __asm__ volatile (
     "li         %[tmp0],    0x20                                \n\t"
     "mtc1       %[tmp0],    %[ftmp11]                           \n\t"
@@ -729,6 +736,7 @@ static inline uint32_t vpx_variance16x(const uint8_t *src_ptr, int src_stride,
       [high]"r"(&high), [sse]"r"(sse), [sum]"r"(&sum)
     : "memory"
   );
+  /* clang-format on */
 
   return *sse - (((int64_t)sum * sum) / (16 * high));
 }
@@ -753,6 +761,7 @@ static inline uint32_t vpx_variance8x(const uint8_t *src_ptr, int src_stride,
 
   *sse = 0;
 
+  /* clang-format off */
   __asm__ volatile (
     "li         %[tmp0],    0x20                                \n\t"
     "mtc1       %[tmp0],    %[ftmp11]                           \n\t"
@@ -801,6 +810,7 @@ static inline uint32_t vpx_variance8x(const uint8_t *src_ptr, int src_stride,
       [high]"r"(&high), [sse]"r"(sse), [sum]"r"(&sum)
     : "memory"
   );
+  /* clang-format on */
 
   return *sse - (((int64_t)sum * sum) / (8 * high));
 }
@@ -825,6 +835,7 @@ static inline uint32_t vpx_variance4x(const uint8_t *src_ptr, int src_stride,
 
   *sse = 0;
 
+  /* clang-format off */
   __asm__ volatile (
     "li         %[tmp0],    0x20                                \n\t"
     "mtc1       %[tmp0],    %[ftmp10]                           \n\t"
@@ -872,6 +883,7 @@ static inline uint32_t vpx_variance4x(const uint8_t *src_ptr, int src_stride,
       [high]"r"(&high), [sse]"r"(sse), [sum]"r"(&sum)
     : "memory"
   );
+  /* clang-format on */
 
   return *sse - (((int64_t)sum * sum) / (4 * high));
 }
@@ -894,6 +906,7 @@ static inline uint32_t vpx_mse16x(const uint8_t *src_ptr, int src_stride,
 
   *sse = 0;
 
+  /* clang-format off */
   __asm__ volatile (
     "li         %[tmp0],    0x20                                \n\t"
     "mtc1       %[tmp0],    %[ftmp11]                           \n\t"
@@ -925,6 +938,7 @@ static inline uint32_t vpx_mse16x(const uint8_t *src_ptr, int src_stride,
       [high]"r"(&high), [sse]"r"(sse)
     : "memory"
   );
+  /* clang-format on */
 
   return *sse;
 }
@@ -947,6 +961,7 @@ static inline uint32_t vpx_mse8x(const uint8_t *src_ptr, int src_stride,
 
   *sse = 0;
 
+  /* clang-format off */
   __asm__ volatile (
     "li         %[tmp0],    0x20                                \n\t"
     "mtc1       %[tmp0],    %[ftmp11]                           \n\t"
@@ -978,6 +993,7 @@ static inline uint32_t vpx_mse8x(const uint8_t *src_ptr, int src_stride,
       [high]"r"(&high), [sse]"r"(sse)
     : "memory"
   );
+  /* clang-format on */
 
   return *sse;
 }
@@ -1021,22 +1037,39 @@ static inline void var_filter_block2d_bil_16x(const uint8_t *src_ptr,
   uint8_t *temp2_ptr = temp2;
   mips_reg l_counter = counter;
   double ftmp[15];
+  double ff_ph_40, mask;
+  double filter_x0, filter_x1, filter_y0, filter_y1;
   mips_reg tmp[2];
-  DECLARE_ALIGNED(8, const uint64_t, ff_ph_40) = { 0x0040004000400040ULL };
-  DECLARE_ALIGNED(8, const uint64_t, mask) = { 0x00ff00ff00ff00ffULL };
+  uint64_t x0, x1, y0, y1, all;
 
   const uint8_t *filter_x = bilinear_filters[x_offset];
   const uint8_t *filter_y = bilinear_filters[y_offset];
+  x0 = (uint64_t)filter_x[0];
+  x1 = (uint64_t)filter_x[1];
+  y0 = (uint64_t)filter_y[0];
+  y1 = (uint64_t)filter_y[1];
+  all = x0 | x1 << 8 | y0 << 16 | y1 << 24;
 
+  /* clang-format off */
   __asm__ volatile (
     "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
+    MMI_MTC1(%[all], %[ftmp14])
+    "punpcklbh  %[ftmp14],  %[ftmp14],      %[ftmp0]            \n\t"
+    "pshufh     %[filter_x0], %[ftmp14],    %[ftmp0]            \n\t"
+    MMI_LI(%[tmp0], 0x10)
+    MMI_MTC1(%[tmp0], %[mask])
+    "ssrld      %[ftmp14],  %[ftmp14],      %[mask]             \n\t"
+    "pshufh     %[filter_x1], %[ftmp14],    %[ftmp0]            \n\t"
+    "ssrld      %[ftmp14],  %[ftmp14],      %[mask]             \n\t"
+    "pshufh     %[filter_y0], %[ftmp14],    %[ftmp0]            \n\t"
+    "ssrld      %[ftmp14],  %[ftmp14],      %[mask]             \n\t"
+    "pshufh     %[filter_y1], %[ftmp14],    %[ftmp0]            \n\t"
     MMI_LI(%[tmp0], 0x07)
     MMI_MTC1(%[tmp0], %[ftmp14])
-    "pshufh     %[filter_x0], %[filter_x0], %[ftmp0]            \n\t"
-    "pshufh     %[filter_x1], %[filter_x1], %[ftmp0]            \n\t"
-    "pshufh     %[filter_y0], %[filter_y0], %[ftmp0]            \n\t"
-    "pshufh     %[filter_y1], %[filter_y1], %[ftmp0]            \n\t"
-
+    MMI_LI(%[tmp0], 0x0040004000400040)
+    MMI_MTC1(%[tmp0], %[ff_ph_40])
+    MMI_LI(%[tmp0], 0x00ff00ff00ff00ff)
+    MMI_MTC1(%[tmp0], %[mask])
     // fdata3: fdata3[0] ~ fdata3[15]
     VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_16_A
 
@@ -1072,15 +1105,13 @@ static inline void var_filter_block2d_bil_16x(const uint8_t *src_ptr,
       [ftmp11] "=&f"(ftmp[11]), [ftmp12] "=&f"(ftmp[12]),
       [ftmp13] "=&f"(ftmp[13]), [ftmp14] "=&f"(ftmp[14]),
       [tmp0] "=&r"(tmp[0]), [src_ptr] "+&r"(src_ptr), [temp2_ptr] "+&r"(temp2_ptr),
-      [counter]"+&r"(l_counter)
-    : [filter_x0] "f"((uint64_t)filter_x[0]),
-      [filter_x1] "f"((uint64_t)filter_x[1]),
-      [filter_y0] "f"((uint64_t)filter_y[0]),
-      [filter_y1] "f"((uint64_t)filter_y[1]),
-      [src_stride] "r"((mips_reg)src_stride), [ff_ph_40] "f"(ff_ph_40),
-      [mask] "f"(mask)
+      [counter]"+&r"(l_counter), [ff_ph_40] "=&f"(ff_ph_40), [mask] "=&f"(mask),
+      [filter_x0] "=&f"(filter_x0), [filter_x1] "=&f"(filter_x1),
+      [filter_y0] "=&f"(filter_y0), [filter_y1] "=&f"(filter_y1)
+    : [src_stride] "r"((mips_reg)src_stride), [all] "r"(all)
     : "memory"
   );
+  /* clang-format on */
 }
 
 #define SUBPIX_VAR16XN(H)                                                      \
@@ -1105,19 +1136,38 @@ static inline void var_filter_block2d_bil_8x(const uint8_t *src_ptr,
   mips_reg l_counter = counter;
   double ftmp[15];
   mips_reg tmp[2];
-  DECLARE_ALIGNED(8, const uint64_t, ff_ph_40) = { 0x0040004000400040ULL };
-  DECLARE_ALIGNED(8, const uint64_t, mask) = { 0x00ff00ff00ff00ffULL };
+  double ff_ph_40, mask;
+  uint64_t x0, x1, y0, y1, all;
+  double filter_x0, filter_x1, filter_y0, filter_y1;
   const uint8_t *filter_x = bilinear_filters[x_offset];
   const uint8_t *filter_y = bilinear_filters[y_offset];
+  x0 = (uint64_t)filter_x[0];
+  x1 = (uint64_t)filter_x[1];
+  y0 = (uint64_t)filter_y[0];
+  y1 = (uint64_t)filter_y[1];
+  all = x0 | x1 << 8 | y0 << 16 | y1 << 24;
 
+  /* clang-format off */
   __asm__ volatile (
+    "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
+    MMI_MTC1(%[all], %[ftmp14])
+    "punpcklbh  %[ftmp14],  %[ftmp14],      %[ftmp0]            \n\t"
+    "pshufh     %[filter_x0], %[ftmp14],    %[ftmp0]            \n\t"
+    MMI_LI(%[tmp0], 0x10)
+    MMI_MTC1(%[tmp0], %[mask])
+    "ssrld      %[ftmp14],  %[ftmp14],      %[mask]             \n\t"
+    "pshufh     %[filter_x1], %[ftmp14],    %[ftmp0]            \n\t"
+    "ssrld      %[ftmp14],  %[ftmp14],      %[mask]             \n\t"
+    "pshufh     %[filter_y0], %[ftmp14],    %[ftmp0]            \n\t"
+    "ssrld      %[ftmp14],  %[ftmp14],      %[mask]             \n\t"
+    "pshufh     %[filter_y1], %[ftmp14],    %[ftmp0]            \n\t"
     "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
     MMI_LI(%[tmp0], 0x07)
     MMI_MTC1(%[tmp0], %[ftmp14])
-    "pshufh     %[filter_x0], %[filter_x0], %[ftmp0]            \n\t"
-    "pshufh     %[filter_x1], %[filter_x1], %[ftmp0]            \n\t"
-    "pshufh     %[filter_y0], %[filter_y0], %[ftmp0]            \n\t"
-    "pshufh     %[filter_y1], %[filter_y1], %[ftmp0]            \n\t"
+    MMI_LI(%[tmp0], 0x0040004000400040)
+    MMI_MTC1(%[tmp0], %[ff_ph_40])
+    MMI_LI(%[tmp0], 0x00ff00ff00ff00ff)
+    MMI_MTC1(%[tmp0], %[mask])
 
     // fdata3: fdata3[0] ~ fdata3[7]
     VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_A
@@ -1154,15 +1204,13 @@ static inline void var_filter_block2d_bil_8x(const uint8_t *src_ptr,
       [ftmp11] "=&f"(ftmp[11]), [ftmp12] "=&f"(ftmp[12]),
       [ftmp13] "=&f"(ftmp[13]), [ftmp14] "=&f"(ftmp[14]),
       [tmp0] "=&r"(tmp[0]), [src_ptr] "+&r"(src_ptr), [temp2_ptr] "+&r"(temp2_ptr),
-      [counter]"+&r"(l_counter)
-    : [filter_x0] "f"((uint64_t)filter_x[0]),
-      [filter_x1] "f"((uint64_t)filter_x[1]),
-      [filter_y0] "f"((uint64_t)filter_y[0]),
-      [filter_y1] "f"((uint64_t)filter_y[1]),
-      [src_stride] "r"((mips_reg)src_stride), [ff_ph_40] "f"(ff_ph_40),
-      [mask] "f"(mask)
+      [counter]"+&r"(l_counter), [ff_ph_40] "=&f"(ff_ph_40), [mask] "=&f"(mask),
+      [filter_x0] "=&f"(filter_x0), [filter_x1] "=&f"(filter_x1),
+      [filter_y0] "=&f"(filter_y0), [filter_y1] "=&f"(filter_y1)
+    : [src_stride] "r"((mips_reg)src_stride), [all] "r"(all)
     : "memory"
   );
+  /* clang-format on */
 }
 
 #define SUBPIX_VAR8XN(H)                                                      \
@@ -1188,19 +1236,38 @@ static inline void var_filter_block2d_bil_4x(const uint8_t *src_ptr,
   mips_reg l_counter = counter;
   double ftmp[7];
   mips_reg tmp[2];
-  DECLARE_ALIGNED(8, const uint64_t, ff_ph_40) = { 0x0040004000400040ULL };
-  DECLARE_ALIGNED(8, const uint64_t, mask) = { 0x00ff00ff00ff00ffULL };
+  double ff_ph_40, mask;
+  uint64_t x0, x1, y0, y1, all;
+  double filter_x0, filter_x1, filter_y0, filter_y1;
   const uint8_t *filter_x = bilinear_filters[x_offset];
   const uint8_t *filter_y = bilinear_filters[y_offset];
+  x0 = (uint64_t)filter_x[0];
+  x1 = (uint64_t)filter_x[1];
+  y0 = (uint64_t)filter_y[0];
+  y1 = (uint64_t)filter_y[1];
+  all = x0 | x1 << 8 | y0 << 16 | y1 << 24;
 
+  /* clang-format off */
   __asm__ volatile (
+    "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
+    MMI_MTC1(%[all], %[ftmp6])
+    "punpcklbh  %[ftmp6],   %[ftmp6],       %[ftmp0]            \n\t"
+    "pshufh     %[filter_x0], %[ftmp6],     %[ftmp0]            \n\t"
+    MMI_LI(%[tmp0], 0x10)
+    MMI_MTC1(%[tmp0], %[mask])
+    "ssrld      %[ftmp6],   %[ftmp6],       %[mask]             \n\t"
+    "pshufh     %[filter_x1], %[ftmp6],     %[ftmp0]            \n\t"
+    "ssrld      %[ftmp6],   %[ftmp6],       %[mask]             \n\t"
+    "pshufh     %[filter_y0], %[ftmp6],     %[ftmp0]            \n\t"
+    "ssrld      %[ftmp6],   %[ftmp6],       %[mask]             \n\t"
+    "pshufh     %[filter_y1], %[ftmp6],     %[ftmp0]            \n\t"
     "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
     MMI_LI(%[tmp0], 0x07)
     MMI_MTC1(%[tmp0], %[ftmp6])
-    "pshufh     %[filter_x0], %[filter_x0], %[ftmp0]            \n\t"
-    "pshufh     %[filter_x1], %[filter_x1], %[ftmp0]            \n\t"
-    "pshufh     %[filter_y0], %[filter_y0], %[ftmp0]            \n\t"
-    "pshufh     %[filter_y1], %[filter_y1], %[ftmp0]            \n\t"
+    MMI_LI(%[tmp0], 0x0040004000400040)
+    MMI_MTC1(%[tmp0], %[ff_ph_40])
+    MMI_LI(%[tmp0], 0x00ff00ff00ff00ff)
+    MMI_MTC1(%[tmp0], %[mask])
     // fdata3: fdata3[0] ~ fdata3[3]
     VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_A
 
@@ -1232,15 +1299,14 @@ static inline void var_filter_block2d_bil_4x(const uint8_t *src_ptr,
     : [ftmp0] "=&f"(ftmp[0]), [ftmp1] "=&f"(ftmp[1]), [ftmp2] "=&f"(ftmp[2]),
       [ftmp3] "=&f"(ftmp[3]), [ftmp4] "=&f"(ftmp[4]), [ftmp5] "=&f"(ftmp[5]),
       [ftmp6] "=&f"(ftmp[6]), [tmp0] "=&r"(tmp[0]), [src_ptr] "+&r"(src_ptr),
-      [temp2_ptr] "+&r"(temp2_ptr), [counter]"+&r"(l_counter)
-    : [filter_x0] "f"((uint64_t)filter_x[0]),
-      [filter_x1] "f"((uint64_t)filter_x[1]),
-      [filter_y0] "f"((uint64_t)filter_y[0]),
-      [filter_y1] "f"((uint64_t)filter_y[1]),
-      [src_stride] "r"((mips_reg)src_stride), [ff_ph_40] "f"(ff_ph_40),
-      [mask] "f"(mask)
+      [temp2_ptr] "+&r"(temp2_ptr), [counter]"+&r"(l_counter),
+      [ff_ph_40] "=&f"(ff_ph_40), [mask] "=&f"(mask),
+      [filter_x0] "=&f"(filter_x0), [filter_x1] "=&f"(filter_x1),
+      [filter_y0] "=&f"(filter_y0), [filter_y1] "=&f"(filter_y1)
+    : [src_stride] "r"((mips_reg)src_stride), [all] "r"(all)
     : "memory"
   );
+  /* clang-format on */
 }
 
 #define SUBPIX_VAR4XN(H)                                                      \