vp8,vpx_dsp:[loongson] fix bugs reported by clang

author jinbo <jinbo-hf@loongson.cn>

Wed, 1 Jul 2020 00:56:25 +0000 (08:56 +0800)

committer jinbo <jinbo-hf@loongson.cn>

Tue, 7 Jul 2020 01:25:58 +0000 (09:25 +0800)
author jinbo <jinbo-hf@loongson.cn>
Wed, 1 Jul 2020 00:56:25 +0000 (08:56 +0800)
committer jinbo <jinbo-hf@loongson.cn>
Tue, 7 Jul 2020 01:25:58 +0000 (09:25 +0800)
diff --git a/vp8/common/mips/mmi/idctllm_mmi.c b/vp8/common/mips/mmi/idctllm_mmi.c

index 4fad1d347f1e28bca7d4c6d3fbe570bd765d6b16..a35689dd30a3552234c25ea092fb4f07be6df8f1 100644 (file)
--- a/vp8/common/mips/mmi/idctllm_mmi.c
+++ b/vp8/common/mips/mmi/idctllm_mmi.c
@@ -41,14 +41,18 @@ void vp8_short_idct4x4llm_mmi(int16_t *input, unsigned char *pred_ptr,
                                int pred_stride, unsigned char *dst_ptr,
                                int dst_stride) {
    double ftmp[12];
-  uint32_t tmp[0];
-  DECLARE_ALIGNED(8, const uint64_t, ff_ph_04) = { 0x0004000400040004ULL };
-  DECLARE_ALIGNED(8, const uint64_t, ff_ph_4e7b) = { 0x4e7b4e7b4e7b4e7bULL };
-  DECLARE_ALIGNED(8, const uint64_t, ff_ph_22a3) = { 0x22a322a322a322a3ULL };
+  uint64_t tmp[1];
+  double ff_ph_04, ff_ph_4e7b, ff_ph_22a3;
  
    __asm__ volatile (
+    "dli        %[tmp0],    0x0004000400040004                  \n\t"
+    "dmtc1      %[tmp0],    %[ff_ph_04]                         \n\t"
+    "dli        %[tmp0],    0x4e7b4e7b4e7b4e7b                  \n\t"
+    "dmtc1      %[tmp0],    %[ff_ph_4e7b]                       \n\t"
+    "dli        %[tmp0],    0x22a322a322a322a3                  \n\t"
+    "dmtc1      %[tmp0],    %[ff_ph_22a3]                       \n\t"
      MMI_LI(%[tmp0], 0x02)
-    "mtc1       %[tmp0],    %[ftmp11]                           \n\t"
+    "dmtc1      %[tmp0],    %[ftmp11]                           \n\t"
      "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
  
      "gsldlc1    %[ftmp1],   0x07(%[ip])                         \n\t"
@@ -186,9 +190,10 @@ void vp8_short_idct4x4llm_mmi(int16_t *input, unsigned char *pred_ptr,
        [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), [ftmp8]"=&f"(ftmp[8]),
        [ftmp9]"=&f"(ftmp[9]), [ftmp10]"=&f"(ftmp[10]),
        [ftmp11]"=&f"(ftmp[11]), [tmp0]"=&r"(tmp[0]),
-      [pred_ptr]"+&r"(pred_ptr), [dst_ptr]"+&r"(dst_ptr)
-    : [ip]"r"(input), [ff_ph_22a3]"f"(ff_ph_22a3),
-      [ff_ph_4e7b]"f"(ff_ph_4e7b), [ff_ph_04]"f"(ff_ph_04),
+      [pred_ptr]"+&r"(pred_ptr), [dst_ptr]"+&r"(dst_ptr),
+      [ff_ph_4e7b]"=&f"(ff_ph_4e7b), [ff_ph_04]"=&f"(ff_ph_04),
+      [ff_ph_22a3]"=&f"(ff_ph_22a3)
+    : [ip]"r"(input),
        [pred_stride]"r"((mips_reg)pred_stride),
        [dst_stride]"r"((mips_reg)dst_stride)
      : "memory"
@@ -198,12 +203,13 @@ void vp8_short_idct4x4llm_mmi(int16_t *input, unsigned char *pred_ptr,
  void vp8_dc_only_idct_add_mmi(int16_t input_dc, unsigned char *pred_ptr,
                                int pred_stride, unsigned char *dst_ptr,
                                int dst_stride) {
-  int a1 = ((input_dc + 4) >> 3);
-  double ftmp[5];
+  int a0 = ((input_dc + 4) >> 3);
+  double a1, ftmp[5];
    int low32;
  
    __asm__ volatile (
      "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]        \n\t"
+    "dmtc1      %[a0],      %[a1]                           \n\t"
      "pshufh     %[a1],      %[a1],          %[ftmp0]        \n\t"
      "ulw        %[low32],   0x00(%[pred_ptr])               \n\t"
      "mtc1       %[low32],   %[ftmp1]                        \n\t"
@@ -244,9 +250,9 @@ void vp8_dc_only_idct_add_mmi(int16_t input_dc, unsigned char *pred_ptr,
      "gsswrc1    %[ftmp1],   0x00(%[dst_ptr])                \n\t"
      : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), [ftmp2]"=&f"(ftmp[2]),
        [ftmp3]"=&f"(ftmp[3]), [ftmp4]"=&f"(ftmp[4]), [low32]"=&r"(low32),
-      [dst_ptr]"+&r"(dst_ptr), [pred_ptr]"+&r"(pred_ptr)
+      [dst_ptr]"+&r"(dst_ptr), [pred_ptr]"+&r"(pred_ptr), [a1]"=&f"(a1)
      : [dst_stride]"r"((mips_reg)dst_stride),
-      [pred_stride]"r"((mips_reg)pred_stride), [a1]"f"(a1)
+      [pred_stride]"r"((mips_reg)pred_stride), [a0]"r"(a0)
      : "memory"
    );
  }
@@ -254,14 +260,15 @@ void vp8_dc_only_idct_add_mmi(int16_t input_dc, unsigned char *pred_ptr,
  void vp8_short_inv_walsh4x4_mmi(int16_t *input, int16_t *mb_dqcoeff) {
    int i;
    int16_t output[16];
-  double ftmp[12];
-  uint32_t tmp[1];
-  DECLARE_ALIGNED(8, const uint64_t, ff_ph_03) = { 0x0003000300030003ULL };
+  double ff_ph_03, ftmp[12];
+  uint64_t tmp[1];
  
    __asm__ volatile (
+    "dli        %[tmp0],    0x0003000300030003                  \n\t"
+    "dmtc1      %[tmp0],    %[ff_ph_03]                         \n\t"
      MMI_LI(%[tmp0], 0x03)
      "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
-    "mtc1       %[tmp0],    %[ftmp11]                           \n\t"
+    "dmtc1      %[tmp0],    %[ftmp11]                           \n\t"
      "gsldlc1    %[ftmp1],   0x07(%[ip])                         \n\t"
      "gsldrc1    %[ftmp1],   0x00(%[ip])                         \n\t"
      "gsldlc1    %[ftmp2],   0x0f(%[ip])                         \n\t"
@@ -317,8 +324,8 @@ void vp8_short_inv_walsh4x4_mmi(int16_t *input, int16_t *mb_dqcoeff) {
        [ftmp3]"=&f"(ftmp[3]), [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
        [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), [ftmp8]"=&f"(ftmp[8]),
        [ftmp9]"=&f"(ftmp[9]), [ftmp10]"=&f"(ftmp[10]),
-      [ftmp11]"=&f"(ftmp[11]), [tmp0]"=&r"(tmp[0])
-    : [ip]"r"(input), [op]"r"(output), [ff_ph_03]"f"(ff_ph_03)
+      [ftmp11]"=&f"(ftmp[11]), [tmp0]"=&r"(tmp[0]), [ff_ph_03]"=&f"(ff_ph_03)
+    : [ip]"r"(input), [op]"r"(output)
      : "memory"
    );
  
diff --git a/vp8/common/mips/mmi/loopfilter_filters_mmi.c b/vp8/common/mips/mmi/loopfilter_filters_mmi.c

index fc1240cc27e72a02d6eafee9113e6e66a4f2a5d1..a07a7e3b41d3362fbd4f4d06087f3aaeee4d3fb0 100644 (file)
--- a/vp8/common/mips/mmi/loopfilter_filters_mmi.c
+++ b/vp8/common/mips/mmi/loopfilter_filters_mmi.c
@@ -13,28 +13,25 @@
  #include "vp8/common/onyxc_int.h"
  #include "vpx_ports/asmdefs_mmi.h"
  
-DECLARE_ALIGNED(8, static const uint64_t, ff_ph_01) = { 0x0001000100010001ULL };
-DECLARE_ALIGNED(8, static const uint64_t,
-                ff_ph_003f) = { 0x003f003f003f003fULL };
-DECLARE_ALIGNED(8, static const uint64_t,
-                ff_ph_0900) = { 0x0900090009000900ULL };
-DECLARE_ALIGNED(8, static const uint64_t,
-                ff_ph_1200) = { 0x1200120012001200ULL };
-DECLARE_ALIGNED(8, static const uint64_t,
-                ff_ph_1b00) = { 0x1b001b001b001b00ULL };
-DECLARE_ALIGNED(8, static const uint64_t, ff_pb_fe) = { 0xfefefefefefefefeULL };
-DECLARE_ALIGNED(8, static const uint64_t, ff_pb_80) = { 0x8080808080808080ULL };
-DECLARE_ALIGNED(8, static const uint64_t, ff_pb_04) = { 0x0404040404040404ULL };
-DECLARE_ALIGNED(8, static const uint64_t, ff_pb_03) = { 0x0303030303030303ULL };
-DECLARE_ALIGNED(8, static const uint64_t, ff_pb_01) = { 0x0101010101010101ULL };
-
  void vp8_loop_filter_horizontal_edge_mmi(
      unsigned char *src_ptr, int src_pixel_step, const unsigned char *blimit,
      const unsigned char *limit, const unsigned char *thresh, int count) {
-  uint32_t tmp[1];
+  uint64_t tmp[1];
    mips_reg addr[2];
    double ftmp[12];
+  double ff_ph_01, ff_pb_fe, ff_pb_80, ff_pb_04, ff_pb_03;
+  /* clang-format off */
    __asm__ volatile (
+    "dli        %[tmp0],    0x0001000100010001                      \n\t"
+    "dmtc1      %[tmp0],    %[ff_ph_01]                             \n\t"
+    "dli        %[tmp0],    0xfefefefefefefefe                      \n\t"
+    "dmtc1      %[tmp0],    %[ff_pb_fe]                             \n\t"
+    "dli        %[tmp0],    0x8080808080808080                      \n\t"
+    "dmtc1      %[tmp0],    %[ff_pb_80]                             \n\t"
+    "dli        %[tmp0],    0x0404040404040404                      \n\t"
+    "dmtc1      %[tmp0],    %[ff_pb_04]                             \n\t"
+    "dli        %[tmp0],    0x0303030303030303                      \n\t"
+    "dmtc1      %[tmp0],    %[ff_pb_03]                             \n\t"
      "1:                                                             \n\t"
      "gsldlc1    %[ftmp10],  0x07(%[limit])                          \n\t"
      "gsldrc1    %[ftmp10],  0x00(%[limit])                          \n\t"
@@ -91,9 +88,9 @@ void vp8_loop_filter_horizontal_edge_mmi(
      "pasubub    %[ftmp1],   %[ftmp5],           %[ftmp6]            \n\t"
      "paddusb    %[ftmp1],   %[ftmp1],           %[ftmp1]            \n\t"
      "pasubub    %[ftmp2],   %[ftmp4],           %[ftmp7]            \n\t"
-    "and        %[ftmp2],   %[ftmp2],           %[ff_pb_fe]         \n\t"
-    "li         %[tmp0],    0x01                                    \n\t"
-    "mtc1       %[tmp0],    %[ftmp10]                               \n\t"
+    "pand       %[ftmp2],   %[ftmp2],           %[ff_pb_fe]         \n\t"
+    "dli        %[tmp0],    0x01                                    \n\t"
+    "dmtc1      %[tmp0],    %[ftmp10]                               \n\t"
      "psrlh      %[ftmp2],   %[ftmp2],           %[ftmp10]           \n\t"
      "paddusb    %[ftmp1],   %[ftmp1],           %[ftmp2]            \n\t"
      "gsldlc1    %[ftmp10],  0x07(%[blimit])                         \n\t"
@@ -134,8 +131,8 @@ void vp8_loop_filter_horizontal_edge_mmi(
      "punpcklbh  %[ftmp0],   %[ftmp0],           %[ftmp8]            \n\t"
      "punpckhbh  %[ftmp11],  %[ftmp11],          %[ftmp8]            \n\t"
  
-    "li         %[tmp0],    0x0b                                    \n\t"
-    "mtc1       %[tmp0],    %[ftmp10]                               \n\t"
+    "dli        %[tmp0],    0x0b                                    \n\t"
+    "dmtc1      %[tmp0],    %[ftmp10]                               \n\t"
      "psrah      %[ftmp0],   %[ftmp0],           %[ftmp10]           \n\t"
      "psrah      %[ftmp11],  %[ftmp11],          %[ftmp10]           \n\t"
      "packsshb   %[ftmp8],   %[ftmp0],           %[ftmp11]           \n\t"
@@ -149,8 +146,8 @@ void vp8_loop_filter_horizontal_edge_mmi(
      "packsshb   %[ftmp0],   %[ftmp0],           %[ftmp9]            \n\t"
      "paddsh     %[ftmp9],   %[ftmp9],           %[ff_ph_01]         \n\t"
  
-    "li         %[tmp0],    0x01                                    \n\t"
-    "mtc1       %[tmp0],    %[ftmp10]                               \n\t"
+    "dli        %[tmp0],    0x01                                    \n\t"
+    "dmtc1      %[tmp0],    %[ftmp10]                               \n\t"
      "psrah      %[ftmp11],  %[ftmp11],          %[ftmp10]           \n\t"
      "psrah      %[ftmp9],   %[ftmp9],           %[ftmp10]           \n\t"
      "packsshb   %[ftmp11],  %[ftmp11],          %[ftmp9]            \n\t"
@@ -188,17 +185,18 @@ void vp8_loop_filter_horizontal_edge_mmi(
        [ftmp10]"=&f"(ftmp[10]),            [ftmp11]"=&f"(ftmp[11]),
        [tmp0]"=&r"(tmp[0]),
        [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
-      [src_ptr]"+&r"(src_ptr),          [count]"+&r"(count)
+      [src_ptr]"+&r"(src_ptr),          [count]"+&r"(count),
+      [ff_ph_01]"=&f"(ff_ph_01),        [ff_pb_fe]"=&f"(ff_pb_fe),
+      [ff_pb_80]"=&f"(ff_pb_80),        [ff_pb_04]"=&f"(ff_pb_04),
+      [ff_pb_03]"=&f"(ff_pb_03)
      : [limit]"r"(limit),                [blimit]"r"(blimit),
        [thresh]"r"(thresh),
        [src_pixel_step]"r"((mips_reg)src_pixel_step),
        [src_pixel_step_x2]"r"((mips_reg)(src_pixel_step<<1)),
-      [src_pixel_step_x4]"r"((mips_reg)(src_pixel_step<<2)),
-      [ff_ph_01]"f"(ff_ph_01),          [ff_pb_fe]"f"(ff_pb_fe),
-      [ff_pb_80]"f"(ff_pb_80),          [ff_pb_04]"f"(ff_pb_04),
-      [ff_pb_03]"f"(ff_pb_03)
+      [src_pixel_step_x4]"r"((mips_reg)(src_pixel_step<<2))
      : "memory"
    );
+  /* clang-format on */
  }
  
  void vp8_loop_filter_vertical_edge_mmi(unsigned char *src_ptr,
@@ -206,11 +204,23 @@ void vp8_loop_filter_vertical_edge_mmi(unsigned char *src_ptr,
                                         const unsigned char *blimit,
                                         const unsigned char *limit,
                                         const unsigned char *thresh, int count) {
-  uint32_t tmp[1];
+  uint64_t tmp[1];
    mips_reg addr[2];
    double ftmp[13];
+  double ff_pb_fe, ff_ph_01, ff_pb_03, ff_pb_04, ff_pb_80;
  
+  /* clang-format off */
    __asm__ volatile (
+    "dli        %[tmp0],    0xfefefefefefefefe                      \n\t"
+    "dmtc1      %[tmp0],    %[ff_pb_fe]                             \n\t"
+    "dli        %[tmp0],    0x0001000100010001                      \n\t"
+    "dmtc1      %[tmp0],    %[ff_ph_01]                             \n\t"
+    "dli        %[tmp0],    0x0303030303030303                      \n\t"
+    "dmtc1      %[tmp0],    %[ff_pb_03]                             \n\t"
+    "dli        %[tmp0],    0x0404040404040404                      \n\t"
+    "dmtc1      %[tmp0],    %[ff_pb_04]                             \n\t"
+    "dli        %[tmp0],    0x8080808080808080                      \n\t"
+    "dmtc1      %[tmp0],    %[ff_pb_80]                             \n\t"
      MMI_SLL(%[tmp0], %[src_pixel_step], 0x02)
      MMI_ADDU(%[src_ptr], %[src_ptr], %[tmp0])
      MMI_SUBU(%[src_ptr], %[src_ptr], 0x04)
@@ -315,8 +325,8 @@ void vp8_loop_filter_vertical_edge_mmi(unsigned char *src_ptr,
      /* abs (p1-q1) */
      "pasubub    %[ftmp12],  %[ftmp10],          %[ftmp5]            \n\t"
      "pand       %[ftmp12],  %[ftmp12],          %[ff_pb_fe]         \n\t"
-    "li         %[tmp0],    0x01                                    \n\t"
-    "mtc1       %[tmp0],    %[ftmp1]                                \n\t"
+    "dli        %[tmp0],    0x01                                    \n\t"
+    "dmtc1      %[tmp0],    %[ftmp1]                                \n\t"
      "psrlh      %[ftmp12],  %[ftmp12],          %[ftmp1]            \n\t"
      "paddusb    %[ftmp1],   %[ftmp11],          %[ftmp12]           \n\t"
      "psubusb    %[ftmp1],   %[ftmp1],           %[ftmp8]            \n\t"
@@ -354,8 +364,8 @@ void vp8_loop_filter_vertical_edge_mmi(unsigned char *src_ptr,
      "paddsb     %[ftmp11],  %[ftmp2],           %[ff_pb_04]         \n\t"
      "paddsb     %[ftmp12],  %[ftmp2],           %[ff_pb_03]         \n\t"
  
-    "li         %[tmp0],    0x0b                                    \n\t"
-    "mtc1       %[tmp0],    %[ftmp7]                                \n\t"
+    "dli        %[tmp0],    0x0b                                    \n\t"
+    "dmtc1      %[tmp0],    %[ftmp7]                                \n\t"
      "pxor      %[ftmp0],    %[ftmp0],           %[ftmp0]            \n\t"
      "pxor      %[ftmp8],    %[ftmp8],           %[ftmp8]            \n\t"
      "punpcklbh %[ftmp0],    %[ftmp0],           %[ftmp12]           \n\t"
@@ -379,8 +389,8 @@ void vp8_loop_filter_vertical_edge_mmi(unsigned char *src_ptr,
      "paddsh     %[ftmp0],   %[ftmp0],           %[ff_ph_01]         \n\t"
      "paddsh     %[ftmp8],   %[ftmp8],           %[ff_ph_01]         \n\t"
  
-    "li         %[tmp0],    0x01                                    \n\t"
-    "mtc1       %[tmp0],    %[ftmp7]                                \n\t"
+    "dli        %[tmp0],    0x01                                    \n\t"
+    "dmtc1      %[tmp0],    %[ftmp7]                                \n\t"
      "psrah      %[ftmp0],   %[ftmp0],           %[ftmp7]            \n\t"
      "psrah      %[ftmp8],   %[ftmp8],           %[ftmp7]            \n\t"
      "packsshb   %[ftmp2],   %[ftmp0],           %[ftmp8]            \n\t"
@@ -450,15 +460,16 @@ void vp8_loop_filter_vertical_edge_mmi(unsigned char *src_ptr,
        [ftmp10]"=&f"(ftmp[10]),            [ftmp11]"=&f"(ftmp[11]),
        [ftmp12]"=&f"(ftmp[12]),            [tmp0]"=&r"(tmp[0]),
        [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
-      [src_ptr]"+&r"(src_ptr),          [count]"+&r"(count)
+      [src_ptr]"+&r"(src_ptr),          [count]"+&r"(count),
+      [ff_ph_01]"=&f"(ff_ph_01),        [ff_pb_03]"=&f"(ff_pb_03),
+      [ff_pb_04]"=&f"(ff_pb_04),        [ff_pb_80]"=&f"(ff_pb_80),
+      [ff_pb_fe]"=&f"(ff_pb_fe)
      : [limit]"r"(limit),                [blimit]"r"(blimit),
        [thresh]"r"(thresh),
-      [src_pixel_step]"r"((mips_reg)src_pixel_step),
-      [ff_ph_01]"f"(ff_ph_01),          [ff_pb_03]"f"(ff_pb_03),
-      [ff_pb_04]"f"(ff_pb_04),          [ff_pb_80]"f"(ff_pb_80),
-      [ff_pb_fe]"f"(ff_pb_fe)
+      [src_pixel_step]"r"((mips_reg)src_pixel_step)
      : "memory"
    );
+  /* clang-format on */
  }
  
  /* clang-format off */
@@ -484,10 +495,29 @@ void vp8_loop_filter_vertical_edge_mmi(unsigned char *src_ptr,
  void vp8_mbloop_filter_horizontal_edge_mmi(
      unsigned char *src_ptr, int src_pixel_step, const unsigned char *blimit,
      const unsigned char *limit, const unsigned char *thresh, int count) {
-  uint32_t tmp[1];
+  uint64_t tmp[1];
    double ftmp[13];
+  double ff_pb_fe, ff_pb_80, ff_pb_04, ff_pb_03, ff_ph_003f, ff_ph_0900,
+      ff_ph_1200, ff_ph_1b00;
  
+  /* clang-format off */
    __asm__ volatile (
+    "dli        %[tmp0],    0xfefefefefefefefe                      \n\t"
+    "dmtc1      %[tmp0],    %[ff_pb_fe]                             \n\t"
+    "dli        %[tmp0],    0x8080808080808080                      \n\t"
+    "dmtc1      %[tmp0],    %[ff_pb_80]                             \n\t"
+    "dli        %[tmp0],    0x0404040404040404                      \n\t"
+    "dmtc1      %[tmp0],    %[ff_pb_04]                             \n\t"
+    "dli        %[tmp0],    0x0303030303030303                      \n\t"
+    "dmtc1      %[tmp0],    %[ff_pb_03]                             \n\t"
+    "dli        %[tmp0],    0x003f003f003f003f                      \n\t"
+    "dmtc1      %[tmp0],    %[ff_ph_003f]                           \n\t"
+    "dli        %[tmp0],    0x0900090009000900                      \n\t"
+    "dmtc1      %[tmp0],    %[ff_ph_0900]                           \n\t"
+    "dli        %[tmp0],    0x1200120012001200                      \n\t"
+    "dmtc1      %[tmp0],    %[ff_ph_1200]                           \n\t"
+    "dli        %[tmp0],    0x1b001b001b001b00                      \n\t"
+    "dmtc1      %[tmp0],    %[ff_ph_1b00]                           \n\t"
      MMI_SLL(%[tmp0], %[src_pixel_step], 0x02)
      MMI_SUBU(%[src_ptr], %[src_ptr], %[tmp0])
      "1:                                                             \n\t"
@@ -550,8 +580,8 @@ void vp8_mbloop_filter_horizontal_edge_mmi(
      "paddusb    %[ftmp1],   %[ftmp1],           %[ftmp1]            \n\t"
      "pasubub    %[ftmp2],   %[ftmp4],           %[ftmp7]            \n\t"
      "pand       %[ftmp2],   %[ftmp2],           %[ff_pb_fe]         \n\t"
-    "li         %[tmp0],    0x01                                    \n\t"
-    "mtc1       %[tmp0],    %[ftmp9]                                \n\t"
+    "dli        %[tmp0],    0x01                                    \n\t"
+    "dmtc1      %[tmp0],    %[ftmp9]                                \n\t"
      "psrlh      %[ftmp2],   %[ftmp2],           %[ftmp9]            \n\t"
      "paddusb    %[ftmp1],   %[ftmp1],           %[ftmp2]            \n\t"
      "psubusb    %[ftmp1],   %[ftmp1],           %[ftmp12]           \n\t"
@@ -584,8 +614,8 @@ void vp8_mbloop_filter_horizontal_edge_mmi(
      "pandn      %[ftmp12],  %[ftmp1],           %[ftmp2]            \n\t"
      "pand       %[ftmp2],   %[ftmp2],           %[ftmp1]            \n\t"
  
-    "li         %[tmp0],    0x0b                                    \n\t"
-    "mtc1       %[tmp0],    %[ftmp9]                                \n\t"
+    "dli        %[tmp0],    0x0b                                    \n\t"
+    "dmtc1      %[tmp0],    %[ftmp9]                                \n\t"
      "paddsb     %[ftmp0],   %[ftmp2],           %[ff_pb_03]         \n\t"
      VP8_MBLOOP_HPSRAB
      "paddsb     %[ftmp5],   %[ftmp5],           %[ftmp0]            \n\t"
@@ -593,8 +623,8 @@ void vp8_mbloop_filter_horizontal_edge_mmi(
      VP8_MBLOOP_HPSRAB
      "psubsb     %[ftmp6],   %[ftmp6],           %[ftmp0]            \n\t"
  
-    "li         %[tmp0],    0x07                                    \n\t"
-    "mtc1       %[tmp0],    %[ftmp9]                                \n\t"
+    "dli        %[tmp0],    0x07                                    \n\t"
+    "dmtc1      %[tmp0],    %[ftmp9]                                \n\t"
      "pxor       %[ftmp0],   %[ftmp0],           %[ftmp0]            \n\t"
  
      VP8_MBLOOP_HPSRAB_ADD(%[ff_ph_1b00])
@@ -649,18 +679,20 @@ void vp8_mbloop_filter_horizontal_edge_mmi(
        [ftmp8]"=&f"(ftmp[8]),              [ftmp9]"=&f"(ftmp[9]),
        [ftmp10]"=&f"(ftmp[10]),            [ftmp11]"=&f"(ftmp[11]),
        [ftmp12]"=&f"(ftmp[12]),            [tmp0]"=&r"(tmp[0]),
-      [src_ptr]"+&r"(src_ptr),            [count]"+&r"(count)
+      [src_ptr]"+&r"(src_ptr),            [count]"+&r"(count),
+      [ff_pb_fe]"=&f"(ff_pb_fe),          [ff_pb_80]"=&f"(ff_pb_80),
+      [ff_pb_04]"=&f"(ff_pb_04),          [ff_pb_03]"=&f"(ff_pb_03),
+      [ff_ph_0900]"=&f"(ff_ph_0900),      [ff_ph_1b00]"=&f"(ff_ph_1b00),
+      [ff_ph_1200]"=&f"(ff_ph_1200),      [ff_ph_003f]"=&f"(ff_ph_003f)
      : [limit]"r"(limit),                  [blimit]"r"(blimit),
        [thresh]"r"(thresh),
-      [src_pixel_step]"r"((mips_reg)src_pixel_step),
-      [ff_pb_fe]"f"(ff_pb_fe),            [ff_pb_80]"f"(ff_pb_80),
-      [ff_pb_04]"f"(ff_pb_04),            [ff_pb_03]"f"(ff_pb_03),
-      [ff_ph_0900]"f"(ff_ph_0900),        [ff_ph_1b00]"f"(ff_ph_1b00),
-      [ff_ph_1200]"f"(ff_ph_1200),        [ff_ph_003f]"f"(ff_ph_003f)
+      [src_pixel_step]"r"((mips_reg)src_pixel_step)
      : "memory"
    );
+  /* clang-format on */
  }
  
+/* clang-format off */
  #define VP8_MBLOOP_VPSRAB_ADDH                                          \
    "pxor       %[ftmp7],   %[ftmp7],           %[ftmp7]            \n\t" \
    "pxor       %[ftmp8],   %[ftmp8],           %[ftmp8]            \n\t" \
@@ -673,15 +705,30 @@ void vp8_mbloop_filter_horizontal_edge_mmi(
    "psrah      %[ftmp7],   %[ftmp7],           %[ftmp12]           \n\t" \
    "psrah      %[ftmp8],   %[ftmp8],           %[ftmp12]           \n\t" \
    "packsshb   %[ftmp3],   %[ftmp7],           %[ftmp8]            \n\t"
+/* clang-format on */
  
  void vp8_mbloop_filter_vertical_edge_mmi(
      unsigned char *src_ptr, int src_pixel_step, const unsigned char *blimit,
      const unsigned char *limit, const unsigned char *thresh, int count) {
    mips_reg tmp[1];
-  DECLARE_ALIGNED(8, const uint64_t, srct[1]);
+  DECLARE_ALIGNED(8, const uint64_t, srct[2]);
    double ftmp[14];
+  double ff_ph_003f, ff_ph_0900, ff_pb_fe, ff_pb_80, ff_pb_04, ff_pb_03;
  
+  /* clang-format off */
    __asm__ volatile (
+    "dli        %[tmp0],    0x003f003f003f003f                      \n\t"
+    "dmtc1      %[tmp0],    %[ff_ph_003f]                           \n\t"
+    "dli        %[tmp0],    0x0900090009000900                      \n\t"
+    "dmtc1      %[tmp0],    %[ff_ph_0900]                           \n\t"
+    "dli        %[tmp0],    0xfefefefefefefefe                      \n\t"
+    "dmtc1      %[tmp0],    %[ff_pb_fe]                             \n\t"
+    "dli        %[tmp0],    0x8080808080808080                      \n\t"
+    "dmtc1      %[tmp0],    %[ff_pb_80]                             \n\t"
+    "dli        %[tmp0],    0x0404040404040404                      \n\t"
+    "dmtc1      %[tmp0],    %[ff_pb_04]                             \n\t"
+    "dli        %[tmp0],    0x0303030303030303                      \n\t"
+    "dmtc1      %[tmp0],    %[ff_pb_03]                             \n\t"
      MMI_SUBU(%[src_ptr], %[src_ptr], 0x04)
  
      "1:                                                             \n\t"
@@ -783,8 +830,8 @@ void vp8_mbloop_filter_vertical_edge_mmi(
      /* abs (p1-q1) / 2 */
      "pasubub    %[ftmp12],  %[ftmp10],          %[ftmp5]            \n\t"
      "pand       %[ftmp12],  %[ftmp12],          %[ff_pb_fe]         \n\t"
-    "li         %[tmp0],    0x01                                    \n\t"
-    "mtc1       %[tmp0],    %[ftmp8]                                \n\t"
+    "dli        %[tmp0],    0x01                                    \n\t"
+    "dmtc1      %[tmp0],    %[ftmp8]                                \n\t"
      "psrlh      %[ftmp12],  %[ftmp12],          %[ftmp8]            \n\t"
      "paddusb    %[ftmp12],  %[ftmp1],           %[ftmp12]           \n\t"
      "psubusb    %[ftmp12],  %[ftmp12],          %[ftmp13]           \n\t"
@@ -824,8 +871,8 @@ void vp8_mbloop_filter_vertical_edge_mmi(
      "pandn      %[ftmp0],   %[ftmp1],           %[ftmp0]            \n\t"
  
      "paddsb     %[ftmp4],   %[ftmp3],           %[ff_pb_04]         \n\t"
-    "li         %[tmp0],    0x0b                                    \n\t"
-    "mtc1       %[tmp0],    %[ftmp12]                               \n\t"
+    "dli        %[tmp0],    0x0b                                    \n\t"
+    "dmtc1      %[tmp0],    %[ftmp12]                               \n\t"
      "punpcklbh  %[ftmp7],   %[ftmp7],           %[ftmp4]            \n\t"
      "punpckhbh  %[ftmp8],   %[ftmp8],           %[ftmp4]            \n\t"
      "psrah      %[ftmp7],   %[ftmp7],           %[ftmp12]           \n\t"
@@ -842,8 +889,8 @@ void vp8_mbloop_filter_vertical_edge_mmi(
      /* ftmp6: ps0 */
      "paddsb     %[ftmp6],   %[ftmp6],           %[ftmp3]            \n\t"
  
-    "li         %[tmp0],    0x07                                    \n\t"
-    "mtc1       %[tmp0],    %[ftmp12]                               \n\t"
+    "dli        %[tmp0],    0x07                                    \n\t"
+    "dmtc1      %[tmp0],    %[ftmp12]                               \n\t"
      VP8_MBLOOP_VPSRAB_ADDH
      "paddh      %[ftmp1],   %[ff_ph_0900],      %[ff_ph_0900]       \n\t"
      "paddh      %[ftmp1],   %[ftmp1],           %[ff_ph_0900]       \n\t"
@@ -948,17 +995,19 @@ void vp8_mbloop_filter_vertical_edge_mmi(
        [ftmp10]"=&f"(ftmp[10]),            [ftmp11]"=&f"(ftmp[11]),
        [ftmp12]"=&f"(ftmp[12]),            [ftmp13]"=&f"(ftmp[13]),
        [tmp0]"=&r"(tmp[0]),                [src_ptr]"+&r"(src_ptr),
-      [count]"+&r"(count)
+      [count]"+&r"(count),
+      [ff_ph_003f]"=&f"(ff_ph_003f),    [ff_ph_0900]"=&f"(ff_ph_0900),
+      [ff_pb_03]"=&f"(ff_pb_03),        [ff_pb_04]"=&f"(ff_pb_04),
+      [ff_pb_80]"=&f"(ff_pb_80),        [ff_pb_fe]"=&f"(ff_pb_fe)
      : [limit]"r"(limit),                [blimit]"r"(blimit),
        [srct]"r"(srct),                  [thresh]"r"(thresh),
-      [src_pixel_step]"r"((mips_reg)src_pixel_step),
-      [ff_ph_003f]"f"(ff_ph_003f),      [ff_ph_0900]"f"(ff_ph_0900),
-      [ff_pb_03]"f"(ff_pb_03),          [ff_pb_04]"f"(ff_pb_04),
-      [ff_pb_80]"f"(ff_pb_80),          [ff_pb_fe]"f"(ff_pb_fe)
+      [src_pixel_step]"r"((mips_reg)src_pixel_step)
      : "memory"
    );
+  /* clang-format on */
  }
  
+/* clang-format off */
  #define VP8_SIMPLE_HPSRAB                                               \
    "psllh      %[ftmp0],   %[ftmp5],           %[ftmp8]            \n\t" \
    "psrah      %[ftmp0],   %[ftmp0],           %[ftmp9]            \n\t" \
@@ -966,23 +1015,38 @@ void vp8_mbloop_filter_vertical_edge_mmi(
    "psrah      %[ftmp1],   %[ftmp5],           %[ftmp10]           \n\t" \
    "psllh      %[ftmp1],   %[ftmp1],           %[ftmp8]            \n\t" \
    "por        %[ftmp0],   %[ftmp0],           %[ftmp1]            \n\t"
+/* clang-format on */
  
  void vp8_loop_filter_simple_horizontal_edge_mmi(unsigned char *src_ptr,
                                                  int src_pixel_step,
                                                  const unsigned char *blimit) {
-  uint32_t tmp[1], count = 2;
+  uint64_t tmp[1], count = 2;
    mips_reg addr[2];
    double ftmp[12];
+  double ff_pb_fe, ff_pb_80, ff_pb_04, ff_pb_01;
  
+  /* clang-format off */
    __asm__ volatile (
-    "li         %[tmp0],    0x08                                    \n\t"
-    "mtc1       %[tmp0],    %[ftmp8]                                \n\t"
-    "li         %[tmp0],    0x03                                    \n\t"
-    "mtc1       %[tmp0],    %[ftmp9]                                \n\t"
-    "li         %[tmp0],    0x0b                                    \n\t"
-    "mtc1       %[tmp0],    %[ftmp10]                               \n\t"
-    "li         %[tmp0],    0x01                                    \n\t"
-    "mtc1       %[tmp0],    %[ftmp11]                               \n\t"
+    "dli        %[tmp0],    0x0b                                    \n\t"
+    "dmtc1      %[tmp0],    %[ftmp10]                               \n\t"
+    "dli        %[tmp0],    0x01                                    \n\t"
+    "dmtc1      %[tmp0],    %[ftmp11]                               \n\t"
+    "dli        %[tmp0],    0x08                                    \n\t"
+    "dmtc1      %[tmp0],    %[ftmp8]                                \n\t"
+    "dli        %[tmp0],    0x03                                    \n\t"
+    "dmtc1      %[tmp0],    %[ftmp9]                                \n\t"
+    "dli        %[tmp0],    0x0b                                    \n\t"
+    "dmtc1      %[tmp0],    %[ftmp10]                               \n\t"
+    "dli        %[tmp0],    0x01                                    \n\t"
+    "dmtc1      %[tmp0],    %[ftmp11]                               \n\t"
+    "dli        %[tmp0],    0xfefefefefefefefe                      \n\t"
+    "dmtc1      %[tmp0],    %[ff_pb_fe]                             \n\t"
+    "dli        %[tmp0],    0x8080808080808080                      \n\t"
+    "dmtc1      %[tmp0],    %[ff_pb_80]                             \n\t"
+    "dli        %[tmp0],    0x0404040404040404                      \n\t"
+    "dmtc1      %[tmp0],    %[ff_pb_04]                             \n\t"
+    "dli        %[tmp0],    0x0101010101010101                      \n\t"
+    "dmtc1      %[tmp0],    %[ff_pb_01]                             \n\t"
  
      "1:                                                             \n\t"
      "gsldlc1    %[ftmp3],   0x07(%[blimit])                         \n\t"
@@ -996,7 +1060,7 @@ void vp8_loop_filter_simple_horizontal_edge_mmi(unsigned char *src_ptr,
      "gsldlc1    %[ftmp7],   0x07(%[addr0])                          \n\t"
      "gsldrc1    %[ftmp7],   0x00(%[addr0])                          \n\t"
      "pasubub    %[ftmp1],   %[ftmp7],           %[ftmp2]            \n\t"
-    "and        %[ftmp1],   %[ftmp1],           %[ff_pb_fe]         \n\t"
+    "pand       %[ftmp1],   %[ftmp1],           %[ff_pb_fe]         \n\t"
      "psrlh      %[ftmp1],   %[ftmp1],           %[ftmp11]           \n\t"
  
      MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step])
@@ -1020,7 +1084,7 @@ void vp8_loop_filter_simple_horizontal_edge_mmi(unsigned char *src_ptr,
      "paddsb     %[ftmp2],   %[ftmp2],           %[ftmp0]            \n\t"
      "paddsb     %[ftmp2],   %[ftmp2],           %[ftmp0]            \n\t"
      "paddsb     %[ftmp2],   %[ftmp2],           %[ftmp0]            \n\t"
-    "and        %[ftmp5],   %[ftmp5],           %[ftmp2]            \n\t"
+    "pand       %[ftmp5],   %[ftmp5],           %[ftmp2]            \n\t"
  
      "paddsb     %[ftmp5],   %[ftmp5],           %[ff_pb_04]         \n\t"
      VP8_SIMPLE_HPSRAB
@@ -1048,30 +1112,43 @@ void vp8_loop_filter_simple_horizontal_edge_mmi(unsigned char *src_ptr,
        [ftmp10]"=&f"(ftmp[10]),            [ftmp11]"=&f"(ftmp[11]),
        [tmp0]"=&r"(tmp[0]),
        [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
-      [src_ptr]"+&r"(src_ptr),          [count]"+&r"(count)
+      [src_ptr]"+&r"(src_ptr),          [count]"+&r"(count),
+      [ff_pb_fe]"=&f"(ff_pb_fe),        [ff_pb_80]"=&f"(ff_pb_80),
+      [ff_pb_04]"=&f"(ff_pb_04),        [ff_pb_01]"=&f"(ff_pb_01)
      : [blimit]"r"(blimit),
        [src_pixel_step]"r"((mips_reg)src_pixel_step),
-      [src_pixel_step_x2]"r"((mips_reg)(src_pixel_step<<1)),
-      [ff_pb_fe]"f"(ff_pb_fe),          [ff_pb_80]"f"(ff_pb_80),
-      [ff_pb_04]"f"(ff_pb_04),          [ff_pb_01]"f"(ff_pb_01)
+      [src_pixel_step_x2]"r"((mips_reg)(src_pixel_step<<1))
      : "memory"
    );
+  /* clang-format on */
  }
  
  void vp8_loop_filter_simple_vertical_edge_mmi(unsigned char *src_ptr,
                                                int src_pixel_step,
                                                const unsigned char *blimit) {
-  uint32_t tmp[1], count = 2;
+  uint64_t tmp[1], count = 2;
    mips_reg addr[2];
-  DECLARE_ALIGNED(8, const uint64_t, srct[1]);
-  double ftmp[12];
+  DECLARE_ALIGNED(8, const uint64_t, srct[2]);
+  double ftmp[12], ff_pb_fe, ff_pb_80, ff_pb_04, ff_pb_01;
  
+  /* clang-format off */
    __asm__ volatile (
-    "li         %[tmp0],    0x08                                    \n\t"
-    "mtc1       %[tmp0],    %[ftmp8]                                \n\t"
-    "li         %[tmp0],    0x20                                    \n\t"
-    "mtc1       %[tmp0],    %[ftmp10]                               \n\t"
-
+    "dli        %[tmp0],    0x08                                    \n\t"
+    "dmtc1      %[tmp0],    %[ftmp8]                                \n\t"
+    "dli        %[tmp0],    0x20                                    \n\t"
+    "dmtc1      %[tmp0],    %[ftmp10]                               \n\t"
+    "dli        %[tmp0],    0x08                                    \n\t"
+    "dmtc1      %[tmp0],    %[ftmp8]                                \n\t"
+    "dli        %[tmp0],    0x20                                    \n\t"
+    "dmtc1      %[tmp0],    %[ftmp10]                               \n\t"
+    "dli        %[tmp0],    0xfefefefefefefefe                      \n\t"
+    "dmtc1      %[tmp0],    %[ff_pb_fe]                             \n\t"
+    "dli        %[tmp0],    0x8080808080808080                      \n\t"
+    "dmtc1      %[tmp0],    %[ff_pb_80]                             \n\t"
+    "dli        %[tmp0],    0x0404040404040404                      \n\t"
+    "dmtc1      %[tmp0],    %[ff_pb_04]                             \n\t"
+    "dli        %[tmp0],    0x0101010101010101                      \n\t"
+    "dmtc1      %[tmp0],    %[ff_pb_01]                             \n\t"
      MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step_x4])
      MMI_SUBU(%[src_ptr], %[src_ptr], 0x02)
  
@@ -1118,8 +1195,8 @@ void vp8_loop_filter_simple_vertical_edge_mmi(unsigned char *src_ptr,
      "punpckhwd  %[ftmp3],   %[ftmp2],           %[ftmp5]            \n\t"
      "punpcklwd  %[ftmp2],   %[ftmp2],           %[ftmp5]            \n\t"
  
-    "li         %[tmp0],    0x01                                    \n\t"
-    "mtc1       %[tmp0],    %[ftmp9]                                \n\t"
+    "dli        %[tmp0],    0x01                                    \n\t"
+    "dmtc1      %[tmp0],    %[ftmp9]                                \n\t"
      "pasubub    %[ftmp6],   %[ftmp3],           %[ftmp0]            \n\t"
      "pand       %[ftmp6],   %[ftmp6],           %[ff_pb_fe]         \n\t"
      "psrlh      %[ftmp6],   %[ftmp6],           %[ftmp9]            \n\t"
@@ -1149,14 +1226,14 @@ void vp8_loop_filter_simple_vertical_edge_mmi(unsigned char *src_ptr,
      "pand       %[ftmp5],   %[ftmp5],           %[ftmp0]            \n\t"
      "paddsb     %[ftmp5],   %[ftmp5],           %[ff_pb_04]         \n\t"
  
-    "li         %[tmp0],    0x03                                    \n\t"
-    "mtc1       %[tmp0],    %[ftmp9]                                \n\t"
+    "dli        %[tmp0],    0x03                                    \n\t"
+    "dmtc1      %[tmp0],    %[ftmp9]                                \n\t"
      "psllh      %[ftmp0],   %[ftmp5],           %[ftmp8]            \n\t"
      "psrah      %[ftmp0],   %[ftmp0],           %[ftmp9]            \n\t"
      "psrlh      %[ftmp0],   %[ftmp0],           %[ftmp8]            \n\t"
  
-    "li         %[tmp0],    0x0b                                    \n\t"
-    "mtc1       %[tmp0],    %[ftmp9]                                \n\t"
+    "dli        %[tmp0],    0x0b                                    \n\t"
+    "dmtc1      %[tmp0],    %[ftmp9]                                \n\t"
      "psrah      %[ftmp7],   %[ftmp5],           %[ftmp9]            \n\t"
      "psllh      %[ftmp7],   %[ftmp7],           %[ftmp8]            \n\t"
      "por        %[ftmp0],   %[ftmp0],           %[ftmp7]            \n\t"
@@ -1164,14 +1241,14 @@ void vp8_loop_filter_simple_vertical_edge_mmi(unsigned char *src_ptr,
      "pxor       %[ftmp3],   %[ftmp3],           %[ff_pb_80]         \n\t"
      "psubsb     %[ftmp5],   %[ftmp5],           %[ff_pb_01]         \n\t"
  
-    "li         %[tmp0],    0x03                                    \n\t"
-    "mtc1       %[tmp0],    %[ftmp9]                                \n\t"
+    "dli        %[tmp0],    0x03                                    \n\t"
+    "dmtc1      %[tmp0],    %[ftmp9]                                \n\t"
      "psllh      %[ftmp0],   %[ftmp5],           %[ftmp8]            \n\t"
      "psrah      %[ftmp0],   %[ftmp0],           %[ftmp9]            \n\t"
      "psrlh      %[ftmp0],   %[ftmp0],           %[ftmp8]            \n\t"
  
-    "li         %[tmp0],    0x0b                                    \n\t"
-    "mtc1       %[tmp0],    %[ftmp9]                                \n\t"
+    "dli        %[tmp0],    0x0b                                    \n\t"
+    "dmtc1      %[tmp0],    %[ftmp9]                                \n\t"
      "psrah      %[ftmp5],   %[ftmp5],           %[ftmp9]            \n\t"
      "psllh      %[ftmp5],   %[ftmp5],           %[ftmp8]            \n\t"
      "por        %[ftmp0],   %[ftmp0],           %[ftmp5]            \n\t"
@@ -1235,16 +1312,17 @@ void vp8_loop_filter_simple_vertical_edge_mmi(unsigned char *src_ptr,
        [ftmp10]"=&f"(ftmp[10]),            [ftmp11]"=&f"(ftmp[11]),
        [tmp0]"=&r"(tmp[0]),
        [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
-      [src_ptr]"+&r"(src_ptr),          [count]"+&r"(count)
+      [src_ptr]"+&r"(src_ptr),          [count]"+&r"(count),
+      [ff_pb_fe]"=&f"(ff_pb_fe),        [ff_pb_80]"=&f"(ff_pb_80),
+      [ff_pb_04]"=&f"(ff_pb_04),        [ff_pb_01]"=&f"(ff_pb_01)
      : [blimit]"r"(blimit),              [srct]"r"(srct),
        [src_pixel_step]"r"((mips_reg)src_pixel_step),
        [src_pixel_step_x2]"r"((mips_reg)(src_pixel_step<<1)),
        [src_pixel_step_x4]"r"((mips_reg)(src_pixel_step<<2)),
-      [src_pixel_step_x8]"r"((mips_reg)(src_pixel_step<<3)),
-      [ff_pb_fe]"f"(ff_pb_fe),          [ff_pb_80]"f"(ff_pb_80),
-      [ff_pb_04]"f"(ff_pb_04),          [ff_pb_01]"f"(ff_pb_01)
+      [src_pixel_step_x8]"r"((mips_reg)(src_pixel_step<<3))
      : "memory"
    );
+  /* clang-format on */
  }
  
  /* Horizontal MB filtering */
diff --git a/vp8/common/mips/mmi/sixtap_filter_mmi.c b/vp8/common/mips/mmi/sixtap_filter_mmi.c

index dbe35d09f3f301527c211e5090f25ff040106b2d..b85f73fdff8ff39678f9d340199d9bb6a3120fa2 100644 (file)
--- a/vp8/common/mips/mmi/sixtap_filter_mmi.c
+++ b/vp8/common/mips/mmi/sixtap_filter_mmi.c
@@ -70,9 +70,8 @@ static INLINE void vp8_filter_block1d_h6_mmi(unsigned char *src_ptr,
                                               unsigned int output_height,
                                               unsigned int output_width,
                                               const int16_t *vp8_filter) {
-  uint32_t tmp[1];
-  DECLARE_ALIGNED(8, const uint64_t, ff_ph_40) = { 0x0040004000400040ULL };
-
+  uint64_t tmp[1];
+  double ff_ph_40;
  #if _MIPS_SIM == _ABIO32
    register double fzero asm("$f0");
    register double ftmp0 asm("$f2");
@@ -103,7 +102,10 @@ static INLINE void vp8_filter_block1d_h6_mmi(unsigned char *src_ptr,
    register double ftmp11 asm("$f12");
  #endif  // _MIPS_SIM == _ABIO32
  
+  /* clang-format off */
    __asm__ volatile (
+    "dli        %[tmp0],        0x0040004000400040                    \n\t"
+    "dmtc1      %[tmp0],        %[ff_ph_40]                           \n\t"
      "ldc1       %[ftmp0],       0x00(%[vp8_filter])                   \n\t"
      "ldc1       %[ftmp1],       0x10(%[vp8_filter])                   \n\t"
      "ldc1       %[ftmp2],       0x20(%[vp8_filter])                   \n\t"
@@ -111,10 +113,10 @@ static INLINE void vp8_filter_block1d_h6_mmi(unsigned char *src_ptr,
      "ldc1       %[ftmp4],       0x40(%[vp8_filter])                   \n\t"
      "ldc1       %[ftmp5],       0x50(%[vp8_filter])                   \n\t"
      "pxor       %[fzero],       %[fzero],           %[fzero]          \n\t"
-    "li         %[tmp0],        0x07                                  \n\t"
-    "mtc1       %[tmp0],        %[ftmp7]                              \n\t"
-    "li         %[tmp0],        0x08                                  \n\t"
-    "mtc1       %[tmp0],        %[ftmp11]                             \n\t"
+    "dli        %[tmp0],        0x07                                  \n\t"
+    "dmtc1      %[tmp0],        %[ftmp7]                              \n\t"
+    "dli        %[tmp0],        0x08                                  \n\t"
+    "dmtc1      %[tmp0],        %[ftmp11]                             \n\t"
  
      "1:                                                               \n\t"
      "gsldlc1    %[ftmp9],       0x05(%[src_ptr])                      \n\t"
@@ -166,21 +168,22 @@ static INLINE void vp8_filter_block1d_h6_mmi(unsigned char *src_ptr,
        [ftmp9]"=&f"(ftmp9),              [ftmp10]"=&f"(ftmp10),
        [ftmp11]"=&f"(ftmp11),            [tmp0]"=&r"(tmp[0]),
        [output_ptr]"+&r"(output_ptr),    [output_height]"+&r"(output_height),
-      [src_ptr]"+&r"(src_ptr)
+      [src_ptr]"+&r"(src_ptr),          [ff_ph_40]"=&f"(ff_ph_40)
      : [src_pixels_per_line]"r"((mips_reg)src_pixels_per_line),
-      [vp8_filter]"r"(vp8_filter),      [output_width]"r"(output_width),
-      [ff_ph_40]"f"(ff_ph_40)
+      [vp8_filter]"r"(vp8_filter),      [output_width]"r"(output_width)
      : "memory"
      );
+  /* clang-format on */
  }
  
  /* Horizontal filter:  pixel_step is always W */
  static INLINE void vp8_filter_block1dc_v6_mmi(
      uint16_t *src_ptr, unsigned char *output_ptr, unsigned int output_height,
      int output_pitch, unsigned int pixels_per_line, const int16_t *vp8_filter) {
-  DECLARE_ALIGNED(8, const uint64_t, ff_ph_40) = { 0x0040004000400040ULL };
-  uint32_t tmp[1];
+  double ff_ph_40;
+  uint64_t tmp[1];
    mips_reg addr[1];
+
  #if _MIPS_SIM == _ABIO32
    register double fzero asm("$f0");
    register double ftmp0 asm("$f2");
@@ -215,7 +218,10 @@ static INLINE void vp8_filter_block1dc_v6_mmi(
    register double ftmp13 asm("$f14");
  #endif  // _MIPS_SIM == _ABIO32
  
+  /* clang-format off */
    __asm__ volatile (
+    "dli        %[tmp0],      0x0040004000400040                      \n\t"
+    "dmtc1      %[tmp0],      %[ff_ph_40]                             \n\t"
      "ldc1       %[ftmp0],     0x00(%[vp8_filter])                     \n\t"
      "ldc1       %[ftmp1],     0x10(%[vp8_filter])                     \n\t"
      "ldc1       %[ftmp2],     0x20(%[vp8_filter])                     \n\t"
@@ -223,8 +229,8 @@ static INLINE void vp8_filter_block1dc_v6_mmi(
      "ldc1       %[ftmp4],     0x40(%[vp8_filter])                     \n\t"
      "ldc1       %[ftmp5],     0x50(%[vp8_filter])                     \n\t"
      "pxor       %[fzero],     %[fzero],        %[fzero]               \n\t"
-    "li         %[tmp0],      0x07                                    \n\t"
-    "mtc1       %[tmp0],      %[ftmp13]                               \n\t"
+    "dli        %[tmp0],      0x07                                    \n\t"
+    "dmtc1      %[tmp0],      %[ftmp13]                               \n\t"
  
      /* In order to make full use of memory load delay slot,
       * Operation of memory loading and calculating has been rearranged.
@@ -285,15 +291,16 @@ static INLINE void vp8_filter_block1dc_v6_mmi(
        [ftmp11]"=&f"(ftmp11),            [ftmp12]"=&f"(ftmp12),
        [ftmp13]"=&f"(ftmp13),            [tmp0]"=&r"(tmp[0]),
        [addr0]"=&r"(addr[0]),            [src_ptr]"+&r"(src_ptr),
-      [output_ptr]"+&r"(output_ptr),    [output_height]"+&r"(output_height)
+      [output_ptr]"+&r"(output_ptr),    [output_height]"+&r"(output_height),
+      [ff_ph_40]"=&f"(ff_ph_40)
      : [pixels_per_line]"r"((mips_reg)pixels_per_line),
        [pixels_per_line_x2]"r"((mips_reg)(pixels_per_line<<1)),
        [pixels_per_line_x4]"r"((mips_reg)(pixels_per_line<<2)),
        [vp8_filter]"r"(vp8_filter),
-      [output_pitch]"r"((mips_reg)output_pitch),
-      [ff_ph_40]"f"(ff_ph_40)
+      [output_pitch]"r"((mips_reg)output_pitch)
      : "memory"
      );
+  /* clang-format on */
  }
  
  /* When xoffset == 0, vp8_filter= {0,0,128,0,0,0},
@@ -313,6 +320,7 @@ static INLINE void vp8_filter_block1d_h6_filter0_mmi(
    register double ftmp1 asm("$f2");
  #endif  // _MIPS_SIM == _ABIO32
  
+  /* clang-format off */
    __asm__ volatile (
      "pxor       %[fzero],       %[fzero],           %[fzero]          \n\t"
  
@@ -335,6 +343,7 @@ static INLINE void vp8_filter_block1d_h6_filter0_mmi(
        [output_width]"r"(output_width)
      : "memory"
      );
+  /* clang-format on */
  }
  
  static INLINE void vp8_filter_block1dc_v6_filter0_mmi(
@@ -350,6 +359,7 @@ static INLINE void vp8_filter_block1dc_v6_filter0_mmi(
    register double ftmp1 asm("$f2");
  #endif  // _MIPS_SIM == _ABIO32
  
+  /* clang-format on */
    __asm__ volatile (
      "pxor       %[fzero],     %[fzero],        %[fzero]               \n\t"
  
@@ -371,6 +381,7 @@ static INLINE void vp8_filter_block1dc_v6_filter0_mmi(
        [output_pitch]"r"((mips_reg)output_pitch)
      : "memory"
      );
+  /* clang-format on */
  }
  
  #define sixtapNxM(n, m)                                                        \
diff --git a/vp8/encoder/mips/mmi/dct_mmi.c b/vp8/encoder/mips/mmi/dct_mmi.c

index b5ecf0f1ca77ed98495879f391f2e1741768d189..0fd25fcda507212a44a6f17a82a29987a1f06d67 100644 (file)
--- a/vp8/encoder/mips/mmi/dct_mmi.c
+++ b/vp8/encoder/mips/mmi/dct_mmi.c
@@ -46,6 +46,7 @@
  void vp8_short_fdct4x4_mmi(int16_t *input, int16_t *output, int pitch) {
    uint64_t tmp[1];
    int16_t *ip = input;
+  double ff_ph_op1, ff_ph_op3;
  
  #if _MIPS_SIM == _ABIO32
    register double ftmp0 asm("$f0");
@@ -83,13 +84,16 @@ void vp8_short_fdct4x4_mmi(int16_t *input, int16_t *output, int pitch) {
    DECLARE_ALIGNED(8, const uint64_t, ff_pw_51000) = { 0x0000c7380000c738ULL };
    DECLARE_ALIGNED(8, const uint64_t, ff_pw_14500) = { 0x000038a4000038a4ULL };
    DECLARE_ALIGNED(8, const uint64_t, ff_pw_7500) = { 0x00001d4c00001d4cULL };
-  DECLARE_ALIGNED(8, const uint64_t, ff_ph_op1) = { 0x14e808a914e808a9ULL };
-  DECLARE_ALIGNED(8, const uint64_t, ff_ph_op3) = { 0xeb1808a9eb1808a9ULL };
    DECLARE_ALIGNED(8, const uint64_t, ff_pw_5352) = { 0x000014e8000014e8ULL };
    DECLARE_ALIGNED(8, const uint64_t, ff_pw_2217) = { 0x000008a9000008a9ULL };
    DECLARE_ALIGNED(8, const uint64_t, ff_ph_8) = { 0x0008000800080008ULL };
  
+  /* clang-format off */
    __asm__ volatile (
+    "dli        %[tmp0],    0x14e808a914e808a9              \n\t"
+    "dmtc1      %[tmp0],    %[ff_ph_op1]                    \n\t"
+    "dli        %[tmp0],    0xeb1808a9eb1808a9              \n\t"
+    "dmtc1      %[tmp0],    %[ff_ph_op3]                    \n\t"
      "pxor       %[ftmp0],   %[ftmp0],      %[ftmp0]         \n\t"
      "gsldlc1    %[ftmp1],   0x07(%[ip])                     \n\t"
      "gsldrc1    %[ftmp1],   0x00(%[ip])                     \n\t"
@@ -129,7 +133,7 @@ void vp8_short_fdct4x4_mmi(int16_t *input, int16_t *output, int pitch) {
  
      // op[1] = (c1 * 2217 + d1 * 5352 + 14500) >> 12
      MMI_LI(%[tmp0], 0x0c)
-    "mtc1       %[tmp0],    %[ftmp11]                       \n\t"
+    "dmtc1      %[tmp0],    %[ftmp11]                       \n\t"
      "ldc1       %[ftmp12],  %[ff_pw_14500]                  \n\t"
      "punpcklhw  %[ftmp9],   %[ftmp7],       %[ftmp8]        \n\t"
      "pmaddhw    %[ftmp5],   %[ftmp9],       %[ff_ph_op1]    \n\t"
@@ -169,7 +173,7 @@ void vp8_short_fdct4x4_mmi(int16_t *input, int16_t *output, int pitch) {
      "paddh      %[ftmp1],   %[ftmp1],       %[ftmp9]        \n\t"
      "paddh      %[ftmp2],   %[ftmp2],       %[ftmp9]        \n\t"
      MMI_LI(%[tmp0], 0x04)
-    "mtc1       %[tmp0],    %[ftmp9]                        \n\t"
+    "dmtc1      %[tmp0],    %[ftmp9]                        \n\t"
      "psrah      %[ftmp1],   %[ftmp1],       %[ftmp9]        \n\t"
      "psrah      %[ftmp2],   %[ftmp2],       %[ftmp9]        \n\t"
  
@@ -211,15 +215,16 @@ void vp8_short_fdct4x4_mmi(int16_t *input, int16_t *output, int pitch) {
        [ftmp3] "=&f"(ftmp3), [ftmp4] "=&f"(ftmp4), [ftmp5] "=&f"(ftmp5),
        [ftmp6] "=&f"(ftmp6), [ftmp7] "=&f"(ftmp7), [ftmp8] "=&f"(ftmp8),
        [ftmp9] "=&f"(ftmp9), [ftmp10] "=&f"(ftmp10), [ftmp11] "=&f"(ftmp11),
-      [ftmp12] "=&f"(ftmp12), [tmp0] "=&r"(tmp[0]), [ip]"+&r"(ip)
+      [ftmp12] "=&f"(ftmp12), [tmp0] "=&r"(tmp[0]), [ip]"+&r"(ip),
+      [ff_ph_op1] "=&f"(ff_ph_op1), [ff_ph_op3] "=&f"(ff_ph_op3)
      : [ff_ph_01] "m"(ff_ph_01), [ff_ph_07] "m"(ff_ph_07),
-      [ff_ph_op1] "f"(ff_ph_op1), [ff_ph_op3] "f"(ff_ph_op3),
        [ff_pw_14500] "m"(ff_pw_14500), [ff_pw_7500] "m"(ff_pw_7500),
        [ff_pw_12000] "m"(ff_pw_12000), [ff_pw_51000] "m"(ff_pw_51000),
        [ff_pw_5352]"m"(ff_pw_5352), [ff_pw_2217]"m"(ff_pw_2217),
        [ff_ph_8]"m"(ff_ph_8), [pitch]"r"(pitch), [output] "r"(output)
      : "memory"
    );
+  /* clang-format on */
  }
  
  void vp8_short_fdct8x4_mmi(int16_t *input, int16_t *output, int pitch) {
@@ -228,17 +233,22 @@ void vp8_short_fdct8x4_mmi(int16_t *input, int16_t *output, int pitch) {
  }
  
  void vp8_short_walsh4x4_mmi(int16_t *input, int16_t *output, int pitch) {
-  double ftmp[13];
-  uint32_t tmp[1];
-  DECLARE_ALIGNED(8, const uint64_t, ff_ph_01) = { 0x0001000100010001ULL };
-  DECLARE_ALIGNED(8, const uint64_t, ff_pw_01) = { 0x0000000100000001ULL };
-  DECLARE_ALIGNED(8, const uint64_t, ff_pw_03) = { 0x0000000300000003ULL };
-  DECLARE_ALIGNED(8, const uint64_t, ff_pw_mask) = { 0x0001000000010000ULL };
+  double ftmp[13], ff_ph_01, ff_pw_01, ff_pw_03, ff_pw_mask;
+  uint64_t tmp[1];
  
+  /* clang-format off */
    __asm__ volatile (
+    "dli        %[tmp0],    0x0001000100010001                  \n\t"
+    "dmtc1      %[tmp0],    %[ff_ph_01]                         \n\t"
+    "dli        %[tmp0],    0x0000000100000001                  \n\t"
+    "dmtc1      %[tmp0],    %[ff_pw_01]                         \n\t"
+    "dli        %[tmp0],    0x0000000300000003                  \n\t"
+    "dmtc1      %[tmp0],    %[ff_pw_03]                         \n\t"
+    "dli        %[tmp0],    0x0001000000010000                  \n\t"
+    "dmtc1      %[tmp0],    %[ff_pw_mask]                       \n\t"
      MMI_LI(%[tmp0], 0x02)
      "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
-    "mtc1       %[tmp0],    %[ftmp11]                           \n\t"
+    "dmtc1      %[tmp0],    %[ftmp11]                           \n\t"
  
      "gsldlc1    %[ftmp1],   0x07(%[ip])                         \n\t"
      "gsldrc1    %[ftmp1],   0x00(%[ip])                         \n\t"
@@ -337,7 +347,7 @@ void vp8_short_walsh4x4_mmi(int16_t *input, int16_t *output, int pitch) {
      "psubw      %[ftmp4],   %[ftmp9],       %[ftmp10]           \n\t"
  
      MMI_LI(%[tmp0], 0x03)
-    "mtc1       %[tmp0],    %[ftmp11]                           \n\t"
+    "dmtc1      %[tmp0],    %[ftmp11]                           \n\t"
  
      "pcmpgtw    %[ftmp9],   %[ftmp0],       %[ftmp1]            \n\t"
      "pand       %[ftmp9],   %[ftmp9],       %[ff_pw_01]         \n\t"
@@ -393,7 +403,7 @@ void vp8_short_walsh4x4_mmi(int16_t *input, int16_t *output, int pitch) {
      "packsswh   %[ftmp4],   %[ftmp4],       %[ftmp8]            \n\t"
  
      MMI_LI(%[tmp0], 0x72)
-    "mtc1       %[tmp0],    %[ftmp11]                           \n\t"
+    "dmtc1      %[tmp0],    %[ftmp11]                           \n\t"
      "pshufh     %[ftmp1],   %[ftmp1],       %[ftmp11]           \n\t"
      "pshufh     %[ftmp2],   %[ftmp2],       %[ftmp11]           \n\t"
      "pshufh     %[ftmp3],   %[ftmp3],       %[ftmp11]           \n\t"
@@ -413,13 +423,12 @@ void vp8_short_walsh4x4_mmi(int16_t *input, int16_t *output, int pitch) {
        [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
        [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
        [ftmp10]"=&f"(ftmp[10]),          [ftmp11]"=&f"(ftmp[11]),
-      [ftmp12]"=&f"(ftmp[12]),
-      [tmp0]"=&r"(tmp[0]),
-      [ip]"+&r"(input)
-    : [op]"r"(output),
-      [ff_pw_01]"f"(ff_pw_01),          [pitch]"r"((mips_reg)pitch),
-      [ff_pw_03]"f"(ff_pw_03),          [ff_pw_mask]"f"(ff_pw_mask),
-      [ff_ph_01]"f"(ff_ph_01)
+      [ftmp12]"=&f"(ftmp[12]),          [ff_pw_mask]"=&f"(ff_pw_mask),
+      [tmp0]"=&r"(tmp[0]),              [ff_pw_01]"=&f"(ff_pw_01),
+      [ip]"+&r"(input),                 [ff_pw_03]"=&f"(ff_pw_03),
+      [ff_ph_01]"=&f"(ff_ph_01)
+    : [op]"r"(output),                  [pitch]"r"((mips_reg)pitch)
      : "memory"
    );
+  /* clang-format on */
  }
diff --git a/vp8/encoder/mips/mmi/vp8_quantize_mmi.c b/vp8/encoder/mips/mmi/vp8_quantize_mmi.c

index 69a9e5e01852c1a692c9e81079cb3c2ce5b209b2..1986444aa39635e4be7dc43de5b1062ee3fdcf7b 100644 (file)
--- a/vp8/encoder/mips/mmi/vp8_quantize_mmi.c
+++ b/vp8/encoder/mips/mmi/vp8_quantize_mmi.c
@@ -42,16 +42,17 @@ void vp8_fast_quantize_b_mmi(BLOCK *b, BLOCKD *d) {
  
    double ftmp[13];
    uint64_t tmp[1];
-  DECLARE_ALIGNED(8, const uint64_t, ones) = { 0xffffffffffffffffULL };
-  int eob = 0;
+  int64_t eob = 0;
+  double ones;
  
    __asm__ volatile(
        // loop 0 ~ 7
        "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]        \n\t"
+      "pcmpeqh    %[ones],    %[ones],        %[ones]         \n\t"
        "gsldlc1    %[ftmp1],   0x07(%[coeff_ptr])              \n\t"
        "gsldrc1    %[ftmp1],   0x00(%[coeff_ptr])              \n\t"
-      "li         %[tmp0],    0x0f                            \n\t"
-      "mtc1       %[tmp0],    %[ftmp9]                        \n\t"
+      "dli        %[tmp0],    0x0f                            \n\t"
+      "dmtc1      %[tmp0],    %[ftmp9]                        \n\t"
        "gsldlc1    %[ftmp2],   0x0f(%[coeff_ptr])              \n\t"
        "gsldrc1    %[ftmp2],   0x08(%[coeff_ptr])              \n\t"
  
@@ -165,18 +166,18 @@ void vp8_fast_quantize_b_mmi(BLOCK *b, BLOCKD *d) {
        "gssdlc1    %[ftmp6],   0x1f(%[dqcoeff_ptr])            \n\t"
        "gssdrc1    %[ftmp6],   0x18(%[dqcoeff_ptr])            \n\t"
  
-      "li         %[tmp0],    0x10                            \n\t"
-      "mtc1       %[tmp0],    %[ftmp9]                        \n\t"
+      "dli        %[tmp0],    0x10                            \n\t"
+      "dmtc1      %[tmp0],    %[ftmp9]                        \n\t"
  
        "pmaxsh     %[ftmp10],  %[ftmp10],       %[ftmp11]      \n\t"
        "psrlw      %[ftmp11],  %[ftmp10],       %[ftmp9]       \n\t"
        "pmaxsh     %[ftmp10],  %[ftmp10],       %[ftmp11]      \n\t"
-      "li         %[tmp0],    0xaa                            \n\t"
-      "mtc1       %[tmp0],    %[ftmp9]                        \n\t"
+      "dli        %[tmp0],    0xaa                            \n\t"
+      "dmtc1      %[tmp0],    %[ftmp9]                        \n\t"
        "pshufh     %[ftmp11],  %[ftmp10],       %[ftmp9]       \n\t"
        "pmaxsh     %[ftmp10],  %[ftmp10],       %[ftmp11]      \n\t"
-      "li         %[tmp0],    0xffff                          \n\t"
-      "mtc1       %[tmp0],    %[ftmp9]                        \n\t"
+      "dli        %[tmp0],    0xffff                          \n\t"
+      "dmtc1      %[tmp0],    %[ftmp9]                        \n\t"
        "pand       %[ftmp10],  %[ftmp10],       %[ftmp9]       \n\t"
        "gssdlc1    %[ftmp10],  0x07(%[eob])                    \n\t"
        "gssdrc1    %[ftmp10],  0x00(%[eob])                    \n\t"
@@ -184,15 +185,15 @@ void vp8_fast_quantize_b_mmi(BLOCK *b, BLOCKD *d) {
          [ftmp3] "=&f"(ftmp[3]), [ftmp4] "=&f"(ftmp[4]), [ftmp5] "=&f"(ftmp[5]),
          [ftmp6] "=&f"(ftmp[6]), [ftmp7] "=&f"(ftmp[7]), [ftmp8] "=&f"(ftmp[8]),
          [ftmp9] "=&f"(ftmp[9]), [ftmp10] "=&f"(ftmp[10]),
-        [ftmp11] "=&f"(ftmp[11]), [ftmp12] "=&f"(ftmp[12]), [tmp0] "=&r"(tmp[0])
+        [ftmp11] "=&f"(ftmp[11]), [ftmp12] "=&f"(ftmp[12]),
+        [tmp0] "=&r"(tmp[0]), [ones] "=&f"(ones)
        : [coeff_ptr] "r"((mips_reg)coeff_ptr),
          [qcoeff_ptr] "r"((mips_reg)qcoeff_ptr),
          [dequant_ptr] "r"((mips_reg)dequant_ptr),
          [round_ptr] "r"((mips_reg)round_ptr),
          [quant_ptr] "r"((mips_reg)quant_ptr),
          [dqcoeff_ptr] "r"((mips_reg)dqcoeff_ptr),
-        [inv_zig_zag] "r"((mips_reg)inv_zig_zag), [eob] "r"((mips_reg)&eob),
-        [ones] "f"(ones)
+        [inv_zig_zag] "r"((mips_reg)inv_zig_zag), [eob] "r"((mips_reg)&eob)
        : "memory");
  
    *d->eob = eob;
diff --git a/vpx_dsp/mips/sad_mmi.c b/vpx_dsp/mips/sad_mmi.c

index 5dee3164bc16eca36c30a4e67411386193028276..eaca4773f2b86eba254005429fc59682811cf325 100644 (file)
--- a/vpx_dsp/mips/sad_mmi.c
+++ b/vpx_dsp/mips/sad_mmi.c
@@ -364,6 +364,7 @@ static inline unsigned int vpx_sad64x(const uint8_t *src, int src_stride,
    double ftmp1, ftmp2, ftmp3, ftmp4, ftmp5;
    mips_reg l_counter = counter;
  
+  /* clang-format off */
    __asm__ volatile (
      "pxor       %[ftmp5],   %[ftmp5],       %[ftmp5]            \n\t"
      "1:                                                         \n\t"
@@ -383,6 +384,7 @@ static inline unsigned int vpx_sad64x(const uint8_t *src, int src_stride,
      : [src_stride]"r"((mips_reg)src_stride),
        [ref_stride]"r"((mips_reg)ref_stride)
    );
+  /* clang-format on */
  
    return sad;
  }
@@ -405,7 +407,9 @@ static inline unsigned int vpx_sad_avg64x(const uint8_t *src, int src_stride,
    unsigned int sad;
    double ftmp1, ftmp2, ftmp3, ftmp4, ftmp5;
    mips_reg l_counter = counter;
+  mips_reg l_second_pred = (mips_reg)second_pred;
  
+  /* clang-format off */
    __asm__ volatile (
      "pxor       %[ftmp5],   %[ftmp5],       %[ftmp5]            \n\t"
      "1:                                                         \n\t"
@@ -424,11 +428,12 @@ static inline unsigned int vpx_sad_avg64x(const uint8_t *src, int src_stride,
      : [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), [ftmp3]"=&f"(ftmp3),
        [ftmp4]"=&f"(ftmp4), [ftmp5]"=&f"(ftmp5), [counter]"+&r"(l_counter),
        [src]"+&r"(src), [ref]"+&r"(ref),
-      [second_pred]"+&r"((mips_reg)second_pred),
+      [second_pred]"+&r"(l_second_pred),
        [sad]"=&r"(sad)
      : [src_stride]"r"((mips_reg)src_stride),
        [ref_stride]"r"((mips_reg)ref_stride)
    );
+  /* clang-format on */
  
    return sad;
  }
@@ -450,6 +455,7 @@ static inline unsigned int vpx_sad32x(const uint8_t *src, int src_stride,
    double ftmp1, ftmp2, ftmp3, ftmp4, ftmp5;
    mips_reg l_counter = counter;
  
+  /* clang-format off */
    __asm__ volatile (
      "pxor       %[ftmp5],   %[ftmp5],       %[ftmp5]            \n\t"
      "1:                                                         \n\t"
@@ -469,6 +475,7 @@ static inline unsigned int vpx_sad32x(const uint8_t *src, int src_stride,
      : [src_stride]"r"((mips_reg)src_stride),
        [ref_stride]"r"((mips_reg)ref_stride)
    );
+  /* clang-format on */
  
    return sad;
  }
@@ -493,7 +500,9 @@ static inline unsigned int vpx_sad_avg32x(const uint8_t *src, int src_stride,
    unsigned int sad;
    double ftmp1, ftmp2, ftmp3, ftmp4, ftmp5;
    mips_reg l_counter = counter;
+  mips_reg l_second_pred = (mips_reg)second_pred;
  
+  /* clang-format off */
    __asm__ volatile (
      "pxor       %[ftmp5],   %[ftmp5],       %[ftmp5]            \n\t"
      "1:                                                         \n\t"
@@ -512,11 +521,12 @@ static inline unsigned int vpx_sad_avg32x(const uint8_t *src, int src_stride,
      : [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), [ftmp3]"=&f"(ftmp3),
        [ftmp4]"=&f"(ftmp4), [ftmp5]"=&f"(ftmp5), [counter]"+&r"(l_counter),
        [src]"+&r"(src), [ref]"+&r"(ref),
-      [second_pred]"+&r"((mips_reg)second_pred),
+      [second_pred]"+&r"(l_second_pred),
        [sad]"=&r"(sad)
      : [src_stride]"r"((mips_reg)src_stride),
        [ref_stride]"r"((mips_reg)ref_stride)
    );
+  /* clang-format on */
  
    return sad;
  }
@@ -539,6 +549,7 @@ static inline unsigned int vpx_sad16x(const uint8_t *src, int src_stride,
    double ftmp1, ftmp2, ftmp3, ftmp4, ftmp5;
    mips_reg l_counter = counter;
  
+  /* clang-format off */
    __asm__ volatile (
      "pxor       %[ftmp5],   %[ftmp5],       %[ftmp5]            \n\t"
      "1:                                                         \n\t"
@@ -558,6 +569,7 @@ static inline unsigned int vpx_sad16x(const uint8_t *src, int src_stride,
      : [src_stride]"r"((mips_reg)src_stride),
        [ref_stride]"r"((mips_reg)ref_stride)
    );
+  /* clang-format on */
  
    return sad;
  }
@@ -586,7 +598,9 @@ static inline unsigned int vpx_sad_avg16x(const uint8_t *src, int src_stride,
    unsigned int sad;
    double ftmp1, ftmp2, ftmp3, ftmp4, ftmp5;
    mips_reg l_counter = counter;
+  mips_reg l_second_pred = (mips_reg)second_pred;
  
+  /* clang-format off */
    __asm__ volatile (
      "pxor       %[ftmp5],   %[ftmp5],       %[ftmp5]            \n\t"
      "1:                                                         \n\t"
@@ -605,11 +619,12 @@ static inline unsigned int vpx_sad_avg16x(const uint8_t *src, int src_stride,
      : [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), [ftmp3]"=&f"(ftmp3),
        [ftmp4]"=&f"(ftmp4), [ftmp5]"=&f"(ftmp5), [counter]"+&r"(l_counter),
        [src]"+&r"(src), [ref]"+&r"(ref),
-      [second_pred]"+&r"((mips_reg)second_pred),
+      [second_pred]"+&r"(l_second_pred),
        [sad]"=&r"(sad)
      : [src_stride]"r"((mips_reg)src_stride),
        [ref_stride]"r"((mips_reg)ref_stride)
    );
+  /* clang-format on */
  
    return sad;
  }
@@ -632,6 +647,7 @@ static inline unsigned int vpx_sad8x(const uint8_t *src, int src_stride,
    double ftmp1, ftmp2, ftmp3;
    mips_reg l_counter = counter;
  
+  /* clang-format off */
    __asm__ volatile (
      "pxor       %[ftmp3],   %[ftmp3],       %[ftmp3]            \n\t"
      "1:                                                         \n\t"
@@ -651,6 +667,7 @@ static inline unsigned int vpx_sad8x(const uint8_t *src, int src_stride,
      : [src_stride]"r"((mips_reg)src_stride),
        [ref_stride]"r"((mips_reg)ref_stride)
    );
+  /* clang-format on */
  
    return sad;
  }
@@ -679,7 +696,9 @@ static inline unsigned int vpx_sad_avg8x(const uint8_t *src, int src_stride,
    unsigned int sad;
    double ftmp1, ftmp2, ftmp3;
    mips_reg l_counter = counter;
+  mips_reg l_second_pred = (mips_reg)second_pred;
  
+  /* clang-format off */
    __asm__ volatile (
      "pxor       %[ftmp3],   %[ftmp3],       %[ftmp3]            \n\t"
      "1:                                                         \n\t"
@@ -697,11 +716,12 @@ static inline unsigned int vpx_sad_avg8x(const uint8_t *src, int src_stride,
      "mfc1       %[sad],     %[ftmp3]                            \n\t"
      : [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), [ftmp3]"=&f"(ftmp3),
        [counter]"+&r"(l_counter), [src]"+&r"(src), [ref]"+&r"(ref),
-      [second_pred]"+&r"((mips_reg)second_pred),
+      [second_pred]"+&r"(l_second_pred),
        [sad]"=&r"(sad)
      : [src_stride]"r"((mips_reg)src_stride),
        [ref_stride]"r"((mips_reg)ref_stride)
    );
+  /* clang-format on */
  
    return sad;
  }
@@ -724,6 +744,7 @@ static inline unsigned int vpx_sad4x(const uint8_t *src, int src_stride,
    double ftmp1, ftmp2, ftmp3;
    mips_reg l_counter = counter;
  
+  /* clang-format off */
    __asm__ volatile (
      "pxor       %[ftmp3],   %[ftmp3],       %[ftmp3]            \n\t"
      "1:                                                         \n\t"
@@ -743,6 +764,7 @@ static inline unsigned int vpx_sad4x(const uint8_t *src, int src_stride,
      : [src_stride]"r"((mips_reg)src_stride),
        [ref_stride]"r"((mips_reg)ref_stride)
    );
+  /* clang-format on */
  
    return sad;
  }
@@ -767,7 +789,9 @@ static inline unsigned int vpx_sad_avg4x(const uint8_t *src, int src_stride,
    unsigned int sad;
    double ftmp1, ftmp2, ftmp3;
    mips_reg l_counter = counter;
+  mips_reg l_second_pred = (mips_reg)second_pred;
  
+  /* clang-format off */
    __asm__ volatile (
      "pxor       %[ftmp3],   %[ftmp3],       %[ftmp3]            \n\t"
      "1:                                                         \n\t"
@@ -785,11 +809,12 @@ static inline unsigned int vpx_sad_avg4x(const uint8_t *src, int src_stride,
      "mfc1       %[sad],     %[ftmp3]                            \n\t"
      : [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), [ftmp3]"=&f"(ftmp3),
        [counter]"+&r"(l_counter), [src]"+&r"(src), [ref]"+&r"(ref),
-      [second_pred]"+&r"((mips_reg)second_pred),
+      [second_pred]"+&r"(l_second_pred),
        [sad]"=&r"(sad)
      : [src_stride]"r"((mips_reg)src_stride),
        [ref_stride]"r"((mips_reg)ref_stride)
    );
+  /* clang-format on */
  
    return sad;
  }
diff --git a/vpx_dsp/mips/variance_mmi.c b/vpx_dsp/mips/variance_mmi.c

index 29e52a1a89463d3ce9d5b13c3d15a99980701684..c2adcfa018947eb97fd947d9e27c9913fed03be4 100644 (file)
--- a/vpx_dsp/mips/variance_mmi.c
+++ b/vpx_dsp/mips/variance_mmi.c
@@ -414,6 +414,7 @@ static inline uint32_t vpx_variance64x(const uint8_t *src_ptr, int src_stride,
  
    *sse = 0;
  
+  /* clang-format off */
    __asm__ volatile (
      "li         %[tmp0],    0x20                                \n\t"
      "mtc1       %[tmp0],    %[ftmp11]                           \n\t"
@@ -496,6 +497,7 @@ static inline uint32_t vpx_variance64x(const uint8_t *src_ptr, int src_stride,
        [high]"r"(&high), [sse]"r"(sse)
      : "memory"
    );
+  /* clang-format on */
  
    return *sse - (((int64_t)sum * sum) / (64 * high));
  }
@@ -519,6 +521,7 @@ uint32_t vpx_variance32x64_mmi(const uint8_t *src_ptr, int src_stride,
  
    *sse = 0;
  
+  /* clang-format off */
    __asm__ volatile (
      "li         %[tmp0],    0x20                                \n\t"
      "mtc1       %[tmp0],    %[ftmp11]                           \n\t"
@@ -577,6 +580,7 @@ uint32_t vpx_variance32x64_mmi(const uint8_t *src_ptr, int src_stride,
        [sse]"r"(sse)
      : "memory"
    );
+  /* clang-format on */
  
    return *sse - (((int64_t)sum * sum) / 2048);
  }
@@ -590,6 +594,7 @@ static inline uint32_t vpx_variance32x(const uint8_t *src_ptr, int src_stride,
  
    *sse = 0;
  
+  /* clang-format off */
    __asm__ volatile (
      "li         %[tmp0],    0x20                                \n\t"
      "mtc1       %[tmp0],    %[ftmp11]                           \n\t"
@@ -653,6 +658,7 @@ static inline uint32_t vpx_variance32x(const uint8_t *src_ptr, int src_stride,
        [high]"r"(&high), [sse]"r"(sse), [sum]"r"(&sum)
      : "memory"
    );
+  /* clang-format on */
  
    return *sse - (((int64_t)sum * sum) / (32 * high));
  }
@@ -676,6 +682,7 @@ static inline uint32_t vpx_variance16x(const uint8_t *src_ptr, int src_stride,
  
    *sse = 0;
  
+  /* clang-format off */
    __asm__ volatile (
      "li         %[tmp0],    0x20                                \n\t"
      "mtc1       %[tmp0],    %[ftmp11]                           \n\t"
@@ -729,6 +736,7 @@ static inline uint32_t vpx_variance16x(const uint8_t *src_ptr, int src_stride,
        [high]"r"(&high), [sse]"r"(sse), [sum]"r"(&sum)
      : "memory"
    );
+  /* clang-format on */
  
    return *sse - (((int64_t)sum * sum) / (16 * high));
  }
@@ -753,6 +761,7 @@ static inline uint32_t vpx_variance8x(const uint8_t *src_ptr, int src_stride,
  
    *sse = 0;
  
+  /* clang-format off */
    __asm__ volatile (
      "li         %[tmp0],    0x20                                \n\t"
      "mtc1       %[tmp0],    %[ftmp11]                           \n\t"
@@ -801,6 +810,7 @@ static inline uint32_t vpx_variance8x(const uint8_t *src_ptr, int src_stride,
        [high]"r"(&high), [sse]"r"(sse), [sum]"r"(&sum)
      : "memory"
    );
+  /* clang-format on */
  
    return *sse - (((int64_t)sum * sum) / (8 * high));
  }
@@ -825,6 +835,7 @@ static inline uint32_t vpx_variance4x(const uint8_t *src_ptr, int src_stride,
  
    *sse = 0;
  
+  /* clang-format off */
    __asm__ volatile (
      "li         %[tmp0],    0x20                                \n\t"
      "mtc1       %[tmp0],    %[ftmp10]                           \n\t"
@@ -872,6 +883,7 @@ static inline uint32_t vpx_variance4x(const uint8_t *src_ptr, int src_stride,
        [high]"r"(&high), [sse]"r"(sse), [sum]"r"(&sum)
      : "memory"
    );
+  /* clang-format on */
  
    return *sse - (((int64_t)sum * sum) / (4 * high));
  }
@@ -894,6 +906,7 @@ static inline uint32_t vpx_mse16x(const uint8_t *src_ptr, int src_stride,
  
    *sse = 0;
  
+  /* clang-format off */
    __asm__ volatile (
      "li         %[tmp0],    0x20                                \n\t"
      "mtc1       %[tmp0],    %[ftmp11]                           \n\t"
@@ -925,6 +938,7 @@ static inline uint32_t vpx_mse16x(const uint8_t *src_ptr, int src_stride,
        [high]"r"(&high), [sse]"r"(sse)
      : "memory"
    );
+  /* clang-format on */
  
    return *sse;
  }
@@ -947,6 +961,7 @@ static inline uint32_t vpx_mse8x(const uint8_t *src_ptr, int src_stride,
  
    *sse = 0;
  
+  /* clang-format off */
    __asm__ volatile (
      "li         %[tmp0],    0x20                                \n\t"
      "mtc1       %[tmp0],    %[ftmp11]                           \n\t"
@@ -978,6 +993,7 @@ static inline uint32_t vpx_mse8x(const uint8_t *src_ptr, int src_stride,
        [high]"r"(&high), [sse]"r"(sse)
      : "memory"
    );
+  /* clang-format on */
  
    return *sse;
  }
@@ -1021,22 +1037,39 @@ static inline void var_filter_block2d_bil_16x(const uint8_t *src_ptr,
    uint8_t *temp2_ptr = temp2;
    mips_reg l_counter = counter;
    double ftmp[15];
+  double ff_ph_40, mask;
+  double filter_x0, filter_x1, filter_y0, filter_y1;
    mips_reg tmp[2];
-  DECLARE_ALIGNED(8, const uint64_t, ff_ph_40) = { 0x0040004000400040ULL };
-  DECLARE_ALIGNED(8, const uint64_t, mask) = { 0x00ff00ff00ff00ffULL };
+  uint64_t x0, x1, y0, y1, all;
  
    const uint8_t *filter_x = bilinear_filters[x_offset];
    const uint8_t *filter_y = bilinear_filters[y_offset];
+  x0 = (uint64_t)filter_x[0];
+  x1 = (uint64_t)filter_x[1];
+  y0 = (uint64_t)filter_y[0];
+  y1 = (uint64_t)filter_y[1];
+  all = x0 | x1 << 8 | y0 << 16 | y1 << 24;
  
+  /* clang-format off */
    __asm__ volatile (
      "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
+    MMI_MTC1(%[all], %[ftmp14])
+    "punpcklbh  %[ftmp14],  %[ftmp14],      %[ftmp0]            \n\t"
+    "pshufh     %[filter_x0], %[ftmp14],    %[ftmp0]            \n\t"
+    MMI_LI(%[tmp0], 0x10)
+    MMI_MTC1(%[tmp0], %[mask])
+    "ssrld      %[ftmp14],  %[ftmp14],      %[mask]             \n\t"
+    "pshufh     %[filter_x1], %[ftmp14],    %[ftmp0]            \n\t"
+    "ssrld      %[ftmp14],  %[ftmp14],      %[mask]             \n\t"
+    "pshufh     %[filter_y0], %[ftmp14],    %[ftmp0]            \n\t"
+    "ssrld      %[ftmp14],  %[ftmp14],      %[mask]             \n\t"
+    "pshufh     %[filter_y1], %[ftmp14],    %[ftmp0]            \n\t"
      MMI_LI(%[tmp0], 0x07)
      MMI_MTC1(%[tmp0], %[ftmp14])
-    "pshufh     %[filter_x0], %[filter_x0], %[ftmp0]            \n\t"
-    "pshufh     %[filter_x1], %[filter_x1], %[ftmp0]            \n\t"
-    "pshufh     %[filter_y0], %[filter_y0], %[ftmp0]            \n\t"
-    "pshufh     %[filter_y1], %[filter_y1], %[ftmp0]            \n\t"
-
+    MMI_LI(%[tmp0], 0x0040004000400040)
+    MMI_MTC1(%[tmp0], %[ff_ph_40])
+    MMI_LI(%[tmp0], 0x00ff00ff00ff00ff)
+    MMI_MTC1(%[tmp0], %[mask])
      // fdata3: fdata3[0] ~ fdata3[15]
      VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_16_A
  
@@ -1072,15 +1105,13 @@ static inline void var_filter_block2d_bil_16x(const uint8_t *src_ptr,
        [ftmp11] "=&f"(ftmp[11]), [ftmp12] "=&f"(ftmp[12]),
        [ftmp13] "=&f"(ftmp[13]), [ftmp14] "=&f"(ftmp[14]),
        [tmp0] "=&r"(tmp[0]), [src_ptr] "+&r"(src_ptr), [temp2_ptr] "+&r"(temp2_ptr),
-      [counter]"+&r"(l_counter)
-    : [filter_x0] "f"((uint64_t)filter_x[0]),
-      [filter_x1] "f"((uint64_t)filter_x[1]),
-      [filter_y0] "f"((uint64_t)filter_y[0]),
-      [filter_y1] "f"((uint64_t)filter_y[1]),
-      [src_stride] "r"((mips_reg)src_stride), [ff_ph_40] "f"(ff_ph_40),
-      [mask] "f"(mask)
+      [counter]"+&r"(l_counter), [ff_ph_40] "=&f"(ff_ph_40), [mask] "=&f"(mask),
+      [filter_x0] "=&f"(filter_x0), [filter_x1] "=&f"(filter_x1),
+      [filter_y0] "=&f"(filter_y0), [filter_y1] "=&f"(filter_y1)
+    : [src_stride] "r"((mips_reg)src_stride), [all] "r"(all)
      : "memory"
    );
+  /* clang-format on */
  }
  
  #define SUBPIX_VAR16XN(H)                                                      \
@@ -1105,19 +1136,38 @@ static inline void var_filter_block2d_bil_8x(const uint8_t *src_ptr,
    mips_reg l_counter = counter;
    double ftmp[15];
    mips_reg tmp[2];
-  DECLARE_ALIGNED(8, const uint64_t, ff_ph_40) = { 0x0040004000400040ULL };
-  DECLARE_ALIGNED(8, const uint64_t, mask) = { 0x00ff00ff00ff00ffULL };
+  double ff_ph_40, mask;
+  uint64_t x0, x1, y0, y1, all;
+  double filter_x0, filter_x1, filter_y0, filter_y1;
    const uint8_t *filter_x = bilinear_filters[x_offset];
    const uint8_t *filter_y = bilinear_filters[y_offset];
+  x0 = (uint64_t)filter_x[0];
+  x1 = (uint64_t)filter_x[1];
+  y0 = (uint64_t)filter_y[0];
+  y1 = (uint64_t)filter_y[1];
+  all = x0 | x1 << 8 | y0 << 16 | y1 << 24;
  
+  /* clang-format off */
    __asm__ volatile (
+    "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
+    MMI_MTC1(%[all], %[ftmp14])
+    "punpcklbh  %[ftmp14],  %[ftmp14],      %[ftmp0]            \n\t"
+    "pshufh     %[filter_x0], %[ftmp14],    %[ftmp0]            \n\t"
+    MMI_LI(%[tmp0], 0x10)
+    MMI_MTC1(%[tmp0], %[mask])
+    "ssrld      %[ftmp14],  %[ftmp14],      %[mask]             \n\t"
+    "pshufh     %[filter_x1], %[ftmp14],    %[ftmp0]            \n\t"
+    "ssrld      %[ftmp14],  %[ftmp14],      %[mask]             \n\t"
+    "pshufh     %[filter_y0], %[ftmp14],    %[ftmp0]            \n\t"
+    "ssrld      %[ftmp14],  %[ftmp14],      %[mask]             \n\t"
+    "pshufh     %[filter_y1], %[ftmp14],    %[ftmp0]            \n\t"
      "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
      MMI_LI(%[tmp0], 0x07)
      MMI_MTC1(%[tmp0], %[ftmp14])
-    "pshufh     %[filter_x0], %[filter_x0], %[ftmp0]            \n\t"
-    "pshufh     %[filter_x1], %[filter_x1], %[ftmp0]            \n\t"
-    "pshufh     %[filter_y0], %[filter_y0], %[ftmp0]            \n\t"
-    "pshufh     %[filter_y1], %[filter_y1], %[ftmp0]            \n\t"
+    MMI_LI(%[tmp0], 0x0040004000400040)
+    MMI_MTC1(%[tmp0], %[ff_ph_40])
+    MMI_LI(%[tmp0], 0x00ff00ff00ff00ff)
+    MMI_MTC1(%[tmp0], %[mask])
  
      // fdata3: fdata3[0] ~ fdata3[7]
      VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_A
@@ -1154,15 +1204,13 @@ static inline void var_filter_block2d_bil_8x(const uint8_t *src_ptr,
        [ftmp11] "=&f"(ftmp[11]), [ftmp12] "=&f"(ftmp[12]),
        [ftmp13] "=&f"(ftmp[13]), [ftmp14] "=&f"(ftmp[14]),
        [tmp0] "=&r"(tmp[0]), [src_ptr] "+&r"(src_ptr), [temp2_ptr] "+&r"(temp2_ptr),
-      [counter]"+&r"(l_counter)
-    : [filter_x0] "f"((uint64_t)filter_x[0]),
-      [filter_x1] "f"((uint64_t)filter_x[1]),
-      [filter_y0] "f"((uint64_t)filter_y[0]),
-      [filter_y1] "f"((uint64_t)filter_y[1]),
-      [src_stride] "r"((mips_reg)src_stride), [ff_ph_40] "f"(ff_ph_40),
-      [mask] "f"(mask)
+      [counter]"+&r"(l_counter), [ff_ph_40] "=&f"(ff_ph_40), [mask] "=&f"(mask),
+      [filter_x0] "=&f"(filter_x0), [filter_x1] "=&f"(filter_x1),
+      [filter_y0] "=&f"(filter_y0), [filter_y1] "=&f"(filter_y1)
+    : [src_stride] "r"((mips_reg)src_stride), [all] "r"(all)
      : "memory"
    );
+  /* clang-format on */
  }
  
  #define SUBPIX_VAR8XN(H)                                                      \
@@ -1188,19 +1236,38 @@ static inline void var_filter_block2d_bil_4x(const uint8_t *src_ptr,
    mips_reg l_counter = counter;
    double ftmp[7];
    mips_reg tmp[2];
-  DECLARE_ALIGNED(8, const uint64_t, ff_ph_40) = { 0x0040004000400040ULL };
-  DECLARE_ALIGNED(8, const uint64_t, mask) = { 0x00ff00ff00ff00ffULL };
+  double ff_ph_40, mask;
+  uint64_t x0, x1, y0, y1, all;
+  double filter_x0, filter_x1, filter_y0, filter_y1;
    const uint8_t *filter_x = bilinear_filters[x_offset];
    const uint8_t *filter_y = bilinear_filters[y_offset];
+  x0 = (uint64_t)filter_x[0];
+  x1 = (uint64_t)filter_x[1];
+  y0 = (uint64_t)filter_y[0];
+  y1 = (uint64_t)filter_y[1];
+  all = x0 | x1 << 8 | y0 << 16 | y1 << 24;
  
+  /* clang-format off */
    __asm__ volatile (
+    "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
+    MMI_MTC1(%[all], %[ftmp6])
+    "punpcklbh  %[ftmp6],   %[ftmp6],       %[ftmp0]            \n\t"
+    "pshufh     %[filter_x0], %[ftmp6],     %[ftmp0]            \n\t"
+    MMI_LI(%[tmp0], 0x10)
+    MMI_MTC1(%[tmp0], %[mask])
+    "ssrld      %[ftmp6],   %[ftmp6],       %[mask]             \n\t"
+    "pshufh     %[filter_x1], %[ftmp6],     %[ftmp0]            \n\t"
+    "ssrld      %[ftmp6],   %[ftmp6],       %[mask]             \n\t"
+    "pshufh     %[filter_y0], %[ftmp6],     %[ftmp0]            \n\t"
+    "ssrld      %[ftmp6],   %[ftmp6],       %[mask]             \n\t"
+    "pshufh     %[filter_y1], %[ftmp6],     %[ftmp0]            \n\t"
      "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
      MMI_LI(%[tmp0], 0x07)
      MMI_MTC1(%[tmp0], %[ftmp6])
-    "pshufh     %[filter_x0], %[filter_x0], %[ftmp0]            \n\t"
-    "pshufh     %[filter_x1], %[filter_x1], %[ftmp0]            \n\t"
-    "pshufh     %[filter_y0], %[filter_y0], %[ftmp0]            \n\t"
-    "pshufh     %[filter_y1], %[filter_y1], %[ftmp0]            \n\t"
+    MMI_LI(%[tmp0], 0x0040004000400040)
+    MMI_MTC1(%[tmp0], %[ff_ph_40])
+    MMI_LI(%[tmp0], 0x00ff00ff00ff00ff)
+    MMI_MTC1(%[tmp0], %[mask])
      // fdata3: fdata3[0] ~ fdata3[3]
      VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_A
  
@@ -1232,15 +1299,14 @@ static inline void var_filter_block2d_bil_4x(const uint8_t *src_ptr,
      : [ftmp0] "=&f"(ftmp[0]), [ftmp1] "=&f"(ftmp[1]), [ftmp2] "=&f"(ftmp[2]),
        [ftmp3] "=&f"(ftmp[3]), [ftmp4] "=&f"(ftmp[4]), [ftmp5] "=&f"(ftmp[5]),
        [ftmp6] "=&f"(ftmp[6]), [tmp0] "=&r"(tmp[0]), [src_ptr] "+&r"(src_ptr),
-      [temp2_ptr] "+&r"(temp2_ptr), [counter]"+&r"(l_counter)
-    : [filter_x0] "f"((uint64_t)filter_x[0]),
-      [filter_x1] "f"((uint64_t)filter_x[1]),
-      [filter_y0] "f"((uint64_t)filter_y[0]),
-      [filter_y1] "f"((uint64_t)filter_y[1]),
-      [src_stride] "r"((mips_reg)src_stride), [ff_ph_40] "f"(ff_ph_40),
-      [mask] "f"(mask)
+      [temp2_ptr] "+&r"(temp2_ptr), [counter]"+&r"(l_counter),
+      [ff_ph_40] "=&f"(ff_ph_40), [mask] "=&f"(mask),
+      [filter_x0] "=&f"(filter_x0), [filter_x1] "=&f"(filter_x1),
+      [filter_y0] "=&f"(filter_y0), [filter_y1] "=&f"(filter_y1)
+    : [src_stride] "r"((mips_reg)src_stride), [all] "r"(all)
      : "memory"
    );
+  /* clang-format on */
  }
  
  #define SUBPIX_VAR4XN(H)                                                      \
author	jinbo <jinbo-hf@loongson.cn>
	Wed, 1 Jul 2020 00:56:25 +0000 (08:56 +0800)
committer	jinbo <jinbo-hf@loongson.cn>
	Tue, 7 Jul 2020 01:25:58 +0000 (09:25 +0800)
vp8/common/mips/mmi/idctllm_mmi.c		patch \| blob \| history
vp8/common/mips/mmi/loopfilter_filters_mmi.c		patch \| blob \| history
vp8/common/mips/mmi/sixtap_filter_mmi.c		patch \| blob \| history
vp8/encoder/mips/mmi/dct_mmi.c		patch \| blob \| history
vp8/encoder/mips/mmi/vp8_quantize_mmi.c		patch \| blob \| history
vpx_dsp/mips/sad_mmi.c		patch \| blob \| history
vpx_dsp/mips/variance_mmi.c		patch \| blob \| history