From 59e065b6edfa4f62edf23b7c0365b99f5ef86b5e Mon Sep 17 00:00:00 2001
From: Shiyou Yin <yinshiyou-hf@loongson.cn>
Date: Tue, 22 Aug 2017 08:44:36 +0800
Subject: [PATCH] vpx_dsp:loongson optimize vpx_mseWxH_c(case
 16x16,16X8,8X16,8X8) with mmi.

Change-Id: I2c782d18d9004414ba61b77238e0caf3e022d8f2
---
 test/variance_test.cc        |   8 ++
 vpx_dsp/mips/variance_mmi.c  | 146 +++++++++++++++++++++++++++++++++++
 vpx_dsp/vpx_dsp.mk           |   2 +
 vpx_dsp/vpx_dsp_rtcd_defs.pl |   8 +-
 4 files changed, 160 insertions(+), 4 deletions(-)
 create mode 100644 vpx_dsp/mips/variance_mmi.c

diff --git a/test/variance_test.cc b/test/variance_test.cc
index 8765c48f2..85b9a6934 100644
--- a/test/variance_test.cc
+++ b/test/variance_test.cc
@@ -1540,4 +1540,12 @@ INSTANTIATE_TEST_CASE_P(VSX, VpxSseTest,
                         ::testing::Values(SseParams(2, 2,
                                                     &vpx_get4x4sse_cs_vsx)));
 #endif  // HAVE_VSX
+
+#if HAVE_MMI
+INSTANTIATE_TEST_CASE_P(MMI, VpxMseTest,
+                        ::testing::Values(MseParams(4, 4, &vpx_mse16x16_mmi),
+                                          MseParams(4, 3, &vpx_mse16x8_mmi),
+                                          MseParams(3, 4, &vpx_mse8x16_mmi),
+                                          MseParams(3, 3, &vpx_mse8x8_mmi)));
+#endif  // HAVE_MMI
 }  // namespace
diff --git a/vpx_dsp/mips/variance_mmi.c b/vpx_dsp/mips/variance_mmi.c
new file mode 100644
index 000000000..ab22cc66e
--- /dev/null
+++ b/vpx_dsp/mips/variance_mmi.c
@@ -0,0 +1,146 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_ports/mem.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/asmdefs_mmi.h"
+
+#define VARIANCE_SSE_8                                              \
+  "gsldlc1    %[ftmp1],   0x07(%[a])                          \n\t" \
+  "gsldrc1    %[ftmp1],   0x00(%[a])                          \n\t" \
+  "gsldlc1    %[ftmp2],   0x07(%[b])                          \n\t" \
+  "gsldrc1    %[ftmp2],   0x00(%[b])                          \n\t" \
+  "pasubub    %[ftmp3],   %[ftmp1],       %[ftmp2]            \n\t" \
+  "punpcklbh  %[ftmp4],   %[ftmp3],       %[ftmp0]            \n\t" \
+  "punpckhbh  %[ftmp5],   %[ftmp3],       %[ftmp0]            \n\t" \
+  "pmaddhw    %[ftmp6],   %[ftmp4],       %[ftmp4]            \n\t" \
+  "pmaddhw    %[ftmp7],   %[ftmp5],       %[ftmp5]            \n\t" \
+  "paddw      %[ftmp8],   %[ftmp8],       %[ftmp6]            \n\t" \
+  "paddw      %[ftmp8],   %[ftmp8],       %[ftmp7]            \n\t"
+
+#define VARIANCE_SSE_16                                             \
+  VARIANCE_SSE_8                                                    \
+  "gsldlc1    %[ftmp1],   0x0f(%[a])                          \n\t" \
+  "gsldrc1    %[ftmp1],   0x08(%[a])                          \n\t" \
+  "gsldlc1    %[ftmp2],   0x0f(%[b])                          \n\t" \
+  "gsldrc1    %[ftmp2],   0x08(%[b])                          \n\t" \
+  "pasubub    %[ftmp3],   %[ftmp1],       %[ftmp2]            \n\t" \
+  "punpcklbh  %[ftmp4],   %[ftmp3],       %[ftmp0]            \n\t" \
+  "punpckhbh  %[ftmp5],   %[ftmp3],       %[ftmp0]            \n\t" \
+  "pmaddhw    %[ftmp6],   %[ftmp4],       %[ftmp4]            \n\t" \
+  "pmaddhw    %[ftmp7],   %[ftmp5],       %[ftmp5]            \n\t" \
+  "paddw      %[ftmp8],   %[ftmp8],       %[ftmp6]            \n\t" \
+  "paddw      %[ftmp8],   %[ftmp8],       %[ftmp7]            \n\t"
+
+static inline uint32_t vpx_mse16x(const uint8_t *a, int a_stride,
+                                  const uint8_t *b, int b_stride, uint32_t *sse,
+                                  uint64_t high) {
+  double ftmp[12];
+  uint32_t tmp[1];
+
+  *sse = 0;
+
+  __asm__ volatile (
+    "li         %[tmp0],    0x20                                \n\t"
+    "mtc1       %[tmp0],    %[ftmp11]                           \n\t"
+    MMI_L(%[tmp0], %[high], 0x00)
+    "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
+    "xor        %[ftmp8],   %[ftmp8],       %[ftmp8]            \n\t"
+    "xor        %[ftmp9],   %[ftmp9],       %[ftmp9]            \n\t"
+
+    "1:                                                         \n\t"
+    VARIANCE_SSE_16
+
+    "addiu      %[tmp0],    %[tmp0],        -0x01               \n\t"
+    MMI_ADDU(%[a], %[a], %[a_stride])
+    MMI_ADDU(%[b], %[b], %[b_stride])
+    "bnez       %[tmp0],    1b                                  \n\t"
+
+    "dsrl       %[ftmp9],   %[ftmp8],       %[ftmp11]           \n\t"
+    "paddw      %[ftmp9],   %[ftmp9],       %[ftmp8]            \n\t"
+    "swc1       %[ftmp9],   0x00(%[sse])                        \n\t"
+    : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+      [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+      [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
+      [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
+      [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
+      [ftmp10]"=&f"(ftmp[10]),          [ftmp11]"=&f"(ftmp[11]),
+      [tmp0]"=&r"(tmp[0]),
+      [a]"+&r"(a),                      [b]"+&r"(b)
+    : [a_stride]"r"((mips_reg)a_stride),[b_stride]"r"((mips_reg)b_stride),
+      [high]"r"(&high), [sse]"r"(sse)
+    : "memory"
+  );
+
+  return *sse;
+}
+
+#define vpx_mse16xN(n)                                         \
+  uint32_t vpx_mse16x##n##_mmi(const uint8_t *a, int a_stride, \
+                               const uint8_t *b, int b_stride, \
+                               uint32_t *sse) {                \
+    return vpx_mse16x(a, a_stride, b, b_stride, sse, n);       \
+  }
+
+vpx_mse16xN(16);
+vpx_mse16xN(8);
+
+static inline uint32_t vpx_mse8x(const uint8_t *a, int a_stride,
+                                 const uint8_t *b, int b_stride, uint32_t *sse,
+                                 uint64_t high) {
+  double ftmp[12];
+  uint32_t tmp[1];
+
+  *sse = 0;
+
+  __asm__ volatile (
+    "li         %[tmp0],    0x20                                \n\t"
+    "mtc1       %[tmp0],    %[ftmp11]                           \n\t"
+    MMI_L(%[tmp0], %[high], 0x00)
+    "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
+    "xor        %[ftmp8],   %[ftmp8],       %[ftmp8]            \n\t"
+    "xor        %[ftmp9],   %[ftmp9],       %[ftmp9]            \n\t"
+
+    "1:                                                         \n\t"
+    VARIANCE_SSE_8
+
+    "addiu      %[tmp0],    %[tmp0],        -0x01               \n\t"
+    MMI_ADDU(%[a], %[a], %[a_stride])
+    MMI_ADDU(%[b], %[b], %[b_stride])
+    "bnez       %[tmp0],    1b                                  \n\t"
+
+    "dsrl       %[ftmp9],   %[ftmp8],       %[ftmp11]           \n\t"
+    "paddw      %[ftmp9],   %[ftmp9],       %[ftmp8]            \n\t"
+    "swc1       %[ftmp9],   0x00(%[sse])                        \n\t"
+    : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+      [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+      [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
+      [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
+      [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
+      [ftmp10]"=&f"(ftmp[10]),          [ftmp11]"=&f"(ftmp[11]),
+      [tmp0]"=&r"(tmp[0]),
+      [a]"+&r"(a),                      [b]"+&r"(b)
+    : [a_stride]"r"((mips_reg)a_stride),[b_stride]"r"((mips_reg)b_stride),
+      [high]"r"(&high), [sse]"r"(sse)
+    : "memory"
+  );
+
+  return *sse;
+}
+
+#define vpx_mse8xN(n)                                                          \
+  uint32_t vpx_mse8x##n##_mmi(const uint8_t *a, int a_stride,                  \
+                              const uint8_t *b, int b_stride, uint32_t *sse) { \
+    return vpx_mse8x(a, a_stride, b, b_stride, sse, n);                        \
+  }
+
+vpx_mse8xN(16);
+vpx_mse8xN(8);
diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk
index 0c3720e74..89db8f647 100644
--- a/vpx_dsp/vpx_dsp.mk
+++ b/vpx_dsp/vpx_dsp.mk
@@ -351,6 +351,8 @@ DSP_SRCS-$(HAVE_NEON)   += arm/variance_neon.c
 DSP_SRCS-$(HAVE_MSA)    += mips/variance_msa.c
 DSP_SRCS-$(HAVE_MSA)    += mips/sub_pixel_variance_msa.c
 
+DSP_SRCS-$(HAVE_MMI)    += mips/variance_mmi.c
+
 DSP_SRCS-$(HAVE_SSE)    += x86/variance_sse2.c
 DSP_SRCS-$(HAVE_SSE2)   += x86/avg_pred_sse2.c
 DSP_SRCS-$(HAVE_SSE2)   += x86/variance_sse2.c  # Contains SSE2 and SSSE3
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index 67004764f..d6de0e3a4 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -1101,16 +1101,16 @@ add_proto qw/void vpx_get8x8var/, "const uint8_t *src_ptr, int source_stride, co
   specialize qw/vpx_get8x8var sse2 neon msa/;
 
 add_proto qw/unsigned int vpx_mse16x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-  specialize qw/vpx_mse16x16 sse2 avx2 neon msa/;
+  specialize qw/vpx_mse16x16 sse2 avx2 neon msa mmi/;
 
 add_proto qw/unsigned int vpx_mse16x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-  specialize qw/vpx_mse16x8 sse2 msa/;
+  specialize qw/vpx_mse16x8 sse2 msa mmi/;
 
 add_proto qw/unsigned int vpx_mse8x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-  specialize qw/vpx_mse8x16 sse2 msa/;
+  specialize qw/vpx_mse8x16 sse2 msa mmi/;
 
 add_proto qw/unsigned int vpx_mse8x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-  specialize qw/vpx_mse8x8 sse2 msa/;
+  specialize qw/vpx_mse8x8 sse2 msa mmi/;
 
 add_proto qw/unsigned int vpx_get_mb_ss/, "const int16_t *";
   specialize qw/vpx_get_mb_ss sse2 msa vsx/;
-- 
2.40.0