From 59e065b6edfa4f62edf23b7c0365b99f5ef86b5e Mon Sep 17 00:00:00 2001 From: Shiyou Yin Date: Tue, 22 Aug 2017 08:44:36 +0800 Subject: [PATCH] vpx_dsp:loongson optimize vpx_mseWxH_c(case 16x16,16X8,8X16,8X8) with mmi. Change-Id: I2c782d18d9004414ba61b77238e0caf3e022d8f2 --- test/variance_test.cc | 8 ++ vpx_dsp/mips/variance_mmi.c | 146 +++++++++++++++++++++++++++++++++++ vpx_dsp/vpx_dsp.mk | 2 + vpx_dsp/vpx_dsp_rtcd_defs.pl | 8 +- 4 files changed, 160 insertions(+), 4 deletions(-) create mode 100644 vpx_dsp/mips/variance_mmi.c diff --git a/test/variance_test.cc b/test/variance_test.cc index 8765c48f2..85b9a6934 100644 --- a/test/variance_test.cc +++ b/test/variance_test.cc @@ -1540,4 +1540,12 @@ INSTANTIATE_TEST_CASE_P(VSX, VpxSseTest, ::testing::Values(SseParams(2, 2, &vpx_get4x4sse_cs_vsx))); #endif // HAVE_VSX + +#if HAVE_MMI +INSTANTIATE_TEST_CASE_P(MMI, VpxMseTest, + ::testing::Values(MseParams(4, 4, &vpx_mse16x16_mmi), + MseParams(4, 3, &vpx_mse16x8_mmi), + MseParams(3, 4, &vpx_mse8x16_mmi), + MseParams(3, 3, &vpx_mse8x8_mmi))); +#endif // HAVE_MMI } // namespace diff --git a/vpx_dsp/mips/variance_mmi.c b/vpx_dsp/mips/variance_mmi.c new file mode 100644 index 000000000..ab22cc66e --- /dev/null +++ b/vpx_dsp/mips/variance_mmi.c @@ -0,0 +1,146 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_dsp_rtcd.h" +#include "vpx_ports/mem.h" +#include "vpx/vpx_integer.h" +#include "vpx_ports/asmdefs_mmi.h" + +#define VARIANCE_SSE_8 \ + "gsldlc1 %[ftmp1], 0x07(%[a]) \n\t" \ + "gsldrc1 %[ftmp1], 0x00(%[a]) \n\t" \ + "gsldlc1 %[ftmp2], 0x07(%[b]) \n\t" \ + "gsldrc1 %[ftmp2], 0x00(%[b]) \n\t" \ + "pasubub %[ftmp3], %[ftmp1], %[ftmp2] \n\t" \ + "punpcklbh %[ftmp4], %[ftmp3], %[ftmp0] \n\t" \ + "punpckhbh %[ftmp5], %[ftmp3], %[ftmp0] \n\t" \ + "pmaddhw %[ftmp6], %[ftmp4], %[ftmp4] \n\t" \ + "pmaddhw %[ftmp7], %[ftmp5], %[ftmp5] \n\t" \ + "paddw %[ftmp8], %[ftmp8], %[ftmp6] \n\t" \ + "paddw %[ftmp8], %[ftmp8], %[ftmp7] \n\t" + +#define VARIANCE_SSE_16 \ + VARIANCE_SSE_8 \ + "gsldlc1 %[ftmp1], 0x0f(%[a]) \n\t" \ + "gsldrc1 %[ftmp1], 0x08(%[a]) \n\t" \ + "gsldlc1 %[ftmp2], 0x0f(%[b]) \n\t" \ + "gsldrc1 %[ftmp2], 0x08(%[b]) \n\t" \ + "pasubub %[ftmp3], %[ftmp1], %[ftmp2] \n\t" \ + "punpcklbh %[ftmp4], %[ftmp3], %[ftmp0] \n\t" \ + "punpckhbh %[ftmp5], %[ftmp3], %[ftmp0] \n\t" \ + "pmaddhw %[ftmp6], %[ftmp4], %[ftmp4] \n\t" \ + "pmaddhw %[ftmp7], %[ftmp5], %[ftmp5] \n\t" \ + "paddw %[ftmp8], %[ftmp8], %[ftmp6] \n\t" \ + "paddw %[ftmp8], %[ftmp8], %[ftmp7] \n\t" + +static inline uint32_t vpx_mse16x(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, uint32_t *sse, + uint64_t high) { + double ftmp[12]; + uint32_t tmp[1]; + + *sse = 0; + + __asm__ volatile ( + "li %[tmp0], 0x20 \n\t" + "mtc1 %[tmp0], %[ftmp11] \n\t" + MMI_L(%[tmp0], %[high], 0x00) + "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "xor %[ftmp8], %[ftmp8], %[ftmp8] \n\t" + "xor %[ftmp9], %[ftmp9], %[ftmp9] \n\t" + + "1: \n\t" + VARIANCE_SSE_16 + + "addiu %[tmp0], %[tmp0], -0x01 \n\t" + MMI_ADDU(%[a], %[a], %[a_stride]) + MMI_ADDU(%[b], %[b], %[b_stride]) + "bnez %[tmp0], 1b \n\t" + + "dsrl %[ftmp9], %[ftmp8], %[ftmp11] \n\t" + "paddw %[ftmp9], %[ftmp9], %[ftmp8] \n\t" + "swc1 %[ftmp9], 0x00(%[sse]) \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), + [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), + [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), + [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), + [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]), + [tmp0]"=&r"(tmp[0]), + [a]"+&r"(a), [b]"+&r"(b) + : [a_stride]"r"((mips_reg)a_stride),[b_stride]"r"((mips_reg)b_stride), + [high]"r"(&high), [sse]"r"(sse) + : "memory" + ); + + return *sse; +} + +#define vpx_mse16xN(n) \ + uint32_t vpx_mse16x##n##_mmi(const uint8_t *a, int a_stride, \ + const uint8_t *b, int b_stride, \ + uint32_t *sse) { \ + return vpx_mse16x(a, a_stride, b, b_stride, sse, n); \ + } + +vpx_mse16xN(16); +vpx_mse16xN(8); + +static inline uint32_t vpx_mse8x(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, uint32_t *sse, + uint64_t high) { + double ftmp[12]; + uint32_t tmp[1]; + + *sse = 0; + + __asm__ volatile ( + "li %[tmp0], 0x20 \n\t" + "mtc1 %[tmp0], %[ftmp11] \n\t" + MMI_L(%[tmp0], %[high], 0x00) + "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "xor %[ftmp8], %[ftmp8], %[ftmp8] \n\t" + "xor %[ftmp9], %[ftmp9], %[ftmp9] \n\t" + + "1: \n\t" + VARIANCE_SSE_8 + + "addiu %[tmp0], %[tmp0], -0x01 \n\t" + MMI_ADDU(%[a], %[a], %[a_stride]) + MMI_ADDU(%[b], %[b], %[b_stride]) + "bnez %[tmp0], 1b \n\t" + + "dsrl %[ftmp9], %[ftmp8], %[ftmp11] \n\t" + "paddw %[ftmp9], %[ftmp9], %[ftmp8] \n\t" + "swc1 %[ftmp9], 0x00(%[sse]) \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), + [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), + [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), + [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), + [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]), + [tmp0]"=&r"(tmp[0]), + [a]"+&r"(a), [b]"+&r"(b) + : [a_stride]"r"((mips_reg)a_stride),[b_stride]"r"((mips_reg)b_stride), + [high]"r"(&high), [sse]"r"(sse) + : "memory" + ); + + return *sse; +} + +#define vpx_mse8xN(n) \ + uint32_t vpx_mse8x##n##_mmi(const uint8_t *a, int a_stride, \ + const uint8_t *b, int b_stride, uint32_t *sse) { \ + return vpx_mse8x(a, a_stride, b, b_stride, sse, n); \ + } + +vpx_mse8xN(16); +vpx_mse8xN(8); diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk index 0c3720e74..89db8f647 100644 --- a/vpx_dsp/vpx_dsp.mk +++ b/vpx_dsp/vpx_dsp.mk @@ -351,6 +351,8 @@ DSP_SRCS-$(HAVE_NEON) += arm/variance_neon.c DSP_SRCS-$(HAVE_MSA) += mips/variance_msa.c DSP_SRCS-$(HAVE_MSA) += mips/sub_pixel_variance_msa.c +DSP_SRCS-$(HAVE_MMI) += mips/variance_mmi.c + DSP_SRCS-$(HAVE_SSE) += x86/variance_sse2.c DSP_SRCS-$(HAVE_SSE2) += x86/avg_pred_sse2.c DSP_SRCS-$(HAVE_SSE2) += x86/variance_sse2.c # Contains SSE2 and SSSE3 diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index 67004764f..d6de0e3a4 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -1101,16 +1101,16 @@ add_proto qw/void vpx_get8x8var/, "const uint8_t *src_ptr, int source_stride, co specialize qw/vpx_get8x8var sse2 neon msa/; add_proto qw/unsigned int vpx_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; - specialize qw/vpx_mse16x16 sse2 avx2 neon msa/; + specialize qw/vpx_mse16x16 sse2 avx2 neon msa mmi/; add_proto qw/unsigned int vpx_mse16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; - specialize qw/vpx_mse16x8 sse2 msa/; + specialize qw/vpx_mse16x8 sse2 msa mmi/; add_proto qw/unsigned int vpx_mse8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; - specialize qw/vpx_mse8x16 sse2 msa/; + specialize qw/vpx_mse8x16 sse2 msa mmi/; add_proto qw/unsigned int vpx_mse8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; - specialize qw/vpx_mse8x8 sse2 msa/; + specialize qw/vpx_mse8x8 sse2 msa mmi/; add_proto qw/unsigned int vpx_get_mb_ss/, "const int16_t *"; specialize qw/vpx_get_mb_ss sse2 msa vsx/; -- 2.40.0