From 00952137906fa8f9a07544cc56c88dd52c17d4f7 Mon Sep 17 00:00:00 2001 From: Shiyou Yin Date: Sun, 3 Sep 2017 00:40:37 +0800 Subject: [PATCH] vp8: [loongson] optimize sixtap predict with mmi 1. vp8_sixtap_predict16x16_mmi 2. vp8_sixtap_predict8x8_mmi 3. vp8_sixtap_predict8x4_mmi 4. vp8_sixtap_predict4x4_mmi Change-Id: I186669d1a1d998a0f3ba3a548e25eee8b52c251b --- test/predict_test.cc | 9 + vp8/common/mips/mmi/sixtap_filter_mmi.c | 398 ++++++++++++++++++++++++ vp8/common/rtcd_defs.pl | 8 +- vp8/vp8_common.mk | 3 + 4 files changed, 414 insertions(+), 4 deletions(-) create mode 100644 vp8/common/mips/mmi/sixtap_filter_mmi.c diff --git a/test/predict_test.cc b/test/predict_test.cc index a6e2b3cf3..9f366ae52 100644 --- a/test/predict_test.cc +++ b/test/predict_test.cc @@ -324,6 +324,15 @@ INSTANTIATE_TEST_CASE_P( make_tuple(4, 4, &vp8_sixtap_predict4x4_msa))); #endif +#if HAVE_MMI +INSTANTIATE_TEST_CASE_P( + MMI, SixtapPredictTest, + ::testing::Values(make_tuple(16, 16, &vp8_sixtap_predict16x16_mmi), + make_tuple(8, 8, &vp8_sixtap_predict8x8_mmi), + make_tuple(8, 4, &vp8_sixtap_predict8x4_mmi), + make_tuple(4, 4, &vp8_sixtap_predict4x4_mmi))); +#endif + class BilinearPredictTest : public PredictTestBase {}; TEST_P(BilinearPredictTest, TestWithRandomData) { diff --git a/vp8/common/mips/mmi/sixtap_filter_mmi.c b/vp8/common/mips/mmi/sixtap_filter_mmi.c new file mode 100644 index 000000000..1b41a4296 --- /dev/null +++ b/vp8/common/mips/mmi/sixtap_filter_mmi.c @@ -0,0 +1,398 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vp8/common/filter.h" +#include "vpx_ports/asmdefs_mmi.h" + +DECLARE_ALIGNED(8, static const int16_t, vp8_six_tap_mmi[8][6 * 8]) = { + { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0080, 0x0080, 0x0080, 0x0080, 0x0080, 0x0080, 0x0080, 0x0080, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 }, + { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, + 0x007b, 0x007b, 0x007b, 0x007b, 0x007b, 0x007b, 0x007b, 0x007b, + 0x000c, 0x000c, 0x000c, 0x000c, 0x000c, 0x000c, 0x000c, 0x000c, + 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 }, + { 0x0002, 0x0002, 0x0002, 0x0002, 0x0002, 0x0002, 0x0002, 0x0002, + 0xfff5, 0xfff5, 0xfff5, 0xfff5, 0xfff5, 0xfff5, 0xfff5, 0xfff5, + 0x006c, 0x006c, 0x006c, 0x006c, 0x006c, 0x006c, 0x006c, 0x006c, + 0x0024, 0x0024, 0x0024, 0x0024, 0x0024, 0x0024, 0x0024, 0x0024, + 0xfff8, 0xfff8, 0xfff8, 0xfff8, 0xfff8, 0xfff8, 0xfff8, 0xfff8, + 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001 }, + { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0xfff7, 0xfff7, 0xfff7, 0xfff7, 0xfff7, 0xfff7, 0xfff7, 0xfff7, + 0x005d, 0x005d, 0x005d, 0x005d, 0x005d, 0x005d, 0x005d, 0x005d, + 0x0032, 0x0032, 0x0032, 0x0032, 0x0032, 0x0032, 0x0032, 0x0032, + 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 }, + { 0x0003, 0x0003, 0x0003, 0x0003, 0x0003, 0x0003, 0x0003, 0x0003, + 0xfff0, 0xfff0, 0xfff0, 0xfff0, 0xfff0, 0xfff0, 0xfff0, 0xfff0, + 0x004d, 0x004d, 0x004d, 0x004d, 0x004d, 0x004d, 0x004d, 0x004d, + 0x004d, 0x004d, 0x004d, 0x004d, 0x004d, 0x004d, 0x004d, 0x004d, + 0xfff0, 0xfff0, 0xfff0, 0xfff0, 0xfff0, 0xfff0, 0xfff0, 0xfff0, + 0x0003, 0x0003, 0x0003, 0x0003, 0x0003, 0x0003, 0x0003, 0x0003 }, + { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, + 0x0032, 0x0032, 0x0032, 0x0032, 0x0032, 0x0032, 0x0032, 0x0032, + 0x005d, 0x005d, 0x005d, 0x005d, 0x005d, 0x005d, 0x005d, 0x005d, + 0xfff7, 0xfff7, 0xfff7, 0xfff7, 0xfff7, 0xfff7, 0xfff7, 0xfff7, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 }, + { 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, + 0xfff8, 0xfff8, 0xfff8, 0xfff8, 0xfff8, 0xfff8, 0xfff8, 0xfff8, + 0x0024, 0x0024, 0x0024, 0x0024, 0x0024, 0x0024, 0x0024, 0x0024, + 0x006c, 0x006c, 0x006c, 0x006c, 0x006c, 0x006c, 0x006c, 0x006c, + 0xfff5, 0xfff5, 0xfff5, 0xfff5, 0xfff5, 0xfff5, 0xfff5, 0xfff5, + 0x0002, 0x0002, 0x0002, 0x0002, 0x0002, 0x0002, 0x0002, 0x0002 }, + { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, + 0x000c, 0x000c, 0x000c, 0x000c, 0x000c, 0x000c, 0x000c, 0x000c, + 0x007b, 0x007b, 0x007b, 0x007b, 0x007b, 0x007b, 0x007b, 0x007b, + 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 } +}; + +/* Horizontal filter: pixel_step is 1, output_height and output_width are + the size of horizontal filtering output, output_height is always H + 5 */ +static INLINE void vp8_filter_block1d_h6_mmi(unsigned char *src_ptr, + uint16_t *output_ptr, + unsigned int src_pixels_per_line, + unsigned int output_height, + unsigned int output_width, + const int16_t *vp8_filter) { + uint32_t tmp[1]; + DECLARE_ALIGNED(8, const uint64_t, ff_ph_40) = { 0x0040004000400040ULL }; + +#if _MIPS_SIM == _ABIO32 + register double fzero asm("$f0"); + register double ftmp0 asm("$f2"); + register double ftmp1 asm("$f4"); + register double ftmp2 asm("$f6"); + register double ftmp3 asm("$f8"); + register double ftmp4 asm("$f10"); + register double ftmp5 asm("$f12"); + register double ftmp6 asm("$f14"); + register double ftmp7 asm("$f16"); + register double ftmp8 asm("$f18"); + register double ftmp9 asm("$f20"); + register double ftmp10 asm("$f22"); +#else + register double fzero asm("$f0"); + register double ftmp0 asm("$f1"); + register double ftmp1 asm("$f2"); + register double ftmp2 asm("$f3"); + register double ftmp3 asm("$f4"); + register double ftmp4 asm("$f5"); + register double ftmp5 asm("$f6"); + register double ftmp6 asm("$f7"); + register double ftmp7 asm("$f8"); + register double ftmp8 asm("$f9"); + register double ftmp9 asm("$f10"); + register double ftmp10 asm("$f11"); +#endif // _MIPS_SIM == _ABIO32 + + __asm__ volatile ( + "ldc1 %[ftmp0], 0x00(%[vp8_filter]) \n\t" + "ldc1 %[ftmp1], 0x10(%[vp8_filter]) \n\t" + "ldc1 %[ftmp2], 0x20(%[vp8_filter]) \n\t" + "ldc1 %[ftmp3], 0x30(%[vp8_filter]) \n\t" + "ldc1 %[ftmp4], 0x40(%[vp8_filter]) \n\t" + "ldc1 %[ftmp5], 0x50(%[vp8_filter]) \n\t" + "xor %[fzero], %[fzero], %[fzero] \n\t" + "li %[tmp0], 0x07 \n\t" + "mtc1 %[tmp0], %[ftmp7] \n\t" + "li %[tmp0], 0x08 \n\t" + "mtc1 %[tmp0], %[ftmp10] \n\t" + + "1: \n\t" + "gsldlc1 %[ftmp9], 0x05(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp9], -0x02(%[src_ptr]) \n\t" + + "punpcklbh %[ftmp6], %[ftmp9], %[fzero] \n\t" + "pmullh %[ftmp8], %[ftmp6], %[ftmp0] \n\t" + + "punpckhbh %[ftmp6], %[ftmp9], %[fzero] \n\t" + "pmullh %[ftmp6], %[ftmp6], %[ftmp4] \n\t" + "paddsh %[ftmp8], %[ftmp8], %[ftmp6] \n\t" + + "gsldlc1 %[ftmp9], 0x06(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp9], -0x01(%[src_ptr]) \n\t" + + "punpcklbh %[ftmp6], %[ftmp9], %[fzero] \n\t" + "pmullh %[ftmp6], %[ftmp6], %[ftmp1] \n\t" + "paddsh %[ftmp8], %[ftmp8], %[ftmp6] \n\t" + + "punpckhbh %[ftmp6], %[ftmp9], %[fzero] \n\t" + "pmullh %[ftmp6], %[ftmp6], %[ftmp5] \n\t" + "paddsh %[ftmp8], %[ftmp8], %[ftmp6] \n\t" + + "dsrl %[ftmp9], %[ftmp9], %[ftmp10] \n\t" + "punpcklbh %[ftmp6], %[ftmp9], %[fzero] \n\t" + "pmullh %[ftmp6], %[ftmp6], %[ftmp2] \n\t" + "paddsh %[ftmp8], %[ftmp8], %[ftmp6] \n\t" + + "dsrl %[ftmp9], %[ftmp9], %[ftmp10] \n\t" + "punpcklbh %[ftmp6], %[ftmp9], %[fzero] \n\t" + "pmullh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" + "paddsh %[ftmp8], %[ftmp8], %[ftmp6] \n\t" + + "paddsh %[ftmp8], %[ftmp8], %[ff_ph_40] \n\t" + "psrah %[ftmp8], %[ftmp8], %[ftmp7] \n\t" + "packushb %[ftmp8], %[ftmp8], %[fzero] \n\t" + "punpcklbh %[ftmp8], %[ftmp8], %[fzero] \n\t" + "gssdlc1 %[ftmp8], 0x07(%[output_ptr]) \n\t" + "gssdrc1 %[ftmp8], 0x00(%[output_ptr]) \n\t" + + "addiu %[output_height], %[output_height], -0x01 \n\t" + MMI_ADDU(%[output_ptr], %[output_ptr], %[output_width]) + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixels_per_line]) + "bnez %[output_height], 1b \n\t" + : [fzero]"=&f"(fzero), [ftmp0]"=&f"(ftmp0), + [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), + [ftmp3]"=&f"(ftmp3), [ftmp4]"=&f"(ftmp4), + [ftmp5]"=&f"(ftmp5), [ftmp6]"=&f"(ftmp6), + [ftmp7]"=&f"(ftmp7), [ftmp8]"=&f"(ftmp8), + [ftmp9]"=&f"(ftmp9), [ftmp10]"=&f"(ftmp10), + [tmp0]"=&r"(tmp[0]), [src_ptr]"+&r"(src_ptr), + [output_ptr]"+&r"(output_ptr), [output_height]"+&r"(output_height) + : [src_pixels_per_line]"r"((mips_reg)src_pixels_per_line), + [vp8_filter]"r"(vp8_filter), [output_width]"r"(output_width), + [ff_ph_40]"f"(ff_ph_40) + : "memory" + ); +} + +/* Horizontal filter: pixel_step is always W */ +static INLINE void vp8_filter_block1dc_v6_mmi( + uint16_t *src_ptr, unsigned char *output_ptr, unsigned int output_height, + int output_pitch, unsigned int pixels_per_line, const int16_t *vp8_filter) { + DECLARE_ALIGNED(8, const uint64_t, ff_ph_40) = { 0x0040004000400040ULL }; + uint32_t tmp[1]; + mips_reg addr[1]; +#if _MIPS_SIM == _ABIO32 + register double fzero asm("$f0"); + register double ftmp0 asm("$f2"); + register double ftmp1 asm("$f4"); + register double ftmp2 asm("$f6"); + register double ftmp3 asm("$f8"); + register double ftmp4 asm("$f10"); + register double ftmp5 asm("$f12"); + register double ftmp6 asm("$f14"); + register double ftmp7 asm("$f16"); + register double ftmp8 asm("$f18"); +#else + register double fzero asm("$f0"); + register double ftmp0 asm("$f1"); + register double ftmp1 asm("$f2"); + register double ftmp2 asm("$f3"); + register double ftmp3 asm("$f4"); + register double ftmp4 asm("$f5"); + register double ftmp5 asm("$f6"); + register double ftmp6 asm("$f7"); + register double ftmp7 asm("$f8"); + register double ftmp8 asm("$f9"); +#endif // _MIPS_SIM == _ABIO32 + + __asm__ volatile ( + "ldc1 %[ftmp0], 0x00(%[vp8_filter]) \n\t" + "ldc1 %[ftmp1], 0x10(%[vp8_filter]) \n\t" + "ldc1 %[ftmp2], 0x20(%[vp8_filter]) \n\t" + "ldc1 %[ftmp3], 0x30(%[vp8_filter]) \n\t" + "ldc1 %[ftmp4], 0x40(%[vp8_filter]) \n\t" + "ldc1 %[ftmp5], 0x50(%[vp8_filter]) \n\t" + MMI_SUBU(%[src_ptr], %[src_ptr], %[pixels_per_line_x2]) + "xor %[fzero], %[fzero], %[fzero] \n\t" + "li %[tmp0], 0x07 \n\t" + "mtc1 %[tmp0], %[ftmp7] \n\t" + + "1: \n\t" + "gsldlc1 %[ftmp6], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp6], 0x00(%[src_ptr]) \n\t" + "pmullh %[ftmp8], %[ftmp6], %[ftmp0] \n\t" + + MMI_ADDU(%[addr0], %[src_ptr], %[pixels_per_line]) + "gsldlc1 %[ftmp6], 0x07(%[addr0]) \n\t" + "gsldrc1 %[ftmp6], 0x00(%[addr0]) \n\t" + "pmullh %[ftmp6], %[ftmp6], %[ftmp1] \n\t" + "paddsh %[ftmp8], %[ftmp8], %[ftmp6] \n\t" + + MMI_ADDU(%[addr0], %[src_ptr], %[pixels_per_line_x2]) + "gsldlc1 %[ftmp6], 0x07(%[addr0]) \n\t" + "gsldrc1 %[ftmp6], 0x00(%[addr0]) \n\t" + "pmullh %[ftmp6], %[ftmp6], %[ftmp2] \n\t" + "paddsh %[ftmp8], %[ftmp8], %[ftmp6] \n\t" + + MMI_ADDU(%[addr0], %[src_ptr], %[pixels_per_line_x4]) + "gsldlc1 %[ftmp6], 0x07(%[addr0]) \n\t" + "gsldrc1 %[ftmp6], 0x00(%[addr0]) \n\t" + "pmullh %[ftmp6], %[ftmp6], %[ftmp4] \n\t" + "paddsh %[ftmp8], %[ftmp8], %[ftmp6] \n\t" + + MMI_ADDU(%[src_ptr], %[src_ptr], %[pixels_per_line]) + MMI_ADDU(%[addr0], %[src_ptr], %[pixels_per_line_x2]) + "gsldlc1 %[ftmp6], 0x07(%[addr0]) \n\t" + "gsldrc1 %[ftmp6], 0x00(%[addr0]) \n\t" + "pmullh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" + "paddsh %[ftmp8], %[ftmp8], %[ftmp6] \n\t" + + MMI_ADDU(%[addr0], %[src_ptr], %[pixels_per_line_x4]) + "gsldlc1 %[ftmp6], 0x07(%[addr0]) \n\t" + "gsldrc1 %[ftmp6], 0x00(%[addr0]) \n\t" + "pmullh %[ftmp6], %[ftmp6], %[ftmp5] \n\t" + "paddsh %[ftmp8], %[ftmp8], %[ftmp6] \n\t" + + "paddsh %[ftmp8], %[ftmp8], %[ff_ph_40] \n\t" + "psrah %[ftmp8], %[ftmp8], %[ftmp7] \n\t" + "packushb %[ftmp8], %[ftmp8], %[fzero] \n\t" + "gsswlc1 %[ftmp8], 0x03(%[output_ptr]) \n\t" + "gsswrc1 %[ftmp8], 0x00(%[output_ptr]) \n\t" + + MMI_ADDIU(%[output_height], %[output_height], -0x01) + MMI_ADDU(%[output_ptr], %[output_ptr], %[output_pitch]) + "bnez %[output_height], 1b \n\t" + : [fzero]"=&f"(fzero), [ftmp0]"=&f"(ftmp0), + [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), + [ftmp3]"=&f"(ftmp3), [ftmp4]"=&f"(ftmp4), + [ftmp5]"=&f"(ftmp5), [ftmp6]"=&f"(ftmp6), + [ftmp7]"=&f"(ftmp7), [ftmp8]"=&f"(ftmp8), + [tmp0]"=&r"(tmp[0]), [addr0]"=&r"(addr[0]), + [src_ptr]"+&r"(src_ptr), [output_ptr]"+&r"(output_ptr), + [output_height]"+&r"(output_height) + : [pixels_per_line]"r"((mips_reg)pixels_per_line), + [pixels_per_line_x2]"r"((mips_reg)(pixels_per_line<<1)), + [pixels_per_line_x4]"r"((mips_reg)(pixels_per_line<<2)), + [vp8_filter]"r"(vp8_filter), + [output_pitch]"r"((mips_reg)output_pitch), + [ff_ph_40]"f"(ff_ph_40) + : "memory" + ); +} + +/* When xoffset == 0, vp8_filter= {0,0,128,0,0,0}, + function vp8_filter_block1d_h6_mmi and vp8_filter_block1d_v6_mmi can + be simplified */ +static INLINE void vp8_filter_block1d_h6_filter0_mmi( + unsigned char *src_ptr, uint16_t *output_ptr, + unsigned int src_pixels_per_line, unsigned int output_height, + unsigned int output_width) { +#if _MIPS_SIM == _ABIO32 + register double fzero asm("$f0"); + register double ftmp0 asm("$f2"); + register double ftmp1 asm("$f4"); +#else + register double fzero asm("$f0"); + register double ftmp0 asm("$f1"); + register double ftmp1 asm("$f2"); +#endif // _MIPS_SIM == _ABIO32 + + __asm__ volatile ( + "xor %[fzero], %[fzero], %[fzero] \n\t" + + "1: \n\t" + "gsldlc1 %[ftmp0], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp0], 0x00(%[src_ptr]) \n\t" + + "punpcklbh %[ftmp1], %[ftmp0], %[fzero] \n\t" + "gssdlc1 %[ftmp1], 0x07(%[output_ptr]) \n\t" + "gssdrc1 %[ftmp1], 0x00(%[output_ptr]) \n\t" + + "addiu %[output_height], %[output_height], -0x01 \n\t" + MMI_ADDU(%[output_ptr], %[output_ptr], %[output_width]) + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixels_per_line]) + "bnez %[output_height], 1b \n\t" + : [fzero]"=&f"(fzero), [ftmp0]"=&f"(ftmp0), + [ftmp1]"=&f"(ftmp1), [src_ptr]"+&r"(src_ptr), + [output_ptr]"+&r"(output_ptr), [output_height]"+&r"(output_height) + : [src_pixels_per_line]"r"((mips_reg)src_pixels_per_line), + [output_width]"r"(output_width) + : "memory" + ); +} + +static INLINE void vp8_filter_block1dc_v6_filter0_mmi( + uint16_t *src_ptr, unsigned char *output_ptr, unsigned int output_height, + int output_pitch, unsigned int pixels_per_line) { +#if _MIPS_SIM == _ABIO32 + register double fzero asm("$f0"); + register double ftmp0 asm("$f2"); + register double ftmp1 asm("$f4"); +#else + register double fzero asm("$f0"); + register double ftmp0 asm("$f1"); + register double ftmp1 asm("$f2"); +#endif // _MIPS_SIM == _ABIO32 + + __asm__ volatile ( + "xor %[fzero], %[fzero], %[fzero] \n\t" + + "1: \n\t" + "gsldlc1 %[ftmp0], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp0], 0x00(%[src_ptr]) \n\t" + "packushb %[ftmp1], %[ftmp0], %[fzero] \n\t" + "gsswlc1 %[ftmp1], 0x03(%[output_ptr]) \n\t" + "gsswrc1 %[ftmp1], 0x00(%[output_ptr]) \n\t" + + MMI_ADDU(%[src_ptr], %[src_ptr], %[pixels_per_line]) + MMI_ADDIU(%[output_height], %[output_height], -0x01) + MMI_ADDU(%[output_ptr], %[output_ptr], %[output_pitch]) + "bnez %[output_height], 1b \n\t" + : [fzero]"=&f"(fzero), [ftmp0]"=&f"(ftmp0), + [ftmp1]"=&f"(ftmp1), [src_ptr]"+&r"(src_ptr), + [output_ptr]"+&r"(output_ptr), [output_height]"+&r"(output_height) + : [pixels_per_line]"r"((mips_reg)pixels_per_line), + [output_pitch]"r"((mips_reg)output_pitch) + : "memory" + ); +} + +#define sixtapNxM(n, m) \ + void vp8_sixtap_predict##n##x##m##_mmi( \ + unsigned char *src_ptr, int src_pixels_per_line, int xoffset, \ + int yoffset, unsigned char *dst_ptr, int dst_pitch) { \ + DECLARE_ALIGNED(16, uint16_t, \ + FData2[(n + 5) * (n == 16 ? 24 : (n == 8 ? 16 : n))]); \ + const int16_t *HFilter, *VFilter; \ + int i, loop = n / 4; \ + HFilter = vp8_six_tap_mmi[xoffset]; \ + VFilter = vp8_six_tap_mmi[yoffset]; \ + \ + if (xoffset == 0) { \ + for (i = 0; i < loop; ++i) { \ + vp8_filter_block1d_h6_filter0_mmi( \ + src_ptr - (2 * src_pixels_per_line) + i * 4, FData2 + i * 4, \ + src_pixels_per_line, m + 5, n * 2); \ + } \ + } else { \ + for (i = 0; i < loop; ++i) { \ + vp8_filter_block1d_h6_mmi(src_ptr - (2 * src_pixels_per_line) + i * 4, \ + FData2 + i * 4, src_pixels_per_line, m + 5, \ + n * 2, HFilter); \ + } \ + } \ + if (yoffset == 0) { \ + for (i = 0; i < loop; ++i) { \ + vp8_filter_block1dc_v6_filter0_mmi( \ + FData2 + n * 2 + i * 4, dst_ptr + i * 4, m, dst_pitch, n * 2); \ + } \ + } else { \ + for (i = 0; i < loop; ++i) { \ + vp8_filter_block1dc_v6_mmi(FData2 + n * 2 + i * 4, dst_ptr + i * 4, m, \ + dst_pitch, n * 2, VFilter); \ + } \ + } \ + } + +sixtapNxM(4, 4); +sixtapNxM(8, 8); +sixtapNxM(8, 4); +sixtapNxM(16, 16); diff --git a/vp8/common/rtcd_defs.pl b/vp8/common/rtcd_defs.pl index bc5e05799..2d68de6a1 100644 --- a/vp8/common/rtcd_defs.pl +++ b/vp8/common/rtcd_defs.pl @@ -132,16 +132,16 @@ if (vpx_config("CONFIG_POSTPROC") eq "yes") { # Subpixel # add_proto qw/void vp8_sixtap_predict16x16/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch"; -specialize qw/vp8_sixtap_predict16x16 sse2 ssse3 neon dspr2 msa/; +specialize qw/vp8_sixtap_predict16x16 sse2 ssse3 neon dspr2 msa mmi/; add_proto qw/void vp8_sixtap_predict8x8/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch"; -specialize qw/vp8_sixtap_predict8x8 sse2 ssse3 neon dspr2 msa/; +specialize qw/vp8_sixtap_predict8x8 sse2 ssse3 neon dspr2 msa mmi/; add_proto qw/void vp8_sixtap_predict8x4/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch"; -specialize qw/vp8_sixtap_predict8x4 sse2 ssse3 neon dspr2 msa/; +specialize qw/vp8_sixtap_predict8x4 sse2 ssse3 neon dspr2 msa mmi/; add_proto qw/void vp8_sixtap_predict4x4/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch"; -specialize qw/vp8_sixtap_predict4x4 mmx ssse3 neon dspr2 msa/; +specialize qw/vp8_sixtap_predict4x4 mmx ssse3 neon dspr2 msa mmi/; add_proto qw/void vp8_bilinear_predict16x16/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch"; specialize qw/vp8_bilinear_predict16x16 sse2 ssse3 neon msa/; diff --git a/vp8/vp8_common.mk b/vp8/vp8_common.mk index 137f5bb62..1522dd464 100644 --- a/vp8/vp8_common.mk +++ b/vp8/vp8_common.mk @@ -116,6 +116,9 @@ VP8_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/loopfilter_filters_msa.c VP8_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/sixtap_filter_msa.c VP8_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp8_macros_msa.h +# common (c) +VP8_COMMON_SRCS-$(HAVE_MMI) += common/mips/mmi/sixtap_filter_mmi.c + ifeq ($(CONFIG_POSTPROC),yes) VP8_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/mfqe_msa.c endif -- 2.40.0