From 509fb0bc9dbdbd7380bbc5a5d318ccf51e6b62dc Mon Sep 17 00:00:00 2001 From: Parag Salasakar Date: Thu, 23 Jul 2015 10:29:40 +0530 Subject: [PATCH] mips msa vp8 copy mem optimization average improvement ~2x-4x Change-Id: I3af3ecced96c5b8e0cb811256e5089e28fe013a2 --- vp8/common/mips/msa/copymem_msa.c | 70 ++++++++++++++++++++++++++++ vp8/common/mips/msa/vp8_macros_msa.h | 26 +++++++++++ vp8/common/rtcd_defs.pl | 6 +-- vp8/vp8_common.mk | 1 + 4 files changed, 100 insertions(+), 3 deletions(-) create mode 100644 vp8/common/mips/msa/copymem_msa.c diff --git a/vp8/common/mips/msa/copymem_msa.c b/vp8/common/mips/msa/copymem_msa.c new file mode 100644 index 000000000..002a5ed91 --- /dev/null +++ b/vp8/common/mips/msa/copymem_msa.c @@ -0,0 +1,70 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vp8_rtcd.h" +#include "vp8/common/mips/msa/vp8_macros_msa.h" + +static void copy_8x4_msa(uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride) +{ + uint64_t src0, src1, src2, src3; + + LD4(src, src_stride, src0, src1, src2, src3); + SD4(src0, src1, src2, src3, dst, dst_stride); +} + +static void copy_8x8_msa(uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride) +{ + uint64_t src0, src1, src2, src3; + + LD4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + SD4(src0, src1, src2, src3, dst, dst_stride); + dst += (4 * dst_stride); + + LD4(src, src_stride, src0, src1, src2, src3); + SD4(src0, src1, src2, src3, dst, dst_stride); +} + +static void copy_16x16_msa(uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride) +{ + v16u8 src0, src1, src2, src3, src4, src5, src6, src7; + v16u8 src8, src9, src10, src11, src12, src13, src14, src15; + + LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); + src += (8 * src_stride); + LD_UB8(src, src_stride, src8, src9, src10, src11, src12, src13, src14, + src15); + + ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride); + dst += (8 * dst_stride); + ST_UB8(src8, src9, src10, src11, src12, src13, src14, src15, dst, + dst_stride); +} + +void vp8_copy_mem16x16_msa(uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride) +{ + copy_16x16_msa(src, src_stride, dst, dst_stride); +} + +void vp8_copy_mem8x8_msa(uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride) +{ + copy_8x8_msa(src, src_stride, dst, dst_stride); +} + +void vp8_copy_mem8x4_msa(uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride) +{ + copy_8x4_msa(src, src_stride, dst, dst_stride); +} diff --git a/vp8/common/mips/msa/vp8_macros_msa.h b/vp8/common/mips/msa/vp8_macros_msa.h index 8170ecf1d..46980900a 100644 --- a/vp8/common/mips/msa/vp8_macros_msa.h +++ b/vp8/common/mips/msa/vp8_macros_msa.h @@ -222,6 +222,23 @@ out3 = LW((psrc) + 3 * stride); \ } +/* Description : Load double words with stride + Arguments : Inputs - psrc, stride + Outputs - out0, out1 + Details : Load double word in 'out0' from (psrc) + Load double word in 'out1' from (psrc + stride) +*/ +#define LD2(psrc, stride, out0, out1) \ +{ \ + out0 = LD((psrc)); \ + out1 = LD((psrc) + stride); \ +} +#define LD4(psrc, stride, out0, out1, out2, out3) \ +{ \ + LD2((psrc), stride, out0, out1); \ + LD2((psrc) + 2 * stride, stride, out2, out3); \ +} + /* Description : Store 4 words with stride Arguments : Inputs - in0, in1, in2, in3, pdst, stride Details : Store word from 'in0' to (pdst) @@ -298,6 +315,7 @@ LD_B4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7); \ } #define LD_UB8(...) LD_B8(v16u8, __VA_ARGS__) +#define LD_SB8(...) LD_B8(v16i8, __VA_ARGS__) /* Description : Load vectors with 8 halfword elements with stride Arguments : Inputs - psrc, stride @@ -339,6 +357,14 @@ #define ST_UB4(...) ST_B4(v16u8, __VA_ARGS__) #define ST_SB4(...) ST_B4(v16i8, __VA_ARGS__) +#define ST_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ + pdst, stride) \ +{ \ + ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride); \ + ST_B4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride); \ +} +#define ST_UB8(...) ST_B8(v16u8, __VA_ARGS__) + /* Description : Store vectors of 8 halfword elements with stride Arguments : Inputs - in0, in1, pdst, stride Details : Store 8 halfword elements from 'in0' to (pdst) diff --git a/vp8/common/rtcd_defs.pl b/vp8/common/rtcd_defs.pl index 3a278f7a8..aa47c0fa9 100644 --- a/vp8/common/rtcd_defs.pl +++ b/vp8/common/rtcd_defs.pl @@ -138,17 +138,17 @@ $vp8_dc_only_idct_add_dspr2=vp8_dc_only_idct_add_dspr2; # RECON # add_proto qw/void vp8_copy_mem16x16/, "unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch"; -specialize qw/vp8_copy_mem16x16 mmx sse2 media neon dspr2/; +specialize qw/vp8_copy_mem16x16 mmx sse2 media neon dspr2 msa/; $vp8_copy_mem16x16_media=vp8_copy_mem16x16_v6; $vp8_copy_mem16x16_dspr2=vp8_copy_mem16x16_dspr2; add_proto qw/void vp8_copy_mem8x8/, "unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch"; -specialize qw/vp8_copy_mem8x8 mmx media neon dspr2/; +specialize qw/vp8_copy_mem8x8 mmx media neon dspr2 msa/; $vp8_copy_mem8x8_media=vp8_copy_mem8x8_v6; $vp8_copy_mem8x8_dspr2=vp8_copy_mem8x8_dspr2; add_proto qw/void vp8_copy_mem8x4/, "unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch"; -specialize qw/vp8_copy_mem8x4 mmx media neon dspr2/; +specialize qw/vp8_copy_mem8x4 mmx media neon dspr2 msa/; $vp8_copy_mem8x4_media=vp8_copy_mem8x4_v6; $vp8_copy_mem8x4_dspr2=vp8_copy_mem8x4_dspr2; diff --git a/vp8/vp8_common.mk b/vp8/vp8_common.mk index 8248cb70c..3e390953c 100644 --- a/vp8/vp8_common.mk +++ b/vp8/vp8_common.mk @@ -114,6 +114,7 @@ VP8_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/idct_blk_dspr2.c VP8_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/dequantize_dspr2.c # common (c) +VP8_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/copymem_msa.c VP8_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/idct_msa.c VP8_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/loopfilter_filters_msa.c VP8_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/sixtap_filter_msa.c -- 2.40.0