From: Parag Salasakar Date: Fri, 31 Jul 2015 03:59:10 +0000 (+0530) Subject: mips msa vp8 block subtract optimization X-Git-Tag: v1.5.0~362 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=0e3f494b217bde5e1d47107cdfbb044e4d801cec;p=libvpx mips msa vp8 block subtract optimization average improvement ~2x-3x Change-Id: I30abf4c92cddcc9e87b7a40d4106076e1ec701c2 --- diff --git a/vp8/common/mips/msa/vp8_macros_msa.h b/vp8/common/mips/msa/vp8_macros_msa.h index 0ed94cd43..0486348be 100644 --- a/vp8/common/mips/msa/vp8_macros_msa.h +++ b/vp8/common/mips/msa/vp8_macros_msa.h @@ -643,6 +643,23 @@ } #define DOTP_SH4_SW(...) DOTP_SH4(v4i32, __VA_ARGS__) +/* Description : Dot product of word vector elements + Arguments : Inputs - mult0, mult1, cnst0, cnst1 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Signed word elements from 'mult0' are multiplied with + signed word elements from 'cnst0' producing a result + twice the size of input i.e. signed double word. + The multiplication result of adjacent odd-even elements + are added together and written to the 'out0' vector +*/ +#define DOTP_SW2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \ +{ \ + out0 = (RTYPE)__msa_dotp_s_d((v4i32)mult0, (v4i32)cnst0); \ + out1 = (RTYPE)__msa_dotp_s_d((v4i32)mult1, (v4i32)cnst1); \ +} +#define DOTP_SW2_SD(...) DOTP_SW2(v2i64, __VA_ARGS__) + /* Description : Dot product & addition of byte vector elements Arguments : Inputs - mult0, mult1, cnst0, cnst1 Outputs - out0, out1 @@ -693,6 +710,23 @@ } #define DPADD_SH4_SW(...) DPADD_SH4(v4i32, __VA_ARGS__) +/* Description : Dot product & addition of double word vector elements + Arguments : Inputs - mult0, mult1 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Each signed word element from 'mult0' is multiplied with itself + producing an intermediate result twice the size of it + i.e. signed double word + The multiplication result of adjacent odd-even elements + are added to the 'out0' vector +*/ +#define DPADD_SD2(RTYPE, mult0, mult1, out0, out1) \ +{ \ + out0 = (RTYPE)__msa_dpadd_s_d((v2i64)out0, (v4i32)mult0, (v4i32)mult0); \ + out1 = (RTYPE)__msa_dpadd_s_d((v2i64)out1, (v4i32)mult1, (v4i32)mult1); \ +} +#define DPADD_SD2_SD(...) DPADD_SD2(v2i64, __VA_ARGS__) + /* Description : Clips all signed halfword elements of input vector between 0 & 255 Arguments : Input - in @@ -805,6 +839,21 @@ } #define HSUB_UB2_SH(...) HSUB_UB2(v8i16, __VA_ARGS__) +/* Description : Horizontal subtraction of signed halfword vector elements + Arguments : Inputs - in0, in1 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Each signed odd halfword element from 'in0' is subtracted from + even signed halfword element from 'in0' (pairwise) and the + word result is written to 'out0' +*/ +#define HSUB_UH2(RTYPE, in0, in1, out0, out1) \ +{ \ + out0 = (RTYPE)__msa_hsub_s_w((v8i16)in0, (v8i16)in0); \ + out1 = (RTYPE)__msa_hsub_s_w((v8i16)in1, (v8i16)in1); \ +} +#define HSUB_UH2_SW(...) HSUB_UH2(v4i32, __VA_ARGS__) + /* Description : Set element n input vector to GPR value Arguments : Inputs - in0, in1, in2, in3 Output - out diff --git a/vp8/common/rtcd_defs.pl b/vp8/common/rtcd_defs.pl index 3cf4f9801..8261dd2ae 100644 --- a/vp8/common/rtcd_defs.pl +++ b/vp8/common/rtcd_defs.pl @@ -295,15 +295,15 @@ specialize qw/vp8_fast_quantize_b sse2 ssse3 neon msa/; # Block subtraction # add_proto qw/int vp8_block_error/, "short *coeff, short *dqcoeff"; -specialize qw/vp8_block_error mmx sse2/; +specialize qw/vp8_block_error mmx sse2 msa/; $vp8_block_error_sse2=vp8_block_error_xmm; add_proto qw/int vp8_mbblock_error/, "struct macroblock *mb, int dc"; -specialize qw/vp8_mbblock_error mmx sse2/; +specialize qw/vp8_mbblock_error mmx sse2 msa/; $vp8_mbblock_error_sse2=vp8_mbblock_error_xmm; add_proto qw/int vp8_mbuverror/, "struct macroblock *mb"; -specialize qw/vp8_mbuverror mmx sse2/; +specialize qw/vp8_mbuverror mmx sse2 msa/; $vp8_mbuverror_sse2=vp8_mbuverror_xmm; # diff --git a/vp8/encoder/mips/msa/encodeopt_msa.c b/vp8/encoder/mips/msa/encodeopt_msa.c new file mode 100644 index 000000000..ea794a8a8 --- /dev/null +++ b/vp8/encoder/mips/msa/encodeopt_msa.c @@ -0,0 +1,174 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vp8_rtcd.h" +#include "vp8/common/mips/msa/vp8_macros_msa.h" +#include "vp8/encoder/block.h" + +int32_t vp8_block_error_msa(int16_t *coeff_ptr, int16_t *dq_coeff_ptr) +{ + int32_t err = 0; + uint32_t loop_cnt; + v8i16 coeff, dq_coeff, coeff0, coeff1; + v4i32 diff0, diff1; + v2i64 err0 = { 0 }; + v2i64 err1 = { 0 }; + + for (loop_cnt = 2; loop_cnt--;) + { + coeff = LD_SH(coeff_ptr); + dq_coeff = LD_SH(dq_coeff_ptr); + ILVRL_H2_SH(coeff, dq_coeff, coeff0, coeff1); + HSUB_UH2_SW(coeff0, coeff1, diff0, diff1); + DPADD_SD2_SD(diff0, diff1, err0, err1); + coeff_ptr += 8; + dq_coeff_ptr += 8; + } + + err0 += __msa_splati_d(err0, 1); + err1 += __msa_splati_d(err1, 1); + err = __msa_copy_s_d(err0, 0); + err += __msa_copy_s_d(err1, 0); + + return err; +} + +int32_t vp8_mbblock_error_msa(MACROBLOCK *mb, int32_t dc) +{ + BLOCK *be; + BLOCKD *bd; + int16_t *coeff_ptr, *dq_coeff_ptr; + int32_t err = 0; + uint32_t loop_cnt; + v8i16 coeff, coeff0, coeff1, coeff2, coeff3, coeff4; + v8i16 dq_coeff, dq_coeff2, dq_coeff3, dq_coeff4; + v4i32 diff0, diff1; + v2i64 err0, err1; + v16u8 zero = { 0 }; + v16u8 mask0 = (v16u8)__msa_ldi_b(255); + + if (1 == dc) + { + mask0 = (v16u8)__msa_insve_w((v4i32)mask0, 0, (v4i32)zero); + } + + for (loop_cnt = 0; loop_cnt < 8; loop_cnt++) + { + be = &mb->block[2 * loop_cnt]; + bd = &mb->e_mbd.block[2 * loop_cnt]; + coeff_ptr = be->coeff; + dq_coeff_ptr = bd->dqcoeff; + coeff = LD_SH(coeff_ptr); + dq_coeff = LD_SH(dq_coeff_ptr); + coeff_ptr += 8; + dq_coeff_ptr += 8; + coeff2 = LD_SH(coeff_ptr); + dq_coeff2 = LD_SH(dq_coeff_ptr); + be = &mb->block[2 * loop_cnt + 1]; + bd = &mb->e_mbd.block[2 * loop_cnt + 1]; + coeff_ptr = be->coeff; + dq_coeff_ptr = bd->dqcoeff; + coeff3 = LD_SH(coeff_ptr); + dq_coeff3 = LD_SH(dq_coeff_ptr); + coeff_ptr += 8; + dq_coeff_ptr += 8; + coeff4 = LD_SH(coeff_ptr); + dq_coeff4 = LD_SH(dq_coeff_ptr); + ILVRL_H2_SH(coeff, dq_coeff, coeff0, coeff1); + HSUB_UH2_SW(coeff0, coeff1, diff0, diff1); + diff0 = (v4i32)__msa_bmnz_v(zero, (v16u8)diff0, mask0); + DOTP_SW2_SD(diff0, diff1, diff0, diff1, err0, err1); + ILVRL_H2_SH(coeff2, dq_coeff2, coeff0, coeff1); + HSUB_UH2_SW(coeff0, coeff1, diff0, diff1); + DPADD_SD2_SD(diff0, diff1, err0, err1); + err0 += __msa_splati_d(err0, 1); + err1 += __msa_splati_d(err1, 1); + err += __msa_copy_s_d(err0, 0); + err += __msa_copy_s_d(err1, 0); + + ILVRL_H2_SH(coeff3, dq_coeff3, coeff0, coeff1); + HSUB_UH2_SW(coeff0, coeff1, diff0, diff1); + diff0 = (v4i32)__msa_bmnz_v(zero, (v16u8)diff0, mask0); + DOTP_SW2_SD(diff0, diff1, diff0, diff1, err0, err1); + ILVRL_H2_SH(coeff4, dq_coeff4, coeff0, coeff1); + HSUB_UH2_SW(coeff0, coeff1, diff0, diff1); + DPADD_SD2_SD(diff0, diff1, err0, err1); + err0 += __msa_splati_d(err0, 1); + err1 += __msa_splati_d(err1, 1); + err += __msa_copy_s_d(err0, 0); + err += __msa_copy_s_d(err1, 0); + } + + return err; +} + +int32_t vp8_mbuverror_msa(MACROBLOCK *mb) +{ + BLOCK *be; + BLOCKD *bd; + int16_t *coeff_ptr, *dq_coeff_ptr; + int32_t err = 0; + uint32_t loop_cnt; + v8i16 coeff, coeff0, coeff1, coeff2, coeff3, coeff4; + v8i16 dq_coeff, dq_coeff2, dq_coeff3, dq_coeff4; + v4i32 diff0, diff1; + v2i64 err0, err1, err_dup0, err_dup1; + + for (loop_cnt = 16; loop_cnt < 24; loop_cnt += 2) + { + be = &mb->block[loop_cnt]; + bd = &mb->e_mbd.block[loop_cnt]; + coeff_ptr = be->coeff; + dq_coeff_ptr = bd->dqcoeff; + coeff = LD_SH(coeff_ptr); + dq_coeff = LD_SH(dq_coeff_ptr); + coeff_ptr += 8; + dq_coeff_ptr += 8; + coeff2 = LD_SH(coeff_ptr); + dq_coeff2 = LD_SH(dq_coeff_ptr); + be = &mb->block[loop_cnt + 1]; + bd = &mb->e_mbd.block[loop_cnt + 1]; + coeff_ptr = be->coeff; + dq_coeff_ptr = bd->dqcoeff; + coeff3 = LD_SH(coeff_ptr); + dq_coeff3 = LD_SH(dq_coeff_ptr); + coeff_ptr += 8; + dq_coeff_ptr += 8; + coeff4 = LD_SH(coeff_ptr); + dq_coeff4 = LD_SH(dq_coeff_ptr); + + ILVRL_H2_SH(coeff, dq_coeff, coeff0, coeff1); + HSUB_UH2_SW(coeff0, coeff1, diff0, diff1); + DOTP_SW2_SD(diff0, diff1, diff0, diff1, err0, err1); + + ILVRL_H2_SH(coeff2, dq_coeff2, coeff0, coeff1); + HSUB_UH2_SW(coeff0, coeff1, diff0, diff1); + DPADD_SD2_SD(diff0, diff1, err0, err1); + err_dup0 = __msa_splati_d(err0, 1); + err_dup1 = __msa_splati_d(err1, 1); + ADD2(err0, err_dup0, err1, err_dup1, err0, err1); + err += __msa_copy_s_d(err0, 0); + err += __msa_copy_s_d(err1, 0); + + ILVRL_H2_SH(coeff3, dq_coeff3, coeff0, coeff1); + HSUB_UH2_SW(coeff0, coeff1, diff0, diff1); + DOTP_SW2_SD(diff0, diff1, diff0, diff1, err0, err1); + ILVRL_H2_SH(coeff4, dq_coeff4, coeff0, coeff1); + HSUB_UH2_SW(coeff0, coeff1, diff0, diff1); + DPADD_SD2_SD(diff0, diff1, err0, err1); + err_dup0 = __msa_splati_d(err0, 1); + err_dup1 = __msa_splati_d(err1, 1); + ADD2(err0, err_dup0, err1, err_dup1, err0, err1); + err += __msa_copy_s_d(err0, 0); + err += __msa_copy_s_d(err1, 0); + } + + return err; +} diff --git a/vp8/vp8cx.mk b/vp8/vp8cx.mk index 4be902b29..25d3d9ff4 100644 --- a/vp8/vp8cx.mk +++ b/vp8/vp8cx.mk @@ -104,6 +104,7 @@ VP8_CX_SRCS_REMOVE-$(HAVE_SSE2) += encoder/x86/temporal_filter_apply_sse2.asm endif VP8_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/dct_msa.c +VP8_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/encodeopt_msa.c VP8_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/quantize_msa.c VP8_CX_SRCS-yes := $(filter-out $(VP8_CX_SRCS_REMOVE-yes),$(VP8_CX_SRCS-yes))