From: Parag Salasakar Date: Thu, 30 Jul 2015 02:44:42 +0000 (+0530) Subject: mips msa vp8 fdct optimization X-Git-Tag: v1.5.0~370 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=0c2a14f9e24fda448161bbaf13878b202ea57f1f;p=libvpx mips msa vp8 fdct optimization average improvement ~2x-4x Change-Id: Id0bc600440f7ef53348f585ebadb1ac6869e9a00 --- diff --git a/vp8/common/mips/msa/vp8_macros_msa.h b/vp8/common/mips/msa/vp8_macros_msa.h index 28058590c..b533cc696 100644 --- a/vp8/common/mips/msa/vp8_macros_msa.h +++ b/vp8/common/mips/msa/vp8_macros_msa.h @@ -629,6 +629,31 @@ } #define DPADD_SB4_SH(...) DPADD_SB4(v8i16, __VA_ARGS__) +/* Description : Dot product & addition of halfword vector elements + Arguments : Inputs - mult0, mult1, cnst0, cnst1 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Signed halfword elements from 'mult0' are multiplied with + signed halfword elements from 'cnst0' producing a result + twice the size of input i.e. signed word. + The multiplication result of adjacent odd-even elements + are added to the 'out0' vector +*/ +#define DPADD_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \ +{ \ + out0 = (RTYPE)__msa_dpadd_s_w((v4i32)out0, (v8i16)mult0, (v8i16)cnst0); \ + out1 = (RTYPE)__msa_dpadd_s_w((v4i32)out1, (v8i16)mult1, (v8i16)cnst1); \ +} +#define DPADD_SH2_SW(...) DPADD_SH2(v4i32, __VA_ARGS__) + +#define DPADD_SH4(RTYPE, mult0, mult1, mult2, mult3, \ + cnst0, cnst1, cnst2, cnst3, out0, out1, out2, out3) \ +{ \ + DPADD_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \ + DPADD_SH2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \ +} +#define DPADD_SH4_SW(...) DPADD_SH4(v4i32, __VA_ARGS__) + /* Description : Clips all signed halfword elements of input vector between 0 & 255 Arguments : Input - in @@ -783,6 +808,7 @@ out1 = (RTYPE)__msa_ilvev_h((v8i16)in3, (v8i16)in2); \ } #define ILVEV_H2_UB(...) ILVEV_H2(v16u8, __VA_ARGS__) +#define ILVEV_H2_SH(...) ILVEV_H2(v8i16, __VA_ARGS__) /* Description : Interleave even word elements from vectors Arguments : Inputs - in0, in1, in2, in3 @@ -1035,6 +1061,24 @@ #define SPLATI_H3_SB(...) SPLATI_H3(v16i8, __VA_ARGS__) #define SPLATI_H3_SH(...) SPLATI_H3(v8i16, __VA_ARGS__) +/* Description : Indexed word element values are replicated to all + elements in output vector + Arguments : Inputs - in, stidx + Outputs - out0, out1 + Return Type - as per RTYPE + Details : 'stidx' element value from 'in' vector is replicated to all + elements in 'out0' vector + 'stidx + 1' element value from 'in' vector is replicated to all + elements in 'out1' vector + Valid index range for word operation is 0-3 +*/ +#define SPLATI_W2(RTYPE, in, stidx, out0, out1) \ +{ \ + out0 = (RTYPE)__msa_splati_w((v4i32)in, stidx); \ + out1 = (RTYPE)__msa_splati_w((v4i32)in, (stidx+1)); \ +} +#define SPLATI_W2_SW(...) SPLATI_W2(v4i32, __VA_ARGS__) + /* Description : Pack even byte elements of vector pairs Arguments : Inputs - in0, in1, in2, in3 Outputs - out0, out1 @@ -1160,6 +1204,21 @@ } #define XORI_B8_128_SB(...) XORI_B8_128(v16i8, __VA_ARGS__) +/* Description : Shift left all elements of vector (generic for all data types) + Arguments : Inputs - in0, in1, in2, in3, shift + Outputs - in place operation + Return Type - as per input vector RTYPE + Details : Each element of vector 'in0' is left shifted by 'shift' and + the result is written in-place. +*/ +#define SLLI_4V(in0, in1, in2, in3, shift) \ +{ \ + in0 = in0 << shift; \ + in1 = in1 << shift; \ + in2 = in2 << shift; \ + in3 = in3 << shift; \ +} + /* Description : Arithmetic shift right all elements of vector (generic for all data types) Arguments : Inputs - in0, in1, in2, in3, shift @@ -1250,6 +1309,22 @@ ADD2(in4, in5, in6, in7, out2, out3); \ } +/* Description : Sign extend halfword elements from right half of the vector + Arguments : Input - in (halfword vector) + Output - out (sign extended word vector) + Return Type - signed word + Details : Sign bit of halfword elements from input vector 'in' is + extracted and interleaved with same vector 'in0' to generate + 4 word elements keeping sign intact +*/ +#define UNPCK_R_SH_SW(in, out) \ +{ \ + v8i16 sign_m; \ + \ + sign_m = __msa_clti_s_h((v8i16)in, 0); \ + out = (v4i32)__msa_ilvr_h(sign_m, (v8i16)in); \ +} + /* Description : Zero extend unsigned byte elements to halfword elements Arguments : Input - in (unsigned byte vector) Outputs - out0, out1 (unsigned halfword vectors) @@ -1401,6 +1476,21 @@ out7 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ } +/* Description : Transpose 4x4 block with half word elements in vectors + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1, out2, out3 + Return Type - signed halfword +*/ +#define TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3) \ +{ \ + v8i16 s0_m, s1_m; \ + \ + ILVR_H2_SH(in1, in0, in3, in2, s0_m, s1_m); \ + ILVRL_W2_SH(s1_m, s0_m, out0, out2); \ + out1 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out0); \ + out3 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out2); \ +} + /* Description : Transpose 8x4 block with half word elements in vectors Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 Outputs - out0, out1, out2, out3, out4, out5, out6, out7 diff --git a/vp8/common/rtcd_defs.pl b/vp8/common/rtcd_defs.pl index 8ba2a5a19..ad7429a54 100644 --- a/vp8/common/rtcd_defs.pl +++ b/vp8/common/rtcd_defs.pl @@ -271,15 +271,15 @@ if (vpx_config("CONFIG_INTERNAL_STATS") eq "yes") { # Forward DCT # add_proto qw/void vp8_short_fdct4x4/, "short *input, short *output, int pitch"; -specialize qw/vp8_short_fdct4x4 mmx sse2 media neon/; +specialize qw/vp8_short_fdct4x4 mmx sse2 media neon msa/; $vp8_short_fdct4x4_media=vp8_short_fdct4x4_armv6; add_proto qw/void vp8_short_fdct8x4/, "short *input, short *output, int pitch"; -specialize qw/vp8_short_fdct8x4 mmx sse2 media neon/; +specialize qw/vp8_short_fdct8x4 mmx sse2 media neon msa/; $vp8_short_fdct8x4_media=vp8_short_fdct8x4_armv6; add_proto qw/void vp8_short_walsh4x4/, "short *input, short *output, int pitch"; -specialize qw/vp8_short_walsh4x4 sse2 media neon/; +specialize qw/vp8_short_walsh4x4 sse2 media neon msa/; $vp8_short_walsh4x4_media=vp8_short_walsh4x4_armv6; # diff --git a/vp8/encoder/mips/msa/dct_msa.c b/vp8/encoder/mips/msa/dct_msa.c new file mode 100644 index 000000000..be61ffa0d --- /dev/null +++ b/vp8/encoder/mips/msa/dct_msa.c @@ -0,0 +1,199 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vp8_rtcd.h" +#include "vp8/common/mips/msa/vp8_macros_msa.h" + +#define TRANSPOSE4x4_H(in0, in1, in2, in3, out0, out1, out2, out3) \ +{ \ + v8i16 s0_m, s1_m, tp0_m, tp1_m, tp2_m, tp3_m; \ + \ + ILVR_H2_SH(in2, in0, in3, in1, s0_m, s1_m); \ + ILVRL_H2_SH(s1_m, s0_m, tp0_m, tp1_m); \ + ILVL_H2_SH(in2, in0, in3, in1, s0_m, s1_m); \ + ILVRL_H2_SH(s1_m, s0_m, tp2_m, tp3_m); \ + PCKEV_D2_SH(tp2_m, tp0_m, tp3_m, tp1_m, out0, out2); \ + PCKOD_D2_SH(tp2_m, tp0_m, tp3_m, tp1_m, out1, out3); \ +} + +#define SET_DOTP_VALUES(coeff, val0, val1, val2, const1, const2) \ +{ \ + v8i16 tmp0_m; \ + \ + SPLATI_H3_SH(coeff, val0, val1, val2, tmp0_m, const1, const2); \ + ILVEV_H2_SH(tmp0_m, const1, const2, tmp0_m, const1, const2); \ +} + +#define RET_1_IF_NZERO_H(in0) \ +({ \ + v8i16 tmp0_m; \ + v8i16 one_m = __msa_ldi_h(1); \ + \ + tmp0_m = __msa_ceqi_h(in0, 0); \ + tmp0_m = tmp0_m ^ 255; \ + tmp0_m = one_m & tmp0_m; \ + \ + tmp0_m; \ +}) + +#define RET_1_IF_NZERO_W(in0) \ +({ \ + v4i32 tmp0_m; \ + v4i32 one_m = __msa_ldi_w(1); \ + \ + tmp0_m = __msa_ceqi_w(in0, 0); \ + tmp0_m = tmp0_m ^ 255; \ + tmp0_m = one_m & tmp0_m; \ + \ + tmp0_m; \ +}) + +#define RET_1_IF_NEG_W(in0) \ +({ \ + v4i32 tmp0_m; \ + \ + v4i32 one_m = __msa_ldi_w(1); \ + tmp0_m = __msa_clti_s_w(in0, 0); \ + tmp0_m = one_m & tmp0_m; \ + \ + tmp0_m; \ +}) + +void vp8_short_fdct4x4_msa(int16_t *input, int16_t *output, int32_t pitch) +{ + v8i16 in0, in1, in2, in3; + v8i16 temp0, temp1; + v8i16 const0, const1; + v8i16 coeff = { 2217, 5352, -5352, 14500, 7500, 12000, 25000, 26000 }; + v4i32 out0, out1, out2, out3; + v8i16 zero = { 0 }; + + LD_SH4(input, pitch / 2, in0, in1, in2, in3); + TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3); + + BUTTERFLY_4(in0, in1, in2, in3, temp0, temp1, in1, in3); + SLLI_4V(temp0, temp1, in1, in3, 3); + in0 = temp0 + temp1; + in2 = temp0 - temp1; + SET_DOTP_VALUES(coeff, 0, 1, 2, const0, const1); + temp0 = __msa_ilvr_h(in3, in1); + in1 = __msa_splati_h(coeff, 3); + out0 = (v4i32)__msa_ilvev_h(zero, in1); + coeff = __msa_ilvl_h(zero, coeff); + out1 = __msa_splati_w((v4i32)coeff, 0); + DPADD_SH2_SW(temp0, temp0, const0, const1, out0, out1); + out0 >>= 12; + out1 >>= 12; + PCKEV_H2_SH(out0, out0, out1, out1, in1, in3); + TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3); + + BUTTERFLY_4(in0, in1, in2, in3, temp0, temp1, in1, in3); + in0 = temp0 + temp1 + 7; + in2 = temp0 - temp1 + 7; + in0 >>= 4; + in2 >>= 4; + ILVR_H2_SW(zero, in0, zero, in2, out0, out2); + temp1 = RET_1_IF_NZERO_H(in3); + ILVR_H2_SH(zero, temp1, in3, in1, temp1, temp0); + SPLATI_W2_SW(coeff, 2, out3, out1); + out3 += out1; + out1 = __msa_splati_w((v4i32)coeff, 1); + DPADD_SH2_SW(temp0, temp0, const0, const1, out1, out3); + out1 >>= 16; + out3 >>= 16; + out1 += (v4i32)temp1; + PCKEV_H2_SH(out1, out0, out3, out2, in0, in2); + ST_SH2(in0, in2, output, 8); +} + +void vp8_short_fdct8x4_msa(int16_t *input, int16_t *output, int32_t pitch) +{ + v8i16 in0, in1, in2, in3; + v8i16 temp0, temp1, tmp0, tmp1; + v8i16 const0, const1, const2; + v8i16 coeff = { 2217, 5352, -5352, 14500, 7500, 12000, 25000, 26000 }; + v8i16 zero = { 0 }; + v4i32 vec0_w, vec1_w, vec2_w, vec3_w; + + LD_SH4(input, pitch / 2, in0, in1, in2, in3); + TRANSPOSE4x4_H(in0, in1, in2, in3, in0, in1, in2, in3); + + BUTTERFLY_4(in0, in1, in2, in3, temp0, temp1, in1, in3); + SLLI_4V(temp0, temp1, in1, in3, 3); + in0 = temp0 + temp1; + in2 = temp0 - temp1; + SET_DOTP_VALUES(coeff, 0, 1, 2, const1, const2); + temp0 = __msa_splati_h(coeff, 3); + vec1_w = (v4i32)__msa_ilvev_h(zero, temp0); + coeff = __msa_ilvl_h(zero, coeff); + vec3_w = __msa_splati_w((v4i32)coeff, 0); + ILVRL_H2_SH(in3, in1, tmp1, tmp0); + vec0_w = vec1_w; + vec2_w = vec3_w; + DPADD_SH4_SW(tmp1, tmp0, tmp1, tmp0, const1, const1, const2, const2, + vec0_w, vec1_w, vec2_w, vec3_w); + SRA_4V(vec1_w, vec0_w, vec3_w, vec2_w, 12); + PCKEV_H2_SH(vec1_w, vec0_w, vec3_w, vec2_w, in1, in3); + TRANSPOSE4x4_H(in0, in1, in2, in3, in0, in1, in2, in3); + + BUTTERFLY_4(in0, in1, in2, in3, temp0, temp1, in1, in3); + in0 = temp0 + temp1 + 7; + in2 = temp0 - temp1 + 7; + in0 >>= 4; + in2 >>= 4; + SPLATI_W2_SW(coeff, 2, vec3_w, vec1_w); + vec3_w += vec1_w; + vec1_w = __msa_splati_w((v4i32)coeff, 1); + const0 = RET_1_IF_NZERO_H(in3); + ILVRL_H2_SH(in3, in1, tmp1, tmp0); + vec0_w = vec1_w; + vec2_w = vec3_w; + DPADD_SH4_SW(tmp1, tmp0, tmp1, tmp0, const1, const1, const2, const2, + vec0_w, vec1_w, vec2_w, vec3_w); + SRA_4V(vec1_w, vec0_w, vec3_w, vec2_w, 16); + PCKEV_H2_SH(vec1_w, vec0_w, vec3_w, vec2_w, in1, in3); + in1 += const0; + PCKEV_D2_SH(in1, in0, in3, in2, temp0, temp1); + ST_SH2(temp0, temp1, output, 8); + + PCKOD_D2_SH(in1, in0, in3, in2, in0, in2); + ST_SH2(in0, in2, output + 16, 8); +} + +void vp8_short_walsh4x4_msa(int16_t *input, int16_t *output, int32_t pitch) +{ + v8i16 in0_h, in1_h, in2_h, in3_h; + v4i32 in0_w, in1_w, in2_w, in3_w, temp0, temp1, temp2, temp3; + + LD_SH4(input, pitch / 2, in0_h, in1_h, in2_h, in3_h); + TRANSPOSE4x4_SH_SH(in0_h, in1_h, in2_h, in3_h, in0_h, in1_h, in2_h, in3_h); + + UNPCK_R_SH_SW(in0_h, in0_w); + UNPCK_R_SH_SW(in1_h, in1_w); + UNPCK_R_SH_SW(in2_h, in2_w); + UNPCK_R_SH_SW(in3_h, in3_w); + BUTTERFLY_4(in0_w, in1_w, in3_w, in2_w, temp0, temp3, temp2, temp1); + SLLI_4V(temp0, temp1, temp2, temp3, 2); + BUTTERFLY_4(temp0, temp1, temp2, temp3, in0_w, in1_w, in2_w, in3_w); + temp0 = RET_1_IF_NZERO_W(temp0); + in0_w += temp0; + TRANSPOSE4x4_SW_SW(in0_w, in1_w, in2_w, in3_w, in0_w, in1_w, in2_w, in3_w); + + BUTTERFLY_4(in0_w, in1_w, in3_w, in2_w, temp0, temp3, temp2, temp1); + BUTTERFLY_4(temp0, temp1, temp2, temp3, in0_w, in1_w, in2_w, in3_w); + in0_w += RET_1_IF_NEG_W(in0_w); + in1_w += RET_1_IF_NEG_W(in1_w); + in2_w += RET_1_IF_NEG_W(in2_w); + in3_w += RET_1_IF_NEG_W(in3_w); + ADD4(in0_w, 3, in1_w, 3, in2_w, 3, in3_w, 3, in0_w, in1_w, in2_w, in3_w); + SRA_4V(in0_w, in1_w, in2_w, in3_w, 3); + PCKEV_H2_SH(in1_w, in0_w, in3_w, in2_w, in0_h, in1_h); + ST_SH2(in0_h, in1_h, output, 8); +} diff --git a/vp8/vp8cx.mk b/vp8/vp8cx.mk index 99d40ecd8..851155619 100644 --- a/vp8/vp8cx.mk +++ b/vp8/vp8cx.mk @@ -103,4 +103,6 @@ ifeq ($(CONFIG_REALTIME_ONLY),yes) VP8_CX_SRCS_REMOVE-$(HAVE_SSE2) += encoder/x86/temporal_filter_apply_sse2.asm endif +VP8_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/dct_msa.c + VP8_CX_SRCS-yes := $(filter-out $(VP8_CX_SRCS_REMOVE-yes),$(VP8_CX_SRCS-yes))