From 5deb9837447cee8e2239bad5844d5b904b62e773 Mon Sep 17 00:00:00 2001 From: Parag Salasakar Date: Tue, 28 Jul 2015 08:16:34 +0530 Subject: [PATCH] mips msa vp8 filter by weight optimization average improvement ~3x-5x Change-Id: Ia808ae56b118e0e1b293901447aa5a0f597b405b --- vp8/common/mips/msa/mfqe_msa.c | 146 +++++++++++++++++++++++++++ vp8/common/mips/msa/vp8_macros_msa.h | 46 +++++++++ vp8/common/rtcd_defs.pl | 4 +- vp8/vp8_common.mk | 4 + 4 files changed, 198 insertions(+), 2 deletions(-) create mode 100644 vp8/common/mips/msa/mfqe_msa.c diff --git a/vp8/common/mips/msa/mfqe_msa.c b/vp8/common/mips/msa/mfqe_msa.c new file mode 100644 index 000000000..3e7629f3a --- /dev/null +++ b/vp8/common/mips/msa/mfqe_msa.c @@ -0,0 +1,146 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vp8_rtcd.h" +#include "vp8/common/postproc.h" +#include "vp8/common/mips/msa/vp8_macros_msa.h" + +static void filter_by_weight8x8_msa(uint8_t *src_ptr, int32_t src_stride, + uint8_t *dst_ptr, int32_t dst_stride, + int32_t src_weight) +{ + int32_t dst_weight = (1 << MFQE_PRECISION) - src_weight; + int32_t row; + uint64_t src0_d, src1_d, dst0_d, dst1_d; + v16i8 src0 = { 0 }; + v16i8 src1 = { 0 }; + v16i8 dst0 = { 0 }; + v16i8 dst1 = { 0 }; + v8i16 src_wt, dst_wt, res_h_r, res_h_l, src_r, src_l, dst_r, dst_l; + + src_wt = __msa_fill_h(src_weight); + dst_wt = __msa_fill_h(dst_weight); + + for (row = 2; row--;) + { + LD2(src_ptr, src_stride, src0_d, src1_d); + src_ptr += (2 * src_stride); + LD2(dst_ptr, dst_stride, dst0_d, dst1_d); + INSERT_D2_SB(src0_d, src1_d, src0); + INSERT_D2_SB(dst0_d, dst1_d, dst0); + + LD2(src_ptr, src_stride, src0_d, src1_d); + src_ptr += (2 * src_stride); + LD2((dst_ptr + 2 * dst_stride), dst_stride, dst0_d, dst1_d); + INSERT_D2_SB(src0_d, src1_d, src1); + INSERT_D2_SB(dst0_d, dst1_d, dst1); + + UNPCK_UB_SH(src0, src_r, src_l); + UNPCK_UB_SH(dst0, dst_r, dst_l); + res_h_r = (src_r * src_wt); + res_h_r += (dst_r * dst_wt); + res_h_l = (src_l * src_wt); + res_h_l += (dst_l * dst_wt); + SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION); + dst0 = (v16i8)__msa_pckev_b((v16i8)res_h_l, (v16i8)res_h_r); + ST8x2_UB(dst0, dst_ptr, dst_stride); + dst_ptr += (2 * dst_stride); + + UNPCK_UB_SH(src1, src_r, src_l); + UNPCK_UB_SH(dst1, dst_r, dst_l); + res_h_r = (src_r * src_wt); + res_h_r += (dst_r * dst_wt); + res_h_l = (src_l * src_wt); + res_h_l += (dst_l * dst_wt); + SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION); + dst1 = (v16i8)__msa_pckev_b((v16i8)res_h_l, (v16i8)res_h_r); + ST8x2_UB(dst1, dst_ptr, dst_stride); + dst_ptr += (2 * dst_stride); + } +} + +static void filter_by_weight16x16_msa(uint8_t *src_ptr, int32_t src_stride, + uint8_t *dst_ptr, int32_t dst_stride, + int32_t src_weight) +{ + int32_t dst_weight = (1 << MFQE_PRECISION) - src_weight; + int32_t row; + v16i8 src0, src1, src2, src3; + v16i8 dst0, dst1, dst2, dst3; + v8i16 src_wt, dst_wt; + v8i16 res_h_r, res_h_l; + v8i16 src_r, src_l, dst_r, dst_l; + + src_wt = __msa_fill_h(src_weight); + dst_wt = __msa_fill_h(dst_weight); + + for (row = 4; row--;) + { + LD_SB4(src_ptr, src_stride, src0, src1, src2, src3); + src_ptr += (4 * src_stride); + LD_SB4(dst_ptr, dst_stride, dst0, dst1, dst2, dst3); + + UNPCK_UB_SH(src0, src_r, src_l); + UNPCK_UB_SH(dst0, dst_r, dst_l); + res_h_r = (src_r * src_wt); + res_h_r += (dst_r * dst_wt); + res_h_l = (src_l * src_wt); + res_h_l += (dst_l * dst_wt); + SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION); + PCKEV_ST_SB(res_h_r, res_h_l, dst_ptr); + dst_ptr += dst_stride; + + UNPCK_UB_SH(src1, src_r, src_l); + UNPCK_UB_SH(dst1, dst_r, dst_l); + res_h_r = (src_r * src_wt); + res_h_r += (dst_r * dst_wt); + res_h_l = (src_l * src_wt); + res_h_l += (dst_l * dst_wt); + SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION); + PCKEV_ST_SB(res_h_r, res_h_l, dst_ptr); + dst_ptr += dst_stride; + + UNPCK_UB_SH(src2, src_r, src_l); + UNPCK_UB_SH(dst2, dst_r, dst_l); + res_h_r = (src_r * src_wt); + res_h_r += (dst_r * dst_wt); + res_h_l = (src_l * src_wt); + res_h_l += (dst_l * dst_wt); + SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION); + PCKEV_ST_SB(res_h_r, res_h_l, dst_ptr); + dst_ptr += dst_stride; + + UNPCK_UB_SH(src3, src_r, src_l); + UNPCK_UB_SH(dst3, dst_r, dst_l); + res_h_r = (src_r * src_wt); + res_h_r += (dst_r * dst_wt); + res_h_l = (src_l * src_wt); + res_h_l += (dst_l * dst_wt); + SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION); + PCKEV_ST_SB(res_h_r, res_h_l, dst_ptr); + dst_ptr += dst_stride; + } +} + +void vp8_filter_by_weight16x16_msa(uint8_t *src_ptr, int32_t src_stride, + uint8_t *dst_ptr, int32_t dst_stride, + int32_t src_weight) +{ + filter_by_weight16x16_msa(src_ptr, src_stride, dst_ptr, dst_stride, + src_weight); +} + +void vp8_filter_by_weight8x8_msa(uint8_t *src_ptr, int32_t src_stride, + uint8_t *dst_ptr, int32_t dst_stride, + int32_t src_weight) +{ + filter_by_weight8x8_msa(src_ptr, src_stride, dst_ptr, dst_stride, + src_weight); +} diff --git a/vp8/common/mips/msa/vp8_macros_msa.h b/vp8/common/mips/msa/vp8_macros_msa.h index a8cbcaa0a..9e1406559 100644 --- a/vp8/common/mips/msa/vp8_macros_msa.h +++ b/vp8/common/mips/msa/vp8_macros_msa.h @@ -435,6 +435,25 @@ ST4x4_UB(in1, in1, 0, 1, 2, 3, pblk_4x8 + 4 * stride, stride); \ } +/* Description : Store 8x2 byte block to destination memory from input vector + Arguments : Inputs - in, pdst, stride + Details : Index 0 double word element from 'in' vector is copied to the + GP register and stored to (pdst) + Index 1 double word element from 'in' vector is copied to the + GP register and stored to (pdst + stride) +*/ +#define ST8x2_UB(in, pdst, stride) \ +{ \ + uint64_t out0_m, out1_m; \ + uint8_t *pblk_8x2_m = (uint8_t *)(pdst); \ + \ + out0_m = __msa_copy_u_d((v2i64)in, 0); \ + out1_m = __msa_copy_u_d((v2i64)in, 1); \ + \ + SD(out0_m, pblk_8x2_m); \ + SD(out1_m, pblk_8x2_m + stride); \ +} + /* Description : Store 8x4 byte block to destination memory from input vectors Arguments : Inputs - in0, in1, pdst, stride @@ -623,6 +642,19 @@ out_m; \ }) +/* Description : Set element n input vector to GPR value + Arguments : Inputs - in0, in1, in2, in3 + Output - out + Return Type - as per RTYPE + Details : Set element 0 in vector 'out' to value specified in 'in0' +*/ +#define INSERT_D2(RTYPE, in0, in1, out) \ +{ \ + out = (RTYPE)__msa_insert_d((v2i64)out, 0, in0); \ + out = (RTYPE)__msa_insert_d((v2i64)out, 1, in1); \ +} +#define INSERT_D2_SB(...) INSERT_D2(v16i8, __VA_ARGS__) + /* Description : Interleave even byte elements from vectors Arguments : Inputs - in0, in1, in2, in3 Outputs - out0, out1 @@ -1116,6 +1148,20 @@ ADD2(in4, in5, in6, in7, out2, out3); \ } +/* Description : Zero extend unsigned byte elements to halfword elements + Arguments : Input - in (unsigned byte vector) + Outputs - out0, out1 (unsigned halfword vectors) + Return Type - signed halfword + Details : Zero extended right half of vector is returned in 'out0' + Zero extended left half of vector is returned in 'out1' +*/ +#define UNPCK_UB_SH(in, out0, out1) \ +{ \ + v16i8 zero_m = { 0 }; \ + \ + ILVRL_B2_SH(zero_m, in, out0, out1); \ +} + /* Description : Sign extend halfword elements from input vector and return the result in pair of vectors Arguments : Input - in (halfword vector) diff --git a/vp8/common/rtcd_defs.pl b/vp8/common/rtcd_defs.pl index f7f137915..7b8d10898 100644 --- a/vp8/common/rtcd_defs.pl +++ b/vp8/common/rtcd_defs.pl @@ -191,10 +191,10 @@ if (vpx_config("CONFIG_POSTPROC") eq "yes") { # no asm yet add_proto qw/void vp8_filter_by_weight16x16/, "unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight"; - specialize qw/vp8_filter_by_weight16x16 sse2/; + specialize qw/vp8_filter_by_weight16x16 sse2 msa/; add_proto qw/void vp8_filter_by_weight8x8/, "unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight"; - specialize qw/vp8_filter_by_weight8x8 sse2/; + specialize qw/vp8_filter_by_weight8x8 sse2 msa/; add_proto qw/void vp8_filter_by_weight4x4/, "unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight"; # no asm yet diff --git a/vp8/vp8_common.mk b/vp8/vp8_common.mk index 91ac837c2..aac3992db 100644 --- a/vp8/vp8_common.mk +++ b/vp8/vp8_common.mk @@ -122,6 +122,10 @@ VP8_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/reconintra_msa.c VP8_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/sixtap_filter_msa.c VP8_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp8_macros_msa.h +ifeq ($(CONFIG_POSTPROC),yes) +VP8_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/mfqe_msa.c +endif + # common (c) VP8_COMMON_SRCS-$(ARCH_ARM) += common/arm/filter_arm.c VP8_COMMON_SRCS-$(ARCH_ARM) += common/arm/loopfilter_arm.c -- 2.40.0