From b92cc27b76ee5158515beb2e2656a0117d1c2dda Mon Sep 17 00:00:00 2001 From: Parag Salasakar Date: Fri, 26 Jun 2015 12:00:24 +0530 Subject: [PATCH] mips msa vp9 temporal filter optimization average improvement ~4x-5x Change-Id: Iad9c0a296dbc2ea96d000bd009077999ed58a3c5 --- vp9/common/vp9_rtcd_defs.pl | 2 +- .../mips/msa/vp9_temporal_filter_msa.c | 289 ++++++++++++++++++ vp9/vp9cx.mk | 1 + 3 files changed, 291 insertions(+), 1 deletion(-) create mode 100644 vp9/encoder/mips/msa/vp9_temporal_filter_msa.c diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl index c97b5d76d..22a5efdd5 100644 --- a/vp9/common/vp9_rtcd_defs.pl +++ b/vp9/common/vp9_rtcd_defs.pl @@ -1077,7 +1077,7 @@ add_proto qw/int vp9_full_range_search/, "const struct macroblock *x, const stru specialize qw/vp9_full_range_search/; add_proto qw/void vp9_temporal_filter_apply/, "uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count"; -specialize qw/vp9_temporal_filter_apply sse2/; +specialize qw/vp9_temporal_filter_apply sse2 msa/; if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { diff --git a/vp9/encoder/mips/msa/vp9_temporal_filter_msa.c b/vp9/encoder/mips/msa/vp9_temporal_filter_msa.c new file mode 100644 index 000000000..4053bffae --- /dev/null +++ b/vp9/encoder/mips/msa/vp9_temporal_filter_msa.c @@ -0,0 +1,289 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vp9_rtcd.h" +#include "vp9/common/mips/msa/vp9_macros_msa.h" + +static void temporal_filter_apply_8size_msa(uint8_t *frm1_ptr, + uint32_t stride, + uint8_t *frm2_ptr, + int32_t filt_sth, + int32_t filt_wgt, + uint32_t *acc, + uint16_t *cnt) { + uint32_t row; + uint64_t f0, f1, f2, f3; + v16i8 frm2, frm1 = { 0 }; + v16i8 frm4, frm3 = { 0 }; + v16u8 frm_r, frm_l; + v8i16 frm2_r, frm2_l; + v8i16 diff0, diff1, mod0_h, mod1_h; + v4i32 cnst3, cnst16, filt_wt, strength; + v4i32 mod0_w, mod1_w, mod2_w, mod3_w; + v4i32 diff0_r, diff0_l, diff1_r, diff1_l; + v4i32 frm2_rr, frm2_rl, frm2_lr, frm2_ll; + v4i32 acc0, acc1, acc2, acc3; + v8i16 cnt0, cnt1; + + filt_wt = __msa_fill_w(filt_wgt); + strength = __msa_fill_w(filt_sth); + cnst3 = __msa_ldi_w(3); + cnst16 = __msa_ldi_w(16); + + for (row = 2; row--;) { + LD4(frm1_ptr, stride, f0, f1, f2, f3); + frm1_ptr += (4 * stride); + + LD_SB2(frm2_ptr, 16, frm2, frm4); + frm2_ptr += 32; + + LD_SW2(acc, 4, acc0, acc1); + LD_SW2(acc + 8, 4, acc2, acc3); + LD_SH2(cnt, 8, cnt0, cnt1); + + INSERT_D2_SB(f0, f1, frm1); + INSERT_D2_SB(f2, f3, frm3); + ILVRL_B2_UB(frm1, frm2, frm_r, frm_l); + HSUB_UB2_SH(frm_r, frm_l, diff0, diff1); + UNPCK_SH_SW(diff0, diff0_r, diff0_l); + UNPCK_SH_SW(diff1, diff1_r, diff1_l); + MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, + diff1_l, mod0_w, mod1_w, mod2_w, mod3_w); + MUL4(mod0_w, cnst3, mod1_w, cnst3, mod2_w, cnst3, mod3_w, cnst3, + mod0_w, mod1_w, mod2_w, mod3_w); + SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength); + + diff0_r = (mod0_w < cnst16); + diff0_l = (mod1_w < cnst16); + diff1_r = (mod2_w < cnst16); + diff1_l = (mod3_w < cnst16); + + SUB4(cnst16, mod0_w, cnst16, mod1_w, cnst16, mod2_w, cnst16, mod3_w, + mod0_w, mod1_w, mod2_w, mod3_w); + + mod0_w = diff0_r & mod0_w; + mod1_w = diff0_l & mod1_w; + mod2_w = diff1_r & mod2_w; + mod3_w = diff1_l & mod3_w; + + MUL4(mod0_w, filt_wt, mod1_w, filt_wt, mod2_w, filt_wt, mod3_w, filt_wt, + mod0_w, mod1_w, mod2_w, mod3_w); + PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h); + ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h); + ST_SH2(mod0_h, mod1_h, cnt, 8); + cnt += 16; + + UNPCK_UB_SH(frm2, frm2_r, frm2_l); + UNPCK_SH_SW(frm2_r, frm2_rr, frm2_rl); + UNPCK_SH_SW(frm2_l, frm2_lr, frm2_ll); + MUL4(mod0_w, frm2_rr, mod1_w, frm2_rl, mod2_w, frm2_lr, mod3_w, frm2_ll, + mod0_w, mod1_w, mod2_w, mod3_w); + ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, + mod0_w, mod1_w, mod2_w, mod3_w); + + ST_SW2(mod0_w, mod1_w, acc, 4); + acc += 8; + ST_SW2(mod2_w, mod3_w, acc, 4); + acc += 8; + + LD_SW2(acc, 4, acc0, acc1); + LD_SW2(acc + 8, 4, acc2, acc3); + LD_SH2(cnt, 8, cnt0, cnt1); + + ILVRL_B2_UB(frm3, frm4, frm_r, frm_l); + HSUB_UB2_SH(frm_r, frm_l, diff0, diff1); + UNPCK_SH_SW(diff0, diff0_r, diff0_l); + UNPCK_SH_SW(diff1, diff1_r, diff1_l); + MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, + diff1_l, mod0_w, mod1_w, mod2_w, mod3_w); + MUL4(mod0_w, cnst3, mod1_w, cnst3, mod2_w, cnst3, mod3_w, cnst3, + mod0_w, mod1_w, mod2_w, mod3_w); + SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength); + + diff0_r = (mod0_w < cnst16); + diff0_l = (mod1_w < cnst16); + diff1_r = (mod2_w < cnst16); + diff1_l = (mod3_w < cnst16); + + SUB4(cnst16, mod0_w, cnst16, mod1_w, cnst16, mod2_w, cnst16, mod3_w, + mod0_w, mod1_w, mod2_w, mod3_w); + + mod0_w = diff0_r & mod0_w; + mod1_w = diff0_l & mod1_w; + mod2_w = diff1_r & mod2_w; + mod3_w = diff1_l & mod3_w; + + MUL4(mod0_w, filt_wt, mod1_w, filt_wt, mod2_w, filt_wt, mod3_w, filt_wt, + mod0_w, mod1_w, mod2_w, mod3_w); + PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h); + ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h); + ST_SH2(mod0_h, mod1_h, cnt, 8); + cnt += 16; + UNPCK_UB_SH(frm4, frm2_r, frm2_l); + UNPCK_SH_SW(frm2_r, frm2_rr, frm2_rl); + UNPCK_SH_SW(frm2_l, frm2_lr, frm2_ll); + MUL4(mod0_w, frm2_rr, mod1_w, frm2_rl, mod2_w, frm2_lr, mod3_w, frm2_ll, + mod0_w, mod1_w, mod2_w, mod3_w); + ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, + mod0_w, mod1_w, mod2_w, mod3_w); + + ST_SW2(mod0_w, mod1_w, acc, 4); + acc += 8; + ST_SW2(mod2_w, mod3_w, acc, 4); + acc += 8; + } +} + +static void temporal_filter_apply_16size_msa(uint8_t *frm1_ptr, + uint32_t stride, + uint8_t *frm2_ptr, + int32_t filt_sth, + int32_t filt_wgt, + uint32_t *acc, + uint16_t *cnt) { + uint32_t row; + v16i8 frm1, frm2, frm3, frm4; + v16u8 frm_r, frm_l; + v16i8 zero = { 0 }; + v8u16 frm2_r, frm2_l; + v8i16 diff0, diff1, mod0_h, mod1_h; + v4i32 cnst3, cnst16, filt_wt, strength; + v4i32 mod0_w, mod1_w, mod2_w, mod3_w; + v4i32 diff0_r, diff0_l, diff1_r, diff1_l; + v4i32 frm2_rr, frm2_rl, frm2_lr, frm2_ll; + v4i32 acc0, acc1, acc2, acc3; + v8i16 cnt0, cnt1; + + filt_wt = __msa_fill_w(filt_wgt); + strength = __msa_fill_w(filt_sth); + cnst3 = __msa_ldi_w(3); + cnst16 = __msa_ldi_w(16); + + for (row = 8; row--;) { + LD_SB2(frm1_ptr, stride, frm1, frm3); + frm1_ptr += stride; + + LD_SB2(frm2_ptr, 16, frm2, frm4); + frm2_ptr += 16; + + LD_SW2(acc, 4, acc0, acc1); + LD_SW2(acc, 4, acc2, acc3); + LD_SH2(cnt, 8, cnt0, cnt1); + + ILVRL_B2_UB(frm1, frm2, frm_r, frm_l); + HSUB_UB2_SH(frm_r, frm_l, diff0, diff1); + UNPCK_SH_SW(diff0, diff0_r, diff0_l); + UNPCK_SH_SW(diff1, diff1_r, diff1_l); + MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l, + mod0_w, mod1_w, mod2_w, mod3_w); + MUL4(mod0_w, cnst3, mod1_w, cnst3, mod2_w, cnst3, mod3_w, cnst3, + mod0_w, mod1_w, mod2_w, mod3_w); + SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength); + + diff0_r = (mod0_w < cnst16); + diff0_l = (mod1_w < cnst16); + diff1_r = (mod2_w < cnst16); + diff1_l = (mod3_w < cnst16); + + SUB4(cnst16, mod0_w, cnst16, mod1_w, cnst16, mod2_w, cnst16, mod3_w, + mod0_w, mod1_w, mod2_w, mod3_w); + + mod0_w = diff0_r & mod0_w; + mod1_w = diff0_l & mod1_w; + mod2_w = diff1_r & mod2_w; + mod3_w = diff1_l & mod3_w; + + MUL4(mod0_w, filt_wt, mod1_w, filt_wt, mod2_w, filt_wt, mod3_w, filt_wt, + mod0_w, mod1_w, mod2_w, mod3_w); + PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h); + ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h); + ST_SH2(mod0_h, mod1_h, cnt, 8); + cnt += 16; + + ILVRL_B2_UH(zero, frm2, frm2_r, frm2_l); + UNPCK_SH_SW(frm2_r, frm2_rr, frm2_rl); + UNPCK_SH_SW(frm2_l, frm2_lr, frm2_ll); + MUL4(mod0_w, frm2_rr, mod1_w, frm2_rl, mod2_w, frm2_lr, mod3_w, frm2_ll, + mod0_w, mod1_w, mod2_w, mod3_w); + ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, + mod0_w, mod1_w, mod2_w, mod3_w); + + ST_SW2(mod0_w, mod1_w, acc, 4); + acc += 8; + ST_SW2(mod2_w, mod3_w, acc, 4); + acc += 8; + + LD_SW2(acc, 4, acc0, acc1); + LD_SW2(acc + 8, 4, acc2, acc3); + LD_SH2(cnt, 8, cnt0, cnt1); + + ILVRL_B2_UB(frm3, frm4, frm_r, frm_l); + HSUB_UB2_SH(frm_r, frm_l, diff0, diff1); + UNPCK_SH_SW(diff0, diff0_r, diff0_l); + UNPCK_SH_SW(diff1, diff1_r, diff1_l); + MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l, + mod0_w, mod1_w, mod2_w, mod3_w); + MUL4(mod0_w, cnst3, mod1_w, cnst3, mod2_w, cnst3, mod3_w, cnst3, + mod0_w, mod1_w, mod2_w, mod3_w); + SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength); + + diff0_r = (mod0_w < cnst16); + diff0_l = (mod1_w < cnst16); + diff1_r = (mod2_w < cnst16); + diff1_l = (mod3_w < cnst16); + + SUB4(cnst16, mod0_w, cnst16, mod1_w, cnst16, mod2_w, cnst16, mod3_w, + mod0_w, mod1_w, mod2_w, mod3_w); + + mod0_w = diff0_r & mod0_w; + mod1_w = diff0_l & mod1_w; + mod2_w = diff1_r & mod2_w; + mod3_w = diff1_l & mod3_w; + + MUL4(mod0_w, filt_wt, mod1_w, filt_wt, mod2_w, filt_wt, mod3_w, filt_wt, + mod0_w, mod1_w, mod2_w, mod3_w); + PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h); + ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h); + ST_SH2(mod0_h, mod1_h, cnt, 8); + cnt += 16; + + ILVRL_B2_UH(zero, frm4, frm2_r, frm2_l); + UNPCK_SH_SW(frm2_r, frm2_rr, frm2_rl); + UNPCK_SH_SW(frm2_l, frm2_lr, frm2_ll); + MUL4(mod0_w, frm2_rr, mod1_w, frm2_rl, mod2_w, frm2_lr, mod3_w, frm2_ll, + mod0_w, mod1_w, mod2_w, mod3_w); + ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, + mod0_w, mod1_w, mod2_w, mod3_w); + ST_SW2(mod0_w, mod1_w, acc, 4); + acc += 8; + ST_SW2(mod2_w, mod3_w, acc, 4); + acc += 8; + + frm1_ptr += stride; + frm2_ptr += 16; + } +} + +void vp9_temporal_filter_apply_msa(uint8_t *frame1_ptr, uint32_t stride, + uint8_t *frame2_ptr, uint32_t blk_w, + uint32_t blk_h, int32_t strength, + int32_t filt_wgt, uint32_t *accu, + uint16_t *cnt) { + if (8 == (blk_w * blk_h)) { + temporal_filter_apply_8size_msa(frame1_ptr, stride, frame2_ptr, + strength, filt_wgt, accu, cnt); + } else if (16 == (blk_w * blk_h)) { + temporal_filter_apply_16size_msa(frame1_ptr, stride, frame2_ptr, + strength, filt_wgt, accu, cnt); + } else { + vp9_temporal_filter_apply_c(frame1_ptr, stride, frame2_ptr, blk_w, blk_h, + strength, filt_wgt, accu, cnt); + } +} diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk index 96422d7e1..e78c111f0 100644 --- a/vp9/vp9cx.mk +++ b/vp9/vp9cx.mk @@ -160,5 +160,6 @@ VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct16x16_msa.c VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct32x32_msa.c VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct_msa.h VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_subtract_msa.c +VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_temporal_filter_msa.c VP9_CX_SRCS-yes := $(filter-out $(VP9_CX_SRCS_REMOVE-yes),$(VP9_CX_SRCS-yes)) -- 2.40.0