From: yuanhecai Date: Wed, 20 Apr 2022 03:16:55 +0000 (+0800) Subject: vp8[loongarch]: Optimize fdct8x4/diamond_search_sad X-Git-Tag: v1.12.0-rc1~15 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=508c0aff89b511d04cbd1e782cc24313fd6ae06b;p=libvpx vp8[loongarch]: Optimize fdct8x4/diamond_search_sad 1. vp8_short_fdct8x4_lsx 2. vp8_diamond_search_sad_lsx 3. vpx_sad8x8_lsx Bug: webm:1755 Change-Id: Ic9df84ead2d4fc07ec58e9730d6a12ac2b2d31c1 --- diff --git a/test/sad_test.cc b/test/sad_test.cc index 7ce25343f..2506f1adb 100644 --- a/test/sad_test.cc +++ b/test/sad_test.cc @@ -1136,6 +1136,7 @@ const SadMxNParam lsx_tests[] = { SadMxNParam(64, 64, &vpx_sad64x64_lsx), SadMxNParam(32, 32, &vpx_sad32x32_lsx), SadMxNParam(16, 16, &vpx_sad16x16_lsx), + SadMxNParam(8, 8, &vpx_sad8x8_lsx), }; INSTANTIATE_TEST_SUITE_P(LSX, SADTest, ::testing::ValuesIn(lsx_tests)); diff --git a/vp8/common/rtcd_defs.pl b/vp8/common/rtcd_defs.pl index 4f45d2ab9..7bc866faa 100644 --- a/vp8/common/rtcd_defs.pl +++ b/vp8/common/rtcd_defs.pl @@ -187,7 +187,7 @@ add_proto qw/void vp8_short_fdct4x4/, "short *input, short *output, int pitch"; specialize qw/vp8_short_fdct4x4 sse2 neon msa mmi lsx/; add_proto qw/void vp8_short_fdct8x4/, "short *input, short *output, int pitch"; -specialize qw/vp8_short_fdct8x4 sse2 neon msa mmi/; +specialize qw/vp8_short_fdct8x4 sse2 neon msa mmi lsx/; add_proto qw/void vp8_short_walsh4x4/, "short *input, short *output, int pitch"; specialize qw/vp8_short_walsh4x4 sse2 neon msa mmi/; @@ -222,9 +222,10 @@ $vp8_refining_search_sad_sse2=vp8_refining_search_sadx4; $vp8_refining_search_sad_msa=vp8_refining_search_sadx4; add_proto qw/int vp8_diamond_search_sad/, "struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv"; -specialize qw/vp8_diamond_search_sad sse2 msa/; +specialize qw/vp8_diamond_search_sad sse2 msa lsx/; $vp8_diamond_search_sad_sse2=vp8_diamond_search_sadx4; $vp8_diamond_search_sad_msa=vp8_diamond_search_sadx4; +$vp8_diamond_search_sad_lsx=vp8_diamond_search_sadx4; # # Alt-ref Noise Reduction (ARNR) diff --git a/vp8/encoder/loongarch/dct_lsx.c b/vp8/encoder/loongarch/dct_lsx.c index e090d2360..a08d4d3f6 100644 --- a/vp8/encoder/loongarch/dct_lsx.c +++ b/vp8/encoder/loongarch/dct_lsx.c @@ -97,3 +97,65 @@ void vp8_short_fdct4x4_lsx(int16_t *input, int16_t *output, int32_t pitch) { __lsx_vst(in0, output, 0); __lsx_vst(in2, output, 16); } + +void vp8_short_fdct8x4_lsx(int16_t *input, int16_t *output, int32_t pitch) { + __m128i in0, in1, in2, in3, temp0, temp1, tmp0, tmp1; + __m128i const0, const1, const2, vec0_w, vec1_w, vec2_w, vec3_w; + __m128i coeff = { 0x38a4eb1814e808a9, 0x659061a82ee01d4c }; + __m128i zero = __lsx_vldi(0); + int32_t pitch2 = pitch << 1; + int32_t pitch3 = pitch2 + pitch; + + in0 = __lsx_vld(input, 0); + DUP2_ARG2(__lsx_vldx, input, pitch, input, pitch2, in1, in2); + in3 = __lsx_vldx(input, pitch3); + LSX_TRANSPOSE4x4_H(in0, in1, in2, in3, in0, in1, in2, in3); + + LSX_BUTTERFLY_4_H(in0, in1, in2, in3, temp0, temp1, in1, in3); + DUP4_ARG2(__lsx_vslli_h, temp0, 3, temp1, 3, in1, 3, in3, 3, temp0, temp1, + in1, in3); + in0 = __lsx_vadd_h(temp0, temp1); + in2 = __lsx_vsub_h(temp0, temp1); + SET_DOTP_VALUES(coeff, 0, 1, 2, const1, const2); + temp0 = __lsx_vreplvei_h(coeff, 3); + vec1_w = __lsx_vpackev_h(zero, temp0); + coeff = __lsx_vilvh_h(zero, coeff); + vec3_w = __lsx_vreplvei_w(coeff, 0); + tmp1 = __lsx_vilvl_h(in3, in1); + tmp0 = __lsx_vilvh_h(in3, in1); + vec0_w = vec1_w; + vec2_w = vec3_w; + DUP4_ARG3(__lsx_vdp2add_w_h, vec0_w, tmp1, const1, vec1_w, tmp0, const1, + vec2_w, tmp1, const2, vec3_w, tmp0, const2, vec0_w, vec1_w, vec2_w, + vec3_w); + DUP2_ARG3(__lsx_vsrani_h_w, vec1_w, vec0_w, 12, vec3_w, vec2_w, 12, in1, in3); + LSX_TRANSPOSE4x4_H(in0, in1, in2, in3, in0, in1, in2, in3); + + LSX_BUTTERFLY_4_H(in0, in1, in2, in3, temp0, temp1, in1, in3); + in0 = __lsx_vadd_h(temp0, temp1); + in0 = __lsx_vaddi_hu(in0, 7); + in2 = __lsx_vsub_h(temp0, temp1); + in2 = __lsx_vaddi_hu(in2, 7); + in0 = __lsx_vsrai_h(in0, 4); + in2 = __lsx_vsrai_h(in2, 4); + DUP2_ARG2(__lsx_vreplvei_w, coeff, 2, coeff, 3, vec3_w, vec1_w); + vec3_w = __lsx_vadd_w(vec3_w, vec1_w); + vec1_w = __lsx_vreplvei_w(coeff, 1); + const0 = RET_1_IF_NZERO_H(in3); + tmp1 = __lsx_vilvl_h(in3, in1); + tmp0 = __lsx_vilvh_h(in3, in1); + vec0_w = vec1_w; + vec2_w = vec3_w; + DUP4_ARG3(__lsx_vdp2add_w_h, vec0_w, tmp1, const1, vec1_w, tmp0, const1, + vec2_w, tmp1, const2, vec3_w, tmp0, const2, vec0_w, vec1_w, vec2_w, + vec3_w); + DUP2_ARG3(__lsx_vsrani_h_w, vec1_w, vec0_w, 16, vec3_w, vec2_w, 16, in1, in3); + in1 = __lsx_vadd_h(in1, const0); + DUP2_ARG2(__lsx_vpickev_d, in1, in0, in3, in2, temp0, temp1); + __lsx_vst(temp0, output, 0); + __lsx_vst(temp1, output, 16); + + DUP2_ARG2(__lsx_vpickod_d, in1, in0, in3, in2, in0, in2); + __lsx_vst(in0, output, 32); + __lsx_vst(in2, output, 48); +} diff --git a/vp8/encoder/mcomp.c b/vp8/encoder/mcomp.c index 769c2f558..ae092c66e 100644 --- a/vp8/encoder/mcomp.c +++ b/vp8/encoder/mcomp.c @@ -1129,7 +1129,7 @@ int vp8_diamond_search_sad_c(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv, mv_err_cost(&this_mv, center_mv, mvcost, x->errorperbit); } -#if HAVE_SSE2 || HAVE_MSA +#if HAVE_SSE2 || HAVE_MSA || HAVE_LSX int vp8_diamond_search_sadx4(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv, int_mv *best_mv, int search_param, int sad_per_bit, int *num00, vp8_variance_fn_ptr_t *fn_ptr, @@ -1278,7 +1278,7 @@ int vp8_diamond_search_sadx4(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv, return fn_ptr->vf(what, what_stride, best_address, in_what_stride, &thissad) + mv_err_cost(&this_mv, center_mv, mvcost, x->errorperbit); } -#endif // HAVE_SSE2 || HAVE_MSA +#endif // HAVE_SSE2 || HAVE_MSA || HAVE_LSX int vp8_full_search_sad(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv, int sad_per_bit, int distance, diff --git a/vpx_dsp/loongarch/sad_lsx.c b/vpx_dsp/loongarch/sad_lsx.c index 4764acbf8..46ee557df 100644 --- a/vpx_dsp/loongarch/sad_lsx.c +++ b/vpx_dsp/loongarch/sad_lsx.c @@ -57,6 +57,34 @@ sum_m; \ }) +static uint32_t sad_8width_lsx(const uint8_t *src, int32_t src_stride, + const uint8_t *ref, int32_t ref_stride, + int32_t height) { + int32_t ht_cnt; + __m128i src0, src1, src2, src3, ref0, ref1, ref2, ref3, sad_tmp; + __m128i sad = __lsx_vldi(0); + + for (ht_cnt = (height >> 2); ht_cnt--;) { + DUP2_ARG2(__lsx_vld, src, 0, ref, 0, src0, ref0); + src += src_stride; + ref += ref_stride; + DUP2_ARG2(__lsx_vld, src, 0, ref, 0, src1, ref1); + src += src_stride; + ref += ref_stride; + DUP2_ARG2(__lsx_vld, src, 0, ref, 0, src2, ref2); + src += src_stride; + ref += ref_stride; + DUP2_ARG2(__lsx_vld, src, 0, ref, 0, src3, ref3); + src += src_stride; + ref += ref_stride; + DUP4_ARG2(__lsx_vpickev_d, src1, src0, src3, src2, ref1, ref0, ref3, ref2, + src0, src1, ref0, ref1); + sad_tmp = SAD_UB2_UH(src0, src1, ref0, ref1); + sad = __lsx_vadd_h(sad, sad_tmp); + } + return HADD_UH_U32(sad); +} + static uint32_t sad_16width_lsx(const uint8_t *src, int32_t src_stride, const uint8_t *ref, int32_t ref_stride, int32_t height) { @@ -584,6 +612,12 @@ static uint32_t avgsad_64width_lsx(const uint8_t *src, int32_t src_stride, return HADD_SW_S32(sad); } +#define VPX_SAD_8xHT_LSX(height) \ + uint32_t vpx_sad8x##height##_lsx(const uint8_t *src, int32_t src_stride, \ + const uint8_t *ref, int32_t ref_stride) { \ + return sad_8width_lsx(src, src_stride, ref, ref_stride, height); \ + } + #define VPX_SAD_16xHT_LSX(height) \ uint32_t vpx_sad16x##height##_lsx(const uint8_t *src, int32_t src_stride, \ const uint8_t *ref, int32_t ref_stride) { \ @@ -662,7 +696,7 @@ SAD32 SAD16 -#define SAD8 VPX_SAD_8xHTx4D_LSX(8) +#define SAD8 VPX_SAD_8xHT_LSX(8) VPX_SAD_8xHTx4D_LSX(8) SAD8 diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index 23925a479..e82b487f1 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -763,7 +763,7 @@ add_proto qw/unsigned int vpx_sad8x16/, "const uint8_t *src_ptr, int src_stride, specialize qw/vpx_sad8x16 neon msa sse2 vsx mmi/; add_proto qw/unsigned int vpx_sad8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; -specialize qw/vpx_sad8x8 neon msa sse2 vsx mmi/; +specialize qw/vpx_sad8x8 neon msa sse2 vsx mmi lsx/; add_proto qw/unsigned int vpx_sad8x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; specialize qw/vpx_sad8x4 neon msa sse2 vsx mmi/;