vp8[loongarch]: Optimize fdct8x4/diamond_search_sad

author yuanhecai <yuanhecai@loongson.cn>

Wed, 20 Apr 2022 03:16:55 +0000 (11:16 +0800)

committer yuanhecai <yuanhecai@loongson.cn>

Tue, 17 May 2022 12:53:25 +0000 (20:53 +0800)
author yuanhecai <yuanhecai@loongson.cn>
Wed, 20 Apr 2022 03:16:55 +0000 (11:16 +0800)
committer yuanhecai <yuanhecai@loongson.cn>
Tue, 17 May 2022 12:53:25 +0000 (20:53 +0800)
diff --git a/test/sad_test.cc b/test/sad_test.cc

index 7ce25343f63716908051d3e853a46c76d0040dbd..2506f1adbcd0040c435ad95b55deb36017c9643f 100644 (file)
--- a/test/sad_test.cc
+++ b/test/sad_test.cc
@@ -1136,6 +1136,7 @@ const SadMxNParam lsx_tests[] = {
    SadMxNParam(64, 64, &vpx_sad64x64_lsx),
    SadMxNParam(32, 32, &vpx_sad32x32_lsx),
    SadMxNParam(16, 16, &vpx_sad16x16_lsx),
+  SadMxNParam(8, 8, &vpx_sad8x8_lsx),
  };
  INSTANTIATE_TEST_SUITE_P(LSX, SADTest, ::testing::ValuesIn(lsx_tests));
  
diff --git a/vp8/common/rtcd_defs.pl b/vp8/common/rtcd_defs.pl

index 4f45d2ab9a60fb05563fc77d4bdab1e814f55d66..7bc866faaabcef51e241242fdec53358817e4e32 100644 (file)
--- a/vp8/common/rtcd_defs.pl
+++ b/vp8/common/rtcd_defs.pl
@@ -187,7 +187,7 @@ add_proto qw/void vp8_short_fdct4x4/, "short *input, short *output, int pitch";
  specialize qw/vp8_short_fdct4x4 sse2 neon msa mmi lsx/;
  
  add_proto qw/void vp8_short_fdct8x4/, "short *input, short *output, int pitch";
-specialize qw/vp8_short_fdct8x4 sse2 neon msa mmi/;
+specialize qw/vp8_short_fdct8x4 sse2 neon msa mmi lsx/;
  
  add_proto qw/void vp8_short_walsh4x4/, "short *input, short *output, int pitch";
  specialize qw/vp8_short_walsh4x4 sse2 neon msa mmi/;
@@ -222,9 +222,10 @@ $vp8_refining_search_sad_sse2=vp8_refining_search_sadx4;
  $vp8_refining_search_sad_msa=vp8_refining_search_sadx4;
  
  add_proto qw/int vp8_diamond_search_sad/, "struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv";
-specialize qw/vp8_diamond_search_sad sse2 msa/;
+specialize qw/vp8_diamond_search_sad sse2 msa lsx/;
  $vp8_diamond_search_sad_sse2=vp8_diamond_search_sadx4;
  $vp8_diamond_search_sad_msa=vp8_diamond_search_sadx4;
+$vp8_diamond_search_sad_lsx=vp8_diamond_search_sadx4;
  
  #
  # Alt-ref Noise Reduction (ARNR)
diff --git a/vp8/encoder/loongarch/dct_lsx.c b/vp8/encoder/loongarch/dct_lsx.c

index e090d2360f2761e24f31030c123884986636ba3c..a08d4d3f63b7723d2e4ba072febf62dd14c36824 100644 (file)
--- a/vp8/encoder/loongarch/dct_lsx.c
+++ b/vp8/encoder/loongarch/dct_lsx.c
@@ -97,3 +97,65 @@ void vp8_short_fdct4x4_lsx(int16_t *input, int16_t *output, int32_t pitch) {
    __lsx_vst(in0, output, 0);
    __lsx_vst(in2, output, 16);
  }
+
+void vp8_short_fdct8x4_lsx(int16_t *input, int16_t *output, int32_t pitch) {
+  __m128i in0, in1, in2, in3, temp0, temp1, tmp0, tmp1;
+  __m128i const0, const1, const2, vec0_w, vec1_w, vec2_w, vec3_w;
+  __m128i coeff = { 0x38a4eb1814e808a9, 0x659061a82ee01d4c };
+  __m128i zero = __lsx_vldi(0);
+  int32_t pitch2 = pitch << 1;
+  int32_t pitch3 = pitch2 + pitch;
+
+  in0 = __lsx_vld(input, 0);
+  DUP2_ARG2(__lsx_vldx, input, pitch, input, pitch2, in1, in2);
+  in3 = __lsx_vldx(input, pitch3);
+  LSX_TRANSPOSE4x4_H(in0, in1, in2, in3, in0, in1, in2, in3);
+
+  LSX_BUTTERFLY_4_H(in0, in1, in2, in3, temp0, temp1, in1, in3);
+  DUP4_ARG2(__lsx_vslli_h, temp0, 3, temp1, 3, in1, 3, in3, 3, temp0, temp1,
+            in1, in3);
+  in0 = __lsx_vadd_h(temp0, temp1);
+  in2 = __lsx_vsub_h(temp0, temp1);
+  SET_DOTP_VALUES(coeff, 0, 1, 2, const1, const2);
+  temp0 = __lsx_vreplvei_h(coeff, 3);
+  vec1_w = __lsx_vpackev_h(zero, temp0);
+  coeff = __lsx_vilvh_h(zero, coeff);
+  vec3_w = __lsx_vreplvei_w(coeff, 0);
+  tmp1 = __lsx_vilvl_h(in3, in1);
+  tmp0 = __lsx_vilvh_h(in3, in1);
+  vec0_w = vec1_w;
+  vec2_w = vec3_w;
+  DUP4_ARG3(__lsx_vdp2add_w_h, vec0_w, tmp1, const1, vec1_w, tmp0, const1,
+            vec2_w, tmp1, const2, vec3_w, tmp0, const2, vec0_w, vec1_w, vec2_w,
+            vec3_w);
+  DUP2_ARG3(__lsx_vsrani_h_w, vec1_w, vec0_w, 12, vec3_w, vec2_w, 12, in1, in3);
+  LSX_TRANSPOSE4x4_H(in0, in1, in2, in3, in0, in1, in2, in3);
+
+  LSX_BUTTERFLY_4_H(in0, in1, in2, in3, temp0, temp1, in1, in3);
+  in0 = __lsx_vadd_h(temp0, temp1);
+  in0 = __lsx_vaddi_hu(in0, 7);
+  in2 = __lsx_vsub_h(temp0, temp1);
+  in2 = __lsx_vaddi_hu(in2, 7);
+  in0 = __lsx_vsrai_h(in0, 4);
+  in2 = __lsx_vsrai_h(in2, 4);
+  DUP2_ARG2(__lsx_vreplvei_w, coeff, 2, coeff, 3, vec3_w, vec1_w);
+  vec3_w = __lsx_vadd_w(vec3_w, vec1_w);
+  vec1_w = __lsx_vreplvei_w(coeff, 1);
+  const0 = RET_1_IF_NZERO_H(in3);
+  tmp1 = __lsx_vilvl_h(in3, in1);
+  tmp0 = __lsx_vilvh_h(in3, in1);
+  vec0_w = vec1_w;
+  vec2_w = vec3_w;
+  DUP4_ARG3(__lsx_vdp2add_w_h, vec0_w, tmp1, const1, vec1_w, tmp0, const1,
+            vec2_w, tmp1, const2, vec3_w, tmp0, const2, vec0_w, vec1_w, vec2_w,
+            vec3_w);
+  DUP2_ARG3(__lsx_vsrani_h_w, vec1_w, vec0_w, 16, vec3_w, vec2_w, 16, in1, in3);
+  in1 = __lsx_vadd_h(in1, const0);
+  DUP2_ARG2(__lsx_vpickev_d, in1, in0, in3, in2, temp0, temp1);
+  __lsx_vst(temp0, output, 0);
+  __lsx_vst(temp1, output, 16);
+
+  DUP2_ARG2(__lsx_vpickod_d, in1, in0, in3, in2, in0, in2);
+  __lsx_vst(in0, output, 32);
+  __lsx_vst(in2, output, 48);
+}
diff --git a/vp8/encoder/mcomp.c b/vp8/encoder/mcomp.c

index 769c2f55895a987d0e268fb19f1817e86dc7d83a..ae092c66e19c04c007659a043b11f95050cf83e8 100644 (file)
--- a/vp8/encoder/mcomp.c
+++ b/vp8/encoder/mcomp.c
@@ -1129,7 +1129,7 @@ int vp8_diamond_search_sad_c(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
           mv_err_cost(&this_mv, center_mv, mvcost, x->errorperbit);
  }
  
-#if HAVE_SSE2 || HAVE_MSA
+#if HAVE_SSE2 || HAVE_MSA || HAVE_LSX
  int vp8_diamond_search_sadx4(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
                               int_mv *best_mv, int search_param, int sad_per_bit,
                               int *num00, vp8_variance_fn_ptr_t *fn_ptr,
@@ -1278,7 +1278,7 @@ int vp8_diamond_search_sadx4(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
    return fn_ptr->vf(what, what_stride, best_address, in_what_stride, &thissad) +
           mv_err_cost(&this_mv, center_mv, mvcost, x->errorperbit);
  }
-#endif  // HAVE_SSE2 || HAVE_MSA
+#endif  // HAVE_SSE2 || HAVE_MSA || HAVE_LSX
  
  int vp8_full_search_sad(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
                          int sad_per_bit, int distance,
diff --git a/vpx_dsp/loongarch/sad_lsx.c b/vpx_dsp/loongarch/sad_lsx.c

index 4764acbf88e4769938a5f856122c2b967f527fa8..46ee557df5e7695e6595d1dc4cbac18bfa813b85 100644 (file)
--- a/vpx_dsp/loongarch/sad_lsx.c
+++ b/vpx_dsp/loongarch/sad_lsx.c
@@ -57,6 +57,34 @@
      sum_m;                                     \
    })
  
+static uint32_t sad_8width_lsx(const uint8_t *src, int32_t src_stride,
+                               const uint8_t *ref, int32_t ref_stride,
+                               int32_t height) {
+  int32_t ht_cnt;
+  __m128i src0, src1, src2, src3, ref0, ref1, ref2, ref3, sad_tmp;
+  __m128i sad = __lsx_vldi(0);
+
+  for (ht_cnt = (height >> 2); ht_cnt--;) {
+    DUP2_ARG2(__lsx_vld, src, 0, ref, 0, src0, ref0);
+    src += src_stride;
+    ref += ref_stride;
+    DUP2_ARG2(__lsx_vld, src, 0, ref, 0, src1, ref1);
+    src += src_stride;
+    ref += ref_stride;
+    DUP2_ARG2(__lsx_vld, src, 0, ref, 0, src2, ref2);
+    src += src_stride;
+    ref += ref_stride;
+    DUP2_ARG2(__lsx_vld, src, 0, ref, 0, src3, ref3);
+    src += src_stride;
+    ref += ref_stride;
+    DUP4_ARG2(__lsx_vpickev_d, src1, src0, src3, src2, ref1, ref0, ref3, ref2,
+              src0, src1, ref0, ref1);
+    sad_tmp = SAD_UB2_UH(src0, src1, ref0, ref1);
+    sad = __lsx_vadd_h(sad, sad_tmp);
+  }
+  return HADD_UH_U32(sad);
+}
+
  static uint32_t sad_16width_lsx(const uint8_t *src, int32_t src_stride,
                                  const uint8_t *ref, int32_t ref_stride,
                                  int32_t height) {
@@ -584,6 +612,12 @@ static uint32_t avgsad_64width_lsx(const uint8_t *src, int32_t src_stride,
    return HADD_SW_S32(sad);
  }
  
+#define VPX_SAD_8xHT_LSX(height)                                             \
+  uint32_t vpx_sad8x##height##_lsx(const uint8_t *src, int32_t src_stride,   \
+                                   const uint8_t *ref, int32_t ref_stride) { \
+    return sad_8width_lsx(src, src_stride, ref, ref_stride, height);         \
+  }
+
  #define VPX_SAD_16xHT_LSX(height)                                             \
    uint32_t vpx_sad16x##height##_lsx(const uint8_t *src, int32_t src_stride,   \
                                      const uint8_t *ref, int32_t ref_stride) { \
@@ -662,7 +696,7 @@ SAD32
  
  SAD16
  
-#define SAD8 VPX_SAD_8xHTx4D_LSX(8)
+#define SAD8 VPX_SAD_8xHT_LSX(8) VPX_SAD_8xHTx4D_LSX(8)
  
  SAD8
  
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl

index 23925a4793e3299ba75fc329d0d340a81b9dc61e..e82b487f1361b165fdda37f80fe377d8580ec97e 100644 (file)
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -763,7 +763,7 @@ add_proto qw/unsigned int vpx_sad8x16/, "const uint8_t *src_ptr, int src_stride,
  specialize qw/vpx_sad8x16 neon msa sse2 vsx mmi/;
  
  add_proto qw/unsigned int vpx_sad8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad8x8 neon msa sse2 vsx mmi/;
+specialize qw/vpx_sad8x8 neon msa sse2 vsx mmi lsx/;
  
  add_proto qw/unsigned int vpx_sad8x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
  specialize qw/vpx_sad8x4 neon msa sse2 vsx mmi/;
author	yuanhecai <yuanhecai@loongson.cn>
	Wed, 20 Apr 2022 03:16:55 +0000 (11:16 +0800)
committer	yuanhecai <yuanhecai@loongson.cn>
	Tue, 17 May 2022 12:53:25 +0000 (20:53 +0800)
test/sad_test.cc		patch \| blob \| history
vp8/common/rtcd_defs.pl		patch \| blob \| history
vp8/encoder/loongarch/dct_lsx.c		patch \| blob \| history
vp8/encoder/mcomp.c		patch \| blob \| history
vpx_dsp/loongarch/sad_lsx.c		patch \| blob \| history
vpx_dsp/vpx_dsp_rtcd_defs.pl		patch \| blob \| history