From f6a002f2a67579cce9d53a4314f0676228517c12 Mon Sep 17 00:00:00 2001 From: sdeng Date: Tue, 30 Oct 2018 14:08:07 -0700 Subject: [PATCH] Add satd avx2 implementation Speed Test: C/SatdHighbdTest blocksize: 16 time: 138 us blocksize: 64 time: 315 us blocksize: 256 time: 1120 us blocksize: 1024 time: 3955 us AVX2/SatdHighbdTest blocksize: 16 time: 89 us blocksize: 64 time: 189 us blocksize: 256 time: 590 us blocksize: 1024 time: 1912 us Change-Id: I6357174462fccd589a475b13d8114b853cab5383 --- test/acm_random.h | 12 ++++ test/avg_test.cc | 118 ++++++++++++++++++++++++++++------ vp9/encoder/vp9_encoder.c | 5 +- vpx_dsp/avg.c | 13 ++++ vpx_dsp/vpx_dsp_rtcd_defs.pl | 3 + vpx_dsp/x86/avg_intrin_avx2.c | 23 +++++++ 6 files changed, 150 insertions(+), 24 deletions(-) diff --git a/test/acm_random.h b/test/acm_random.h index 5a4e6c392..ccfa20681 100644 --- a/test/acm_random.h +++ b/test/acm_random.h @@ -34,6 +34,18 @@ class ACMRandom { return (value >> 15) & 0xffff; } + int32_t Rand20Signed(void) { + // Use 20 bits: values between 524287 and -524288. + const uint32_t value = random_.Generate(1048576); + return static_cast(value) - 524288; + } + + int16_t Rand16Signed(void) { + // Use 16 bits: values between 32767 and -32768. + const uint32_t value = random_.Generate(65536); + return static_cast(value) - 32768; + } + int16_t Rand13Signed(void) { // Use 13 bits: values between 4095 and -4096. const uint32_t value = random_.Generate(8192); diff --git a/test/avg_test.cc b/test/avg_test.cc index 5455f05ed..f10d032bc 100644 --- a/test/avg_test.cc +++ b/test/avg_test.cc @@ -250,12 +250,7 @@ class SatdTest : public ::testing::Test, for (int i = 0; i < satd_size_; ++i) src_[i] = val; } - void FillRandom() { - for (int i = 0; i < satd_size_; ++i) { - const int16_t tmp = rnd_.Rand16(); - src_[i] = (tran_low_t)tmp; - } - } + virtual void FillRandom() = 0; void Check(const int expected) { int total; @@ -266,11 +261,21 @@ class SatdTest : public ::testing::Test, tran_low_t *GetCoeff() const { return src_; } int satd_size_; + ACMRandom rnd_; + tran_low_t *src_; private: - tran_low_t *src_; SatdFunc satd_func_; - ACMRandom rnd_; +}; + +class SatdLowbdTest : public SatdTest { + protected: + virtual void FillRandom() { + for (int i = 0; i < satd_size_; ++i) { + const int16_t tmp = rnd_.Rand16Signed(); + src_[i] = (tran_low_t)tmp; + } + } }; typedef int64_t (*BlockErrorFunc)(const tran_low_t *coeff, @@ -402,27 +407,82 @@ TEST_P(IntProColTest, Random) { RunComparison(); } -TEST_P(SatdTest, MinValue) { +TEST_P(SatdLowbdTest, MinValue) { const int kMin = -32640; const int expected = -kMin * satd_size_; FillConstant(kMin); Check(expected); } -TEST_P(SatdTest, MaxValue) { +TEST_P(SatdLowbdTest, MaxValue) { const int kMax = 32640; const int expected = kMax * satd_size_; FillConstant(kMax); Check(expected); } -TEST_P(SatdTest, Random) { +TEST_P(SatdLowbdTest, Random) { + int expected; + switch (satd_size_) { + case 16: expected = 263252; break; + case 64: expected = 1105420; break; + case 256: expected = 4252250; break; + case 1024: expected = 16876840; break; + default: + FAIL() << "Invalid satd size (" << satd_size_ + << ") valid: 16/64/256/1024"; + } + FillRandom(); + Check(expected); +} + +TEST_P(SatdLowbdTest, DISABLED_Speed) { + const int kCountSpeedTestBlock = 20000; + vpx_usec_timer timer; + const int blocksize = GET_PARAM(0); + FillRandom(); + tran_low_t *coeff = GetCoeff(); + + vpx_usec_timer_start(&timer); + for (int i = 0; i < kCountSpeedTestBlock; ++i) { + GET_PARAM(1)(coeff, blocksize); + } + vpx_usec_timer_mark(&timer); + const int elapsed_time = static_cast(vpx_usec_timer_elapsed(&timer)); + printf("blocksize: %4d time: %4d us\n", blocksize, elapsed_time); +} + +#if CONFIG_VP9_HIGHBITDEPTH +class SatdHighbdTest : public SatdTest { + protected: + virtual void FillRandom() { + for (int i = 0; i < satd_size_; ++i) { + src_[i] = rnd_.Rand20Signed(); + } + } +}; + +TEST_P(SatdHighbdTest, MinValue) { + const int kMin = -524280; + const int expected = -kMin * satd_size_; + FillConstant(kMin); + Check(expected); +} + +TEST_P(SatdHighbdTest, MaxValue) { + const int kMax = 524280; + const int expected = kMax * satd_size_; + FillConstant(kMax); + Check(expected); +} + +TEST_P(SatdHighbdTest, Random) { int expected; switch (satd_size_) { - case 16: expected = 205298; break; - case 64: expected = 1113950; break; - case 256: expected = 4268415; break; - case 1024: expected = 16954082; break; + case 16: expected = 5249712; break; + case 64: expected = 18362120; break; + case 256: expected = 66100520; break; + case 1024: expected = 266094734; break; default: FAIL() << "Invalid satd size (" << satd_size_ << ") valid: 16/64/256/1024"; @@ -431,7 +491,7 @@ TEST_P(SatdTest, Random) { Check(expected); } -TEST_P(SatdTest, DISABLED_Speed) { +TEST_P(SatdHighbdTest, DISABLED_Speed) { const int kCountSpeedTestBlock = 20000; vpx_usec_timer timer; const int blocksize = GET_PARAM(0); @@ -446,6 +506,7 @@ TEST_P(SatdTest, DISABLED_Speed) { const int elapsed_time = static_cast(vpx_usec_timer_elapsed(&timer)); printf("blocksize: %4d time: %4d us\n", blocksize, elapsed_time); } +#endif // CONFIG_VP9_HIGHBITDEPTH TEST_P(BlockErrorTestFP, MinValue) { const int64_t kMin = -32640; @@ -512,9 +573,15 @@ INSTANTIATE_TEST_CASE_P( ::testing::Values(make_tuple(16, 16, 1, 8, &vpx_highbd_avg_8x8_sse2), make_tuple(16, 16, 1, 4, &vpx_highbd_avg_4x4_sse2))); #endif // HAVE_SSE2 + +INSTANTIATE_TEST_CASE_P(C, SatdHighbdTest, + ::testing::Values(make_tuple(16, &vpx_satd_c), + make_tuple(64, &vpx_satd_c), + make_tuple(256, &vpx_satd_c), + make_tuple(1024, &vpx_satd_c))); #endif // CONFIG_VP9_HIGHBITDEPTH -INSTANTIATE_TEST_CASE_P(C, SatdTest, +INSTANTIATE_TEST_CASE_P(C, SatdLowbdTest, ::testing::Values(make_tuple(16, &vpx_satd_c), make_tuple(64, &vpx_satd_c), make_tuple(256, &vpx_satd_c), @@ -551,7 +618,7 @@ INSTANTIATE_TEST_CASE_P( make_tuple(64, &vpx_int_pro_col_sse2, &vpx_int_pro_col_c))); -INSTANTIATE_TEST_CASE_P(SSE2, SatdTest, +INSTANTIATE_TEST_CASE_P(SSE2, SatdLowbdTest, ::testing::Values(make_tuple(16, &vpx_satd_sse2), make_tuple(64, &vpx_satd_sse2), make_tuple(256, &vpx_satd_sse2), @@ -566,12 +633,21 @@ INSTANTIATE_TEST_CASE_P( #endif // HAVE_SSE2 #if HAVE_AVX2 -INSTANTIATE_TEST_CASE_P(AVX2, SatdTest, +INSTANTIATE_TEST_CASE_P(AVX2, SatdLowbdTest, ::testing::Values(make_tuple(16, &vpx_satd_avx2), make_tuple(64, &vpx_satd_avx2), make_tuple(256, &vpx_satd_avx2), make_tuple(1024, &vpx_satd_avx2))); +#if CONFIG_VP9_HIGHBITDEPTH +INSTANTIATE_TEST_CASE_P( + AVX2, SatdHighbdTest, + ::testing::Values(make_tuple(16, &vpx_highbd_satd_avx2), + make_tuple(64, &vpx_highbd_satd_avx2), + make_tuple(256, &vpx_highbd_satd_avx2), + make_tuple(1024, &vpx_highbd_satd_avx2))); +#endif // CONFIG_VP9_HIGHBITDEPTH + INSTANTIATE_TEST_CASE_P( AVX2, BlockErrorTestFP, ::testing::Values(make_tuple(16, &vp9_block_error_fp_avx2), @@ -604,7 +680,7 @@ INSTANTIATE_TEST_CASE_P( make_tuple(64, &vpx_int_pro_col_neon, &vpx_int_pro_col_c))); -INSTANTIATE_TEST_CASE_P(NEON, SatdTest, +INSTANTIATE_TEST_CASE_P(NEON, SatdLowbdTest, ::testing::Values(make_tuple(16, &vpx_satd_neon), make_tuple(64, &vpx_satd_neon), make_tuple(256, &vpx_satd_neon), @@ -649,7 +725,7 @@ INSTANTIATE_TEST_CASE_P( // TODO(jingning): Remove the highbitdepth flag once the SIMD functions are // in place. #if !CONFIG_VP9_HIGHBITDEPTH -INSTANTIATE_TEST_CASE_P(MSA, SatdTest, +INSTANTIATE_TEST_CASE_P(MSA, SatdLowbdTest, ::testing::Values(make_tuple(16, &vpx_satd_msa), make_tuple(64, &vpx_satd_msa), make_tuple(256, &vpx_satd_msa), diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c index a73185623..a7fe1279f 100644 --- a/vp9/encoder/vp9_encoder.c +++ b/vp9/encoder/vp9_encoder.c @@ -5973,8 +5973,7 @@ void mode_estimation(VP9_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd, vpx_highbd_subtract_block(bh, bw, src_diff, bw, src, src_stride, dst, dst_stride, xd->bd); highbd_wht_fwd_txfm(src_diff, bw, coeff, tx_size); - // TODO(sdeng): Implement SIMD based high bit-depth satd. - intra_cost = vpx_satd_c(coeff, pix_num); + intra_cost = vpx_highbd_satd(coeff, pix_num); } else { vpx_subtract_block(bh, bw, src_diff, bw, src, src_stride, dst, dst_stride); @@ -6020,7 +6019,7 @@ void mode_estimation(VP9_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd, bh, bw, src_diff, bw, xd->cur_buf->y_buffer + mb_y_offset, xd->cur_buf->y_stride, &predictor[0], bw, xd->bd); highbd_wht_fwd_txfm(src_diff, bw, coeff, tx_size); - inter_cost = vpx_satd_c(coeff, pix_num); + inter_cost = vpx_highbd_satd(coeff, pix_num); } else { vp9_build_inter_predictor( ref_frame[rf_idx]->y_buffer + mb_y_offset, diff --git a/vpx_dsp/avg.c b/vpx_dsp/avg.c index 41a6fd828..1c45e8a73 100644 --- a/vpx_dsp/avg.c +++ b/vpx_dsp/avg.c @@ -314,6 +314,19 @@ void vpx_hadamard_32x32_c(const int16_t *src_diff, ptrdiff_t src_stride, } } +#if CONFIG_VP9_HIGHBITDEPTH +// coeff: dynamic range 20 bit. +// length: value range {16, 64, 256, 1024}. +int vpx_highbd_satd_c(const tran_low_t *coeff, int length) { + int i; + int satd = 0; + for (i = 0; i < length; ++i) satd += abs(coeff[i]); + + // satd: 30 bits + return satd; +} +#endif // CONFIG_VP9_HIGHBITDEPTH + // coeff: 16 bits, dynamic range [-32640, 32640]. // length: value range {16, 64, 256, 1024}. int vpx_satd_c(const tran_low_t *coeff, int length) { diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index 5dc682382..ab478f88e 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -796,6 +796,9 @@ if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") { add_proto qw/int vpx_satd/, "const tran_low_t *coeff, int length"; specialize qw/vpx_satd avx2 sse2 neon/; + + add_proto qw/int vpx_highbd_satd/, "const tran_low_t *coeff, int length"; + specialize qw/vpx_highbd_satd avx2/; } else { add_proto qw/void vpx_hadamard_8x8/, "const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff"; specialize qw/vpx_hadamard_8x8 sse2 neon msa vsx/, "$ssse3_x86_64"; diff --git a/vpx_dsp/x86/avg_intrin_avx2.c b/vpx_dsp/x86/avg_intrin_avx2.c index 7d74705ea..b30cc657f 100644 --- a/vpx_dsp/x86/avg_intrin_avx2.c +++ b/vpx_dsp/x86/avg_intrin_avx2.c @@ -375,3 +375,26 @@ int vpx_satd_avx2(const tran_low_t *coeff, int length) { return _mm_cvtsi128_si32(accum_128); } } + +#if CONFIG_VP9_HIGHBITDEPTH +int vpx_highbd_satd_avx2(const tran_low_t *coeff, int length) { + __m256i accum = _mm256_setzero_si256(); + int i; + + for (i = 0; i < length; i += 8, coeff += 8) { + const __m256i src_line = _mm256_loadu_si256((const __m256i *)coeff); + const __m256i abs = _mm256_abs_epi32(src_line); + accum = _mm256_add_epi32(accum, abs); + } + + { // 32 bit horizontal add + const __m256i a = _mm256_srli_si256(accum, 8); + const __m256i b = _mm256_add_epi32(accum, a); + const __m256i c = _mm256_srli_epi64(b, 32); + const __m256i d = _mm256_add_epi32(b, c); + const __m128i accum_128 = _mm_add_epi32(_mm256_castsi256_si128(d), + _mm256_extractf128_si256(d, 1)); + return _mm_cvtsi128_si32(accum_128); + } +} +#endif // CONFIG_VP9_HIGHBITDEPTH -- 2.40.0