From: Scott LaVarnway Date: Fri, 10 Nov 2017 18:19:52 +0000 (-0800) Subject: vpx: [x86] add vpx_satd_avx2() X-Git-Tag: v1.7.0~73 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=8e6022844fdf3e97cfe10659f386299d716736ab;p=libvpx vpx: [x86] add vpx_satd_avx2() SSE2 instrinsic vs AVX2 intrinsic speed gains: blocksize 16: ~1.33 blocksize 64: ~1.51 blocksize 256: ~3.03 blocksize 1024: ~3.71 Change-Id: I79b28cba82d21f9dd765e79881aa16d24fd0cb58 --- diff --git a/test/avg_test.cc b/test/avg_test.cc index e0f2d7410..ad21198e4 100644 --- a/test/avg_test.cc +++ b/test/avg_test.cc @@ -368,6 +368,21 @@ TEST_P(SatdTest, Random) { Check(expected); } +TEST_P(SatdTest, DISABLED_Speed) { + const int kCountSpeedTestBlock = 20000; + vpx_usec_timer timer; + DECLARE_ALIGNED(16, tran_low_t, coeff[1024]); + const int blocksize = GET_PARAM(0); + + vpx_usec_timer_start(&timer); + for (int i = 0; i < kCountSpeedTestBlock; ++i) { + GET_PARAM(1)(coeff, blocksize); + } + vpx_usec_timer_mark(&timer); + const int elapsed_time = static_cast(vpx_usec_timer_elapsed(&timer)); + printf("blocksize: %4d time: %4d us\n", blocksize, elapsed_time); +} + TEST_P(BlockErrorTestFP, MinValue) { const int64_t kMin = -32640; const int64_t expected = kMin * kMin * txfm_size_; @@ -472,13 +487,19 @@ INSTANTIATE_TEST_CASE_P( #endif // HAVE_SSE2 #if HAVE_AVX2 +INSTANTIATE_TEST_CASE_P(AVX2, SatdTest, + ::testing::Values(make_tuple(16, &vpx_satd_avx2), + make_tuple(64, &vpx_satd_avx2), + make_tuple(256, &vpx_satd_avx2), + make_tuple(1024, &vpx_satd_avx2))); + INSTANTIATE_TEST_CASE_P( AVX2, BlockErrorTestFP, ::testing::Values(make_tuple(16, &vp9_block_error_fp_avx2), make_tuple(64, &vp9_block_error_fp_avx2), make_tuple(256, &vp9_block_error_fp_avx2), make_tuple(1024, &vp9_block_error_fp_avx2))); -#endif // HAVE_AVX2 +#endif #if HAVE_NEON INSTANTIATE_TEST_CASE_P( diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index 8ae847c3d..e117b9d15 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -773,7 +773,7 @@ if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") { specialize qw/vpx_hadamard_16x16 avx2 sse2 neon vsx/; add_proto qw/int vpx_satd/, "const tran_low_t *coeff, int length"; - specialize qw/vpx_satd sse2 neon/; + specialize qw/vpx_satd avx2 sse2 neon/; } else { add_proto qw/void vpx_hadamard_8x8/, "const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff"; specialize qw/vpx_hadamard_8x8 sse2 neon msa vsx/, "$ssse3_x86_64"; @@ -782,7 +782,7 @@ if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") { specialize qw/vpx_hadamard_16x16 avx2 sse2 neon msa vsx/; add_proto qw/int vpx_satd/, "const int16_t *coeff, int length"; - specialize qw/vpx_satd sse2 neon msa/; + specialize qw/vpx_satd avx2 sse2 neon msa/; } add_proto qw/void vpx_int_pro_row/, "int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int height"; diff --git a/vpx_dsp/x86/avg_intrin_avx2.c b/vpx_dsp/x86/avg_intrin_avx2.c index 4dc759bb5..ff19ea647 100644 --- a/vpx_dsp/x86/avg_intrin_avx2.c +++ b/vpx_dsp/x86/avg_intrin_avx2.c @@ -171,3 +171,27 @@ void vpx_hadamard_16x16_avx2(int16_t const *src_diff, ptrdiff_t src_stride, t_coeff += 16; } } + +int vpx_satd_avx2(const tran_low_t *coeff, int length) { + const __m256i one = _mm256_set1_epi16(1); + __m256i accum = _mm256_setzero_si256(); + int i; + + for (i = 0; i < length; i += 16) { + const __m256i src_line = load_tran_low(coeff); + const __m256i abs = _mm256_abs_epi16(src_line); + const __m256i sum = _mm256_madd_epi16(abs, one); + accum = _mm256_add_epi32(accum, sum); + coeff += 16; + } + + { // 32 bit horizontal add + const __m256i a = _mm256_srli_si256(accum, 8); + const __m256i b = _mm256_add_epi32(accum, a); + const __m256i c = _mm256_srli_epi64(b, 32); + const __m256i d = _mm256_add_epi32(b, c); + const __m128i accum_128 = _mm_add_epi32(_mm256_castsi256_si128(d), + _mm256_extractf128_si256(d, 1)); + return _mm_cvtsi128_si32(accum_128); + } +}