From: Scott LaVarnway Date: Thu, 9 Nov 2017 00:06:29 +0000 (-0800) Subject: vpx: [x86] add vp9_block_error_fp_avx2() X-Git-Tag: v1.7.0~74^2 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=62ab5e99c1aa13704716ca056b8c806b22544a6b;p=libvpx vpx: [x86] add vp9_block_error_fp_avx2() SSE2 asm vs AVX2 intrinsics speed gains: blocksize 16: ~1.00 blocksize 64: ~1.17 blocksize 256: ~1.67 blocksize 1024: ~1.81 Change-Id: I2a86db239cf57e3ff617890ccb2d236aba83ad5e --- diff --git a/test/avg_test.cc b/test/avg_test.cc index c570bbc22..e0f2d7410 100644 --- a/test/avg_test.cc +++ b/test/avg_test.cc @@ -23,6 +23,7 @@ #include "test/register_state_check.h" #include "test/util.h" #include "vpx_mem/vpx_mem.h" +#include "vpx_ports/vpx_timer.h" using libvpx_test::ACMRandom; @@ -396,6 +397,22 @@ TEST_P(BlockErrorTestFP, Random) { Check(expected); } +TEST_P(BlockErrorTestFP, DISABLED_Speed) { + const int kCountSpeedTestBlock = 20000; + vpx_usec_timer timer; + DECLARE_ALIGNED(16, tran_low_t, coeff[1024]); + DECLARE_ALIGNED(16, tran_low_t, dqcoeff[1024]); + const int blocksize = GET_PARAM(0); + + vpx_usec_timer_start(&timer); + for (int i = 0; i < kCountSpeedTestBlock; ++i) { + GET_PARAM(1)(coeff, dqcoeff, blocksize); + } + vpx_usec_timer_mark(&timer); + const int elapsed_time = static_cast(vpx_usec_timer_elapsed(&timer)); + printf("blocksize: %4d time: %4d us\n", blocksize, elapsed_time); +} + using std::tr1::make_tuple; INSTANTIATE_TEST_CASE_P( @@ -454,6 +471,15 @@ INSTANTIATE_TEST_CASE_P( make_tuple(1024, &vp9_block_error_fp_sse2))); #endif // HAVE_SSE2 +#if HAVE_AVX2 +INSTANTIATE_TEST_CASE_P( + AVX2, BlockErrorTestFP, + ::testing::Values(make_tuple(16, &vp9_block_error_fp_avx2), + make_tuple(64, &vp9_block_error_fp_avx2), + make_tuple(256, &vp9_block_error_fp_avx2), + make_tuple(1024, &vp9_block_error_fp_avx2))); +#endif // HAVE_AVX2 + #if HAVE_NEON INSTANTIATE_TEST_CASE_P( NEON, AverageTest, diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl index 28ae15a8b..f613fff87 100644 --- a/vp9/common/vp9_rtcd_defs.pl +++ b/vp9/common/vp9_rtcd_defs.pl @@ -122,7 +122,7 @@ add_proto qw/void vp9_fdct8x8_quant/, "const int16_t *input, int stride, tran_lo if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { specialize qw/vp9_block_error avx2 sse2/; - specialize qw/vp9_block_error_fp sse2/; + specialize qw/vp9_block_error_fp avx2 sse2/; specialize qw/vp9_fdct8x8_quant neon ssse3/; @@ -131,7 +131,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { } else { specialize qw/vp9_block_error avx2 msa sse2/; - specialize qw/vp9_block_error_fp neon sse2/; + specialize qw/vp9_block_error_fp neon avx2 sse2/; specialize qw/vp9_fdct8x8_quant sse2 ssse3 neon/; } diff --git a/vp9/encoder/x86/vp9_error_avx2.c b/vp9/encoder/x86/vp9_error_avx2.c index e228bd8b7..be414359a 100644 --- a/vp9/encoder/x86/vp9_error_avx2.c +++ b/vp9/encoder/x86/vp9_error_avx2.c @@ -105,3 +105,57 @@ int64_t vp9_block_error_avx2(const tran_low_t *coeff, const tran_low_t *dqcoeff, _mm_storel_epi64((__m128i *)(ssz), ssz_128); return sse; } + +int64_t vp9_block_error_fp_avx2(const tran_low_t *coeff, + const tran_low_t *dqcoeff, int block_size) { + int i; + const __m256i zero = _mm256_setzero_si256(); + __m256i sse_256 = zero; + __m256i sse_hi; + __m128i sse_128; + int64_t sse; + + if (block_size == 16) { + // Load 16 elements for coeff and dqcoeff. + const __m256i _coeff = load_tran_low(coeff); + const __m256i _dqcoeff = load_tran_low(dqcoeff); + // dqcoeff - coeff + const __m256i diff = _mm256_sub_epi16(_dqcoeff, _coeff); + // madd (dqcoeff - coeff) + const __m256i error_lo = _mm256_madd_epi16(diff, diff); + // Save the higher 64 bit of each 128 bit lane. + const __m256i error_hi = _mm256_srli_si256(error_lo, 8); + // Add the higher 64 bit to the low 64 bit. + const __m256i error = _mm256_add_epi32(error_lo, error_hi); + // Expand each double word in the lower 64 bits to quad word. + sse_256 = _mm256_unpacklo_epi32(error, zero); + } else { + for (i = 0; i < block_size; i += 16) { + // Load 16 elements for coeff and dqcoeff. + const __m256i _coeff = load_tran_low(coeff); + const __m256i _dqcoeff = load_tran_low(dqcoeff); + const __m256i diff = _mm256_sub_epi16(_dqcoeff, _coeff); + const __m256i error = _mm256_madd_epi16(diff, diff); + // Expand each double word of madd (dqcoeff - coeff) to quad word. + const __m256i exp_error_lo = _mm256_unpacklo_epi32(error, zero); + const __m256i exp_error_hi = _mm256_unpackhi_epi32(error, zero); + // Add each quad word of madd (dqcoeff - coeff). + sse_256 = _mm256_add_epi64(sse_256, exp_error_lo); + sse_256 = _mm256_add_epi64(sse_256, exp_error_hi); + coeff += 16; + dqcoeff += 16; + } + } + // Save the higher 64 bit of each 128 bit lane. + sse_hi = _mm256_srli_si256(sse_256, 8); + // Add the higher 64 bit to the low 64 bit. + sse_256 = _mm256_add_epi64(sse_256, sse_hi); + + // Add each 64 bit from each of the 128 bit lane of the 256 bit. + sse_128 = _mm_add_epi64(_mm256_castsi256_si128(sse_256), + _mm256_extractf128_si256(sse_256, 1)); + + // Store the results. + _mm_storel_epi64((__m128i *)&sse, sse_128); + return sse; +}