From: Johann Date: Thu, 23 Mar 2017 21:54:48 +0000 (-0700) Subject: vpx_comp_avg_pred: sse2 optimization X-Git-Tag: v1.7.0~564^2 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=28a8622143e03359cda6b9f2ae603015c501ab87;p=libvpx vpx_comp_avg_pred: sse2 optimization Provides over 15x speedup for width > 8. Due to smaller loads and shifting for width == 8 it gets about 8x speedup. For width == 4 it's only about 4x speedup because there is a lot of shuffling and shifting to get the data properly situated. BUG=webm:1390 Change-Id: Ice0b3dbbf007be3d9509786a61e7f35e94bdffa8 --- diff --git a/test/comp_avg_pred_test.cc b/test/comp_avg_pred_test.cc index bdc0e70a2..2aed7cba1 100644 --- a/test/comp_avg_pred_test.cc +++ b/test/comp_avg_pred_test.cc @@ -60,25 +60,30 @@ TEST_P(AvgPredTest, SizeCombinations) { // Don't test 4x2 or 64x128 if (height_pow == 1 || height_pow == 7) continue; - const int width = 1 << width_pow; - const int height = 1 << height_pow; - Buffer pred = Buffer(width, height, 0); - // Only the reference buffer may have a stride not equal to width. - Buffer ref = Buffer(width, height, 8); - Buffer avg_ref = Buffer(width, height, 0); - Buffer avg_chk = Buffer(width, height, 0); - - pred.Set(&rnd_, &ACMRandom::Rand8); - ref.Set(&rnd_, &ACMRandom::Rand8); - - reference_pred(pred, ref, width, height, &avg_ref); - ASM_REGISTER_STATE_CHECK( - avg_pred_func_(avg_chk.TopLeftPixel(), pred.TopLeftPixel(), width, - height, ref.TopLeftPixel(), ref.stride())); - EXPECT_TRUE(avg_chk.CheckValues(avg_ref)); - if (HasFailure()) { - avg_chk.PrintDifference(avg_ref); - return; + // The sse2 special-cases when ref width == stride, so make sure to test + // it. + for (int ref_padding = 0; ref_padding < 2; ref_padding++) { + const int width = 1 << width_pow; + const int height = 1 << height_pow; + Buffer pred = Buffer(width, height, 0); + // Only the reference buffer may have a stride not equal to width. + Buffer ref = + Buffer(width, height, ref_padding ? 8 : 0); + Buffer avg_ref = Buffer(width, height, 0); + Buffer avg_chk = Buffer(width, height, 0); + + pred.Set(&rnd_, &ACMRandom::Rand8); + ref.Set(&rnd_, &ACMRandom::Rand8); + + reference_pred(pred, ref, width, height, &avg_ref); + ASM_REGISTER_STATE_CHECK( + avg_pred_func_(avg_chk.TopLeftPixel(), pred.TopLeftPixel(), width, + height, ref.TopLeftPixel(), ref.stride())); + EXPECT_TRUE(avg_chk.CheckValues(avg_ref)); + if (HasFailure()) { + avg_chk.PrintDifference(avg_ref); + return; + } } } } @@ -115,26 +120,30 @@ TEST_P(AvgPredTest, DISABLED_Speed) { // Don't test 4x2 or 64x128 if (height_pow == 1 || height_pow == 7) continue; - const int width = 1 << width_pow; - const int height = 1 << height_pow; - Buffer pred = Buffer(width, height, 0); - Buffer ref = Buffer(width, height, 8); - Buffer avg = Buffer(width, height, 0); - - pred.Set(&rnd_, &ACMRandom::Rand8); - ref.Set(&rnd_, &ACMRandom::Rand8); - - vpx_usec_timer timer; - vpx_usec_timer_start(&timer); - for (int i = 0; i < 100000; ++i) { - avg_pred_func_(avg.TopLeftPixel(), pred.TopLeftPixel(), width, height, - ref.TopLeftPixel(), ref.stride()); + for (int ref_padding = 0; ref_padding < 2; ref_padding++) { + const int width = 1 << width_pow; + const int height = 1 << height_pow; + Buffer pred = Buffer(width, height, 0); + Buffer ref = + Buffer(width, height, ref_padding ? 8 : 0); + Buffer avg = Buffer(width, height, 0); + + pred.Set(&rnd_, &ACMRandom::Rand8); + ref.Set(&rnd_, &ACMRandom::Rand8); + + vpx_usec_timer timer; + vpx_usec_timer_start(&timer); + for (int i = 0; i < 10000000 / (width * height); ++i) { + avg_pred_func_(avg.TopLeftPixel(), pred.TopLeftPixel(), width, height, + ref.TopLeftPixel(), ref.stride()); + } + vpx_usec_timer_mark(&timer); + + const int elapsed_time = + static_cast(vpx_usec_timer_elapsed(&timer)); + printf("Average Test (ref_padding: %d) %dx%d time: %5d us\n", + ref_padding, width, height, elapsed_time); } - vpx_usec_timer_mark(&timer); - - const int elapsed_time = - static_cast(vpx_usec_timer_elapsed(&timer) / 1000); - printf("Average Test %dx%d time: %5d ms\n", width, height, elapsed_time); } } } @@ -142,10 +151,8 @@ TEST_P(AvgPredTest, DISABLED_Speed) { INSTANTIATE_TEST_CASE_P(C, AvgPredTest, ::testing::Values(&vpx_comp_avg_pred_c)); -/* TODO(johannkoenig): https://bugs.chromium.org/p/webm/issues/detail?id=1390 #if HAVE_SSE2 INSTANTIATE_TEST_CASE_P(SSE2, AvgPredTest, ::testing::Values(&vpx_comp_avg_pred_sse2)); #endif // HAVE_SSE2 -*/ } // namespace diff --git a/vpx_dsp/variance.c b/vpx_dsp/variance.c index 421415025..b1744047a 100644 --- a/vpx_dsp/variance.c +++ b/vpx_dsp/variance.c @@ -8,6 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include + #include "./vpx_config.h" #include "./vpx_dsp_rtcd.h" @@ -224,6 +226,9 @@ MSE(8, 8) void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride) { int i, j; + /* comp_pred and pred must be 16 byte aligned. */ + assert(((intptr_t)comp_pred & 0xf) == 0); + assert(((intptr_t)pred & 0xf) == 0); for (i = 0; i < height; ++i) { for (j = 0; j < width; ++j) { diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk index 73c50fd3d..7604f7da2 100644 --- a/vpx_dsp/vpx_dsp.mk +++ b/vpx_dsp/vpx_dsp.mk @@ -320,6 +320,7 @@ DSP_SRCS-$(HAVE_MSA) += mips/variance_msa.c DSP_SRCS-$(HAVE_MSA) += mips/sub_pixel_variance_msa.c DSP_SRCS-$(HAVE_SSE) += x86/variance_sse2.c +DSP_SRCS-$(HAVE_SSE2) += x86/avg_pred_sse2.c DSP_SRCS-$(HAVE_SSE2) += x86/variance_sse2.c # Contains SSE2 and SSSE3 DSP_SRCS-$(HAVE_AVX2) += x86/variance_avx2.c DSP_SRCS-$(HAVE_AVX2) += x86/variance_impl_avx2.c diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index 5c2ba1cc5..201af15bd 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -1303,6 +1303,7 @@ add_proto qw/unsigned int vpx_get4x4sse_cs/, "const unsigned char *src_ptr, int specialize qw/vpx_get4x4sse_cs neon msa/; add_proto qw/void vpx_comp_avg_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride"; + specialize qw/vpx_comp_avg_pred sse2/; # # Subpixel Variance diff --git a/vpx_dsp/x86/avg_pred_sse2.c b/vpx_dsp/x86/avg_pred_sse2.c new file mode 100644 index 000000000..f83b26490 --- /dev/null +++ b/vpx_dsp/x86/avg_pred_sse2.c @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "./vpx_dsp_rtcd.h" +#include "vpx/vpx_integer.h" + +void vpx_comp_avg_pred_sse2(uint8_t *comp, const uint8_t *pred, int width, + int height, const uint8_t *ref, int ref_stride) { + /* comp and pred must be 16 byte aligned. */ + assert(((intptr_t)comp & 0xf) == 0); + assert(((intptr_t)pred & 0xf) == 0); + if (width > 8) { + int x, y; + for (y = 0; y < height; ++y) { + for (x = 0; x < width; x += 16) { + const __m128i p = _mm_load_si128((const __m128i *)(pred + x)); + const __m128i r = _mm_loadu_si128((const __m128i *)(ref + x)); + const __m128i avg = _mm_avg_epu8(p, r); + _mm_store_si128((__m128i *)(comp + x), avg); + } + comp += width; + pred += width; + ref += ref_stride; + } + } else { // width must be 4 or 8. + int i; + // Process 16 elements at a time. comp and pred have width == stride and + // therefore live in contigious memory. 4*4, 4*8, 8*4, 8*8, and 8*16 are all + // divisible by 16 so just ref needs to be massaged when loading. + for (i = 0; i < width * height; i += 16) { + const __m128i p = _mm_load_si128((const __m128i *)pred); + __m128i r; + __m128i avg; + if (width == ref_stride) { + r = _mm_loadu_si128((const __m128i *)ref); + ref += 16; + } else if (width == 4) { + r = _mm_set_epi32(*(const uint32_t *)(ref + 3 * ref_stride), + *(const uint32_t *)(ref + 2 * ref_stride), + *(const uint32_t *)(ref + ref_stride), + *(const uint32_t *)(ref)); + + ref += 4 * ref_stride; + } else { + const __m128i r_0 = _mm_loadl_epi64((const __m128i *)ref); + assert(width == 8); + r = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(r_0), + (const __m64 *)(ref + ref_stride))); + + ref += 2 * ref_stride; + } + avg = _mm_avg_epu8(p, r); + _mm_store_si128((__m128i *)comp, avg); + + pred += 16; + comp += 16; + } + } +}