From e022d5b71ffca486b5bc174702a9fe0e35038c75 Mon Sep 17 00:00:00 2001 From: Scott LaVarnway Date: Tue, 20 Dec 2022 15:43:44 -0800 Subject: [PATCH] [x86]: Add vpx_highbd_comp_avg_pred_sse2(). C vs SSE2 4x4: 3.38x 8x8: 3.45x 16x16: 2.06x 32x32: 2.19x 64x64: 1.39x Change-Id: I46638fe187b49a78fee554114fac51c485d74474 --- test/comp_avg_pred_test.cc | 8 ++++- vpx_dsp/vpx_dsp_rtcd_defs.pl | 2 +- vpx_dsp/x86/highbd_variance_sse2.c | 47 ++++++++++++++++++++++++++++++ 3 files changed, 55 insertions(+), 2 deletions(-) diff --git a/test/comp_avg_pred_test.cc b/test/comp_avg_pred_test.cc index 66dc4eb4e..70aeab8d7 100644 --- a/test/comp_avg_pred_test.cc +++ b/test/comp_avg_pred_test.cc @@ -185,7 +185,7 @@ void AvgPredTest::TestSpeed() { vpx_usec_timer timer; vpx_usec_timer_start(&timer); - for (int i = 0; i < 10000000 / (width * height); ++i) { + for (int i = 0; i < 100000000 / (width * height); ++i) { avg_pred_func_((uint8_t *)avg.TopLeftPixel(), (uint8_t *)pred.TopLeftPixel(), width, height, (uint8_t *)ref.TopLeftPixel(), ref.stride()); @@ -254,5 +254,11 @@ INSTANTIATE_TEST_SUITE_P( C, AvgPredTestHBD, ::testing::Values(&highbd_wrapper)); +#if HAVE_SSE2 +INSTANTIATE_TEST_SUITE_P( + SSE2, AvgPredTestHBD, + ::testing::Values(&highbd_wrapper)); +#endif // HAVE_SSE2 + #endif // CONFIG_VP9_HIGHBITDEPTH } // namespace diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index b6d656820..8725821b6 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -1400,7 +1400,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { specialize qw/vpx_highbd_12_mse8x8 sse2 neon/; add_proto qw/void vpx_highbd_comp_avg_pred/, "uint16_t *comp_pred, const uint16_t *pred, int width, int height, const uint16_t *ref, int ref_stride"; - specialize qw/vpx_highbd_comp_avg_pred neon/; + specialize qw/vpx_highbd_comp_avg_pred neon sse2/; # # Subpixel Variance diff --git a/vpx_dsp/x86/highbd_variance_sse2.c b/vpx_dsp/x86/highbd_variance_sse2.c index 7c8d79b09..381e0ad19 100644 --- a/vpx_dsp/x86/highbd_variance_sse2.c +++ b/vpx_dsp/x86/highbd_variance_sse2.c @@ -7,6 +7,7 @@ * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ +#include // SSE2 #include "./vpx_config.h" #include "./vpx_dsp_rtcd.h" @@ -559,3 +560,49 @@ FNS(sse2) #undef FNS #undef FN + +void vpx_highbd_comp_avg_pred_sse2(uint16_t *comp_pred, const uint16_t *pred, + int width, int height, const uint16_t *ref, + int ref_stride) { + int i, j; + if (width > 8) { + for (i = 0; i < height; ++i) { + for (j = 0; j < width; j += 16) { + const __m128i p0 = _mm_loadu_si128((const __m128i *)&pred[j]); + const __m128i p1 = _mm_loadu_si128((const __m128i *)&pred[j + 8]); + const __m128i r0 = _mm_loadu_si128((const __m128i *)&ref[j]); + const __m128i r1 = _mm_loadu_si128((const __m128i *)&ref[j + 8]); + _mm_storeu_si128((__m128i *)&comp_pred[j], _mm_avg_epu16(p0, r0)); + _mm_storeu_si128((__m128i *)&comp_pred[j + 8], _mm_avg_epu16(p1, r1)); + } + comp_pred += width; + pred += width; + ref += ref_stride; + } + } else if (width == 8) { + for (i = 0; i < height; i += 2) { + const __m128i p0 = _mm_loadu_si128((const __m128i *)&pred[0]); + const __m128i p1 = _mm_loadu_si128((const __m128i *)&pred[8]); + const __m128i r0 = _mm_loadu_si128((const __m128i *)&ref[0]); + const __m128i r1 = _mm_loadu_si128((const __m128i *)&ref[ref_stride]); + _mm_storeu_si128((__m128i *)&comp_pred[0], _mm_avg_epu16(p0, r0)); + _mm_storeu_si128((__m128i *)&comp_pred[8], _mm_avg_epu16(p1, r1)); + comp_pred += 8 << 1; + pred += 8 << 1; + ref += ref_stride << 1; + } + } else { + assert(width == 4); + for (i = 0; i < height; i += 2) { + const __m128i p0 = _mm_loadl_epi64((const __m128i *)&pred[0]); + const __m128i p1 = _mm_loadl_epi64((const __m128i *)&pred[4]); + const __m128i r0 = _mm_loadl_epi64((const __m128i *)&ref[0]); + const __m128i r1 = _mm_loadl_epi64((const __m128i *)&ref[ref_stride]); + _mm_storel_epi64((__m128i *)&comp_pred[0], _mm_avg_epu16(p0, r0)); + _mm_storel_epi64((__m128i *)&comp_pred[4], _mm_avg_epu16(p1, r1)); + comp_pred += 4 << 1; + pred += 4 << 1; + ref += ref_stride << 1; + } + } +} -- 2.40.0