]> granicus.if.org Git - libvpx/commitdiff
[x86]: Add vpx_highbd_comp_avg_pred_sse2().
authorScott LaVarnway <slavarnway@google.com>
Tue, 20 Dec 2022 23:43:44 +0000 (15:43 -0800)
committerScott LaVarnway <slavarnway@google.com>
Tue, 20 Dec 2022 23:59:20 +0000 (15:59 -0800)
C vs SSE2

4x4: 3.38x
8x8: 3.45x
16x16: 2.06x
32x32: 2.19x
64x64: 1.39x

Change-Id: I46638fe187b49a78fee554114fac51c485d74474

test/comp_avg_pred_test.cc
vpx_dsp/vpx_dsp_rtcd_defs.pl
vpx_dsp/x86/highbd_variance_sse2.c

index 66dc4eb4e1ba485f6e54b2329f24ee65a820dc47..70aeab8d7e35c59ecbb89a30a107535bba7d47f0 100644 (file)
@@ -185,7 +185,7 @@ void AvgPredTest<bitdepth, Pixel>::TestSpeed() {
 
         vpx_usec_timer timer;
         vpx_usec_timer_start(&timer);
-        for (int i = 0; i < 10000000 / (width * height); ++i) {
+        for (int i = 0; i < 100000000 / (width * height); ++i) {
           avg_pred_func_((uint8_t *)avg.TopLeftPixel(),
                          (uint8_t *)pred.TopLeftPixel(), width, height,
                          (uint8_t *)ref.TopLeftPixel(), ref.stride());
@@ -254,5 +254,11 @@ INSTANTIATE_TEST_SUITE_P(
     C, AvgPredTestHBD,
     ::testing::Values(&highbd_wrapper<vpx_highbd_comp_avg_pred_c>));
 
+#if HAVE_SSE2
+INSTANTIATE_TEST_SUITE_P(
+    SSE2, AvgPredTestHBD,
+    ::testing::Values(&highbd_wrapper<vpx_highbd_comp_avg_pred_sse2>));
+#endif  // HAVE_SSE2
+
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 }  // namespace
index b6d656820faeeeb9cb520a0a7ec793e6aaa707f0..8725821b67b76c42cded87281a5109d52d4feabc 100644 (file)
@@ -1400,7 +1400,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   specialize qw/vpx_highbd_12_mse8x8 sse2 neon/;
 
   add_proto qw/void vpx_highbd_comp_avg_pred/, "uint16_t *comp_pred, const uint16_t *pred, int width, int height, const uint16_t *ref, int ref_stride";
-  specialize qw/vpx_highbd_comp_avg_pred neon/;
+  specialize qw/vpx_highbd_comp_avg_pred neon sse2/;
 
   #
   # Subpixel Variance
index 7c8d79b09efb61f439fdaba23639da3c5aa9f440..381e0ad19337b64d4c281ff6a7335e0dd3597665 100644 (file)
@@ -7,6 +7,7 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
+#include <emmintrin.h>  // SSE2
 
 #include "./vpx_config.h"
 #include "./vpx_dsp_rtcd.h"
@@ -559,3 +560,49 @@ FNS(sse2)
 
 #undef FNS
 #undef FN
+
+void vpx_highbd_comp_avg_pred_sse2(uint16_t *comp_pred, const uint16_t *pred,
+                                   int width, int height, const uint16_t *ref,
+                                   int ref_stride) {
+  int i, j;
+  if (width > 8) {
+    for (i = 0; i < height; ++i) {
+      for (j = 0; j < width; j += 16) {
+        const __m128i p0 = _mm_loadu_si128((const __m128i *)&pred[j]);
+        const __m128i p1 = _mm_loadu_si128((const __m128i *)&pred[j + 8]);
+        const __m128i r0 = _mm_loadu_si128((const __m128i *)&ref[j]);
+        const __m128i r1 = _mm_loadu_si128((const __m128i *)&ref[j + 8]);
+        _mm_storeu_si128((__m128i *)&comp_pred[j], _mm_avg_epu16(p0, r0));
+        _mm_storeu_si128((__m128i *)&comp_pred[j + 8], _mm_avg_epu16(p1, r1));
+      }
+      comp_pred += width;
+      pred += width;
+      ref += ref_stride;
+    }
+  } else if (width == 8) {
+    for (i = 0; i < height; i += 2) {
+      const __m128i p0 = _mm_loadu_si128((const __m128i *)&pred[0]);
+      const __m128i p1 = _mm_loadu_si128((const __m128i *)&pred[8]);
+      const __m128i r0 = _mm_loadu_si128((const __m128i *)&ref[0]);
+      const __m128i r1 = _mm_loadu_si128((const __m128i *)&ref[ref_stride]);
+      _mm_storeu_si128((__m128i *)&comp_pred[0], _mm_avg_epu16(p0, r0));
+      _mm_storeu_si128((__m128i *)&comp_pred[8], _mm_avg_epu16(p1, r1));
+      comp_pred += 8 << 1;
+      pred += 8 << 1;
+      ref += ref_stride << 1;
+    }
+  } else {
+    assert(width == 4);
+    for (i = 0; i < height; i += 2) {
+      const __m128i p0 = _mm_loadl_epi64((const __m128i *)&pred[0]);
+      const __m128i p1 = _mm_loadl_epi64((const __m128i *)&pred[4]);
+      const __m128i r0 = _mm_loadl_epi64((const __m128i *)&ref[0]);
+      const __m128i r1 = _mm_loadl_epi64((const __m128i *)&ref[ref_stride]);
+      _mm_storel_epi64((__m128i *)&comp_pred[0], _mm_avg_epu16(p0, r0));
+      _mm_storel_epi64((__m128i *)&comp_pred[4], _mm_avg_epu16(p1, r1));
+      comp_pred += 4 << 1;
+      pred += 4 << 1;
+      ref += ref_stride << 1;
+    }
+  }
+}