From: Yi Luo <luoyi@google.com>
Date: Tue, 28 Mar 2017 22:30:07 +0000 (-0700)
Subject: Add AVX2 optimization to copy/avg functions
X-Git-Tag: v1.7.0~560^2
X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=aa5a9419926abe3034d1791ed20cca853e5281db;p=libvpx

Add AVX2 optimization to copy/avg functions

Change-Id: Ibcef70e4fead74e2c2909330a7044a29381a8074
---

diff --git a/test/convolve_test.cc b/test/convolve_test.cc
index 0056c602b..8b339fadf 100644
--- a/test/convolve_test.cc
+++ b/test/convolve_test.cc
@@ -25,6 +25,7 @@
 #include "vpx_dsp/vpx_filter.h"
 #include "vpx_mem/vpx_mem.h"
 #include "vpx_ports/mem.h"
+#include "vpx_ports/vpx_timer.h"
 
 namespace {
 
@@ -539,6 +540,46 @@ uint16_t *ConvolveTest::output16_ref_ = NULL;
 
 TEST_P(ConvolveTest, GuardBlocks) { CheckGuardBlocks(); }
 
+TEST_P(ConvolveTest, DISABLED_Copy_Speed) {
+  const uint8_t *const in = input();
+  uint8_t *const out = output();
+  const int kNumTests = 5000000;
+  const int width = Width();
+  const int height = Height();
+  vpx_usec_timer timer;
+
+  vpx_usec_timer_start(&timer);
+  for (int n = 0; n < kNumTests; ++n) {
+    UUT_->copy_[0](in, kInputStride, out, kOutputStride, NULL, 0, NULL, 0,
+                   width, height);
+  }
+  vpx_usec_timer_mark(&timer);
+
+  const int elapsed_time = static_cast<int>(vpx_usec_timer_elapsed(&timer));
+  printf("convolve_copy_%dx%d_%d: %d us\n", width, height,
+         UUT_->use_highbd_ ? UUT_->use_highbd_ : 8, elapsed_time);
+}
+
+TEST_P(ConvolveTest, DISABLED_Avg_Speed) {
+  const uint8_t *const in = input();
+  uint8_t *const out = output();
+  const int kNumTests = 5000000;
+  const int width = Width();
+  const int height = Height();
+  vpx_usec_timer timer;
+
+  vpx_usec_timer_start(&timer);
+  for (int n = 0; n < kNumTests; ++n) {
+    UUT_->copy_[1](in, kInputStride, out, kOutputStride, NULL, 0, NULL, 0,
+                   width, height);
+  }
+  vpx_usec_timer_mark(&timer);
+
+  const int elapsed_time = static_cast<int>(vpx_usec_timer_elapsed(&timer));
+  printf("convolve_avg_%dx%d_%d: %d us\n", width, height,
+         UUT_->use_highbd_ ? UUT_->use_highbd_ : 8, elapsed_time);
+}
+
 TEST_P(ConvolveTest, Copy) {
   uint8_t *const in = input();
   uint8_t *const out = output();
@@ -912,6 +953,17 @@ WRAP(convolve8_sse2, 12)
 WRAP(convolve8_avg_sse2, 12)
 #endif  // HAVE_SSE2 && ARCH_X86_64
 
+#if HAVE_AVX2
+WRAP(convolve_copy_avx2, 8)
+WRAP(convolve_avg_avx2, 8)
+
+WRAP(convolve_copy_avx2, 10)
+WRAP(convolve_avg_avx2, 10)
+
+WRAP(convolve_copy_avx2, 12)
+WRAP(convolve_avg_avx2, 12)
+#endif  // HAVE_AVX2
+
 #if HAVE_NEON
 WRAP(convolve_copy_neon, 8)
 WRAP(convolve_avg_neon, 8)
@@ -1057,18 +1109,48 @@ INSTANTIATE_TEST_CASE_P(SSSE3, ConvolveTest,
                         ::testing::ValuesIn(kArrayConvolve8_ssse3));
 #endif
 
-#if HAVE_AVX2 && HAVE_SSSE3
+#if HAVE_AVX2
+#if CONFIG_VP9_HIGHBITDEPTH
+const ConvolveFunctions convolve8_avx2(
+    wrap_convolve_copy_avx2_8, wrap_convolve_avg_avx2_8,
+    wrap_convolve8_horiz_c_8, wrap_convolve8_avg_horiz_c_8,
+    wrap_convolve8_vert_c_8, wrap_convolve8_avg_vert_c_8, wrap_convolve8_c_8,
+    wrap_convolve8_avg_c_8, wrap_convolve8_horiz_c_8,
+    wrap_convolve8_avg_horiz_c_8, wrap_convolve8_vert_c_8,
+    wrap_convolve8_avg_vert_c_8, wrap_convolve8_c_8, wrap_convolve8_avg_c_8, 8);
+const ConvolveFunctions convolve10_avx2(
+    wrap_convolve_copy_avx2_10, wrap_convolve_avg_avx2_10,
+    wrap_convolve8_horiz_c_10, wrap_convolve8_avg_horiz_c_10,
+    wrap_convolve8_vert_c_10, wrap_convolve8_avg_vert_c_10, wrap_convolve8_c_10,
+    wrap_convolve8_avg_c_10, wrap_convolve8_horiz_c_10,
+    wrap_convolve8_avg_horiz_c_10, wrap_convolve8_vert_c_10,
+    wrap_convolve8_avg_vert_c_10, wrap_convolve8_c_10, wrap_convolve8_avg_c_10,
+    10);
+const ConvolveFunctions convolve12_avx2(
+    wrap_convolve_copy_avx2_12, wrap_convolve_avg_avx2_12,
+    wrap_convolve8_horiz_c_12, wrap_convolve8_avg_horiz_c_12,
+    wrap_convolve8_vert_c_12, wrap_convolve8_avg_vert_c_12, wrap_convolve8_c_12,
+    wrap_convolve8_avg_c_12, wrap_convolve8_horiz_c_12,
+    wrap_convolve8_avg_horiz_c_12, wrap_convolve8_vert_c_12,
+    wrap_convolve8_avg_vert_c_12, wrap_convolve8_c_12, wrap_convolve8_avg_c_12,
+    12);
+const ConvolveParam kArrayConvolve8_avx2[] = { ALL_SIZES(convolve8_avx2),
+                                               ALL_SIZES(convolve10_avx2),
+                                               ALL_SIZES(convolve12_avx2) };
+INSTANTIATE_TEST_CASE_P(AVX2, ConvolveTest,
+                        ::testing::ValuesIn(kArrayConvolve8_avx2));
+#else   // !CONFIG_VP9_HIGHBITDEPTH
 const ConvolveFunctions convolve8_avx2(
     vpx_convolve_copy_c, vpx_convolve_avg_c, vpx_convolve8_horiz_avx2,
     vpx_convolve8_avg_horiz_ssse3, vpx_convolve8_vert_avx2,
     vpx_convolve8_avg_vert_ssse3, vpx_convolve8_avx2, vpx_convolve8_avg_ssse3,
     vpx_scaled_horiz_c, vpx_scaled_avg_horiz_c, vpx_scaled_vert_c,
     vpx_scaled_avg_vert_c, vpx_scaled_2d_c, vpx_scaled_avg_2d_c, 0);
-
 const ConvolveParam kArrayConvolve8_avx2[] = { ALL_SIZES(convolve8_avx2) };
 INSTANTIATE_TEST_CASE_P(AVX2, ConvolveTest,
                         ::testing::ValuesIn(kArrayConvolve8_avx2));
-#endif  // HAVE_AVX2 && HAVE_SSSE3
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+#endif  // HAVE_AVX2
 
 #if HAVE_NEON
 #if CONFIG_VP9_HIGHBITDEPTH
diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk
index 73c50fd3d..8475eca49 100644
--- a/vpx_dsp/vpx_dsp.mk
+++ b/vpx_dsp/vpx_dsp.mk
@@ -95,6 +95,7 @@ DSP_SRCS-$(HAVE_SSSE3) += x86/vpx_subpixel_8t_intrin_ssse3.c
 ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
 DSP_SRCS-$(HAVE_SSE2)  += x86/vpx_high_subpixel_8t_sse2.asm
 DSP_SRCS-$(HAVE_SSE2)  += x86/vpx_high_subpixel_bilinear_sse2.asm
+DSP_SRCS-$(HAVE_AVX2)  += x86/highbd_convolve_avx2.c
 DSP_SRCS-$(HAVE_NEON)  += arm/highbd_vpx_convolve_copy_neon.c
 DSP_SRCS-$(HAVE_NEON)  += arm/highbd_vpx_convolve_avg_neon.c
 DSP_SRCS-$(HAVE_NEON)  += arm/highbd_vpx_convolve8_neon.c
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index 5c2ba1cc5..2f7e1e430 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -373,10 +373,10 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   # Sub Pixel Filters
   #
   add_proto qw/void vpx_highbd_convolve_copy/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
-  specialize qw/vpx_highbd_convolve_copy sse2 neon/;
+  specialize qw/vpx_highbd_convolve_copy sse2 avx2 neon/;
 
   add_proto qw/void vpx_highbd_convolve_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
-  specialize qw/vpx_highbd_convolve_avg sse2 neon/;
+  specialize qw/vpx_highbd_convolve_avg sse2 avx2 neon/;
 
   add_proto qw/void vpx_highbd_convolve8/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
   specialize qw/vpx_highbd_convolve8 neon/, "$sse2_x86_64";
diff --git a/vpx_dsp/x86/highbd_convolve_avx2.c b/vpx_dsp/x86/highbd_convolve_avx2.c
new file mode 100644
index 000000000..75589d32a
--- /dev/null
+++ b/vpx_dsp/x86/highbd_convolve_avx2.c
@@ -0,0 +1,192 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <immintrin.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/x86/convolve.h"
+
+// -----------------------------------------------------------------------------
+// Copy and average
+
+void vpx_highbd_convolve_copy_avx2(const uint8_t *src8, ptrdiff_t src_stride,
+                                   uint8_t *dst8, ptrdiff_t dst_stride,
+                                   const int16_t *filter_x, int filter_x_stride,
+                                   const int16_t *filter_y, int filter_y_stride,
+                                   int width, int h, int bd) {
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+  (void)filter_x;
+  (void)filter_y;
+  (void)filter_x_stride;
+  (void)filter_y_stride;
+  (void)bd;
+
+  assert(width % 4 == 0);
+  if (width > 32) {  // width = 64
+    do {
+      const __m256i p0 = _mm256_loadu_si256((const __m256i *)src);
+      const __m256i p1 = _mm256_loadu_si256((const __m256i *)(src + 16));
+      const __m256i p2 = _mm256_loadu_si256((const __m256i *)(src + 32));
+      const __m256i p3 = _mm256_loadu_si256((const __m256i *)(src + 48));
+      src += src_stride;
+      _mm256_storeu_si256((__m256i *)dst, p0);
+      _mm256_storeu_si256((__m256i *)(dst + 16), p1);
+      _mm256_storeu_si256((__m256i *)(dst + 32), p2);
+      _mm256_storeu_si256((__m256i *)(dst + 48), p3);
+      dst += dst_stride;
+      h--;
+    } while (h > 0);
+  } else if (width > 16) {  // width = 32
+    do {
+      const __m256i p0 = _mm256_loadu_si256((const __m256i *)src);
+      const __m256i p1 = _mm256_loadu_si256((const __m256i *)(src + 16));
+      src += src_stride;
+      _mm256_storeu_si256((__m256i *)dst, p0);
+      _mm256_storeu_si256((__m256i *)(dst + 16), p1);
+      dst += dst_stride;
+      h--;
+    } while (h > 0);
+  } else if (width > 8) {  // width = 16
+    __m256i p0, p1;
+    do {
+      p0 = _mm256_loadu_si256((const __m256i *)src);
+      src += src_stride;
+      p1 = _mm256_loadu_si256((const __m256i *)src);
+      src += src_stride;
+
+      _mm256_storeu_si256((__m256i *)dst, p0);
+      dst += dst_stride;
+      _mm256_storeu_si256((__m256i *)dst, p1);
+      dst += dst_stride;
+      h -= 2;
+    } while (h > 0);
+  } else if (width > 4) {  // width = 8
+    __m128i p0, p1;
+    do {
+      p0 = _mm_loadu_si128((const __m128i *)src);
+      src += src_stride;
+      p1 = _mm_loadu_si128((const __m128i *)src);
+      src += src_stride;
+
+      _mm_storeu_si128((__m128i *)dst, p0);
+      dst += dst_stride;
+      _mm_storeu_si128((__m128i *)dst, p1);
+      dst += dst_stride;
+      h -= 2;
+    } while (h > 0);
+  } else {  // width = 4
+    __m128i p0, p1;
+    do {
+      p0 = _mm_loadl_epi64((const __m128i *)src);
+      src += src_stride;
+      p1 = _mm_loadl_epi64((const __m128i *)src);
+      src += src_stride;
+
+      _mm_storel_epi64((__m128i *)dst, p0);
+      dst += dst_stride;
+      _mm_storel_epi64((__m128i *)dst, p1);
+      dst += dst_stride;
+      h -= 2;
+    } while (h > 0);
+  }
+}
+
+void vpx_highbd_convolve_avg_avx2(const uint8_t *src8, ptrdiff_t src_stride,
+                                  uint8_t *dst8, ptrdiff_t dst_stride,
+                                  const int16_t *filter_x, int filter_x_stride,
+                                  const int16_t *filter_y, int filter_y_stride,
+                                  int width, int h, int bd) {
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+  (void)filter_x;
+  (void)filter_y;
+  (void)filter_x_stride;
+  (void)filter_y_stride;
+  (void)bd;
+
+  assert(width % 4 == 0);
+  if (width > 32) {  // width = 64
+    __m256i p0, p1, p2, p3, u0, u1, u2, u3;
+    do {
+      p0 = _mm256_loadu_si256((const __m256i *)src);
+      p1 = _mm256_loadu_si256((const __m256i *)(src + 16));
+      p2 = _mm256_loadu_si256((const __m256i *)(src + 32));
+      p3 = _mm256_loadu_si256((const __m256i *)(src + 48));
+      src += src_stride;
+      u0 = _mm256_loadu_si256((const __m256i *)dst);
+      u1 = _mm256_loadu_si256((const __m256i *)(dst + 16));
+      u2 = _mm256_loadu_si256((const __m256i *)(dst + 32));
+      u3 = _mm256_loadu_si256((const __m256i *)(dst + 48));
+      _mm256_storeu_si256((__m256i *)dst, _mm256_avg_epu16(p0, u0));
+      _mm256_storeu_si256((__m256i *)(dst + 16), _mm256_avg_epu16(p1, u1));
+      _mm256_storeu_si256((__m256i *)(dst + 32), _mm256_avg_epu16(p2, u2));
+      _mm256_storeu_si256((__m256i *)(dst + 48), _mm256_avg_epu16(p3, u3));
+      dst += dst_stride;
+      h--;
+    } while (h > 0);
+  } else if (width > 16) {  // width = 32
+    __m256i p0, p1, u0, u1;
+    do {
+      p0 = _mm256_loadu_si256((const __m256i *)src);
+      p1 = _mm256_loadu_si256((const __m256i *)(src + 16));
+      src += src_stride;
+      u0 = _mm256_loadu_si256((const __m256i *)dst);
+      u1 = _mm256_loadu_si256((const __m256i *)(dst + 16));
+      _mm256_storeu_si256((__m256i *)dst, _mm256_avg_epu16(p0, u0));
+      _mm256_storeu_si256((__m256i *)(dst + 16), _mm256_avg_epu16(p1, u1));
+      dst += dst_stride;
+      h--;
+    } while (h > 0);
+  } else if (width > 8) {  // width = 16
+    __m256i p0, p1, u0, u1;
+    do {
+      p0 = _mm256_loadu_si256((const __m256i *)src);
+      p1 = _mm256_loadu_si256((const __m256i *)(src + src_stride));
+      src += src_stride << 1;
+      u0 = _mm256_loadu_si256((const __m256i *)dst);
+      u1 = _mm256_loadu_si256((const __m256i *)(dst + dst_stride));
+
+      _mm256_storeu_si256((__m256i *)dst, _mm256_avg_epu16(p0, u0));
+      _mm256_storeu_si256((__m256i *)(dst + dst_stride),
+                          _mm256_avg_epu16(p1, u1));
+      dst += dst_stride << 1;
+      h -= 2;
+    } while (h > 0);
+  } else if (width > 4) {  // width = 8
+    __m128i p0, p1, u0, u1;
+    do {
+      p0 = _mm_loadu_si128((const __m128i *)src);
+      p1 = _mm_loadu_si128((const __m128i *)(src + src_stride));
+      src += src_stride << 1;
+      u0 = _mm_loadu_si128((const __m128i *)dst);
+      u1 = _mm_loadu_si128((const __m128i *)(dst + dst_stride));
+
+      _mm_storeu_si128((__m128i *)dst, _mm_avg_epu16(p0, u0));
+      _mm_storeu_si128((__m128i *)(dst + dst_stride), _mm_avg_epu16(p1, u1));
+      dst += dst_stride << 1;
+      h -= 2;
+    } while (h > 0);
+  } else {  // width = 4
+    __m128i p0, p1, u0, u1;
+    do {
+      p0 = _mm_loadl_epi64((const __m128i *)src);
+      p1 = _mm_loadl_epi64((const __m128i *)(src + src_stride));
+      src += src_stride << 1;
+      u0 = _mm_loadl_epi64((const __m128i *)dst);
+      u1 = _mm_loadl_epi64((const __m128i *)(dst + dst_stride));
+
+      _mm_storel_epi64((__m128i *)dst, _mm_avg_epu16(u0, p0));
+      _mm_storel_epi64((__m128i *)(dst + dst_stride), _mm_avg_epu16(u1, p1));
+      dst += dst_stride << 1;
+      h -= 2;
+    } while (h > 0);
+  }
+}