Optimize convolve8 SSSE3 and AVX2 intrinsics

author Kyle Siefring <kylesiefring@gmail.com>

Sun, 22 Oct 2017 23:34:19 +0000 (19:34 -0400)

committer Kyle Siefring <kylesiefring@gmail.com>

Tue, 24 Oct 2017 14:39:48 +0000 (10:39 -0400)
author Kyle Siefring <kylesiefring@gmail.com>
Sun, 22 Oct 2017 23:34:19 +0000 (19:34 -0400)
committer Kyle Siefring <kylesiefring@gmail.com>
Tue, 24 Oct 2017 14:39:48 +0000 (10:39 -0400)
diff --git a/test/convolve_test.cc b/test/convolve_test.cc

index 08ef5722445363c61fe1f7ae3ee709124c33c741..c4e6f8c54000662a4074ef8df61f8737b0857b98 100644 (file)
--- a/test/convolve_test.cc
+++ b/test/convolve_test.cc
@@ -603,6 +603,75 @@ TEST_P(ConvolveTest, DISABLED_Scale_Speed) {
           UUT_->use_highbd_ ? UUT_->use_highbd_ : 8, elapsed_time);
  }
  
+TEST_P(ConvolveTest, DISABLED_8Tap_Speed) {
+  const uint8_t *const in = input();
+  uint8_t *const out = output();
+  const InterpKernel *const eighttap = vp9_filter_kernels[EIGHTTAP_SHARP];
+  const int kNumTests = 5000000;
+  const int width = Width();
+  const int height = Height();
+  vpx_usec_timer timer;
+
+  SetConstantInput(127);
+
+  vpx_usec_timer_start(&timer);
+  for (int n = 0; n < kNumTests; ++n) {
+    UUT_->hv8_[0](in, kInputStride, out, kOutputStride, eighttap, 8, 16, 8, 16,
+                  width, height);
+  }
+  vpx_usec_timer_mark(&timer);
+
+  const int elapsed_time = static_cast<int>(vpx_usec_timer_elapsed(&timer));
+  printf("convolve8_%dx%d_%d: %d us\n", width, height,
+         UUT_->use_highbd_ ? UUT_->use_highbd_ : 8, elapsed_time);
+}
+
+TEST_P(ConvolveTest, DISABLED_8Tap_Horiz_Speed) {
+  const uint8_t *const in = input();
+  uint8_t *const out = output();
+  const InterpKernel *const eighttap = vp9_filter_kernels[EIGHTTAP_SHARP];
+  const int kNumTests = 5000000;
+  const int width = Width();
+  const int height = Height();
+  vpx_usec_timer timer;
+
+  SetConstantInput(127);
+
+  vpx_usec_timer_start(&timer);
+  for (int n = 0; n < kNumTests; ++n) {
+    UUT_->h8_[0](in, kInputStride, out, kOutputStride, eighttap, 8, 16, 8, 16,
+                 width, height);
+  }
+  vpx_usec_timer_mark(&timer);
+
+  const int elapsed_time = static_cast<int>(vpx_usec_timer_elapsed(&timer));
+  printf("convolve8_horiz_%dx%d_%d: %d us\n", width, height,
+         UUT_->use_highbd_ ? UUT_->use_highbd_ : 8, elapsed_time);
+}
+
+TEST_P(ConvolveTest, DISABLED_8Tap_Vert_Speed) {
+  const uint8_t *const in = input();
+  uint8_t *const out = output();
+  const InterpKernel *const eighttap = vp9_filter_kernels[EIGHTTAP_SHARP];
+  const int kNumTests = 5000000;
+  const int width = Width();
+  const int height = Height();
+  vpx_usec_timer timer;
+
+  SetConstantInput(127);
+
+  vpx_usec_timer_start(&timer);
+  for (int n = 0; n < kNumTests; ++n) {
+    UUT_->v8_[0](in, kInputStride, out, kOutputStride, eighttap, 8, 16, 8, 16,
+                 width, height);
+  }
+  vpx_usec_timer_mark(&timer);
+
+  const int elapsed_time = static_cast<int>(vpx_usec_timer_elapsed(&timer));
+  printf("convolve8_vert_%dx%d_%d: %d us\n", width, height,
+         UUT_->use_highbd_ ? UUT_->use_highbd_ : 8, elapsed_time);
+}
+
  TEST_P(ConvolveTest, DISABLED_8Tap_Avg_Speed) {
    const uint8_t *const in = input();
    uint8_t *const out = output();
diff --git a/vpx_dsp/x86/convolve_avx2.h b/vpx_dsp/x86/convolve_avx2.h

index c2e83b53f2d0784eb978153fe7e754c1d87e4c46..bc96b738f465708cb46250c650c13e1039452c39 100644 (file)
--- a/vpx_dsp/x86/convolve_avx2.h
+++ b/vpx_dsp/x86/convolve_avx2.h
@@ -58,16 +58,19 @@ static INLINE __m256i convolve8_16_avx2(const __m256i *const s,
    const __m256i x1 = _mm256_maddubs_epi16(s[1], f[1]);
    const __m256i x2 = _mm256_maddubs_epi16(s[2], f[2]);
    const __m256i x3 = _mm256_maddubs_epi16(s[3], f[3]);
-  // add and saturate the results together
-  const __m256i min_x2x1 = _mm256_min_epi16(x2, x1);
-  const __m256i max_x2x1 = _mm256_max_epi16(x2, x1);
-  __m256i temp = _mm256_adds_epi16(x0, x3);
-  temp = _mm256_adds_epi16(temp, min_x2x1);
-  temp = _mm256_adds_epi16(temp, max_x2x1);
+  __m256i sum1, sum2;
+
+  // sum the results together, saturating only on the final step
+  // adding x0 with x2 and x1 with x3 is the only order that prevents
+  // outranges for all filters
+  sum1 = _mm256_add_epi16(x0, x2);
+  sum2 = _mm256_add_epi16(x1, x3);
+  // add the rounding offset early to avoid another saturated add
+  sum1 = _mm256_add_epi16(sum1, k_64);
+  sum1 = _mm256_adds_epi16(sum1, sum2);
    // round and shift by 7 bit each 16 bit
-  temp = _mm256_adds_epi16(temp, k_64);
-  temp = _mm256_srai_epi16(temp, 7);
-  return temp;
+  sum1 = _mm256_srai_epi16(sum1, 7);
+  return sum1;
  }
  
  static INLINE __m128i convolve8_8_avx2(const __m256i *const s,
@@ -82,16 +85,19 @@ static INLINE __m128i convolve8_8_avx2(const __m256i *const s,
                                         _mm256_castsi256_si128(f[2]));
    const __m128i x3 = _mm_maddubs_epi16(_mm256_castsi256_si128(s[3]),
                                         _mm256_castsi256_si128(f[3]));
-  // add and saturate the results together
-  const __m128i min_x2x1 = _mm_min_epi16(x2, x1);
-  const __m128i max_x2x1 = _mm_max_epi16(x2, x1);
-  __m128i temp = _mm_adds_epi16(x0, x3);
-  temp = _mm_adds_epi16(temp, min_x2x1);
-  temp = _mm_adds_epi16(temp, max_x2x1);
-  // round and shift by 7 bit each 16 bit
-  temp = _mm_adds_epi16(temp, k_64);
-  temp = _mm_srai_epi16(temp, 7);
-  return temp;
+  __m128i sum1, sum2;
+
+  // sum the results together, saturating only on the final step
+  // adding x0 with x2 and x1 with x3 is the only order that prevents
+  // outranges for all filters
+  sum1 = _mm_add_epi16(x0, x2);
+  sum2 = _mm_add_epi16(x1, x3);
+  // add the rounding offset early to avoid another saturated add
+  sum1 = _mm_add_epi16(sum1, k_64);
+  sum1 = _mm_adds_epi16(sum1, sum2);
+  // shift by 7 bit each 16 bit
+  sum1 = _mm_srai_epi16(sum1, 7);
+  return sum1;
  }
  
  #undef MM256_BROADCASTSI128_SI256
diff --git a/vpx_dsp/x86/convolve_ssse3.h b/vpx_dsp/x86/convolve_ssse3.h

index 8da28f0b29731d2bcd0942bfe9d6acb5417093d4..e5d452f99ea4e32645073a8c5bc9b5042282af85 100644 (file)
--- a/vpx_dsp/x86/convolve_ssse3.h
+++ b/vpx_dsp/x86/convolve_ssse3.h
@@ -48,16 +48,19 @@ static INLINE __m128i convolve8_8_ssse3(const __m128i *const s,
    const __m128i x1 = _mm_maddubs_epi16(s[1], f[1]);
    const __m128i x2 = _mm_maddubs_epi16(s[2], f[2]);
    const __m128i x3 = _mm_maddubs_epi16(s[3], f[3]);
-  // add and saturate the results together
-  const __m128i min_x2x1 = _mm_min_epi16(x2, x1);
-  const __m128i max_x2x1 = _mm_max_epi16(x2, x1);
-  __m128i temp = _mm_adds_epi16(x0, x3);
-  temp = _mm_adds_epi16(temp, min_x2x1);
-  temp = _mm_adds_epi16(temp, max_x2x1);
-  // round and shift by 7 bit each 16 bit
-  temp = _mm_adds_epi16(temp, k_64);
-  temp = _mm_srai_epi16(temp, 7);
-  return temp;
+  __m128i sum1, sum2;
+
+  // sum the results together, saturating only on the final step
+  // adding x0 with x2 and x1 with x3 is the only order that prevents
+  // outranges for all filters
+  sum1 = _mm_add_epi16(x0, x2);
+  sum2 = _mm_add_epi16(x1, x3);
+  // add the rounding offset early to avoid another saturated add
+  sum1 = _mm_add_epi16(sum1, k_64);
+  sum1 = _mm_adds_epi16(sum1, sum2);
+  // shift by 7 bit each 16 bit
+  sum1 = _mm_srai_epi16(sum1, 7);
+  return sum1;
  }
  
  static INLINE __m128i convolve8_8_even_offset_ssse3(const __m128i *const s,
diff --git a/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c b/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c

index 5a94c69b57dc363e5fa6e1c4a0de048ccc2d0f20..5b16022d4c53f7b6c4e0c278145344358d3db064 100644 (file)
--- a/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c
+++ b/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c
@@ -38,8 +38,8 @@ void vpx_filter_block1d4_h8_intrin_ssse3(
      const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
      ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
    __m128i firstFilters, secondFilters, shuffle1, shuffle2;
-  __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4;
-  __m128i addFilterReg64, filtersReg, srcReg, minReg;
+  __m128i srcRegFilt1, srcRegFilt2;
+  __m128i addFilterReg64, filtersReg, srcReg;
    unsigned int i;
  
    // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
@@ -75,18 +75,16 @@ void vpx_filter_block1d4_h8_intrin_ssse3(
      srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
      srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters);
  
-    // extract the higher half of the lane
-    srcRegFilt3 = _mm_srli_si128(srcRegFilt1, 8);
-    srcRegFilt4 = _mm_srli_si128(srcRegFilt2, 8);
+    // sum the results together, saturating only on the final step
+    // the specific order of the additions prevents outranges
+    srcRegFilt1 = _mm_add_epi16(srcRegFilt1, srcRegFilt2);
  
-    minReg = _mm_min_epi16(srcRegFilt3, srcRegFilt2);
+    // extract the higher half of the register
+    srcRegFilt2 = _mm_srli_si128(srcRegFilt1, 8);
  
-    // add and saturate all the results together
-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
-    srcRegFilt3 = _mm_max_epi16(srcRegFilt3, srcRegFilt2);
-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt3);
-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
+    // add the rounding offset early to avoid another saturated add
+    srcRegFilt1 = _mm_add_epi16(srcRegFilt1, addFilterReg64);
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2);
  
      // shift by 7 bit each 16 bits
      srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
author	Kyle Siefring <kylesiefring@gmail.com>
	Sun, 22 Oct 2017 23:34:19 +0000 (19:34 -0400)
committer	Kyle Siefring <kylesiefring@gmail.com>
	Tue, 24 Oct 2017 14:39:48 +0000 (10:39 -0400)
test/convolve_test.cc		patch \| blob \| history
vpx_dsp/x86/convolve_avx2.h		patch \| blob \| history
vpx_dsp/x86/convolve_ssse3.h		patch \| blob \| history
vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c		patch \| blob \| history