vpx: [x86] add vpx_satd_avx2()

author Scott LaVarnway <slavarnway@google.com>

Fri, 10 Nov 2017 18:19:52 +0000 (10:19 -0800)

committer Scott LaVarnway <slavarnway@google.com>

Fri, 10 Nov 2017 20:24:12 +0000 (12:24 -0800)
author Scott LaVarnway <slavarnway@google.com>
Fri, 10 Nov 2017 18:19:52 +0000 (10:19 -0800)
committer Scott LaVarnway <slavarnway@google.com>
Fri, 10 Nov 2017 20:24:12 +0000 (12:24 -0800)
diff --git a/test/avg_test.cc b/test/avg_test.cc

index e0f2d7410a9f4ba55764f9063974799c65dc0ea8..ad21198e4b437003403ba344cd81bbb6a8b2afe3 100644 (file)
--- a/test/avg_test.cc
+++ b/test/avg_test.cc
@@ -368,6 +368,21 @@ TEST_P(SatdTest, Random) {
    Check(expected);
  }
  
+TEST_P(SatdTest, DISABLED_Speed) {
+  const int kCountSpeedTestBlock = 20000;
+  vpx_usec_timer timer;
+  DECLARE_ALIGNED(16, tran_low_t, coeff[1024]);
+  const int blocksize = GET_PARAM(0);
+
+  vpx_usec_timer_start(&timer);
+  for (int i = 0; i < kCountSpeedTestBlock; ++i) {
+    GET_PARAM(1)(coeff, blocksize);
+  }
+  vpx_usec_timer_mark(&timer);
+  const int elapsed_time = static_cast<int>(vpx_usec_timer_elapsed(&timer));
+  printf("blocksize: %4d time: %4d us\n", blocksize, elapsed_time);
+}
+
  TEST_P(BlockErrorTestFP, MinValue) {
    const int64_t kMin = -32640;
    const int64_t expected = kMin * kMin * txfm_size_;
@@ -472,13 +487,19 @@ INSTANTIATE_TEST_CASE_P(
  #endif  // HAVE_SSE2
  
  #if HAVE_AVX2
+INSTANTIATE_TEST_CASE_P(AVX2, SatdTest,
+                        ::testing::Values(make_tuple(16, &vpx_satd_avx2),
+                                          make_tuple(64, &vpx_satd_avx2),
+                                          make_tuple(256, &vpx_satd_avx2),
+                                          make_tuple(1024, &vpx_satd_avx2)));
+
  INSTANTIATE_TEST_CASE_P(
      AVX2, BlockErrorTestFP,
      ::testing::Values(make_tuple(16, &vp9_block_error_fp_avx2),
                        make_tuple(64, &vp9_block_error_fp_avx2),
                        make_tuple(256, &vp9_block_error_fp_avx2),
                        make_tuple(1024, &vp9_block_error_fp_avx2)));
-#endif  // HAVE_AVX2
+#endif
  
  #if HAVE_NEON
  INSTANTIATE_TEST_CASE_P(
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl

index 8ae847c3d81eefe37a1f93757a446fd1ac8d3020..e117b9d157d357ab88d02d20065fb2b86575fabb 100644 (file)
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -773,7 +773,7 @@ if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") {
      specialize qw/vpx_hadamard_16x16 avx2 sse2 neon vsx/;
  
      add_proto qw/int vpx_satd/, "const tran_low_t *coeff, int length";
-    specialize qw/vpx_satd sse2 neon/;
+    specialize qw/vpx_satd avx2 sse2 neon/;
    } else {
      add_proto qw/void vpx_hadamard_8x8/, "const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff";
      specialize qw/vpx_hadamard_8x8 sse2 neon msa vsx/, "$ssse3_x86_64";
@@ -782,7 +782,7 @@ if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") {
      specialize qw/vpx_hadamard_16x16 avx2 sse2 neon msa vsx/;
  
      add_proto qw/int vpx_satd/, "const int16_t *coeff, int length";
-    specialize qw/vpx_satd sse2 neon msa/;
+    specialize qw/vpx_satd avx2 sse2 neon msa/;
    }
  
    add_proto qw/void vpx_int_pro_row/, "int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int height";
diff --git a/vpx_dsp/x86/avg_intrin_avx2.c b/vpx_dsp/x86/avg_intrin_avx2.c

index 4dc759bb58b3ab17fc588617c54f9ab367f99a36..ff19ea6470d19d10cc7bd179264c740742527e0e 100644 (file)
--- a/vpx_dsp/x86/avg_intrin_avx2.c
+++ b/vpx_dsp/x86/avg_intrin_avx2.c
@@ -171,3 +171,27 @@ void vpx_hadamard_16x16_avx2(int16_t const *src_diff, ptrdiff_t src_stride,
      t_coeff += 16;
    }
  }
+
+int vpx_satd_avx2(const tran_low_t *coeff, int length) {
+  const __m256i one = _mm256_set1_epi16(1);
+  __m256i accum = _mm256_setzero_si256();
+  int i;
+
+  for (i = 0; i < length; i += 16) {
+    const __m256i src_line = load_tran_low(coeff);
+    const __m256i abs = _mm256_abs_epi16(src_line);
+    const __m256i sum = _mm256_madd_epi16(abs, one);
+    accum = _mm256_add_epi32(accum, sum);
+    coeff += 16;
+  }
+
+  {  // 32 bit horizontal add
+    const __m256i a = _mm256_srli_si256(accum, 8);
+    const __m256i b = _mm256_add_epi32(accum, a);
+    const __m256i c = _mm256_srli_epi64(b, 32);
+    const __m256i d = _mm256_add_epi32(b, c);
+    const __m128i accum_128 = _mm_add_epi32(_mm256_castsi256_si128(d),
+                                            _mm256_extractf128_si256(d, 1));
+    return _mm_cvtsi128_si32(accum_128);
+  }
+}
author	Scott LaVarnway <slavarnway@google.com>
	Fri, 10 Nov 2017 18:19:52 +0000 (10:19 -0800)
committer	Scott LaVarnway <slavarnway@google.com>
	Fri, 10 Nov 2017 20:24:12 +0000 (12:24 -0800)
test/avg_test.cc		patch \| blob \| history
vpx_dsp/vpx_dsp_rtcd_defs.pl		patch \| blob \| history
vpx_dsp/x86/avg_intrin_avx2.c		patch \| blob \| history