Add high bit Hadamard 16x16 avx2 implementation

author sdeng <sdeng@google.com>

Wed, 7 Nov 2018 00:20:41 +0000 (16:20 -0800)

committer sdeng <sdeng@google.com>

Thu, 6 Dec 2018 00:40:24 +0000 (16:40 -0800)
author sdeng <sdeng@google.com>
Wed, 7 Nov 2018 00:20:41 +0000 (16:20 -0800)
committer sdeng <sdeng@google.com>
Thu, 6 Dec 2018 00:40:24 +0000 (16:40 -0800)
diff --git a/test/hadamard_test.cc b/test/hadamard_test.cc

index b2b2d5fcd7c97223884c7e570807aaf07ed88995..a0f463ce98fcb3a35c391e2fcc6b42ad89bcc258 100644 (file)
--- a/test/hadamard_test.cc
+++ b/test/hadamard_test.cc
@@ -310,7 +310,9 @@ INSTANTIATE_TEST_CASE_P(
  #if HAVE_AVX2
  INSTANTIATE_TEST_CASE_P(
      AVX2, HadamardHighbdTest,
-    ::testing::Values(HadamardFuncWithSize(&vpx_highbd_hadamard_8x8_avx2, 8)));
+    ::testing::Values(HadamardFuncWithSize(&vpx_highbd_hadamard_8x8_avx2, 8),
+                      HadamardFuncWithSize(&vpx_highbd_hadamard_16x16_avx2,
+                                           16)));
  #endif  // HAVE_AVX2
  
  #endif  // CONFIG_VP9_HIGHBITDEPTH
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl

index 5dc6823820db1bfc61992c299478f2c3a8e684ac..9992c09d73f6d1dc9066f52c64cb95c61b4c752d 100644 (file)
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -789,7 +789,7 @@ if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") {
      specialize qw/vpx_highbd_hadamard_8x8 avx2/;
  
      add_proto qw/void vpx_highbd_hadamard_16x16/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff";
-    specialize qw/vpx_highbd_hadamard_16x16/;
+    specialize qw/vpx_highbd_hadamard_16x16 avx2/;
  
      add_proto qw/void vpx_highbd_hadamard_32x32/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff";
      specialize qw/vpx_highbd_hadamard_32x32/;
diff --git a/vpx_dsp/x86/avg_intrin_avx2.c b/vpx_dsp/x86/avg_intrin_avx2.c

index 7d74705ea882c906417a22f060db85a4da00612e..a3ebacd4e630adfcdb761ae61b6f5b5ca160db4f 100644 (file)
--- a/vpx_dsp/x86/avg_intrin_avx2.c
+++ b/vpx_dsp/x86/avg_intrin_avx2.c
@@ -134,6 +134,47 @@ void vpx_highbd_hadamard_8x8_avx2(const int16_t *src_diff, ptrdiff_t src_stride,
    coeff += 8;
    _mm256_storeu_si256((__m256i *)coeff, src32[7]);
  }
+
+void vpx_highbd_hadamard_16x16_avx2(const int16_t *src_diff,
+                                    ptrdiff_t src_stride, tran_low_t *coeff) {
+  int idx;
+  tran_low_t *t_coeff = coeff;
+  for (idx = 0; idx < 4; ++idx) {
+    const int16_t *src_ptr =
+        src_diff + (idx >> 1) * 8 * src_stride + (idx & 0x01) * 8;
+    vpx_highbd_hadamard_8x8_avx2(src_ptr, src_stride, t_coeff + idx * 64);
+  }
+
+  for (idx = 0; idx < 64; idx += 8) {
+    __m256i coeff0 = _mm256_loadu_si256((const __m256i *)t_coeff);
+    __m256i coeff1 = _mm256_loadu_si256((const __m256i *)(t_coeff + 64));
+    __m256i coeff2 = _mm256_loadu_si256((const __m256i *)(t_coeff + 128));
+    __m256i coeff3 = _mm256_loadu_si256((const __m256i *)(t_coeff + 192));
+
+    __m256i b0 = _mm256_add_epi32(coeff0, coeff1);
+    __m256i b1 = _mm256_sub_epi32(coeff0, coeff1);
+    __m256i b2 = _mm256_add_epi32(coeff2, coeff3);
+    __m256i b3 = _mm256_sub_epi32(coeff2, coeff3);
+
+    b0 = _mm256_srai_epi32(b0, 1);
+    b1 = _mm256_srai_epi32(b1, 1);
+    b2 = _mm256_srai_epi32(b2, 1);
+    b3 = _mm256_srai_epi32(b3, 1);
+
+    coeff0 = _mm256_add_epi32(b0, b2);
+    coeff1 = _mm256_add_epi32(b1, b3);
+    coeff2 = _mm256_sub_epi32(b0, b2);
+    coeff3 = _mm256_sub_epi32(b1, b3);
+
+    _mm256_storeu_si256((__m256i *)coeff, coeff0);
+    _mm256_storeu_si256((__m256i *)(coeff + 64), coeff1);
+    _mm256_storeu_si256((__m256i *)(coeff + 128), coeff2);
+    _mm256_storeu_si256((__m256i *)(coeff + 192), coeff3);
+
+    coeff += 8;
+    t_coeff += 8;
+  }
+}
  #endif  // CONFIG_VP9_HIGHBITDEPTH
  
  static void hadamard_col8x2_avx2(__m256i *in, int iter) {
author	sdeng <sdeng@google.com>
	Wed, 7 Nov 2018 00:20:41 +0000 (16:20 -0800)
committer	sdeng <sdeng@google.com>
	Thu, 6 Dec 2018 00:40:24 +0000 (16:40 -0800)
test/hadamard_test.cc		patch \| blob \| history
vpx_dsp/vpx_dsp_rtcd_defs.pl		patch \| blob \| history
vpx_dsp/x86/avg_intrin_avx2.c		patch \| blob \| history