]> granicus.if.org Git - libvpx/commitdiff
Add high bit Hadamard 32x32 avx2 implementation
authorsdeng <sdeng@google.com>
Wed, 7 Nov 2018 00:20:41 +0000 (16:20 -0800)
committersdeng <sdeng@google.com>
Fri, 7 Dec 2018 17:05:06 +0000 (09:05 -0800)
Speed test:
[ RUN      ] C/HadamardHighbdTest.DISABLED_Speed/2
Hadamard32x32[          10 runs]: 9 us
Hadamard32x32[       10000 runs]: 8914 us
Hadamard32x32[    10000000 runs]: 8991776 us

[ RUN      ] AVX2/HadamardHighbdTest.DISABLED_Speed/2
Hadamard32x32[          10 runs]: 5 us
Hadamard32x32[       10000 runs]: 4582 us
Hadamard32x32[    10000000 runs]: 4548203 us

Change-Id: Ied1b38b510bd033299f05869216d394e3b7f70f1

test/hadamard_test.cc
vpx_dsp/vpx_dsp_rtcd_defs.pl
vpx_dsp/x86/avg_intrin_avx2.c

index a0f463ce98fcb3a35c391e2fcc6b42ad89bcc258..b194ace6747582e6c7a3255b5f59cdc41990708b 100644 (file)
@@ -311,8 +311,9 @@ INSTANTIATE_TEST_CASE_P(
 INSTANTIATE_TEST_CASE_P(
     AVX2, HadamardHighbdTest,
     ::testing::Values(HadamardFuncWithSize(&vpx_highbd_hadamard_8x8_avx2, 8),
-                      HadamardFuncWithSize(&vpx_highbd_hadamard_16x16_avx2,
-                                           16)));
+                      HadamardFuncWithSize(&vpx_highbd_hadamard_16x16_avx2, 16),
+                      HadamardFuncWithSize(&vpx_highbd_hadamard_32x32_avx2,
+                                           32)));
 #endif  // HAVE_AVX2
 
 #endif  // CONFIG_VP9_HIGHBITDEPTH
index 9992c09d73f6d1dc9066f52c64cb95c61b4c752d..6dc317630248e763e13dc563e3d7f2af7c2e096e 100644 (file)
@@ -792,7 +792,7 @@ if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") {
     specialize qw/vpx_highbd_hadamard_16x16 avx2/;
 
     add_proto qw/void vpx_highbd_hadamard_32x32/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff";
-    specialize qw/vpx_highbd_hadamard_32x32/;
+    specialize qw/vpx_highbd_hadamard_32x32 avx2/;
 
     add_proto qw/int vpx_satd/, "const tran_low_t *coeff, int length";
     specialize qw/vpx_satd avx2 sse2 neon/;
index a3ebacd4e630adfcdb761ae61b6f5b5ca160db4f..f39210b6a4212203a25ff095e4fdc028895fa52f 100644 (file)
@@ -175,6 +175,47 @@ void vpx_highbd_hadamard_16x16_avx2(const int16_t *src_diff,
     t_coeff += 8;
   }
 }
+
+void vpx_highbd_hadamard_32x32_avx2(const int16_t *src_diff,
+                                    ptrdiff_t src_stride, tran_low_t *coeff) {
+  int idx;
+  tran_low_t *t_coeff = coeff;
+  for (idx = 0; idx < 4; ++idx) {
+    const int16_t *src_ptr =
+        src_diff + (idx >> 1) * 16 * src_stride + (idx & 0x01) * 16;
+    vpx_highbd_hadamard_16x16_avx2(src_ptr, src_stride, t_coeff + idx * 256);
+  }
+
+  for (idx = 0; idx < 256; idx += 8) {
+    __m256i coeff0 = _mm256_loadu_si256((const __m256i *)t_coeff);
+    __m256i coeff1 = _mm256_loadu_si256((const __m256i *)(t_coeff + 256));
+    __m256i coeff2 = _mm256_loadu_si256((const __m256i *)(t_coeff + 512));
+    __m256i coeff3 = _mm256_loadu_si256((const __m256i *)(t_coeff + 768));
+
+    __m256i b0 = _mm256_add_epi32(coeff0, coeff1);
+    __m256i b1 = _mm256_sub_epi32(coeff0, coeff1);
+    __m256i b2 = _mm256_add_epi32(coeff2, coeff3);
+    __m256i b3 = _mm256_sub_epi32(coeff2, coeff3);
+
+    b0 = _mm256_srai_epi32(b0, 2);
+    b1 = _mm256_srai_epi32(b1, 2);
+    b2 = _mm256_srai_epi32(b2, 2);
+    b3 = _mm256_srai_epi32(b3, 2);
+
+    coeff0 = _mm256_add_epi32(b0, b2);
+    coeff1 = _mm256_add_epi32(b1, b3);
+    coeff2 = _mm256_sub_epi32(b0, b2);
+    coeff3 = _mm256_sub_epi32(b1, b3);
+
+    _mm256_storeu_si256((__m256i *)coeff, coeff0);
+    _mm256_storeu_si256((__m256i *)(coeff + 256), coeff1);
+    _mm256_storeu_si256((__m256i *)(coeff + 512), coeff2);
+    _mm256_storeu_si256((__m256i *)(coeff + 768), coeff3);
+
+    coeff += 8;
+    t_coeff += 8;
+  }
+}
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
 static void hadamard_col8x2_avx2(__m256i *in, int iter) {