From 0d734728f6c2887a219de4bd118de8cd290f50c0 Mon Sep 17 00:00:00 2001 From: Scott LaVarnway Date: Thu, 8 Sep 2022 13:05:55 -0700 Subject: [PATCH] Add vpx_highbd_sad16x{32,16,8}x4d_avx2. 1.98x to 2.3x faster than the sse2 version. Bug: b/245917257 Change-Id: Ie4f9bb942ffaf4af7d395fb5a5978b41aabfc93c --- test/sad_test.cc | 11 ++ vpx_dsp/vpx_dsp.mk | 1 + vpx_dsp/vpx_dsp_rtcd_defs.pl | 6 +- vpx_dsp/x86/highbd_sad4d_avx2.c | 183 ++++++++++++++++++++++++++++++++ 4 files changed, 198 insertions(+), 3 deletions(-) create mode 100644 vpx_dsp/x86/highbd_sad4d_avx2.c diff --git a/test/sad_test.cc b/test/sad_test.cc index 4fb2af624..92c9e6332 100644 --- a/test/sad_test.cc +++ b/test/sad_test.cc @@ -1079,6 +1079,17 @@ INSTANTIATE_TEST_SUITE_P(AVX2, SADavgTest, ::testing::ValuesIn(avg_avx2_tests)); const SadMxNx4Param x4d_avx2_tests[] = { SadMxNx4Param(64, 64, &vpx_sad64x64x4d_avx2), SadMxNx4Param(32, 32, &vpx_sad32x32x4d_avx2), +#if CONFIG_VP9_HIGHBITDEPTH + SadMxNx4Param(16, 32, &vpx_highbd_sad16x32x4d_avx2, 8), + SadMxNx4Param(16, 16, &vpx_highbd_sad16x16x4d_avx2, 8), + SadMxNx4Param(16, 8, &vpx_highbd_sad16x8x4d_avx2, 8), + SadMxNx4Param(16, 32, &vpx_highbd_sad16x32x4d_avx2, 10), + SadMxNx4Param(16, 16, &vpx_highbd_sad16x16x4d_avx2, 10), + SadMxNx4Param(16, 8, &vpx_highbd_sad16x8x4d_avx2, 10), + SadMxNx4Param(16, 32, &vpx_highbd_sad16x32x4d_avx2, 12), + SadMxNx4Param(16, 16, &vpx_highbd_sad16x16x4d_avx2, 12), + SadMxNx4Param(16, 8, &vpx_highbd_sad16x8x4d_avx2, 12), +#endif // CONFIG_VP9_HIGHBITDEPTH }; INSTANTIATE_TEST_SUITE_P(AVX2, SADx4Test, ::testing::ValuesIn(x4d_avx2_tests)); diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk index 3019bff8f..34e9d736d 100644 --- a/vpx_dsp/vpx_dsp.mk +++ b/vpx_dsp/vpx_dsp.mk @@ -393,6 +393,7 @@ ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes) DSP_SRCS-$(HAVE_SSE2) += x86/highbd_sad4d_sse2.asm DSP_SRCS-$(HAVE_SSE2) += x86/highbd_sad_sse2.asm DSP_SRCS-$(HAVE_NEON) += arm/highbd_sad_neon.c +DSP_SRCS-$(HAVE_AVX2) += x86/highbd_sad4d_avx2.c endif # CONFIG_VP9_HIGHBITDEPTH endif # CONFIG_ENCODERS diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index 44dee5678..df2c8da74 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -1048,13 +1048,13 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { specialize qw/vpx_highbd_sad32x16x4d sse2 neon/; add_proto qw/void vpx_highbd_sad16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]"; - specialize qw/vpx_highbd_sad16x32x4d sse2 neon/; + specialize qw/vpx_highbd_sad16x32x4d sse2 neon avx2/; add_proto qw/void vpx_highbd_sad16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]"; - specialize qw/vpx_highbd_sad16x16x4d sse2 neon/; + specialize qw/vpx_highbd_sad16x16x4d sse2 neon avx2/; add_proto qw/void vpx_highbd_sad16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]"; - specialize qw/vpx_highbd_sad16x8x4d sse2 neon/; + specialize qw/vpx_highbd_sad16x8x4d sse2 neon avx2/; add_proto qw/void vpx_highbd_sad8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]"; specialize qw/vpx_highbd_sad8x16x4d sse2 neon/; diff --git a/vpx_dsp/x86/highbd_sad4d_avx2.c b/vpx_dsp/x86/highbd_sad4d_avx2.c new file mode 100644 index 000000000..46c7e4fbc --- /dev/null +++ b/vpx_dsp/x86/highbd_sad4d_avx2.c @@ -0,0 +1,183 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#include // AVX2 +#include "./vpx_dsp_rtcd.h" +#include "vpx/vpx_integer.h" + +static VPX_FORCE_INLINE void calc_final_4(const __m256i *const sums /*[4]*/, + uint32_t sad_array[4]) { + const __m256i t0 = _mm256_hadd_epi32(sums[0], sums[1]); + const __m256i t1 = _mm256_hadd_epi32(sums[2], sums[3]); + const __m256i t2 = _mm256_hadd_epi32(t0, t1); + const __m128i sum = _mm_add_epi32(_mm256_castsi256_si128(t2), + _mm256_extractf128_si256(t2, 1)); + _mm_storeu_si128((__m128i *)sad_array, sum); +} + +static VPX_FORCE_INLINE void highbd_sad16xHx4d(__m256i *sums_16 /*[4]*/, + const uint16_t *src, + int src_stride, + uint16_t *refs[4], + int ref_stride, int height) { + int i; + for (i = 0; i < height; i++) { + __m256i r[4]; + + // load src and all ref[] + const __m256i s = _mm256_load_si256((const __m256i *)src); + r[0] = _mm256_loadu_si256((const __m256i *)refs[0]); + r[1] = _mm256_loadu_si256((const __m256i *)refs[1]); + r[2] = _mm256_loadu_si256((const __m256i *)refs[2]); + r[3] = _mm256_loadu_si256((const __m256i *)refs[3]); + + // absolute differences between every ref[] to src + r[0] = _mm256_abs_epi16(_mm256_sub_epi16(r[0], s)); + r[1] = _mm256_abs_epi16(_mm256_sub_epi16(r[1], s)); + r[2] = _mm256_abs_epi16(_mm256_sub_epi16(r[2], s)); + r[3] = _mm256_abs_epi16(_mm256_sub_epi16(r[3], s)); + + // sum every abs diff + sums_16[0] = _mm256_add_epi16(sums_16[0], r[0]); + sums_16[1] = _mm256_add_epi16(sums_16[1], r[1]); + sums_16[2] = _mm256_add_epi16(sums_16[2], r[2]); + sums_16[3] = _mm256_add_epi16(sums_16[3], r[3]); + + src += src_stride; + refs[0] += ref_stride; + refs[1] += ref_stride; + refs[2] += ref_stride; + refs[3] += ref_stride; + } +} + +void vpx_highbd_sad16x32x4d_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *const ref_array[4], + int ref_stride, uint32_t sad_array[4]) { + const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); + uint16_t *refs[4]; + __m256i sums_16[4]; + __m256i sums_32[4]; + int i; + + refs[0] = CONVERT_TO_SHORTPTR(ref_array[0]); + refs[1] = CONVERT_TO_SHORTPTR(ref_array[1]); + refs[2] = CONVERT_TO_SHORTPTR(ref_array[2]); + refs[3] = CONVERT_TO_SHORTPTR(ref_array[3]); + sums_32[0] = _mm256_setzero_si256(); + sums_32[1] = _mm256_setzero_si256(); + sums_32[2] = _mm256_setzero_si256(); + sums_32[3] = _mm256_setzero_si256(); + + for (i = 0; i < 2; ++i) { + sums_16[0] = _mm256_setzero_si256(); + sums_16[1] = _mm256_setzero_si256(); + sums_16[2] = _mm256_setzero_si256(); + sums_16[3] = _mm256_setzero_si256(); + + highbd_sad16xHx4d(sums_16, src, src_stride, refs, ref_stride, 16); + + // sums_16 will outrange after 16 rows, so add current sums_16 to sums_32 + sums_32[0] = _mm256_add_epi32( + sums_32[0], + _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[0])), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[0], 1)))); + sums_32[1] = _mm256_add_epi32( + sums_32[1], + _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[1])), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[1], 1)))); + sums_32[2] = _mm256_add_epi32( + sums_32[2], + _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[2])), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[2], 1)))); + sums_32[3] = _mm256_add_epi32( + sums_32[3], + _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[3])), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[3], 1)))); + + src += src_stride << 4; + } + calc_final_4(sums_32, sad_array); +} + +void vpx_highbd_sad16x16x4d_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *const ref_array[4], + int ref_stride, uint32_t sad_array[4]) { + const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); + uint16_t *refs[4]; + __m256i sums_16[4]; + + refs[0] = CONVERT_TO_SHORTPTR(ref_array[0]); + refs[1] = CONVERT_TO_SHORTPTR(ref_array[1]); + refs[2] = CONVERT_TO_SHORTPTR(ref_array[2]); + refs[3] = CONVERT_TO_SHORTPTR(ref_array[3]); + sums_16[0] = _mm256_setzero_si256(); + sums_16[1] = _mm256_setzero_si256(); + sums_16[2] = _mm256_setzero_si256(); + sums_16[3] = _mm256_setzero_si256(); + + highbd_sad16xHx4d(sums_16, src, src_stride, refs, ref_stride, 16); + + { + __m256i sums_32[4]; + sums_32[0] = _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[0])), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[0], 1))); + sums_32[1] = _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[1])), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[1], 1))); + sums_32[2] = _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[2])), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[2], 1))); + sums_32[3] = _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[3])), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[3], 1))); + calc_final_4(sums_32, sad_array); + } +} + +void vpx_highbd_sad16x8x4d_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *const ref_array[4], + int ref_stride, uint32_t sad_array[4]) { + const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); + uint16_t *refs[4]; + __m256i sums_16[4]; + + refs[0] = CONVERT_TO_SHORTPTR(ref_array[0]); + refs[1] = CONVERT_TO_SHORTPTR(ref_array[1]); + refs[2] = CONVERT_TO_SHORTPTR(ref_array[2]); + refs[3] = CONVERT_TO_SHORTPTR(ref_array[3]); + sums_16[0] = _mm256_setzero_si256(); + sums_16[1] = _mm256_setzero_si256(); + sums_16[2] = _mm256_setzero_si256(); + sums_16[3] = _mm256_setzero_si256(); + + highbd_sad16xHx4d(sums_16, src, src_stride, refs, ref_stride, 8); + + { + __m256i sums_32[4]; + sums_32[0] = _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[0])), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[0], 1))); + sums_32[1] = _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[1])), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[1], 1))); + sums_32[2] = _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[2])), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[2], 1))); + sums_32[3] = _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[3])), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[3], 1))); + calc_final_4(sums_32, sad_array); + } +} -- 2.49.0