From: Angie Chiang Date: Sun, 14 Jul 2019 16:20:58 +0000 (-0700) Subject: Add vpx_sad32x32x8_c/avx2 X-Git-Tag: v1.8.2~163 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=291055812b6962e808619892abe8c87277d843c4;p=libvpx Add vpx_sad32x32x8_c/avx2 Change-Id: I4dbb7b6c8979c39eb6ffb97750e3cca0f4b7921f --- diff --git a/test/sad_test.cc b/test/sad_test.cc index 9888b7cf7..9125a0184 100644 --- a/test/sad_test.cc +++ b/test/sad_test.cc @@ -745,7 +745,7 @@ const SadMxNx8Param x8_c_tests[] = { // SadMxNx8Param(64, 64, &vpx_sad64x64x8_c), // SadMxNx8Param(64, 32, &vpx_sad64x32x8_c), // SadMxNx8Param(32, 64, &vpx_sad32x64x8_c), - // SadMxNx8Param(32, 32, &vpx_sad32x32x8_c), + SadMxNx8Param(32, 32, &vpx_sad32x32x8_c), // SadMxNx8Param(32, 16, &vpx_sad32x16x8_c), // SadMxNx8Param(16, 32, &vpx_sad16x32x8_c), SadMxNx8Param(16, 16, &vpx_sad16x16x8_c), @@ -1021,6 +1021,12 @@ const SadMxNx4Param x4d_avx2_tests[] = { SadMxNx4Param(32, 32, &vpx_sad32x32x4d_avx2), }; INSTANTIATE_TEST_CASE_P(AVX2, SADx4Test, ::testing::ValuesIn(x4d_avx2_tests)); + +const SadMxNx8Param x8_avx2_tests[] = { + // SadMxNx8Param(64, 64, &vpx_sad64x64x8_c), + SadMxNx8Param(32, 32, &vpx_sad32x32x8_avx2), +}; +INSTANTIATE_TEST_CASE_P(AVX2, SADx8Test, ::testing::ValuesIn(x8_avx2_tests)); #endif // HAVE_AVX2 #if HAVE_AVX512 diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c index dd0d10d53..147ca4c65 100644 --- a/vp9/encoder/vp9_encoder.c +++ b/vp9/encoder/vp9_encoder.c @@ -2468,7 +2468,7 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf, BFP(BLOCK_32X32, vpx_sad32x32, vpx_sad32x32_avg, vpx_variance32x32, vpx_sub_pixel_variance32x32, vpx_sub_pixel_avg_variance32x32, - vpx_sad32x32x4d, NULL) + vpx_sad32x32x4d, vpx_sad32x32x8) BFP(BLOCK_64X64, vpx_sad64x64, vpx_sad64x64_avg, vpx_variance64x64, vpx_sub_pixel_variance64x64, vpx_sub_pixel_avg_variance64x64, diff --git a/vpx_dsp/sad.c b/vpx_dsp/sad.c index 873ddca09..769322019 100644 --- a/vpx_dsp/sad.c +++ b/vpx_dsp/sad.c @@ -83,6 +83,7 @@ sadMxNx4D(32, 64) // 32x32 sadMxN(32, 32) +sadMxNxK(32, 32, 8) sadMxNx4D(32, 32) // 32x16 diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index 797ef7fe0..fd7eefdad 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -893,6 +893,9 @@ add_proto qw/void vpx_sad4x4x3/, "const uint8_t *src_ptr, int src_stride, const specialize qw/vpx_sad4x4x3 sse3 msa mmi/; # Blocks of 8 +add_proto qw/void vpx_sad32x32x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; +specialize qw/vpx_sad32x32x8 avx2/; + add_proto qw/void vpx_sad16x16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; specialize qw/vpx_sad16x16x8 sse4_1 msa mmi/; diff --git a/vpx_dsp/x86/sad4d_avx2.c b/vpx_dsp/x86/sad4d_avx2.c index b18fecf70..a5c4f8c53 100644 --- a/vpx_dsp/x86/sad4d_avx2.c +++ b/vpx_dsp/x86/sad4d_avx2.c @@ -11,8 +11,8 @@ #include "./vpx_dsp_rtcd.h" #include "vpx/vpx_integer.h" -static INLINE void calc_final(const __m256i *const sums /*[4]*/, - uint32_t sad_array[4]) { +static INLINE void calc_final_4(const __m256i *const sums /*[4]*/, + uint32_t *sad_array) { const __m256i t0 = _mm256_hadd_epi32(sums[0], sums[1]); const __m256i t1 = _mm256_hadd_epi32(sums[2], sums[3]); const __m256i t2 = _mm256_hadd_epi32(t0, t1); @@ -66,7 +66,64 @@ void vpx_sad32x32x4d_avx2(const uint8_t *src_ptr, int src_stride, refs[3] += ref_stride; } - calc_final(sums, sad_array); + calc_final_4(sums, sad_array); +} + +void vpx_sad32x32x8_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + uint32_t *sad_array) { + int i; + __m256i sums[8]; + + sums[0] = _mm256_setzero_si256(); + sums[1] = _mm256_setzero_si256(); + sums[2] = _mm256_setzero_si256(); + sums[3] = _mm256_setzero_si256(); + sums[4] = _mm256_setzero_si256(); + sums[5] = _mm256_setzero_si256(); + sums[6] = _mm256_setzero_si256(); + sums[7] = _mm256_setzero_si256(); + + for (i = 0; i < 32; i++) { + __m256i r[8]; + + // load src and all ref[] + const __m256i s = _mm256_load_si256((const __m256i *)src_ptr); + r[0] = _mm256_loadu_si256((const __m256i *)&ref_ptr[0]); + r[1] = _mm256_loadu_si256((const __m256i *)&ref_ptr[1]); + r[2] = _mm256_loadu_si256((const __m256i *)&ref_ptr[2]); + r[3] = _mm256_loadu_si256((const __m256i *)&ref_ptr[3]); + r[4] = _mm256_loadu_si256((const __m256i *)&ref_ptr[4]); + r[5] = _mm256_loadu_si256((const __m256i *)&ref_ptr[5]); + r[6] = _mm256_loadu_si256((const __m256i *)&ref_ptr[6]); + r[7] = _mm256_loadu_si256((const __m256i *)&ref_ptr[7]); + + // sum of the absolute differences between every ref[] to src + r[0] = _mm256_sad_epu8(r[0], s); + r[1] = _mm256_sad_epu8(r[1], s); + r[2] = _mm256_sad_epu8(r[2], s); + r[3] = _mm256_sad_epu8(r[3], s); + r[4] = _mm256_sad_epu8(r[4], s); + r[5] = _mm256_sad_epu8(r[5], s); + r[6] = _mm256_sad_epu8(r[6], s); + r[7] = _mm256_sad_epu8(r[7], s); + + // sum every ref[] + sums[0] = _mm256_add_epi32(sums[0], r[0]); + sums[1] = _mm256_add_epi32(sums[1], r[1]); + sums[2] = _mm256_add_epi32(sums[2], r[2]); + sums[3] = _mm256_add_epi32(sums[3], r[3]); + sums[4] = _mm256_add_epi32(sums[4], r[4]); + sums[5] = _mm256_add_epi32(sums[5], r[5]); + sums[6] = _mm256_add_epi32(sums[6], r[6]); + sums[7] = _mm256_add_epi32(sums[7], r[7]); + + src_ptr += src_stride; + ref_ptr += ref_stride; + } + + calc_final_4(sums, sad_array); + calc_final_4(sums + 4, sad_array + 4); } void vpx_sad64x64x4d_avx2(const uint8_t *src_ptr, int src_stride, @@ -126,5 +183,5 @@ void vpx_sad64x64x4d_avx2(const uint8_t *src_ptr, int src_stride, refs[3] += ref_stride; } - calc_final(sums, sad_array); + calc_final_4(sums, sad_array); }