From 0eeb7975128997b3f978d3a6c81d6e25b4d4a8c9 Mon Sep 17 00:00:00 2001 From: Johann Date: Wed, 21 Nov 2018 13:30:45 -0500 Subject: [PATCH] quantize: fix x86 hbd builds Calculate the high bits of dqcoeff in high bit depth builds and store them appropriately. BUG=webm:1448 Change-Id: I61a2f8bfcf2e30765f10a94073c4d58321d2fa24 --- test/vp9_quantize_test.cc | 22 ++++++++-------------- vpx_dsp/x86/quantize_avx.c | 19 +++++++------------ vpx_dsp/x86/quantize_sse2.c | 18 ++++++------------ vpx_dsp/x86/quantize_sse2.h | 18 ++++++++++++++++++ vpx_dsp/x86/quantize_ssse3.c | 18 ++++++------------ 5 files changed, 45 insertions(+), 50 deletions(-) diff --git a/test/vp9_quantize_test.cc b/test/vp9_quantize_test.cc index 0b106fb69..20ae58845 100644 --- a/test/vp9_quantize_test.cc +++ b/test/vp9_quantize_test.cc @@ -466,11 +466,11 @@ using ::testing::make_tuple; #if HAVE_SSE2 #if CONFIG_VP9_HIGHBITDEPTH -// TODO(johannkoenig): Fix vpx_quantize_b_sse2 in highbitdepth builds. -// make_tuple(&vpx_quantize_b_sse2, &vpx_highbd_quantize_b_c, VPX_BITS_8), INSTANTIATE_TEST_CASE_P( SSE2, VP9QuantizeTest, ::testing::Values( + make_tuple(&vpx_quantize_b_sse2, &vpx_quantize_b_c, VPX_BITS_8, 16, + false), make_tuple(&vpx_highbd_quantize_b_sse2, &vpx_highbd_quantize_b_c, VPX_BITS_8, 16, false), make_tuple(&vpx_highbd_quantize_b_sse2, &vpx_highbd_quantize_b_c, @@ -495,7 +495,7 @@ INSTANTIATE_TEST_CASE_P( #endif // CONFIG_VP9_HIGHBITDEPTH #endif // HAVE_SSE2 -#if HAVE_SSSE3 && !CONFIG_VP9_HIGHBITDEPTH +#if HAVE_SSSE3 #if ARCH_X86_64 INSTANTIATE_TEST_CASE_P( SSSE3, VP9QuantizeTest, @@ -512,30 +512,24 @@ INSTANTIATE_TEST_CASE_P(SSSE3, VP9QuantizeTest, ::testing::Values(make_tuple(&vpx_quantize_b_ssse3, &vpx_quantize_b_c, VPX_BITS_8, 16, false))); -#endif +#endif // ARCH_X86_64 -#if ARCH_X86_64 -// TODO(johannkoenig): SSSE3 optimizations do not yet pass this test. +// TODO(johannkoenig): fix 32x32 INSTANTIATE_TEST_CASE_P(DISABLED_SSSE3, VP9QuantizeTest, ::testing::Values(make_tuple( &vpx_quantize_b_32x32_ssse3, &vpx_quantize_b_32x32_c, VPX_BITS_8, 32, false))); -#endif // ARCH_X86_64 -#endif // HAVE_SSSE3 && !CONFIG_VP9_HIGHBITDEPTH +#endif // HAVE_SSSE3 -// TODO(johannkoenig): AVX optimizations do not yet pass the 32x32 test or -// highbitdepth configurations. -#if HAVE_AVX && !CONFIG_VP9_HIGHBITDEPTH +#if HAVE_AVX INSTANTIATE_TEST_CASE_P( AVX, VP9QuantizeTest, ::testing::Values(make_tuple(&vpx_quantize_b_avx, &vpx_quantize_b_c, VPX_BITS_8, 16, false), - // Even though SSSE3 and AVX do not match the reference - // code, we can keep them in sync with each other. make_tuple(&vpx_quantize_b_32x32_avx, &vpx_quantize_b_32x32_ssse3, VPX_BITS_8, 32, false))); -#endif // HAVE_AVX && !CONFIG_VP9_HIGHBITDEPTH +#endif // HAVE_AVX #if ARCH_X86_64 && HAVE_AVX2 INSTANTIATE_TEST_CASE_P( diff --git a/vpx_dsp/x86/quantize_avx.c b/vpx_dsp/x86/quantize_avx.c index d0a8d514e..36345a6d2 100644 --- a/vpx_dsp/x86/quantize_avx.c +++ b/vpx_dsp/x86/quantize_avx.c @@ -90,14 +90,12 @@ void vpx_quantize_b_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, store_tran_low(qcoeff0, qcoeff_ptr); store_tran_low(qcoeff1, qcoeff_ptr + 8); - coeff0 = calculate_dqcoeff(qcoeff0, dequant); + calculate_dqcoeff_and_store(qcoeff0, dequant, dqcoeff_ptr); dequant = _mm_unpackhi_epi64(dequant, dequant); - coeff1 = calculate_dqcoeff(qcoeff1, dequant); + calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + 8); - store_tran_low(coeff0, dqcoeff_ptr); - store_tran_low(coeff1, dqcoeff_ptr + 8); - - eob = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan, 0, zero); + eob = + scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, 0, zero); } // AC only loop. @@ -134,13 +132,10 @@ void vpx_quantize_b_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, store_tran_low(qcoeff0, qcoeff_ptr + index); store_tran_low(qcoeff1, qcoeff_ptr + index + 8); - coeff0 = calculate_dqcoeff(qcoeff0, dequant); - coeff1 = calculate_dqcoeff(qcoeff1, dequant); + calculate_dqcoeff_and_store(qcoeff0, dequant, dqcoeff_ptr + index); + calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + index + 8); - store_tran_low(coeff0, dqcoeff_ptr + index); - store_tran_low(coeff1, dqcoeff_ptr + index + 8); - - eob0 = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan, index, + eob0 = scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, index, zero); eob = _mm_max_epi16(eob, eob0); } diff --git a/vpx_dsp/x86/quantize_sse2.c b/vpx_dsp/x86/quantize_sse2.c index fa098a3a0..e38a4059a 100644 --- a/vpx_dsp/x86/quantize_sse2.c +++ b/vpx_dsp/x86/quantize_sse2.c @@ -74,14 +74,11 @@ void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, store_tran_low(qcoeff0, qcoeff_ptr); store_tran_low(qcoeff1, qcoeff_ptr + 8); - coeff0 = calculate_dqcoeff(qcoeff0, dequant); + calculate_dqcoeff_and_store(qcoeff0, dequant, dqcoeff_ptr); dequant = _mm_unpackhi_epi64(dequant, dequant); - coeff1 = calculate_dqcoeff(qcoeff1, dequant); + calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + 8); - store_tran_low(coeff0, dqcoeff_ptr); - store_tran_low(coeff1, dqcoeff_ptr + 8); - - eob = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan, 0, zero); + eob = scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, 0, zero); // AC only loop. while (index < n_coeffs) { @@ -108,13 +105,10 @@ void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, store_tran_low(qcoeff0, qcoeff_ptr + index); store_tran_low(qcoeff1, qcoeff_ptr + index + 8); - coeff0 = calculate_dqcoeff(qcoeff0, dequant); - coeff1 = calculate_dqcoeff(qcoeff1, dequant); - - store_tran_low(coeff0, dqcoeff_ptr + index); - store_tran_low(coeff1, dqcoeff_ptr + index + 8); + calculate_dqcoeff_and_store(qcoeff0, dequant, dqcoeff_ptr + index); + calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + index + 8); - eob0 = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan, index, + eob0 = scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, index, zero); eob = _mm_max_epi16(eob, eob0); diff --git a/vpx_dsp/x86/quantize_sse2.h b/vpx_dsp/x86/quantize_sse2.h index 1f8587810..d93096700 100644 --- a/vpx_dsp/x86/quantize_sse2.h +++ b/vpx_dsp/x86/quantize_sse2.h @@ -48,6 +48,24 @@ static INLINE __m128i calculate_dqcoeff(__m128i qcoeff, __m128i dequant) { return _mm_mullo_epi16(qcoeff, dequant); } +static INLINE void calculate_dqcoeff_and_store(__m128i qcoeff, __m128i dequant, + tran_low_t *dqcoeff) { +#if CONFIG_VP9_HIGHBITDEPTH + const __m128i low = _mm_mullo_epi16(qcoeff, dequant); + const __m128i high = _mm_mulhi_epi16(qcoeff, dequant); + + const __m128i dqcoeff32_0 = _mm_unpacklo_epi16(low, high); + const __m128i dqcoeff32_1 = _mm_unpackhi_epi16(low, high); + + _mm_store_si128((__m128i *)(dqcoeff), dqcoeff32_0); + _mm_store_si128((__m128i *)(dqcoeff + 4), dqcoeff32_1); +#else + const __m128i dqcoeff16 = _mm_mullo_epi16(qcoeff, dequant); + + _mm_store_si128((__m128i *)(dqcoeff), dqcoeff16); +#endif // CONFIG_VP9_HIGHBITDEPTH +} + // Scan 16 values for eob reference in scan. Use masks (-1) from comparing to // zbin to add 1 to the index in 'scan'. static INLINE __m128i scan_for_eob(__m128i *coeff0, __m128i *coeff1, diff --git a/vpx_dsp/x86/quantize_ssse3.c b/vpx_dsp/x86/quantize_ssse3.c index e96f9f990..225e33ed9 100644 --- a/vpx_dsp/x86/quantize_ssse3.c +++ b/vpx_dsp/x86/quantize_ssse3.c @@ -67,14 +67,11 @@ void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, store_tran_low(qcoeff0, qcoeff_ptr); store_tran_low(qcoeff1, qcoeff_ptr + 8); - coeff0 = calculate_dqcoeff(qcoeff0, dequant); + calculate_dqcoeff_and_store(qcoeff0, dequant, dqcoeff_ptr); dequant = _mm_unpackhi_epi64(dequant, dequant); - coeff1 = calculate_dqcoeff(qcoeff1, dequant); + calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + 8); - store_tran_low(coeff0, dqcoeff_ptr); - store_tran_low(coeff1, dqcoeff_ptr + 8); - - eob = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan, 0, zero); + eob = scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, 0, zero); // AC only loop. while (index < n_coeffs) { @@ -99,13 +96,10 @@ void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, store_tran_low(qcoeff0, qcoeff_ptr + index); store_tran_low(qcoeff1, qcoeff_ptr + index + 8); - coeff0 = calculate_dqcoeff(qcoeff0, dequant); - coeff1 = calculate_dqcoeff(qcoeff1, dequant); + calculate_dqcoeff_and_store(qcoeff0, dequant, dqcoeff_ptr + index); + calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + index + 8); - store_tran_low(coeff0, dqcoeff_ptr + index); - store_tran_low(coeff1, dqcoeff_ptr + index + 8); - - eob0 = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan, index, + eob0 = scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, index, zero); eob = _mm_max_epi16(eob, eob0); -- 2.40.0