From d566160f32580e5ef8fccbce8dde8495756a0620 Mon Sep 17 00:00:00 2001 From: Johann Date: Wed, 21 Nov 2018 13:30:45 -0500 Subject: [PATCH] quantize 32x32: fix dqcoeff Calculate the high bits of dqcoeff and store them appropriately in high bit depth builds. Low bit depth builds still do not pass. C truncates the results after division. X86 only supports packing with saturation at this step. BUG=webm:1448 Change-Id: Ic80def575136c7ca37edf18d21e26925b475da98 --- test/vp9_quantize_test.cc | 57 ++++++++++++++++++++++++++++++------ vpx_dsp/vpx_dsp.mk | 1 + vpx_dsp/x86/quantize_avx.c | 44 +++++++--------------------- vpx_dsp/x86/quantize_sse2.h | 4 --- vpx_dsp/x86/quantize_ssse3.c | 44 +++++++--------------------- vpx_dsp/x86/quantize_ssse3.h | 57 ++++++++++++++++++++++++++++++++++++ 6 files changed, 126 insertions(+), 81 deletions(-) create mode 100644 vpx_dsp/x86/quantize_ssse3.h diff --git a/test/vp9_quantize_test.cc b/test/vp9_quantize_test.cc index 20ae58845..fc648e8cc 100644 --- a/test/vp9_quantize_test.cc +++ b/test/vp9_quantize_test.cc @@ -496,6 +496,32 @@ INSTANTIATE_TEST_CASE_P( #endif // HAVE_SSE2 #if HAVE_SSSE3 +#if CONFIG_VP9_HIGHBITDEPTH +#if ARCH_X86_64 +INSTANTIATE_TEST_CASE_P( + SSSE3, VP9QuantizeTest, + ::testing::Values(make_tuple(&vpx_quantize_b_ssse3, &vpx_quantize_b_c, + VPX_BITS_8, 16, false), + make_tuple(&vpx_quantize_b_32x32_ssse3, + &vpx_quantize_b_32x32_c, VPX_BITS_8, 32, + false), + make_tuple(&QuantFPWrapper, + &QuantFPWrapper, VPX_BITS_8, + 16, true), + make_tuple(&QuantFPWrapper, + &QuantFPWrapper, + VPX_BITS_8, 32, true))); +#else +INSTANTIATE_TEST_CASE_P( + SSSE3, VP9QuantizeTest, + ::testing::Values(make_tuple(&vpx_quantize_b_ssse3, &vpx_quantize_b_c, + VPX_BITS_8, 16, false), + make_tuple(&vpx_quantize_b_32x32_ssse3, + &vpx_quantize_b_32x32_c, VPX_BITS_8, 32, + false))); + +#endif // ARCH_X86_64 +#else #if ARCH_X86_64 INSTANTIATE_TEST_CASE_P( SSSE3, VP9QuantizeTest, @@ -507,28 +533,41 @@ INSTANTIATE_TEST_CASE_P( make_tuple(&QuantFPWrapper, &QuantFPWrapper, VPX_BITS_8, 32, true))); + #else INSTANTIATE_TEST_CASE_P(SSSE3, VP9QuantizeTest, ::testing::Values(make_tuple(&vpx_quantize_b_ssse3, &vpx_quantize_b_c, VPX_BITS_8, 16, false))); #endif // ARCH_X86_64 - -// TODO(johannkoenig): fix 32x32 +// TODO(webm:1448): lowbd truncates results in C. INSTANTIATE_TEST_CASE_P(DISABLED_SSSE3, VP9QuantizeTest, ::testing::Values(make_tuple( &vpx_quantize_b_32x32_ssse3, &vpx_quantize_b_32x32_c, VPX_BITS_8, 32, false))); +#endif // CONFIG_VP9_HIGHBITDEPTH #endif // HAVE_SSSE3 #if HAVE_AVX -INSTANTIATE_TEST_CASE_P( - AVX, VP9QuantizeTest, - ::testing::Values(make_tuple(&vpx_quantize_b_avx, &vpx_quantize_b_c, - VPX_BITS_8, 16, false), - make_tuple(&vpx_quantize_b_32x32_avx, - &vpx_quantize_b_32x32_ssse3, VPX_BITS_8, 32, - false))); +#if CONFIG_VP9_HIGHBITDEPTH +INSTANTIATE_TEST_CASE_P(AVX, VP9QuantizeTest, + ::testing::Values(make_tuple(&vpx_quantize_b_avx, + &vpx_quantize_b_c, + VPX_BITS_8, 16, false), + make_tuple(&vpx_quantize_b_32x32_avx, + &vpx_quantize_b_32x32_c, + VPX_BITS_8, 32, false))); +#else +INSTANTIATE_TEST_CASE_P(AVX, VP9QuantizeTest, + ::testing::Values(make_tuple(&vpx_quantize_b_avx, + &vpx_quantize_b_c, + VPX_BITS_8, 16, false))); +// TODO(webm:1448): lowbd truncates results in C. +INSTANTIATE_TEST_CASE_P(DISABLED_AVX, VP9QuantizeTest, + ::testing::Values(make_tuple(&vpx_quantize_b_32x32_avx, + &vpx_quantize_b_32x32_c, + VPX_BITS_8, 32, false))); +#endif // CONFIG_VP9_HIGHBITDEPTH #endif // HAVE_AVX #if ARCH_X86_64 && HAVE_AVX2 diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk index 5ee3bfec7..2495db3f4 100644 --- a/vpx_dsp/vpx_dsp.mk +++ b/vpx_dsp/vpx_dsp.mk @@ -299,6 +299,7 @@ DSP_SRCS-yes += quantize.h DSP_SRCS-$(HAVE_SSE2) += x86/quantize_sse2.c DSP_SRCS-$(HAVE_SSE2) += x86/quantize_sse2.h DSP_SRCS-$(HAVE_SSSE3) += x86/quantize_ssse3.c +DSP_SRCS-$(HAVE_SSSE3) += x86/quantize_ssse3.h DSP_SRCS-$(HAVE_AVX) += x86/quantize_avx.c DSP_SRCS-$(HAVE_NEON) += arm/quantize_neon.c DSP_SRCS-$(HAVE_VSX) += ppc/quantize_vsx.c diff --git a/vpx_dsp/x86/quantize_avx.c b/vpx_dsp/x86/quantize_avx.c index 36345a6d2..0a91d36ea 100644 --- a/vpx_dsp/x86/quantize_avx.c +++ b/vpx_dsp/x86/quantize_avx.c @@ -18,6 +18,7 @@ #include "vpx/vpx_integer.h" #include "vpx_dsp/x86/bitdepth_conversion_sse2.h" #include "vpx_dsp/x86/quantize_sse2.h" +#include "vpx_dsp/x86/quantize_ssse3.h" void vpx_quantize_b_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, @@ -229,27 +230,12 @@ void vpx_quantize_b_32x32_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, store_tran_low(qcoeff0, qcoeff_ptr); store_tran_low(qcoeff1, qcoeff_ptr + 8); - // Un-sign to bias rounding like C. - // dequant is almost always negative, so this is probably the backwards way - // to handle the sign. However, it matches the previous assembly. - coeff0 = _mm_abs_epi16(qcoeff0); - coeff1 = _mm_abs_epi16(qcoeff1); - - coeff0 = calculate_dqcoeff(coeff0, dequant); + calculate_dqcoeff_and_store_32x32(qcoeff0, dequant, zero, dqcoeff_ptr); dequant = _mm_unpackhi_epi64(dequant, dequant); - coeff1 = calculate_dqcoeff(coeff1, dequant); - - // "Divide" by 2. - coeff0 = _mm_srli_epi16(coeff0, 1); - coeff1 = _mm_srli_epi16(coeff1, 1); - - coeff0 = _mm_sign_epi16(coeff0, qcoeff0); - coeff1 = _mm_sign_epi16(coeff1, qcoeff1); + calculate_dqcoeff_and_store_32x32(qcoeff1, dequant, zero, dqcoeff_ptr + 8); - store_tran_low(coeff0, dqcoeff_ptr); - store_tran_low(coeff1, dqcoeff_ptr + 8); - - eob = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan, 0, zero); + eob = + scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, 0, zero); } // AC only loop. @@ -286,22 +272,12 @@ void vpx_quantize_b_32x32_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, store_tran_low(qcoeff0, qcoeff_ptr + index); store_tran_low(qcoeff1, qcoeff_ptr + index + 8); - coeff0 = _mm_abs_epi16(qcoeff0); - coeff1 = _mm_abs_epi16(qcoeff1); + calculate_dqcoeff_and_store_32x32(qcoeff0, dequant, zero, + dqcoeff_ptr + index); + calculate_dqcoeff_and_store_32x32(qcoeff1, dequant, zero, + dqcoeff_ptr + index + 8); - coeff0 = calculate_dqcoeff(coeff0, dequant); - coeff1 = calculate_dqcoeff(coeff1, dequant); - - coeff0 = _mm_srli_epi16(coeff0, 1); - coeff1 = _mm_srli_epi16(coeff1, 1); - - coeff0 = _mm_sign_epi16(coeff0, qcoeff0); - coeff1 = _mm_sign_epi16(coeff1, qcoeff1); - - store_tran_low(coeff0, dqcoeff_ptr + index); - store_tran_low(coeff1, dqcoeff_ptr + index + 8); - - eob0 = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan, index, + eob0 = scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, index, zero); eob = _mm_max_epi16(eob, eob0); } diff --git a/vpx_dsp/x86/quantize_sse2.h b/vpx_dsp/x86/quantize_sse2.h index d93096700..afe2f924b 100644 --- a/vpx_dsp/x86/quantize_sse2.h +++ b/vpx_dsp/x86/quantize_sse2.h @@ -44,10 +44,6 @@ static INLINE void calculate_qcoeff(__m128i *coeff, const __m128i round, *coeff = _mm_mulhi_epi16(qcoeff, shift); } -static INLINE __m128i calculate_dqcoeff(__m128i qcoeff, __m128i dequant) { - return _mm_mullo_epi16(qcoeff, dequant); -} - static INLINE void calculate_dqcoeff_and_store(__m128i qcoeff, __m128i dequant, tran_low_t *dqcoeff) { #if CONFIG_VP9_HIGHBITDEPTH diff --git a/vpx_dsp/x86/quantize_ssse3.c b/vpx_dsp/x86/quantize_ssse3.c index 225e33ed9..fc1d91959 100644 --- a/vpx_dsp/x86/quantize_ssse3.c +++ b/vpx_dsp/x86/quantize_ssse3.c @@ -15,6 +15,7 @@ #include "vpx/vpx_integer.h" #include "vpx_dsp/x86/bitdepth_conversion_sse2.h" #include "vpx_dsp/x86/quantize_sse2.h" +#include "vpx_dsp/x86/quantize_ssse3.h" void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, @@ -201,27 +202,12 @@ void vpx_quantize_b_32x32_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, store_tran_low(qcoeff0, qcoeff_ptr); store_tran_low(qcoeff1, qcoeff_ptr + 8); - // Un-sign to bias rounding like C. - // dequant is almost always negative, so this is probably the backwards way - // to handle the sign. However, it matches the previous assembly. - coeff0 = _mm_abs_epi16(qcoeff0); - coeff1 = _mm_abs_epi16(qcoeff1); - - coeff0 = calculate_dqcoeff(coeff0, dequant); + calculate_dqcoeff_and_store_32x32(qcoeff0, dequant, zero, dqcoeff_ptr); dequant = _mm_unpackhi_epi64(dequant, dequant); - coeff1 = calculate_dqcoeff(coeff1, dequant); - - // "Divide" by 2. - coeff0 = _mm_srli_epi16(coeff0, 1); - coeff1 = _mm_srli_epi16(coeff1, 1); - - coeff0 = _mm_sign_epi16(coeff0, qcoeff0); - coeff1 = _mm_sign_epi16(coeff1, qcoeff1); + calculate_dqcoeff_and_store_32x32(qcoeff1, dequant, zero, dqcoeff_ptr + 8); - store_tran_low(coeff0, dqcoeff_ptr); - store_tran_low(coeff1, dqcoeff_ptr + 8); - - eob = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan, 0, zero); + eob = + scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, 0, zero); } // AC only loop. @@ -262,22 +248,12 @@ void vpx_quantize_b_32x32_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, store_tran_low(qcoeff0, qcoeff_ptr + index); store_tran_low(qcoeff1, qcoeff_ptr + index + 8); - coeff0 = _mm_abs_epi16(qcoeff0); - coeff1 = _mm_abs_epi16(qcoeff1); - - coeff0 = calculate_dqcoeff(coeff0, dequant); - coeff1 = calculate_dqcoeff(coeff1, dequant); + calculate_dqcoeff_and_store_32x32(qcoeff0, dequant, zero, + dqcoeff_ptr + index); + calculate_dqcoeff_and_store_32x32(qcoeff1, dequant, zero, + dqcoeff_ptr + 8 + index); - coeff0 = _mm_srli_epi16(coeff0, 1); - coeff1 = _mm_srli_epi16(coeff1, 1); - - coeff0 = _mm_sign_epi16(coeff0, qcoeff0); - coeff1 = _mm_sign_epi16(coeff1, qcoeff1); - - store_tran_low(coeff0, dqcoeff_ptr + index); - store_tran_low(coeff1, dqcoeff_ptr + index + 8); - - eob0 = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan, index, + eob0 = scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, index, zero); eob = _mm_max_epi16(eob, eob0); } diff --git a/vpx_dsp/x86/quantize_ssse3.h b/vpx_dsp/x86/quantize_ssse3.h new file mode 100644 index 000000000..35223d7b4 --- /dev/null +++ b/vpx_dsp/x86/quantize_ssse3.h @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_DSP_X86_QUANTIZE_SSSE3_H_ +#define VPX_VPX_DSP_X86_QUANTIZE_SSSE3_H_ + +#include + +#include "./vpx_config.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/x86/quantize_sse2.h" + +static INLINE void calculate_dqcoeff_and_store_32x32(const __m128i qcoeff, + const __m128i dequant, + const __m128i zero, + tran_low_t *dqcoeff) { + // Un-sign to bias rounding like C. + const __m128i coeff = _mm_abs_epi16(qcoeff); + +#if CONFIG_VP9_HIGHBITDEPTH + const __m128i sign_0 = _mm_unpacklo_epi16(zero, qcoeff); + const __m128i sign_1 = _mm_unpackhi_epi16(zero, qcoeff); + + const __m128i low = _mm_mullo_epi16(coeff, dequant); + const __m128i high = _mm_mulhi_epi16(coeff, dequant); + __m128i dqcoeff32_0 = _mm_unpacklo_epi16(low, high); + __m128i dqcoeff32_1 = _mm_unpackhi_epi16(low, high); + + // "Divide" by 2. + dqcoeff32_0 = _mm_srli_epi32(dqcoeff32_0, 1); + dqcoeff32_1 = _mm_srli_epi32(dqcoeff32_1, 1); + + dqcoeff32_0 = _mm_sign_epi32(dqcoeff32_0, sign_0); + dqcoeff32_1 = _mm_sign_epi32(dqcoeff32_1, sign_1); + + _mm_store_si128((__m128i *)(dqcoeff), dqcoeff32_0); + _mm_store_si128((__m128i *)(dqcoeff + 4), dqcoeff32_1); +#else + __m128i dqcoeff16 = _mm_mullo_epi16(coeff, dequant); + (void)zero; + + dqcoeff16 = _mm_srli_epi16(dqcoeff16, 1); + + dqcoeff16 = _mm_sign_epi16(dqcoeff16, qcoeff); + + _mm_store_si128((__m128i *)(dqcoeff), dqcoeff16); +#endif // CONFIG_VP9_HIGHBITDEPTH +} + +#endif // VPX_VPX_DSP_X86_QUANTIZE_SSSE3_H_ -- 2.40.0