From e10c95dc83c7086665ca77ea39f6c776ff5e9fd8 Mon Sep 17 00:00:00 2001 From: Johann Date: Thu, 3 Nov 2016 10:47:06 -0700 Subject: [PATCH] Update vp9_fdct8x8_quant_ssse3 for highbitdepth Borrow transition functions from fdct.h nee vpx_quantize_b_sse2 BUG=webm:1304 Change-Id: I9c88c3eec3ff8bb461411d98c26c3c236ea28ef1 --- vp9/common/vp9_rtcd_defs.pl | 1 + vp9/encoder/x86/vp9_dct_ssse3.c | 40 ++++++++++++++++++--------------- vpx_dsp/x86/fdct.h | 11 +++++++++ 3 files changed, 34 insertions(+), 18 deletions(-) diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl index fafc65983..abef06763 100644 --- a/vp9/common/vp9_rtcd_defs.pl +++ b/vp9/common/vp9_rtcd_defs.pl @@ -137,6 +137,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { add_proto qw/void vp9_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; add_proto qw/void vp9_fdct8x8_quant/, "const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; + specialize qw/vp9_fdct8x8_quant ssse3/; } else { add_proto qw/int64_t vp9_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz"; specialize qw/vp9_block_error avx2 msa sse2/; diff --git a/vp9/encoder/x86/vp9_dct_ssse3.c b/vp9/encoder/x86/vp9_dct_ssse3.c index fb2a92541..b3c3d7beb 100644 --- a/vp9/encoder/x86/vp9_dct_ssse3.c +++ b/vp9/encoder/x86/vp9_dct_ssse3.c @@ -12,14 +12,17 @@ #include // SSSE3 #include "./vp9_rtcd.h" +#include "./vpx_config.h" +#include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_dsp/x86/fdct.h" #include "vpx_dsp/x86/inv_txfm_sse2.h" #include "vpx_dsp/x86/txfm_common_sse2.h" void vp9_fdct8x8_quant_ssse3( - const int16_t *input, int stride, int16_t *coeff_ptr, intptr_t n_coeffs, + const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, - int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan_ptr, const int16_t *iscan_ptr) { __m128i zero; int pass; @@ -328,15 +331,15 @@ void vp9_fdct8x8_quant_ssse3( qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); - _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs), qcoeff0); - _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs) + 1, qcoeff1); + store_tran_low(qcoeff0, qcoeff_ptr + n_coeffs); + store_tran_low(qcoeff1, qcoeff_ptr + n_coeffs + 8); coeff0 = _mm_mullo_epi16(qcoeff0, dequant); dequant = _mm_unpackhi_epi64(dequant, dequant); coeff1 = _mm_mullo_epi16(qcoeff1, dequant); - _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs), coeff0); - _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs) + 1, coeff1); + store_tran_low(coeff0, dqcoeff_ptr + n_coeffs); + store_tran_low(coeff1, dqcoeff_ptr + n_coeffs + 8); } { @@ -398,20 +401,21 @@ void vp9_fdct8x8_quant_ssse3( qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); - _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs), qcoeff0); - _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs) + 1, qcoeff1); + store_tran_low(qcoeff0, qcoeff_ptr + n_coeffs); + store_tran_low(qcoeff1, qcoeff_ptr + n_coeffs + 8); coeff0 = _mm_mullo_epi16(qcoeff0, dequant); coeff1 = _mm_mullo_epi16(qcoeff1, dequant); - _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs), coeff0); - _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs) + 1, coeff1); + store_tran_low(coeff0, dqcoeff_ptr + n_coeffs); + store_tran_low(coeff1, dqcoeff_ptr + n_coeffs + 8); } else { - _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs), zero); - _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs) + 1, zero); + // Maybe a more efficient way to store 0? + store_zero_tran_low(qcoeff_ptr + n_coeffs); + store_zero_tran_low(qcoeff_ptr + n_coeffs + 8); - _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs), zero); - _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs) + 1, zero); + store_zero_tran_low(dqcoeff_ptr + n_coeffs); + store_zero_tran_low(dqcoeff_ptr + n_coeffs + 8); } } @@ -452,10 +456,10 @@ void vp9_fdct8x8_quant_ssse3( } } else { do { - _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs), zero); - _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs) + 1, zero); - _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs), zero); - _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs) + 1, zero); + store_zero_tran_low(dqcoeff_ptr + n_coeffs); + store_zero_tran_low(dqcoeff_ptr + n_coeffs + 8); + store_zero_tran_low(qcoeff_ptr + n_coeffs); + store_zero_tran_low(qcoeff_ptr + n_coeffs + 8); n_coeffs += 8 * 2; } while (n_coeffs < 0); *eob_ptr = 0; diff --git a/vpx_dsp/x86/fdct.h b/vpx_dsp/x86/fdct.h index cad73f82a..54a6d81fc 100644 --- a/vpx_dsp/x86/fdct.h +++ b/vpx_dsp/x86/fdct.h @@ -43,4 +43,15 @@ static INLINE void store_tran_low(__m128i a, tran_low_t *b) { _mm_store_si128((__m128i *)(b), a); #endif } + +// Zero fill 8 positions in the output buffer. +static INLINE void store_zero_tran_low(tran_low_t *a) { + const __m128i zero = _mm_setzero_si128(); +#if CONFIG_VP9_HIGHBITDEPTH + _mm_store_si128((__m128i *)(a), zero); + _mm_store_si128((__m128i *)(a + 4), zero); +#else + _mm_store_si128((__m128i *)(a), zero); +#endif +} #endif // VPX_DSP_X86_FDCT_H_ -- 2.40.0