From a76b6b232cb612c786f80823d371e1518ce0da64 Mon Sep 17 00:00:00 2001 From: Linfeng Zhang Date: Fri, 23 Jun 2017 16:04:27 -0700 Subject: [PATCH] Update load_input_data() in x86 Split to load_input_data4() and load_input_data8(). Use pack with signed saturation instruction for high bitdepth. Change-Id: Icda3e0129a6fdb4a51d1cafbdc652ae3a65f4e06 --- vp9/common/x86/vp9_idct_intrin_sse2.c | 20 +++---- vpx_dsp/x86/inv_txfm_sse2.c | 60 ++++++++++----------- vpx_dsp/x86/inv_txfm_sse2.h | 77 +++++++++++++-------------- vpx_dsp/x86/inv_txfm_ssse3.c | 28 +++++----- 4 files changed, 92 insertions(+), 93 deletions(-) diff --git a/vp9/common/x86/vp9_idct_intrin_sse2.c b/vp9/common/x86/vp9_idct_intrin_sse2.c index 7e8089b51..7f0ddb0ec 100644 --- a/vp9/common/x86/vp9_idct_intrin_sse2.c +++ b/vp9/common/x86/vp9_idct_intrin_sse2.c @@ -18,8 +18,8 @@ void vp9_iht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, __m128i in[2]; const __m128i eight = _mm_set1_epi16(8); - in[0] = load_input_data(input); - in[1] = load_input_data(input + 8); + in[0] = load_input_data8(input); + in[1] = load_input_data8(input + 8); switch (tx_type) { case 0: // DCT_DCT @@ -57,14 +57,14 @@ void vp9_iht8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, const __m128i final_rounding = _mm_set1_epi16(1 << 4); // load input data - in[0] = load_input_data(input); - in[1] = load_input_data(input + 8 * 1); - in[2] = load_input_data(input + 8 * 2); - in[3] = load_input_data(input + 8 * 3); - in[4] = load_input_data(input + 8 * 4); - in[5] = load_input_data(input + 8 * 5); - in[6] = load_input_data(input + 8 * 6); - in[7] = load_input_data(input + 8 * 7); + in[0] = load_input_data8(input); + in[1] = load_input_data8(input + 8 * 1); + in[2] = load_input_data8(input + 8 * 2); + in[3] = load_input_data8(input + 8 * 3); + in[4] = load_input_data8(input + 8 * 4); + in[5] = load_input_data8(input + 8 * 5); + in[6] = load_input_data8(input + 8 * 6); + in[7] = load_input_data8(input + 8 * 7); switch (tx_type) { case 0: // DCT_DCT diff --git a/vpx_dsp/x86/inv_txfm_sse2.c b/vpx_dsp/x86/inv_txfm_sse2.c index 2d0318d99..00301b817 100644 --- a/vpx_dsp/x86/inv_txfm_sse2.c +++ b/vpx_dsp/x86/inv_txfm_sse2.c @@ -27,8 +27,8 @@ void vpx_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, __m128i in[2]; // Rows - in[0] = load_input_data(input); - in[1] = load_input_data(input + 8); + in[0] = load_input_data8(input); + in[1] = load_input_data8(input + 8); idct4_sse2(in); // Columns @@ -491,10 +491,10 @@ void vpx_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest, const __m128i cp_16_n16 = pair_set_epi16(cospi_16_64, -cospi_16_64); __m128i in[8], step1[8], step2[8], tmp[4]; - in[0] = load_input_data(input + 0 * 8); - in[1] = load_input_data(input + 1 * 8); - in[2] = load_input_data(input + 2 * 8); - in[3] = load_input_data(input + 3 * 8); + in[0] = load_input_data4(input + 0 * 8); + in[1] = load_input_data4(input + 1 * 8); + in[2] = load_input_data4(input + 2 * 8); + in[3] = load_input_data4(input + 3 * 8); transpose_16bit_4x4(in, in); // in[0]: 00 10 20 30 01 11 21 31 @@ -721,14 +721,14 @@ static INLINE void idct16_8col(__m128i *const in) { static INLINE void idct16_load8x8(const tran_low_t *const input, __m128i *const in) { - in[0] = load_input_data(input); - in[1] = load_input_data(input + 8 * 2); - in[2] = load_input_data(input + 8 * 4); - in[3] = load_input_data(input + 8 * 6); - in[4] = load_input_data(input + 8 * 8); - in[5] = load_input_data(input + 8 * 10); - in[6] = load_input_data(input + 8 * 12); - in[7] = load_input_data(input + 8 * 14); + in[0] = load_input_data8(input); + in[1] = load_input_data8(input + 8 * 2); + in[2] = load_input_data8(input + 8 * 4); + in[3] = load_input_data8(input + 8 * 6); + in[4] = load_input_data8(input + 8 * 8); + in[5] = load_input_data8(input + 8 * 10); + in[6] = load_input_data8(input + 8 * 12); + in[7] = load_input_data8(input + 8 * 14); } void vpx_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, @@ -1258,10 +1258,10 @@ void vpx_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest, int i; // First 1-D inverse DCT // Load input data. - in[0] = load_input_data(input); - in[1] = load_input_data(input + 8 * 2); - in[2] = load_input_data(input + 8 * 4); - in[3] = load_input_data(input + 8 * 6); + in[0] = load_input_data4(input + 0 * 16); + in[1] = load_input_data4(input + 1 * 16); + in[2] = load_input_data4(input + 2 * 16); + in[3] = load_input_data4(input + 3 * 16); transpose_16bit_4x4(in, in); @@ -1651,14 +1651,14 @@ void vpx_idct32x32_34_add_sse2(const tran_low_t *input, uint8_t *dest, int i; // Load input data. Only need to load the top left 8x8 block. - in[0] = load_input_data(input); - in[1] = load_input_data(input + 32); - in[2] = load_input_data(input + 64); - in[3] = load_input_data(input + 96); - in[4] = load_input_data(input + 128); - in[5] = load_input_data(input + 160); - in[6] = load_input_data(input + 192); - in[7] = load_input_data(input + 224); + in[0] = load_input_data8(input + 0 * 32); + in[1] = load_input_data8(input + 1 * 32); + in[2] = load_input_data8(input + 2 * 32); + in[3] = load_input_data8(input + 3 * 32); + in[4] = load_input_data8(input + 4 * 32); + in[5] = load_input_data8(input + 5 * 32); + in[6] = load_input_data8(input + 6 * 32); + in[7] = load_input_data8(input + 7 * 32); transpose_16bit_8x8(in, in); IDCT32_34 @@ -2008,10 +2008,10 @@ static void idct32_full_8x32(const __m128i *in /*in[32]*/, static void load_buffer_8x32(const tran_low_t *input, __m128i *in) { int i; for (i = 0; i < 8; ++i) { - in[i] = load_input_data(input); - in[i + 8] = load_input_data(input + 8); - in[i + 16] = load_input_data(input + 16); - in[i + 24] = load_input_data(input + 24); + in[i] = load_input_data8(input); + in[i + 8] = load_input_data8(input + 8); + in[i + 16] = load_input_data8(input + 16); + in[i + 24] = load_input_data8(input + 24); input += 32; } } diff --git a/vpx_dsp/x86/inv_txfm_sse2.h b/vpx_dsp/x86/inv_txfm_sse2.h index cfe5f788e..7db97db41 100644 --- a/vpx_dsp/x86/inv_txfm_sse2.h +++ b/vpx_dsp/x86/inv_txfm_sse2.h @@ -76,24 +76,23 @@ static INLINE __m128i idct_calc_wraplow_sse2(const __m128i in0, return _mm_packs_epi32(t0, t1); } -// Function to allow 8 bit optimisations to be used when profile 0 is used with +// Functions to allow 8 bit optimisations to be used when profile 0 is used with // highbitdepth enabled -static INLINE __m128i load_input_data(const tran_low_t *data) { +static INLINE __m128i load_input_data4(const tran_low_t *data) { +#if CONFIG_VP9_HIGHBITDEPTH + const __m128i zero = _mm_setzero_si128(); + const __m128i in = _mm_load_si128((const __m128i *)data); + return _mm_packs_epi32(in, zero); +#else + return _mm_loadl_epi64((const __m128i *)data); +#endif +} + +static INLINE __m128i load_input_data8(const tran_low_t *data) { #if CONFIG_VP9_HIGHBITDEPTH - // in0: 0 X 1 X 2 X 3 X - // in1: 4 X 5 X 6 X 7 X - // t0: 0 4 X X 1 5 X X - // t1: 2 6 X X 3 7 X X - // t2: 0 2 4 6 X X X X - // t3: 1 3 5 7 X X X X - // rtn: 0 1 2 3 4 5 6 7 const __m128i in0 = _mm_load_si128((const __m128i *)data); const __m128i in1 = _mm_load_si128((const __m128i *)(data + 4)); - const __m128i t0 = _mm_unpacklo_epi16(in0, in1); - const __m128i t1 = _mm_unpackhi_epi16(in0, in1); - const __m128i t2 = _mm_unpacklo_epi16(t0, t1); - const __m128i t3 = _mm_unpackhi_epi16(t0, t1); - return _mm_unpacklo_epi16(t2, t3); + return _mm_packs_epi32(in0, in1); #else return _mm_load_si128((const __m128i *)data); #endif @@ -101,35 +100,35 @@ static INLINE __m128i load_input_data(const tran_low_t *data) { static INLINE void load_buffer_8x8(const tran_low_t *const input, __m128i *const in) { - in[0] = load_input_data(input + 0 * 8); - in[1] = load_input_data(input + 1 * 8); - in[2] = load_input_data(input + 2 * 8); - in[3] = load_input_data(input + 3 * 8); - in[4] = load_input_data(input + 4 * 8); - in[5] = load_input_data(input + 5 * 8); - in[6] = load_input_data(input + 6 * 8); - in[7] = load_input_data(input + 7 * 8); + in[0] = load_input_data8(input + 0 * 8); + in[1] = load_input_data8(input + 1 * 8); + in[2] = load_input_data8(input + 2 * 8); + in[3] = load_input_data8(input + 3 * 8); + in[4] = load_input_data8(input + 4 * 8); + in[5] = load_input_data8(input + 5 * 8); + in[6] = load_input_data8(input + 6 * 8); + in[7] = load_input_data8(input + 7 * 8); } static INLINE void load_buffer_8x16(const tran_low_t *const input, __m128i *const in) { - in[0] = load_input_data(input + 0 * 16); - in[1] = load_input_data(input + 1 * 16); - in[2] = load_input_data(input + 2 * 16); - in[3] = load_input_data(input + 3 * 16); - in[4] = load_input_data(input + 4 * 16); - in[5] = load_input_data(input + 5 * 16); - in[6] = load_input_data(input + 6 * 16); - in[7] = load_input_data(input + 7 * 16); - - in[8] = load_input_data(input + 8 * 16); - in[9] = load_input_data(input + 9 * 16); - in[10] = load_input_data(input + 10 * 16); - in[11] = load_input_data(input + 11 * 16); - in[12] = load_input_data(input + 12 * 16); - in[13] = load_input_data(input + 13 * 16); - in[14] = load_input_data(input + 14 * 16); - in[15] = load_input_data(input + 15 * 16); + in[0] = load_input_data8(input + 0 * 16); + in[1] = load_input_data8(input + 1 * 16); + in[2] = load_input_data8(input + 2 * 16); + in[3] = load_input_data8(input + 3 * 16); + in[4] = load_input_data8(input + 4 * 16); + in[5] = load_input_data8(input + 5 * 16); + in[6] = load_input_data8(input + 6 * 16); + in[7] = load_input_data8(input + 7 * 16); + + in[8] = load_input_data8(input + 8 * 16); + in[9] = load_input_data8(input + 9 * 16); + in[10] = load_input_data8(input + 10 * 16); + in[11] = load_input_data8(input + 11 * 16); + in[12] = load_input_data8(input + 12 * 16); + in[13] = load_input_data8(input + 13 * 16); + in[14] = load_input_data8(input + 14 * 16); + in[15] = load_input_data8(input + 15 * 16); } static INLINE void recon_and_store(uint8_t *const dest, const __m128i in_x) { diff --git a/vpx_dsp/x86/inv_txfm_ssse3.c b/vpx_dsp/x86/inv_txfm_ssse3.c index 1a9fe51d7..f42ce491e 100644 --- a/vpx_dsp/x86/inv_txfm_ssse3.c +++ b/vpx_dsp/x86/inv_txfm_ssse3.c @@ -36,10 +36,10 @@ void vpx_idct8x8_12_add_ssse3(const tran_low_t *input, uint8_t *dest, __m128i tmp[4]; // Rows. Load 4-row input data. - in[0] = load_input_data(input); - in[1] = load_input_data(input + 8 * 1); - in[2] = load_input_data(input + 8 * 2); - in[3] = load_input_data(input + 8 * 3); + in[0] = load_input_data4(input + 0 * 8); + in[1] = load_input_data4(input + 1 * 8); + in[2] = load_input_data4(input + 2 * 8); + in[3] = load_input_data4(input + 3 * 8); // 4x4 Transpose transpose_16bit_4x4(in, in); @@ -342,14 +342,14 @@ void vpx_idct32x32_34_add_ssse3(const tran_low_t *input, uint8_t *dest, int i; // Load input data. Only need to load the top left 8x8 block. - in[0] = load_input_data(input); - in[1] = load_input_data(input + 32); - in[2] = load_input_data(input + 64); - in[3] = load_input_data(input + 96); - in[4] = load_input_data(input + 128); - in[5] = load_input_data(input + 160); - in[6] = load_input_data(input + 192); - in[7] = load_input_data(input + 224); + in[0] = load_input_data8(input + 0 * 32); + in[1] = load_input_data8(input + 1 * 32); + in[2] = load_input_data8(input + 2 * 32); + in[3] = load_input_data8(input + 3 * 32); + in[4] = load_input_data8(input + 4 * 32); + in[5] = load_input_data8(input + 5 * 32); + in[6] = load_input_data8(input + 6 * 32); + in[7] = load_input_data8(input + 7 * 32); transpose_16bit_8x8(in, in); idct32_34_first_half(in, stp1); @@ -383,8 +383,8 @@ static void load_buffer_16x16(const tran_low_t *input, __m128i *in0, __m128i *in1) { int i; for (i = 0; i < 16; i++) { - in0[i] = load_input_data(input); - in1[i] = load_input_data(input + 8); + in0[i] = load_input_data8(input); + in1[i] = load_input_data8(input + 8); input += 32; } } -- 2.40.0