From: Linfeng Zhang Date: Mon, 26 Jun 2017 21:57:41 +0000 (-0700) Subject: Update vpx_idct8x8_12_add_ssse3() X-Git-Tag: v1.7.0~362 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=0bb31a46a4f3a490a4cb483fe4db79d7bb20a0d3;p=libvpx Update vpx_idct8x8_12_add_ssse3() Change-Id: I0f38801c391db87ddae168602a786a062cd34b1d --- diff --git a/vpx_dsp/x86/inv_txfm_sse2.c b/vpx_dsp/x86/inv_txfm_sse2.c index 00301b817..5d12d1ff1 100644 --- a/vpx_dsp/x86/inv_txfm_sse2.c +++ b/vpx_dsp/x86/inv_txfm_sse2.c @@ -163,17 +163,6 @@ static INLINE void multiplication_and_add( *res3 = idct_calc_wraplow_sse2(lo_1, hi_1, *cst3); } -static void multiplication_and_add_2(const __m128i *const in0, - const __m128i *const in1, - const __m128i *const cst0, - const __m128i *const cst1, - __m128i *const res0, __m128i *const res1) { - const __m128i lo = _mm_unpacklo_epi16(*in0, *in1); - const __m128i hi = _mm_unpackhi_epi16(*in0, *in1); - *res0 = idct_calc_wraplow_sse2(lo, hi, *cst0); - *res1 = idct_calc_wraplow_sse2(lo, hi, *cst1); -} - static INLINE void idct8(const __m128i *const in, __m128i *const out) { const __m128i cp_16_16 = pair_set_epi16(cospi_16_64, cospi_16_64); const __m128i cp_16_n16 = pair_set_epi16(cospi_16_64, -cospi_16_64); diff --git a/vpx_dsp/x86/inv_txfm_sse2.h b/vpx_dsp/x86/inv_txfm_sse2.h index 7db97db41..e6e1cd403 100644 --- a/vpx_dsp/x86/inv_txfm_sse2.h +++ b/vpx_dsp/x86/inv_txfm_sse2.h @@ -76,6 +76,18 @@ static INLINE __m128i idct_calc_wraplow_sse2(const __m128i in0, return _mm_packs_epi32(t0, t1); } +static INLINE void multiplication_and_add_2(const __m128i *const in0, + const __m128i *const in1, + const __m128i *const cst0, + const __m128i *const cst1, + __m128i *const res0, + __m128i *const res1) { + const __m128i lo = _mm_unpacklo_epi16(*in0, *in1); + const __m128i hi = _mm_unpackhi_epi16(*in0, *in1); + *res0 = idct_calc_wraplow_sse2(lo, hi, *cst0); + *res1 = idct_calc_wraplow_sse2(lo, hi, *cst1); +} + // Functions to allow 8 bit optimisations to be used when profile 0 is used with // highbitdepth enabled static INLINE __m128i load_input_data4(const tran_low_t *data) { diff --git a/vpx_dsp/x86/inv_txfm_ssse3.c b/vpx_dsp/x86/inv_txfm_ssse3.c index f42ce491e..a2a8858c2 100644 --- a/vpx_dsp/x86/inv_txfm_ssse3.c +++ b/vpx_dsp/x86/inv_txfm_ssse3.c @@ -17,135 +17,96 @@ void vpx_idct8x8_12_add_ssse3(const tran_low_t *input, uint8_t *dest, int stride) { - const __m128i zero = _mm_setzero_si128(); - const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); - const __m128i stg1_0 = pair_set_epi16(2 * cospi_28_64, 2 * cospi_28_64); - const __m128i stg1_1 = pair_set_epi16(2 * cospi_4_64, 2 * cospi_4_64); - const __m128i stg1_2 = pair_set_epi16(-2 * cospi_20_64, -2 * cospi_20_64); - const __m128i stg1_3 = pair_set_epi16(2 * cospi_12_64, 2 * cospi_12_64); - const __m128i stg2_0 = pair_set_epi16(2 * cospi_16_64, 2 * cospi_16_64); - const __m128i stk2_0 = pair_set_epi16(cospi_16_64, cospi_16_64); - const __m128i stk2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); - const __m128i stg2_2 = pair_set_epi16(2 * cospi_24_64, 2 * cospi_24_64); - const __m128i stg2_3 = pair_set_epi16(2 * cospi_8_64, 2 * cospi_8_64); - const __m128i stg3_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); - - __m128i in[8]; - __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7; - __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7; - __m128i tmp[4]; - - // Rows. Load 4-row input data. + const __m128i cp_28d_4d = dual_set_epi16(2 * cospi_28_64, 2 * cospi_4_64); + const __m128i cp_n20d_12d = dual_set_epi16(-2 * cospi_20_64, 2 * cospi_12_64); + const __m128i cp_8d_24d = dual_set_epi16(2 * cospi_8_64, 2 * cospi_24_64); + const __m128i cp_16_16 = _mm_set1_epi16(cospi_16_64); + const __m128i cp_16_n16 = pair_set_epi16(cospi_16_64, -cospi_16_64); + const __m128i cospi_16_64d = _mm_set1_epi16(2 * cospi_16_64); + const __m128i cospi_28_64d = _mm_set1_epi16(2 * cospi_28_64); + const __m128i cospi_4_64d = _mm_set1_epi16(2 * cospi_4_64); + const __m128i cospi_n20_64d = _mm_set1_epi16(-2 * cospi_20_64); + const __m128i cospi_12_64d = _mm_set1_epi16(2 * cospi_12_64); + const __m128i cospi_24_64d = _mm_set1_epi16(2 * cospi_24_64); + const __m128i cospi_8_64d = _mm_set1_epi16(2 * cospi_8_64); + __m128i in[8], step1[8], step2[8], tmp[4]; + in[0] = load_input_data4(input + 0 * 8); in[1] = load_input_data4(input + 1 * 8); in[2] = load_input_data4(input + 2 * 8); in[3] = load_input_data4(input + 3 * 8); - // 4x4 Transpose - transpose_16bit_4x4(in, in); - - // Stage1 - tmp[0] = _mm_mulhrs_epi16(in[0], stg1_0); - tmp[1] = _mm_mulhrs_epi16(in[0], stg1_1); - tmp[2] = _mm_mulhrs_epi16(in[1], stg1_2); - tmp[3] = _mm_mulhrs_epi16(in[1], stg1_3); - - stp1_4 = _mm_unpackhi_epi64(tmp[0], tmp[1]); - stp1_5 = _mm_unpackhi_epi64(tmp[2], tmp[3]); - - // Stage2 - tmp[0] = _mm_mulhrs_epi16(in[0], stg2_0); - stp2_0 = _mm_unpacklo_epi64(tmp[0], tmp[0]); - - tmp[1] = _mm_mulhrs_epi16(in[1], stg2_2); - tmp[2] = _mm_mulhrs_epi16(in[1], stg2_3); - stp2_2 = _mm_unpacklo_epi64(tmp[2], tmp[1]); - - tmp[0] = _mm_add_epi16(stp1_4, stp1_5); - tmp[1] = _mm_sub_epi16(stp1_4, stp1_5); + // pass 1 - stp2_4 = tmp[0]; - stp2_5 = _mm_unpacklo_epi64(tmp[1], zero); - stp2_6 = _mm_unpackhi_epi64(tmp[1], zero); - - tmp[0] = _mm_unpacklo_epi16(stp2_5, stp2_6); - tmp[1] = _mm_madd_epi16(tmp[0], stg3_0); - tmp[2] = _mm_madd_epi16(tmp[0], stk2_0); // stg3_1 = stk2_0 - - tmp[1] = _mm_add_epi32(tmp[1], rounding); - tmp[2] = _mm_add_epi32(tmp[2], rounding); - tmp[1] = _mm_srai_epi32(tmp[1], DCT_CONST_BITS); - tmp[2] = _mm_srai_epi32(tmp[2], DCT_CONST_BITS); - - stp1_5 = _mm_packs_epi32(tmp[1], tmp[2]); - - // Stage3 - tmp[2] = _mm_add_epi16(stp2_0, stp2_2); - tmp[3] = _mm_sub_epi16(stp2_0, stp2_2); - - stp1_2 = _mm_unpackhi_epi64(tmp[3], tmp[2]); - stp1_3 = _mm_unpacklo_epi64(tmp[3], tmp[2]); - - // Stage4 - tmp[0] = _mm_add_epi16(stp1_3, stp2_4); - tmp[1] = _mm_add_epi16(stp1_2, stp1_5); - tmp[2] = _mm_sub_epi16(stp1_3, stp2_4); - tmp[3] = _mm_sub_epi16(stp1_2, stp1_5); + transpose_16bit_4x4(in, in); + // in[0]: 00 10 20 30 01 11 21 31 + // in[1]: 02 12 22 32 03 13 23 33 + + // stage 1 + tmp[0] = _mm_unpacklo_epi64(in[0], in[0]); + tmp[1] = _mm_unpackhi_epi64(in[0], in[0]); + tmp[2] = _mm_unpacklo_epi64(in[1], in[1]); + tmp[3] = _mm_unpackhi_epi64(in[1], in[1]); + step1[4] = _mm_mulhrs_epi16(tmp[1], cp_28d_4d); // step1 4&7 + step1[5] = _mm_mulhrs_epi16(tmp[3], cp_n20d_12d); // step1 5&6 + + // stage 2 + step2[0] = _mm_mulhrs_epi16(tmp[0], cospi_16_64d); // step2 0&1 + step2[2] = _mm_mulhrs_epi16(tmp[2], cp_8d_24d); // step2 3&2 + step2[4] = _mm_add_epi16(step1[4], step1[5]); // step2 4&7 + step2[5] = _mm_sub_epi16(step1[4], step1[5]); // step2 5&6 + step2[6] = _mm_unpackhi_epi64(step2[5], step2[5]); // step2 6 + + // stage 3 + tmp[0] = _mm_unpacklo_epi16(step2[6], step2[5]); + step1[5] = idct_calc_wraplow_sse2(cp_16_n16, cp_16_16, tmp[0]); // step1 5&6 + tmp[0] = _mm_add_epi16(step2[0], step2[2]); // step1 0&1 + tmp[1] = _mm_sub_epi16(step2[0], step2[2]); // step1 3&2 + step1[2] = _mm_unpackhi_epi64(tmp[1], tmp[0]); // step1 2&1 + step1[3] = _mm_unpacklo_epi64(tmp[1], tmp[0]); // step1 3&0 + + // stage 4 + tmp[0] = _mm_add_epi16(step1[3], step2[4]); // output 3&0 + tmp[1] = _mm_add_epi16(step1[2], step1[5]); // output 2&1 + tmp[2] = _mm_sub_epi16(step1[3], step2[4]); // output 4&7 + tmp[3] = _mm_sub_epi16(step1[2], step1[5]); // output 5&6 + + // pass 2 idct8x8_12_transpose_16bit_4x8(tmp, in); - /* Stage1 */ - stp1_4 = _mm_mulhrs_epi16(in[1], stg1_0); - stp1_7 = _mm_mulhrs_epi16(in[1], stg1_1); - stp1_5 = _mm_mulhrs_epi16(in[3], stg1_2); - stp1_6 = _mm_mulhrs_epi16(in[3], stg1_3); - - /* Stage2 */ - stp2_0 = _mm_mulhrs_epi16(in[0], stg2_0); - stp2_1 = _mm_mulhrs_epi16(in[0], stg2_0); - - stp2_2 = _mm_mulhrs_epi16(in[2], stg2_2); - stp2_3 = _mm_mulhrs_epi16(in[2], stg2_3); - - stp2_4 = _mm_add_epi16(stp1_4, stp1_5); - stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); - stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); - stp2_7 = _mm_add_epi16(stp1_7, stp1_6); - - /* Stage3 */ - stp1_0 = _mm_add_epi16(stp2_0, stp2_3); - stp1_1 = _mm_add_epi16(stp2_1, stp2_2); - stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); - stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); - - tmp[0] = _mm_unpacklo_epi16(stp2_6, stp2_5); - tmp[1] = _mm_unpackhi_epi16(stp2_6, stp2_5); - - tmp[2] = _mm_madd_epi16(tmp[0], stk2_0); - tmp[3] = _mm_madd_epi16(tmp[1], stk2_0); - tmp[2] = _mm_add_epi32(tmp[2], rounding); - tmp[3] = _mm_add_epi32(tmp[3], rounding); - tmp[2] = _mm_srai_epi32(tmp[2], DCT_CONST_BITS); - tmp[3] = _mm_srai_epi32(tmp[3], DCT_CONST_BITS); - stp1_6 = _mm_packs_epi32(tmp[2], tmp[3]); - - tmp[2] = _mm_madd_epi16(tmp[0], stk2_1); - tmp[3] = _mm_madd_epi16(tmp[1], stk2_1); - tmp[2] = _mm_add_epi32(tmp[2], rounding); - tmp[3] = _mm_add_epi32(tmp[3], rounding); - tmp[2] = _mm_srai_epi32(tmp[2], DCT_CONST_BITS); - tmp[3] = _mm_srai_epi32(tmp[3], DCT_CONST_BITS); - stp1_5 = _mm_packs_epi32(tmp[2], tmp[3]); - - /* Stage4 */ - in[0] = _mm_add_epi16(stp1_0, stp2_7); - in[1] = _mm_add_epi16(stp1_1, stp1_6); - in[2] = _mm_add_epi16(stp1_2, stp1_5); - in[3] = _mm_add_epi16(stp1_3, stp2_4); - in[4] = _mm_sub_epi16(stp1_3, stp2_4); - in[5] = _mm_sub_epi16(stp1_2, stp1_5); - in[6] = _mm_sub_epi16(stp1_1, stp1_6); - in[7] = _mm_sub_epi16(stp1_0, stp2_7); + // stage 1 + step1[4] = _mm_mulhrs_epi16(in[1], cospi_28_64d); + step1[7] = _mm_mulhrs_epi16(in[1], cospi_4_64d); + step1[5] = _mm_mulhrs_epi16(in[3], cospi_n20_64d); + step1[6] = _mm_mulhrs_epi16(in[3], cospi_12_64d); + + // stage 2 + step2[0] = _mm_mulhrs_epi16(in[0], cospi_16_64d); // step2[1] = step2[0] + step2[2] = _mm_mulhrs_epi16(in[2], cospi_24_64d); + step2[3] = _mm_mulhrs_epi16(in[2], cospi_8_64d); + step2[4] = _mm_add_epi16(step1[4], step1[5]); + step2[5] = _mm_sub_epi16(step1[4], step1[5]); + step2[6] = _mm_sub_epi16(step1[7], step1[6]); + step2[7] = _mm_add_epi16(step1[7], step1[6]); + + // stage 3 + step1[0] = _mm_add_epi16(step2[0], step2[3]); + step1[1] = _mm_add_epi16(step2[0], step2[2]); + step1[2] = _mm_sub_epi16(step2[0], step2[2]); + step1[3] = _mm_sub_epi16(step2[0], step2[3]); + multiplication_and_add_2(&step2[6], &step2[5], &cp_16_n16, &cp_16_16, + &step1[5], &step1[6]); + + // stage 4 + in[0] = _mm_add_epi16(step1[0], step2[7]); + in[1] = _mm_add_epi16(step1[1], step1[6]); + in[2] = _mm_add_epi16(step1[2], step1[5]); + in[3] = _mm_add_epi16(step1[3], step2[4]); + in[4] = _mm_sub_epi16(step1[3], step2[4]); + in[5] = _mm_sub_epi16(step1[2], step1[5]); + in[6] = _mm_sub_epi16(step1[1], step1[6]); + in[7] = _mm_sub_epi16(step1[0], step2[7]); write_buffer_8x8(in, dest, stride); }