From: Linfeng Zhang Date: Mon, 12 Jun 2017 23:23:53 +0000 (-0700) Subject: Clean array_transpose_{4X8,16x16,16x16_2) in x86 X-Git-Tag: v1.7.0~395^2~1 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=d6eeef9ee6324af69a9fb19b1c507c29700ac28f;p=libvpx Clean array_transpose_{4X8,16x16,16x16_2) in x86 Change-Id: I341399ecbde37065375ea7e63511a26bfc285ea0 --- diff --git a/vp9/encoder/x86/vp9_dct_intrin_sse2.c b/vp9/encoder/x86/vp9_dct_intrin_sse2.c index beb2695ab..969c60aba 100644 --- a/vp9/encoder/x86/vp9_dct_intrin_sse2.c +++ b/vp9/encoder/x86/vp9_dct_intrin_sse2.c @@ -1131,23 +1131,6 @@ static INLINE void write_buffer_16x16(tran_low_t *output, __m128i *in0, write_buffer_8x8(output + 8 * stride, in1 + 8, stride); } -static INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) { - __m128i tbuf[8]; - transpose_16bit_8x8(res0, res0); - transpose_16bit_8x8(res1, tbuf); - transpose_16bit_8x8(res0 + 8, res1); - transpose_16bit_8x8(res1 + 8, res1 + 8); - - res0[8] = tbuf[0]; - res0[9] = tbuf[1]; - res0[10] = tbuf[2]; - res0[11] = tbuf[3]; - res0[12] = tbuf[4]; - res0[13] = tbuf[5]; - res0[14] = tbuf[6]; - res0[15] = tbuf[7]; -} - static INLINE void right_shift_16x16(__m128i *res0, __m128i *res1) { // perform rounding operations right_shift_8x8(res0, 2); @@ -1951,13 +1934,13 @@ static void fadst16_8col(__m128i *in) { static void fdct16_sse2(__m128i *in0, __m128i *in1) { fdct16_8col(in0); fdct16_8col(in1); - array_transpose_16x16(in0, in1); + transpose_16bit_16x16(in0, in1); } static void fadst16_sse2(__m128i *in0, __m128i *in1) { fadst16_8col(in0); fadst16_8col(in1); - array_transpose_16x16(in0, in1); + transpose_16bit_16x16(in0, in1); } void vp9_fht16x16_sse2(const int16_t *input, tran_low_t *output, int stride, diff --git a/vpx_dsp/x86/highbd_idct16x16_add_sse2.c b/vpx_dsp/x86/highbd_idct16x16_add_sse2.c index c2b7262ad..1df91f08f 100644 --- a/vpx_dsp/x86/highbd_idct16x16_add_sse2.c +++ b/vpx_dsp/x86/highbd_idct16x16_add_sse2.c @@ -66,7 +66,7 @@ void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint16_t *dest, test = _mm_movemask_epi8(temp1); if (test) { - array_transpose_16x16(inptr, inptr + 16); + transpose_16bit_16x16(inptr, inptr + 16); for (i = 0; i < 16; i++) { sign_bits = _mm_cmplt_epi16(inptr[i], zero); temp1 = _mm_unpacklo_epi16(inptr[i], sign_bits); diff --git a/vpx_dsp/x86/highbd_idct8x8_add_sse2.c b/vpx_dsp/x86/highbd_idct8x8_add_sse2.c index e98d547ee..c12e3e1b9 100644 --- a/vpx_dsp/x86/highbd_idct8x8_add_sse2.c +++ b/vpx_dsp/x86/highbd_idct8x8_add_sse2.c @@ -165,7 +165,7 @@ void vpx_highbd_idct8x8_12_add_sse2(const tran_low_t *input, uint16_t *dest, if (test) { // Use fact only first 4 rows contain non-zero coeffs - array_transpose_4X8(inptr, inptr); + transpose_16bit_4x8(inptr, inptr); for (i = 0; i < 4; i++) { sign_bits = _mm_cmplt_epi16(inptr[i], zero); temp1 = _mm_unpackhi_epi16(inptr[i], sign_bits); diff --git a/vpx_dsp/x86/inv_txfm_sse2.c b/vpx_dsp/x86/inv_txfm_sse2.c index 4ff77b381..32f1b63b8 100644 --- a/vpx_dsp/x86/inv_txfm_sse2.c +++ b/vpx_dsp/x86/inv_txfm_sse2.c @@ -1462,13 +1462,13 @@ static void idct16_8col(__m128i *in) { } void idct16_sse2(__m128i *in0, __m128i *in1) { - array_transpose_16x16(in0, in1); + transpose_16bit_16x16(in0, in1); idct16_8col(in0); idct16_8col(in1); } void iadst16_sse2(__m128i *in0, __m128i *in1) { - array_transpose_16x16(in0, in1); + transpose_16bit_16x16(in0, in1); iadst16_8col(in0); iadst16_8col(in1); } @@ -1616,7 +1616,7 @@ void vpx_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest, // Second 1-D inverse transform, performed per 8x16 block for (i = 0; i < 2; i++) { int j; - array_transpose_4X8(l + 8 * i, in); + transpose_16bit_4x8(l + 8 * i, in); IDCT16_10 diff --git a/vpx_dsp/x86/inv_txfm_sse2.h b/vpx_dsp/x86/inv_txfm_sse2.h index 40fb9511c..bf86afd3c 100644 --- a/vpx_dsp/x86/inv_txfm_sse2.h +++ b/vpx_dsp/x86/inv_txfm_sse2.h @@ -56,40 +56,6 @@ static INLINE void idct8x8_12_transpose_16bit_4x8(const __m128i *const in, out[3] = _mm_unpackhi_epi64(tr1_2, tr1_3); } -static INLINE void array_transpose_4X8(__m128i *in, __m128i *out) { - const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]); - const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]); - const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]); - const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]); - - const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); - const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); - const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); - const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); - - out[0] = _mm_unpacklo_epi64(tr1_0, tr1_4); - out[1] = _mm_unpackhi_epi64(tr1_0, tr1_4); - out[2] = _mm_unpacklo_epi64(tr1_2, tr1_6); - out[3] = _mm_unpackhi_epi64(tr1_2, tr1_6); -} - -static INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) { - __m128i tbuf[8]; - transpose_16bit_8x8(res0, res0); - transpose_16bit_8x8(res1, tbuf); - transpose_16bit_8x8(res0 + 8, res1); - transpose_16bit_8x8(res1 + 8, res1 + 8); - - res0[8] = tbuf[0]; - res0[9] = tbuf[1]; - res0[10] = tbuf[2]; - res0[11] = tbuf[3]; - res0[12] = tbuf[4]; - res0[13] = tbuf[5]; - res0[14] = tbuf[6]; - res0[15] = tbuf[7]; -} - static INLINE __m128i dct_const_round_shift_sse2(const __m128i in) { const __m128i t = _mm_add_epi32(in, _mm_set1_epi32(DCT_CONST_ROUNDING)); return _mm_srai_epi32(t, DCT_CONST_BITS); diff --git a/vpx_dsp/x86/inv_txfm_ssse3.c b/vpx_dsp/x86/inv_txfm_ssse3.c index 94504e478..3ea43769f 100644 --- a/vpx_dsp/x86/inv_txfm_ssse3.c +++ b/vpx_dsp/x86/inv_txfm_ssse3.c @@ -670,14 +670,6 @@ static void load_buffer_16x16(const tran_low_t *input, __m128i *in0, } } -static void array_transpose_16x16_2(__m128i *in0, __m128i *in1, __m128i *out0, - __m128i *out1) { - transpose_16bit_8x8(in0, out0); - transpose_16bit_8x8(&in0[8], out1); - transpose_16bit_8x8(in1, &out0[8]); - transpose_16bit_8x8(&in1[8], &out1[8]); -} - // Group the coefficient calculation into smaller functions // to prevent stack spillover: // quarter_1: 0-7 @@ -986,7 +978,7 @@ static void transpose_and_copy_16x16(__m128i *in0, __m128i *in1, __m128i *store, switch (cols) { case left_16: { int i; - array_transpose_16x16(in0, in1); + transpose_16bit_16x16(in0, in1); for (i = 0; i < 16; ++i) { store[i] = in0[16 + i]; store[16 + i] = in1[16 + i]; @@ -994,7 +986,10 @@ static void transpose_and_copy_16x16(__m128i *in0, __m128i *in1, __m128i *store, break; } case right_16: { - array_transpose_16x16_2(store, &store[16], in0, in1); + transpose_16bit_8x8(store, in0); + transpose_16bit_8x8(&store[8], in1); + transpose_16bit_8x8(&store[16], &in0[8]); + transpose_16bit_8x8(&store[24], &in1[8]); break; } default: { assert(0); } @@ -1013,7 +1008,7 @@ void vpx_idct32x32_135_add_ssse3(const tran_low_t *input, uint8_t *dest, load_buffer_16x16(input, col0, col1); // columns - array_transpose_16x16(col0, col1); + transpose_16bit_16x16(col0, col1); idct32_135(col0, col1); // rows diff --git a/vpx_dsp/x86/transpose_sse2.h b/vpx_dsp/x86/transpose_sse2.h index bec59f5f9..cac007474 100644 --- a/vpx_dsp/x86/transpose_sse2.h +++ b/vpx_dsp/x86/transpose_sse2.h @@ -33,6 +33,48 @@ static INLINE void transpose_16bit_4x4(const __m128i *const in, out[1] = _mm_unpackhi_epi32(tr0_0, tr0_1); } +static INLINE void transpose_16bit_4x8(const __m128i *const in, + __m128i *const out) { + // Unpack 16 bit elements. Goes from: + // in[0]: 00 01 02 03 XX XX XX XX + // in[1]: 10 11 12 13 XX XX XX XX + // in[2]: 20 21 22 23 XX XX XX XX + // in[3]: 30 31 32 33 XX XX XX XX + // in[4]: 40 41 42 43 XX XX XX XX + // in[5]: 50 51 52 53 XX XX XX XX + // in[6]: 60 61 62 63 XX XX XX XX + // in[7]: 70 71 72 73 XX XX XX XX + // to: + // tr0_0: 00 10 01 11 02 12 03 13 + // tr0_1: 20 30 21 31 22 32 23 33 + // tr0_2: 40 50 41 51 42 52 43 53 + // tr0_3: 60 70 61 71 62 72 63 73 + const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]); + const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]); + const __m128i tr0_2 = _mm_unpacklo_epi16(in[4], in[5]); + const __m128i tr0_3 = _mm_unpacklo_epi16(in[6], in[7]); + + // Unpack 32 bit elements resulting in: + // tr1_0: 00 10 20 30 01 11 21 31 + // tr1_1: 40 50 60 70 41 51 61 71 + // tr1_2: 02 12 22 32 03 13 23 33 + // tr1_3: 42 52 62 72 43 53 63 73 + const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); + const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); + const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); + const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); + + // Unpack 64 bit elements resulting in: + // out[0]: 00 10 20 30 40 50 60 70 + // out[1]: 01 11 21 31 41 51 61 71 + // out[2]: 02 12 22 32 42 52 62 72 + // out[3]: 03 13 23 33 43 53 63 73 + out[0] = _mm_unpacklo_epi64(tr1_0, tr1_1); + out[1] = _mm_unpackhi_epi64(tr1_0, tr1_1); + out[2] = _mm_unpacklo_epi64(tr1_2, tr1_3); + out[3] = _mm_unpackhi_epi64(tr1_2, tr1_3); +} + static INLINE void transpose_16bit_8x8(const __m128i *const in, __m128i *const out) { // Unpack 16 bit elements. Goes from: @@ -99,6 +141,25 @@ static INLINE void transpose_16bit_8x8(const __m128i *const in, out[7] = _mm_unpackhi_epi64(tr1_6, tr1_7); } +// Transpose in-place +static INLINE void transpose_16bit_16x16(__m128i *const left, + __m128i *const right) { + __m128i tbuf[8]; + transpose_16bit_8x8(left, left); + transpose_16bit_8x8(right, tbuf); + transpose_16bit_8x8(left + 8, right); + transpose_16bit_8x8(right + 8, right + 8); + + left[8] = tbuf[0]; + left[9] = tbuf[1]; + left[10] = tbuf[2]; + left[11] = tbuf[3]; + left[12] = tbuf[4]; + left[13] = tbuf[5]; + left[14] = tbuf[6]; + left[15] = tbuf[7]; +} + static INLINE void transpose_32bit_4x4(__m128i *const a0, __m128i *const a1, __m128i *const a2, __m128i *const a3) { // Unpack 32 bit elements. Goes from: