From 908a992d7fde8218139c0ad298d8a60cc66650b6 Mon Sep 17 00:00:00 2001 From: Abo Talib Mahfoodh Date: Mon, 21 Oct 2013 21:11:35 -0400 Subject: [PATCH] Improve vp9_idct4x4_1_add_sse2 Simple modification to reduce number of cycles in the function. Original function number of cycles: 973 Modified function number of cycles: 835 Improvment factor: 1.165 Tested with: park_joy_420_720p50.y4m Change-Id: Ic5857272ea3aafe21d5ef9a69258d78c688f69bd --- vp9/common/x86/vp9_idct_intrin_sse2.c | 103 ++++++++++++++------------ 1 file changed, 55 insertions(+), 48 deletions(-) diff --git a/vp9/common/x86/vp9_idct_intrin_sse2.c b/vp9/common/x86/vp9_idct_intrin_sse2.c index cfec36b42..0bd545989 100644 --- a/vp9/common/x86/vp9_idct_intrin_sse2.c +++ b/vp9/common/x86/vp9_idct_intrin_sse2.c @@ -15,6 +15,16 @@ #include "vp9/common/vp9_common.h" #include "vp9/common/vp9_idct.h" +#define RECON_AND_STORE4X4(dest, in_x) \ +{ \ + __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); \ + d0 = _mm_unpacklo_epi8(d0, zero); \ + d0 = _mm_add_epi16(in_x, d0); \ + d0 = _mm_packus_epi16(d0, d0); \ + *(int *)dest = _mm_cvtsi128_si32(d0); \ + dest += stride; \ +} + void vp9_idct4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride) { const __m128i zero = _mm_setzero_si128(); const __m128i eight = _mm_set1_epi16(8); @@ -26,21 +36,19 @@ void vp9_idct4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride) { __m128i input0, input1, input2, input3; // Rows - input0 = _mm_loadl_epi64((const __m128i *)input); - input1 = _mm_loadl_epi64((const __m128i *)(input + 4)); - input2 = _mm_loadl_epi64((const __m128i *)(input + 8)); - input3 = _mm_loadl_epi64((const __m128i *)(input + 12)); + input0 = _mm_load_si128((const __m128i *)input); + input2 = _mm_load_si128((const __m128i *)(input + 8)); // Construct i3, i1, i3, i1, i2, i0, i2, i0 input0 = _mm_shufflelo_epi16(input0, 0xd8); - input1 = _mm_shufflelo_epi16(input1, 0xd8); + input0 = _mm_shufflehi_epi16(input0, 0xd8); input2 = _mm_shufflelo_epi16(input2, 0xd8); - input3 = _mm_shufflelo_epi16(input3, 0xd8); + input2 = _mm_shufflehi_epi16(input2, 0xd8); + input1 = _mm_unpackhi_epi32(input0, input0); input0 = _mm_unpacklo_epi32(input0, input0); - input1 = _mm_unpacklo_epi32(input1, input1); + input3 = _mm_unpackhi_epi32(input2, input2); input2 = _mm_unpacklo_epi32(input2, input2); - input3 = _mm_unpacklo_epi32(input3, input3); // Stage 1 input0 = _mm_madd_epi16(input0, cst); @@ -59,16 +67,14 @@ void vp9_idct4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride) { input3 = _mm_srai_epi32(input3, DCT_CONST_BITS); // Stage 2 - input0 = _mm_packs_epi32(input0, zero); - input1 = _mm_packs_epi32(input1, zero); - input2 = _mm_packs_epi32(input2, zero); - input3 = _mm_packs_epi32(input3, zero); + input0 = _mm_packs_epi32(input0, input1); + input1 = _mm_packs_epi32(input2, input3); // Transpose - input1 = _mm_unpacklo_epi16(input0, input1); - input3 = _mm_unpacklo_epi16(input2, input3); - input0 = _mm_unpacklo_epi32(input1, input3); - input1 = _mm_unpackhi_epi32(input1, input3); + input2 = _mm_unpacklo_epi16(input0, input1); + input3 = _mm_unpackhi_epi16(input0, input1); + input0 = _mm_unpacklo_epi32(input2, input3); + input1 = _mm_unpackhi_epi32(input2, input3); // Switch column2, column 3, and then, we got: // input2: column1, column 0; input3: column2, column 3. @@ -78,14 +84,9 @@ void vp9_idct4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride) { // Columns // Construct i3, i1, i3, i1, i2, i0, i2, i0 - input0 = _mm_shufflelo_epi16(input2, 0xd8); - input1 = _mm_shufflehi_epi16(input2, 0xd8); - input2 = _mm_shufflehi_epi16(input3, 0xd8); - input3 = _mm_shufflelo_epi16(input3, 0xd8); - - input0 = _mm_unpacklo_epi32(input0, input0); - input1 = _mm_unpackhi_epi32(input1, input1); - input2 = _mm_unpackhi_epi32(input2, input2); + input0 = _mm_unpacklo_epi32(input2, input2); + input1 = _mm_unpackhi_epi32(input2, input2); + input2 = _mm_unpackhi_epi32(input3, input3); input3 = _mm_unpacklo_epi32(input3, input3); // Stage 1 @@ -105,16 +106,14 @@ void vp9_idct4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride) { input3 = _mm_srai_epi32(input3, DCT_CONST_BITS); // Stage 2 - input0 = _mm_packs_epi32(input0, zero); - input1 = _mm_packs_epi32(input1, zero); - input2 = _mm_packs_epi32(input2, zero); - input3 = _mm_packs_epi32(input3, zero); + input0 = _mm_packs_epi32(input0, input2); + input1 = _mm_packs_epi32(input1, input3); // Transpose - input1 = _mm_unpacklo_epi16(input0, input1); - input3 = _mm_unpacklo_epi16(input2, input3); - input0 = _mm_unpacklo_epi32(input1, input3); - input1 = _mm_unpackhi_epi32(input1, input3); + input2 = _mm_unpacklo_epi16(input0, input1); + input3 = _mm_unpackhi_epi16(input0, input1); + input0 = _mm_unpacklo_epi32(input2, input3); + input1 = _mm_unpackhi_epi32(input2, input3); // Switch column2, column 3, and then, we got: // input2: column1, column 0; input3: column2, column 3. @@ -129,23 +128,31 @@ void vp9_idct4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride) { input2 = _mm_srai_epi16(input2, 4); input3 = _mm_srai_epi16(input3, 4); -#define RECON_AND_STORE4X4(dest, in_x) \ - { \ - __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); \ - d0 = _mm_unpacklo_epi8(d0, zero); \ - d0 = _mm_add_epi16(in_x, d0); \ - d0 = _mm_packus_epi16(d0, d0); \ - *(int *)dest = _mm_cvtsi128_si32(d0); \ - dest += stride; \ + // Reconstruction and Store + { + __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); + __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2)); + d0 = _mm_unpacklo_epi32(d0, + _mm_cvtsi32_si128(*(const int *) (dest + stride))); + d2 = _mm_unpacklo_epi32(_mm_cvtsi32_si128( + *(const int *) (dest + stride * 3)), d2); + d0 = _mm_unpacklo_epi8(d0, zero); + d2 = _mm_unpacklo_epi8(d2, zero); + d0 = _mm_add_epi16(d0, input2); + d2 = _mm_add_epi16(d2, input3); + d0 = _mm_packus_epi16(d0, d2); + // store input0 + *(int *)dest = _mm_cvtsi128_si32(d0); + // store input1 + d0 = _mm_srli_si128(d0, 4); + *(int *)(dest + stride) = _mm_cvtsi128_si32(d0); + // store input2 + d0 = _mm_srli_si128(d0, 4); + *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d0); + // store input3 + d0 = _mm_srli_si128(d0, 4); + *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d0); } - - input0 = _mm_srli_si128(input2, 8); - input1 = _mm_srli_si128(input3, 8); - - RECON_AND_STORE4X4(dest, input2); - RECON_AND_STORE4X4(dest, input0); - RECON_AND_STORE4X4(dest, input1); - RECON_AND_STORE4X4(dest, input3); } void vp9_idct4x4_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) { -- 2.40.0