From: Abo Talib Mahfoodh Date: Thu, 21 Nov 2013 20:00:20 +0000 (-0500) Subject: Improve vp9_fdct4x4_sse2 (x1.2) X-Git-Tag: v1.4.0~2969^2 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=ec2dbdd10737443e2f45e9aec915a1f89d1f684b;p=libvpx Improve vp9_fdct4x4_sse2 (x1.2) Modifications are done to reduce the total clock cycle. Speedup: 1.2 Tested with: park_joy_420_720p50.y4m Change-Id: Ia36b87e62e2f80a5fadaf5628729aedc80f38f3f --- diff --git a/vp9/encoder/x86/vp9_dct_sse2.c b/vp9/encoder/x86/vp9_dct_sse2.c index fefca660d..65431bdbf 100644 --- a/vp9/encoder/x86/vp9_dct_sse2.c +++ b/vp9/encoder/x86/vp9_dct_sse2.c @@ -26,24 +26,25 @@ void vp9_fdct4x4_sse2(const int16_t *input, int16_t *output, int stride) { // by constructing the 32 bit constant corresponding to that pair. const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); - const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); - const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); + const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); + const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1); const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0); const __m128i kOne = _mm_set1_epi16(1); - __m128i in0, in1, in2, in3; + __m128i in0, in1; // Load inputs. { in0 = _mm_loadl_epi64((const __m128i *)(input + 0 * stride)); - in1 = _mm_loadl_epi64((const __m128i *)(input + 1 * stride)); - in2 = _mm_loadl_epi64((const __m128i *)(input + 2 * stride)); - in3 = _mm_loadl_epi64((const __m128i *)(input + 3 * stride)); + in0 = _mm_unpacklo_epi64(in0, _mm_loadl_epi64((const __m128i *) + (input + 1 * stride))); + in1 = _mm_loadl_epi64((const __m128i *)(input + 2 * stride)); + in1 = _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *) + (input + 3 * stride)), in1); + // x = x << 4 in0 = _mm_slli_epi16(in0, 4); in1 = _mm_slli_epi16(in1, 4); - in2 = _mm_slli_epi16(in2, 4); - in3 = _mm_slli_epi16(in3, 4); // if (i == 0 && input[0]) input[0] += 1; { // The mask will only contain wether the first value is zero, all @@ -60,18 +61,18 @@ void vp9_fdct4x4_sse2(const int16_t *input, int16_t *output, int stride) { // Do the two transform/transpose passes for (pass = 0; pass < 2; ++pass) { // Transform 1/2: Add/substract - const __m128i r0 = _mm_add_epi16(in0, in3); - const __m128i r1 = _mm_add_epi16(in1, in2); - const __m128i r2 = _mm_sub_epi16(in1, in2); - const __m128i r3 = _mm_sub_epi16(in0, in3); + const __m128i r0 = _mm_add_epi16(in0, in1); + const __m128i r1 = _mm_sub_epi16(in0, in1); + const __m128i r2 = _mm_unpacklo_epi64(r0, r1); + const __m128i r3 = _mm_unpackhi_epi64(r0, r1); // Transform 1/2: Interleave to do the multiply by constants which gets us // into 32 bits. - const __m128i t0 = _mm_unpacklo_epi16(r0, r1); - const __m128i t2 = _mm_unpacklo_epi16(r2, r3); + const __m128i t0 = _mm_unpacklo_epi16(r2, r3); + const __m128i t2 = _mm_unpackhi_epi16(r2, r3); const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16); const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16); - const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08); - const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24); + const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p08_p24); + const __m128i u6 = _mm_madd_epi16(t2, k__cospi_p24_m08); const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING); @@ -90,24 +91,21 @@ void vp9_fdct4x4_sse2(const int16_t *input, int16_t *output, int stride) { // 00 10 01 11 02 12 03 13 // 20 30 21 31 22 32 23 33 in0 = _mm_unpacklo_epi32(tr0_0, tr0_1); - in2 = _mm_unpackhi_epi32(tr0_0, tr0_1); + in1 = _mm_unpackhi_epi32(tr0_0, tr0_1); + in1 = _mm_shuffle_epi32(in1, 0x4E); // 00 10 20 30 01 11 21 31 in0 contains 0 followed by 1 - // 02 12 22 32 03 13 23 33 in2 contains 2 followed by 3 - if (0 == pass) { - // Extract values in the high part for second pass as transform code - // only uses the first four values. - in1 = _mm_unpackhi_epi64(in0, in0); - in3 = _mm_unpackhi_epi64(in2, in2); - } else { - // Post-condition output and store it (v + 1) >> 2, taking advantage - // of the fact 1/3 are stored just after 0/2. - __m128i out01 = _mm_add_epi16(in0, kOne); - __m128i out23 = _mm_add_epi16(in2, kOne); - out01 = _mm_srai_epi16(out01, 2); - out23 = _mm_srai_epi16(out23, 2); - _mm_storeu_si128((__m128i *)(output + 0 * 4), out01); - _mm_storeu_si128((__m128i *)(output + 2 * 4), out23); - } + // 02 12 22 32 03 13 23 33 in1 contains 2 followed by 3 + } + in1 = _mm_shuffle_epi32(in1, 0x4E); + // Post-condition output and store it (v + 1) >> 2, taking advantage + // of the fact 1/3 are stored just after 0/2. + { + __m128i out01 = _mm_add_epi16(in0, kOne); + __m128i out23 = _mm_add_epi16(in1, kOne); + out01 = _mm_srai_epi16(out01, 2); + out23 = _mm_srai_epi16(out23, 2); + _mm_storeu_si128((__m128i *)(output + 0 * 4), out01); + _mm_storeu_si128((__m128i *)(output + 2 * 4), out23); } }