From: Jingning Han Date: Wed, 10 Jul 2013 19:11:09 +0000 (-0700) Subject: SSE2 4x4 invserse ADST/DCT transform X-Git-Tag: v1.3.0~888^2 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=49b630204437f1169fb091a1d4df72ea285a20fb;p=libvpx SSE2 4x4 invserse ADST/DCT transform Enable SSE2 4x4 inverse ADST/DCT transform. The runtime goes from 292 cycles down to 89 cycles. Running bus_cif at 2000 kbps, the overall runtime of speed 0 goes from 301s to 295s (2% speed-up). Change-Id: I24098136e7fee7ab2fbf1c11755bdf2ca37f3628 --- diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh index 459336e4d..c3800d827 100644 --- a/vp9/common/vp9_rtcd_defs.sh +++ b/vp9/common/vp9_rtcd_defs.sh @@ -325,7 +325,7 @@ prototype void vp9_short_idct10_32x32_add "int16_t *input, uint8_t *dest, int de specialize vp9_short_idct10_32x32_add prototype void vp9_short_iht4x4_add "int16_t *input, uint8_t *dest, int dest_stride, int tx_type" -specialize vp9_short_iht4x4_add +specialize vp9_short_iht4x4_add sse2 prototype void vp9_short_iht8x8_add "int16_t *input, uint8_t *dest, int dest_stride, int tx_type" specialize vp9_short_iht8x8_add diff --git a/vp9/common/x86/vp9_idct_intrin_sse2.c b/vp9/common/x86/vp9_idct_intrin_sse2.c index 599dcff93..4495b15ed 100644 --- a/vp9/common/x86/vp9_idct_intrin_sse2.c +++ b/vp9/common/x86/vp9_idct_intrin_sse2.c @@ -241,6 +241,155 @@ void vp9_idct4_1d_sse2(int16_t *input, int16_t *output) { _mm_storel_epi64((__m128i *)output, in); } +static INLINE void transpose_4x4(__m128i *res) { + const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]); + const __m128i tr0_1 = _mm_unpacklo_epi16(res[2], res[3]); + res[0] = _mm_unpacklo_epi32(tr0_0, tr0_1); + res[2] = _mm_unpackhi_epi32(tr0_0, tr0_1); + + res[1] = _mm_unpackhi_epi64(res[0], res[0]); + res[3] = _mm_unpackhi_epi64(res[2], res[2]); +} + +void idct4_1d_sse2(__m128i *in) { + const __m128i k__cospi_p16_p16 = pair_set_epi16(cospi_16_64, cospi_16_64); + const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); + const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); + const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); + const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); + __m128i u[8], v[8]; + + transpose_4x4(in); + // stage 1 + u[0] = _mm_unpacklo_epi16(in[0], in[2]); + u[1] = _mm_unpacklo_epi16(in[1], in[3]); + v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16); + v[1] = _mm_madd_epi16(u[0], k__cospi_p16_m16); + v[2] = _mm_madd_epi16(u[1], k__cospi_p24_m08); + v[3] = _mm_madd_epi16(u[1], k__cospi_p08_p24); + + u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); + u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); + u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); + u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); + + v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); + v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); + v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); + v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); + + u[0] = _mm_packs_epi32(v[0], v[2]); + u[1] = _mm_packs_epi32(v[1], v[3]); + u[2] = _mm_unpackhi_epi64(u[0], u[0]); + u[3] = _mm_unpackhi_epi64(u[1], u[1]); + + // stage 2 + in[0] = _mm_add_epi16(u[0], u[3]); + in[1] = _mm_add_epi16(u[1], u[2]); + in[2] = _mm_sub_epi16(u[1], u[2]); + in[3] = _mm_sub_epi16(u[0], u[3]); +} + +void iadst4_1d_sse2(__m128i *in) { + const __m128i k__sinpi_p01_p04 = pair_set_epi16(sinpi_1_9, sinpi_4_9); + const __m128i k__sinpi_p03_p02 = pair_set_epi16(sinpi_3_9, sinpi_2_9); + const __m128i k__sinpi_p02_m01 = pair_set_epi16(sinpi_2_9, -sinpi_1_9); + const __m128i k__sinpi_p03_m04 = pair_set_epi16(sinpi_3_9, -sinpi_4_9); + const __m128i k__sinpi_p03_p03 = _mm_set1_epi16(sinpi_3_9); + const __m128i kZero = _mm_set1_epi16(0); + const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); + __m128i u[8], v[8], in7; + + transpose_4x4(in); + in7 = _mm_add_epi16(in[0], in[3]); + in7 = _mm_sub_epi16(in7, in[2]); + + u[0] = _mm_unpacklo_epi16(in[0], in[2]); + u[1] = _mm_unpacklo_epi16(in[1], in[3]); + u[2] = _mm_unpacklo_epi16(in7, kZero); + u[3] = _mm_unpacklo_epi16(in[1], kZero); + + v[0] = _mm_madd_epi16(u[0], k__sinpi_p01_p04); // s0 + s3 + v[1] = _mm_madd_epi16(u[1], k__sinpi_p03_p02); // s2 + s5 + v[2] = _mm_madd_epi16(u[2], k__sinpi_p03_p03); // x2 + v[3] = _mm_madd_epi16(u[0], k__sinpi_p02_m01); // s1 - s4 + v[4] = _mm_madd_epi16(u[1], k__sinpi_p03_m04); // s2 - s6 + v[5] = _mm_madd_epi16(u[3], k__sinpi_p03_p03); // s2 + + u[0] = _mm_add_epi32(v[0], v[1]); + u[1] = _mm_add_epi32(v[3], v[4]); + u[2] = v[2]; + u[3] = _mm_add_epi32(u[0], u[1]); + u[4] = _mm_slli_epi32(v[5], 2); + u[5] = _mm_add_epi32(u[3], v[5]); + u[6] = _mm_sub_epi32(u[5], u[4]); + + v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); + v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); + v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); + v[3] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); + + u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); + u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); + u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); + u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); + + in[0] = _mm_packs_epi32(u[0], u[2]); + in[1] = _mm_packs_epi32(u[1], u[3]); + in[2] = _mm_unpackhi_epi64(in[0], in[0]); + in[3] = _mm_unpackhi_epi64(in[1], in[1]); +} + +void vp9_short_iht4x4_add_sse2(int16_t *input, uint8_t *dest, int stride, + int tx_type) { + __m128i in[4]; + const __m128i zero = _mm_setzero_si128(); + const __m128i eight = _mm_set1_epi16(8); + + in[0] = _mm_loadl_epi64((__m128i *)input); + in[1] = _mm_loadl_epi64((__m128i *)(input + 4)); + in[2] = _mm_loadl_epi64((__m128i *)(input + 8)); + in[3] = _mm_loadl_epi64((__m128i *)(input + 12)); + + switch (tx_type) { + case 0: // DCT_DCT + idct4_1d_sse2(in); + idct4_1d_sse2(in); + break; + case 1: // ADST_DCT + idct4_1d_sse2(in); + iadst4_1d_sse2(in); + break; + case 2: // DCT_ADST + iadst4_1d_sse2(in); + idct4_1d_sse2(in); + break; + case 3: // ADST_ADST + iadst4_1d_sse2(in); + iadst4_1d_sse2(in); + break; + default: + assert(0); + break; + } + + // Final round and shift + in[0] = _mm_add_epi16(in[0], eight); + in[1] = _mm_add_epi16(in[1], eight); + in[2] = _mm_add_epi16(in[2], eight); + in[3] = _mm_add_epi16(in[3], eight); + + in[0] = _mm_srai_epi16(in[0], 4); + in[1] = _mm_srai_epi16(in[1], 4); + in[2] = _mm_srai_epi16(in[2], 4); + in[3] = _mm_srai_epi16(in[3], 4); + + RECON_AND_STORE4X4(dest, in[0]); + RECON_AND_STORE4X4(dest, in[1]); + RECON_AND_STORE4X4(dest, in[2]); + RECON_AND_STORE4X4(dest, in[3]); +} + #define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, \ out0, out1, out2, out3, out4, out5, out6, out7) \ { \