From: Yi Luo Date: Wed, 17 Feb 2016 22:49:28 +0000 (-0800) Subject: Initial SSE2 function fdst4_sse2(). X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=5456aee6fc4c81fa838779688a008d4ffd93f2fd;p=libvpx Initial SSE2 function fdst4_sse2(). Applied DST sse2 to 4x4 transform. Fixed DST coefficient packing to satisfy 4x4 transpose requirement. Change-Id: I9164714c77049523dbbc9e145ebb10d7911fba9d --- diff --git a/vp10/encoder/hybrid_fwd_txfm.c b/vp10/encoder/hybrid_fwd_txfm.c index 0f59259e7..6507f98f3 100644 --- a/vp10/encoder/hybrid_fwd_txfm.c +++ b/vp10/encoder/hybrid_fwd_txfm.c @@ -69,8 +69,6 @@ void vp10_fwd_txfm_4x4(const int16_t *src_diff, tran_low_t *coeff, case FLIPADST_FLIPADST: case ADST_FLIPADST: case FLIPADST_ADST: - vp10_fht4x4(src_diff, coeff, diff_stride, tx_type); - break; case DST_DST: case DCT_DST: case DST_DCT: @@ -78,8 +76,7 @@ void vp10_fwd_txfm_4x4(const int16_t *src_diff, tran_low_t *coeff, case ADST_DST: case DST_FLIPADST: case FLIPADST_DST: - // Use C version since DST exists only in C - vp10_fht4x4_c(src_diff, coeff, diff_stride, tx_type); + vp10_fht4x4(src_diff, coeff, diff_stride, tx_type); break; case IDTX: fwd_idtx_c(src_diff, coeff, diff_stride, 4); diff --git a/vp10/encoder/x86/dct_sse2.c b/vp10/encoder/x86/dct_sse2.c index 976fe45fb..1cba80372 100644 --- a/vp10/encoder/x86/dct_sse2.c +++ b/vp10/encoder/x86/dct_sse2.c @@ -172,6 +172,42 @@ static void fadst4_sse2(__m128i *in) { transpose_4x4(in); } +#if CONFIG_EXT_TX +static void fdst4_sse2(__m128i *in) { + const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t) cospi_16_64); + const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); + const __m128i k__cospi_p08_m24 = pair_set_epi16(cospi_8_64, -cospi_24_64); + const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); + const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); + + __m128i u[4], v[4]; + + u[0] = _mm_unpacklo_epi16(in[0], in[1]); + u[1] = _mm_unpacklo_epi16(in[3], in[2]); + + v[0] = _mm_add_epi16(u[0], u[1]); + v[1] = _mm_sub_epi16(u[0], u[1]); + + u[0] = _mm_madd_epi16(v[0], k__cospi_p24_p08); + u[1] = _mm_madd_epi16(v[1], k__cospi_p16_p16); + u[2] = _mm_madd_epi16(v[0], k__cospi_p08_m24); + u[3] = _mm_madd_epi16(v[1], k__cospi_p16_m16); + + v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); + v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); + v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); + v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); + u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); + u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); + u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); + u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); + + in[0] = _mm_packs_epi32(u[0], u[2]); + in[1] = _mm_packs_epi32(u[1], u[3]); + transpose_4x4(in); +} +#endif // CONFIG_EXT_TX + void vp10_fht4x4_sse2(const int16_t *input, tran_low_t *output, int stride, int tx_type) { __m128i in[4]; @@ -229,6 +265,48 @@ void vp10_fht4x4_sse2(const int16_t *input, tran_low_t *output, fadst4_sse2(in); write_buffer_4x4(output, in); break; + case DST_DST: + load_buffer_4x4(input, in, stride, 0, 0); + fdst4_sse2(in); + fdst4_sse2(in); + write_buffer_4x4(output, in); + break; + case DCT_DST: + load_buffer_4x4(input, in, stride, 0, 0); + fdct4_sse2(in); + fdst4_sse2(in); + write_buffer_4x4(output, in); + break; + case DST_DCT: + load_buffer_4x4(input, in, stride, 0, 0); + fdst4_sse2(in); + fdct4_sse2(in); + write_buffer_4x4(output, in); + break; + case DST_ADST: + load_buffer_4x4(input, in, stride, 0, 0); + fdst4_sse2(in); + fadst4_sse2(in); + write_buffer_4x4(output, in); + break; + case ADST_DST: + load_buffer_4x4(input, in, stride, 0, 0); + fadst4_sse2(in); + fdst4_sse2(in); + write_buffer_4x4(output, in); + break; + case DST_FLIPADST: + load_buffer_4x4(input, in, stride, 0, 1); + fdst4_sse2(in); + fadst4_sse2(in); + write_buffer_4x4(output, in); + break; + case FLIPADST_DST: + load_buffer_4x4(input, in, stride, 1, 0); + fadst4_sse2(in); + fdst4_sse2(in); + write_buffer_4x4(output, in); + break; #endif // CONFIG_EXT_TX default: assert(0);