From: Jingning Han Date: Thu, 22 May 2014 16:44:40 +0000 (-0700) Subject: Inverse 16x16 2D-DCT SSSE3 implementation X-Git-Tag: v1.4.0~1499^2 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=48b089137023cb7df2e750658e1c532f02db2ee5;p=libvpx Inverse 16x16 2D-DCT SSSE3 implementation This commit enables the SSSE3 implementation of full inverse 16x16 2D-DCT. The unit runtime goes down from 1642 cycles to 1519 cycles, about 7% speed-up. Change-Id: I14d2fdf9da1fb4ed1e5db7ce24f77a1bfc8ea90d --- diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl index 182739620..c348424da 100644 --- a/vp9/common/vp9_rtcd_defs.pl +++ b/vp9/common/vp9_rtcd_defs.pl @@ -356,7 +356,7 @@ specialize qw/vp9_idct16x16_1_add sse2 neon_asm dspr2/; $vp9_idct16x16_1_add_neon_asm=vp9_idct16x16_1_add_neon; add_proto qw/void vp9_idct16x16_256_add/, "const int16_t *input, uint8_t *dest, int dest_stride"; -specialize qw/vp9_idct16x16_256_add sse2 neon_asm dspr2/; +specialize qw/vp9_idct16x16_256_add sse2 ssse3 neon_asm dspr2/; $vp9_idct16x16_256_add_neon_asm=vp9_idct16x16_256_add_neon; add_proto qw/void vp9_idct16x16_10_add/, "const int16_t *input, uint8_t *dest, int dest_stride"; diff --git a/vp9/common/x86/vp9_idct_intrin_sse2.c b/vp9/common/x86/vp9_idct_intrin_sse2.c index 0231726dc..ff9c43221 100644 --- a/vp9/common/x86/vp9_idct_intrin_sse2.c +++ b/vp9/common/x86/vp9_idct_intrin_sse2.c @@ -8,12 +8,7 @@ * be found in the AUTHORS file in the root of the source tree. */ -#include -#include // SSE2 -#include "./vpx_config.h" -#include "vpx/vpx_integer.h" -#include "vp9/common/vp9_common.h" -#include "vp9/common/vp9_idct.h" +#include "vp9/common/x86/vp9_idct_intrin_sse2.h" #define RECON_AND_STORE4X4(dest, in_x) \ { \ @@ -527,16 +522,6 @@ void vp9_iht4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride, out7 = _mm_subs_epi16(stp1_0, stp2_7); \ } -#define RECON_AND_STORE(dest, in_x) \ - { \ - __m128i d0 = _mm_loadl_epi64((__m128i *)(dest)); \ - d0 = _mm_unpacklo_epi8(d0, zero); \ - d0 = _mm_add_epi16(in_x, d0); \ - d0 = _mm_packus_epi16(d0, d0); \ - _mm_storel_epi64((__m128i *)(dest), d0); \ - dest += stride; \ - } - void vp9_idct8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride) { const __m128i zero = _mm_setzero_si128(); const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); @@ -627,36 +612,6 @@ void vp9_idct8x8_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) { RECON_AND_STORE(dest, dc_value); } -// perform 8x8 transpose -static INLINE void array_transpose_8x8(__m128i *in, __m128i *res) { - const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]); - const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]); - const __m128i tr0_2 = _mm_unpackhi_epi16(in[0], in[1]); - const __m128i tr0_3 = _mm_unpackhi_epi16(in[2], in[3]); - const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]); - const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]); - const __m128i tr0_6 = _mm_unpackhi_epi16(in[4], in[5]); - const __m128i tr0_7 = _mm_unpackhi_epi16(in[6], in[7]); - - const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); - const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_4, tr0_5); - const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); - const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_4, tr0_5); - const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_2, tr0_3); - const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); - const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_2, tr0_3); - const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); - - res[0] = _mm_unpacklo_epi64(tr1_0, tr1_1); - res[1] = _mm_unpackhi_epi64(tr1_0, tr1_1); - res[2] = _mm_unpacklo_epi64(tr1_2, tr1_3); - res[3] = _mm_unpackhi_epi64(tr1_2, tr1_3); - res[4] = _mm_unpacklo_epi64(tr1_4, tr1_5); - res[5] = _mm_unpackhi_epi64(tr1_4, tr1_5); - res[6] = _mm_unpacklo_epi64(tr1_6, tr1_7); - res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7); -} - static INLINE void array_transpose_4X8(__m128i *in, __m128i * out) { const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]); const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]); @@ -1573,23 +1528,6 @@ void vp9_idct16x16_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) { } } -static INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) { - __m128i tbuf[8]; - array_transpose_8x8(res0, res0); - array_transpose_8x8(res1, tbuf); - array_transpose_8x8(res0 + 8, res1); - array_transpose_8x8(res1 + 8, res1 + 8); - - res0[8] = tbuf[0]; - res0[9] = tbuf[1]; - res0[10] = tbuf[2]; - res0[11] = tbuf[3]; - res0[12] = tbuf[4]; - res0[13] = tbuf[5]; - res0[14] = tbuf[6]; - res0[15] = tbuf[7]; -} - static void iadst16_8col(__m128i *in) { // perform 16x16 1-D ADST for 8 columns __m128i s[16], x[16], u[32], v[32]; @@ -2416,82 +2354,6 @@ static void iadst16_sse2(__m128i *in0, __m128i *in1) { iadst16_8col(in1); } -static INLINE void load_buffer_8x16(const int16_t *input, __m128i *in) { - in[0] = _mm_load_si128((const __m128i *)(input + 0 * 16)); - in[1] = _mm_load_si128((const __m128i *)(input + 1 * 16)); - in[2] = _mm_load_si128((const __m128i *)(input + 2 * 16)); - in[3] = _mm_load_si128((const __m128i *)(input + 3 * 16)); - in[4] = _mm_load_si128((const __m128i *)(input + 4 * 16)); - in[5] = _mm_load_si128((const __m128i *)(input + 5 * 16)); - in[6] = _mm_load_si128((const __m128i *)(input + 6 * 16)); - in[7] = _mm_load_si128((const __m128i *)(input + 7 * 16)); - - in[8] = _mm_load_si128((const __m128i *)(input + 8 * 16)); - in[9] = _mm_load_si128((const __m128i *)(input + 9 * 16)); - in[10] = _mm_load_si128((const __m128i *)(input + 10 * 16)); - in[11] = _mm_load_si128((const __m128i *)(input + 11 * 16)); - in[12] = _mm_load_si128((const __m128i *)(input + 12 * 16)); - in[13] = _mm_load_si128((const __m128i *)(input + 13 * 16)); - in[14] = _mm_load_si128((const __m128i *)(input + 14 * 16)); - in[15] = _mm_load_si128((const __m128i *)(input + 15 * 16)); -} - -static INLINE void write_buffer_8x16(uint8_t *dest, __m128i *in, int stride) { - const __m128i final_rounding = _mm_set1_epi16(1<<5); - const __m128i zero = _mm_setzero_si128(); - // Final rounding and shift - in[0] = _mm_adds_epi16(in[0], final_rounding); - in[1] = _mm_adds_epi16(in[1], final_rounding); - in[2] = _mm_adds_epi16(in[2], final_rounding); - in[3] = _mm_adds_epi16(in[3], final_rounding); - in[4] = _mm_adds_epi16(in[4], final_rounding); - in[5] = _mm_adds_epi16(in[5], final_rounding); - in[6] = _mm_adds_epi16(in[6], final_rounding); - in[7] = _mm_adds_epi16(in[7], final_rounding); - in[8] = _mm_adds_epi16(in[8], final_rounding); - in[9] = _mm_adds_epi16(in[9], final_rounding); - in[10] = _mm_adds_epi16(in[10], final_rounding); - in[11] = _mm_adds_epi16(in[11], final_rounding); - in[12] = _mm_adds_epi16(in[12], final_rounding); - in[13] = _mm_adds_epi16(in[13], final_rounding); - in[14] = _mm_adds_epi16(in[14], final_rounding); - in[15] = _mm_adds_epi16(in[15], final_rounding); - - in[0] = _mm_srai_epi16(in[0], 6); - in[1] = _mm_srai_epi16(in[1], 6); - in[2] = _mm_srai_epi16(in[2], 6); - in[3] = _mm_srai_epi16(in[3], 6); - in[4] = _mm_srai_epi16(in[4], 6); - in[5] = _mm_srai_epi16(in[5], 6); - in[6] = _mm_srai_epi16(in[6], 6); - in[7] = _mm_srai_epi16(in[7], 6); - in[8] = _mm_srai_epi16(in[8], 6); - in[9] = _mm_srai_epi16(in[9], 6); - in[10] = _mm_srai_epi16(in[10], 6); - in[11] = _mm_srai_epi16(in[11], 6); - in[12] = _mm_srai_epi16(in[12], 6); - in[13] = _mm_srai_epi16(in[13], 6); - in[14] = _mm_srai_epi16(in[14], 6); - in[15] = _mm_srai_epi16(in[15], 6); - - RECON_AND_STORE(dest, in[0]); - RECON_AND_STORE(dest, in[1]); - RECON_AND_STORE(dest, in[2]); - RECON_AND_STORE(dest, in[3]); - RECON_AND_STORE(dest, in[4]); - RECON_AND_STORE(dest, in[5]); - RECON_AND_STORE(dest, in[6]); - RECON_AND_STORE(dest, in[7]); - RECON_AND_STORE(dest, in[8]); - RECON_AND_STORE(dest, in[9]); - RECON_AND_STORE(dest, in[10]); - RECON_AND_STORE(dest, in[11]); - RECON_AND_STORE(dest, in[12]); - RECON_AND_STORE(dest, in[13]); - RECON_AND_STORE(dest, in[14]); - RECON_AND_STORE(dest, in[15]); -} - void vp9_iht16x16_256_add_sse2(const int16_t *input, uint8_t *dest, int stride, int tx_type) { __m128i in0[16], in1[16]; diff --git a/vp9/common/x86/vp9_idct_intrin_sse2.h b/vp9/common/x86/vp9_idct_intrin_sse2.h new file mode 100644 index 000000000..1c62e3272 --- /dev/null +++ b/vp9/common/x86/vp9_idct_intrin_sse2.h @@ -0,0 +1,149 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include // SSE2 +#include "./vpx_config.h" +#include "vpx/vpx_integer.h" +#include "vp9/common/vp9_common.h" +#include "vp9/common/vp9_idct.h" + +// perform 8x8 transpose +static INLINE void array_transpose_8x8(__m128i *in, __m128i *res) { + const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]); + const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]); + const __m128i tr0_2 = _mm_unpackhi_epi16(in[0], in[1]); + const __m128i tr0_3 = _mm_unpackhi_epi16(in[2], in[3]); + const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]); + const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]); + const __m128i tr0_6 = _mm_unpackhi_epi16(in[4], in[5]); + const __m128i tr0_7 = _mm_unpackhi_epi16(in[6], in[7]); + + const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); + const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_4, tr0_5); + const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); + const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_4, tr0_5); + const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_2, tr0_3); + const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); + const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_2, tr0_3); + const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); + + res[0] = _mm_unpacklo_epi64(tr1_0, tr1_1); + res[1] = _mm_unpackhi_epi64(tr1_0, tr1_1); + res[2] = _mm_unpacklo_epi64(tr1_2, tr1_3); + res[3] = _mm_unpackhi_epi64(tr1_2, tr1_3); + res[4] = _mm_unpacklo_epi64(tr1_4, tr1_5); + res[5] = _mm_unpackhi_epi64(tr1_4, tr1_5); + res[6] = _mm_unpacklo_epi64(tr1_6, tr1_7); + res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7); +} + +static INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) { + __m128i tbuf[8]; + array_transpose_8x8(res0, res0); + array_transpose_8x8(res1, tbuf); + array_transpose_8x8(res0 + 8, res1); + array_transpose_8x8(res1 + 8, res1 + 8); + + res0[8] = tbuf[0]; + res0[9] = tbuf[1]; + res0[10] = tbuf[2]; + res0[11] = tbuf[3]; + res0[12] = tbuf[4]; + res0[13] = tbuf[5]; + res0[14] = tbuf[6]; + res0[15] = tbuf[7]; +} + +static INLINE void load_buffer_8x16(const int16_t *input, __m128i *in) { + in[0] = _mm_load_si128((const __m128i *)(input + 0 * 16)); + in[1] = _mm_load_si128((const __m128i *)(input + 1 * 16)); + in[2] = _mm_load_si128((const __m128i *)(input + 2 * 16)); + in[3] = _mm_load_si128((const __m128i *)(input + 3 * 16)); + in[4] = _mm_load_si128((const __m128i *)(input + 4 * 16)); + in[5] = _mm_load_si128((const __m128i *)(input + 5 * 16)); + in[6] = _mm_load_si128((const __m128i *)(input + 6 * 16)); + in[7] = _mm_load_si128((const __m128i *)(input + 7 * 16)); + + in[8] = _mm_load_si128((const __m128i *)(input + 8 * 16)); + in[9] = _mm_load_si128((const __m128i *)(input + 9 * 16)); + in[10] = _mm_load_si128((const __m128i *)(input + 10 * 16)); + in[11] = _mm_load_si128((const __m128i *)(input + 11 * 16)); + in[12] = _mm_load_si128((const __m128i *)(input + 12 * 16)); + in[13] = _mm_load_si128((const __m128i *)(input + 13 * 16)); + in[14] = _mm_load_si128((const __m128i *)(input + 14 * 16)); + in[15] = _mm_load_si128((const __m128i *)(input + 15 * 16)); +} + +#define RECON_AND_STORE(dest, in_x) \ + { \ + __m128i d0 = _mm_loadl_epi64((__m128i *)(dest)); \ + d0 = _mm_unpacklo_epi8(d0, zero); \ + d0 = _mm_add_epi16(in_x, d0); \ + d0 = _mm_packus_epi16(d0, d0); \ + _mm_storel_epi64((__m128i *)(dest), d0); \ + dest += stride; \ + } + +static INLINE void write_buffer_8x16(uint8_t *dest, __m128i *in, int stride) { + const __m128i final_rounding = _mm_set1_epi16(1<<5); + const __m128i zero = _mm_setzero_si128(); + // Final rounding and shift + in[0] = _mm_adds_epi16(in[0], final_rounding); + in[1] = _mm_adds_epi16(in[1], final_rounding); + in[2] = _mm_adds_epi16(in[2], final_rounding); + in[3] = _mm_adds_epi16(in[3], final_rounding); + in[4] = _mm_adds_epi16(in[4], final_rounding); + in[5] = _mm_adds_epi16(in[5], final_rounding); + in[6] = _mm_adds_epi16(in[6], final_rounding); + in[7] = _mm_adds_epi16(in[7], final_rounding); + in[8] = _mm_adds_epi16(in[8], final_rounding); + in[9] = _mm_adds_epi16(in[9], final_rounding); + in[10] = _mm_adds_epi16(in[10], final_rounding); + in[11] = _mm_adds_epi16(in[11], final_rounding); + in[12] = _mm_adds_epi16(in[12], final_rounding); + in[13] = _mm_adds_epi16(in[13], final_rounding); + in[14] = _mm_adds_epi16(in[14], final_rounding); + in[15] = _mm_adds_epi16(in[15], final_rounding); + + in[0] = _mm_srai_epi16(in[0], 6); + in[1] = _mm_srai_epi16(in[1], 6); + in[2] = _mm_srai_epi16(in[2], 6); + in[3] = _mm_srai_epi16(in[3], 6); + in[4] = _mm_srai_epi16(in[4], 6); + in[5] = _mm_srai_epi16(in[5], 6); + in[6] = _mm_srai_epi16(in[6], 6); + in[7] = _mm_srai_epi16(in[7], 6); + in[8] = _mm_srai_epi16(in[8], 6); + in[9] = _mm_srai_epi16(in[9], 6); + in[10] = _mm_srai_epi16(in[10], 6); + in[11] = _mm_srai_epi16(in[11], 6); + in[12] = _mm_srai_epi16(in[12], 6); + in[13] = _mm_srai_epi16(in[13], 6); + in[14] = _mm_srai_epi16(in[14], 6); + in[15] = _mm_srai_epi16(in[15], 6); + + RECON_AND_STORE(dest, in[0]); + RECON_AND_STORE(dest, in[1]); + RECON_AND_STORE(dest, in[2]); + RECON_AND_STORE(dest, in[3]); + RECON_AND_STORE(dest, in[4]); + RECON_AND_STORE(dest, in[5]); + RECON_AND_STORE(dest, in[6]); + RECON_AND_STORE(dest, in[7]); + RECON_AND_STORE(dest, in[8]); + RECON_AND_STORE(dest, in[9]); + RECON_AND_STORE(dest, in[10]); + RECON_AND_STORE(dest, in[11]); + RECON_AND_STORE(dest, in[12]); + RECON_AND_STORE(dest, in[13]); + RECON_AND_STORE(dest, in[14]); + RECON_AND_STORE(dest, in[15]); +} diff --git a/vp9/common/x86/vp9_idct_intrin_ssse3.c b/vp9/common/x86/vp9_idct_intrin_ssse3.c new file mode 100644 index 000000000..9a6980e0f --- /dev/null +++ b/vp9/common/x86/vp9_idct_intrin_ssse3.c @@ -0,0 +1,326 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include // SSSE3 +#include "vp9/common/x86/vp9_idct_intrin_sse2.h" + +static void idct16_8col(__m128i *in) { + const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64); + const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64); + const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64); + const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64); + const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64); + const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64); + const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64); + const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64); + const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64); + const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64); + const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64); + const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64); + const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); + const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); + const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); + const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); + const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64); + const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); + const __m128i k__cospi_p16_p16_x2 = pair_set_epi16(23170, 23170); + + __m128i v[16], u[16], s[16], t[16]; + + // stage 1 + s[0] = in[0]; + s[1] = in[8]; + s[2] = in[4]; + s[3] = in[12]; + s[4] = in[2]; + s[5] = in[10]; + s[6] = in[6]; + s[7] = in[14]; + s[8] = in[1]; + s[9] = in[9]; + s[10] = in[5]; + s[11] = in[13]; + s[12] = in[3]; + s[13] = in[11]; + s[14] = in[7]; + s[15] = in[15]; + + // stage 2 + u[0] = _mm_unpacklo_epi16(s[8], s[15]); + u[1] = _mm_unpackhi_epi16(s[8], s[15]); + u[2] = _mm_unpacklo_epi16(s[9], s[14]); + u[3] = _mm_unpackhi_epi16(s[9], s[14]); + u[4] = _mm_unpacklo_epi16(s[10], s[13]); + u[5] = _mm_unpackhi_epi16(s[10], s[13]); + u[6] = _mm_unpacklo_epi16(s[11], s[12]); + u[7] = _mm_unpackhi_epi16(s[11], s[12]); + + v[0] = _mm_madd_epi16(u[0], k__cospi_p30_m02); + v[1] = _mm_madd_epi16(u[1], k__cospi_p30_m02); + v[2] = _mm_madd_epi16(u[0], k__cospi_p02_p30); + v[3] = _mm_madd_epi16(u[1], k__cospi_p02_p30); + v[4] = _mm_madd_epi16(u[2], k__cospi_p14_m18); + v[5] = _mm_madd_epi16(u[3], k__cospi_p14_m18); + v[6] = _mm_madd_epi16(u[2], k__cospi_p18_p14); + v[7] = _mm_madd_epi16(u[3], k__cospi_p18_p14); + v[8] = _mm_madd_epi16(u[4], k__cospi_p22_m10); + v[9] = _mm_madd_epi16(u[5], k__cospi_p22_m10); + v[10] = _mm_madd_epi16(u[4], k__cospi_p10_p22); + v[11] = _mm_madd_epi16(u[5], k__cospi_p10_p22); + v[12] = _mm_madd_epi16(u[6], k__cospi_p06_m26); + v[13] = _mm_madd_epi16(u[7], k__cospi_p06_m26); + v[14] = _mm_madd_epi16(u[6], k__cospi_p26_p06); + v[15] = _mm_madd_epi16(u[7], k__cospi_p26_p06); + + u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); + u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); + u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); + u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); + u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); + u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); + u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); + u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); + u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING); + u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING); + u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING); + u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING); + u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING); + u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING); + u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING); + u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING); + + u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); + u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); + u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); + u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); + u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); + u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); + u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); + u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); + u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS); + u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS); + u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS); + u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS); + u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS); + u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS); + u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); + u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); + + s[8] = _mm_packs_epi32(u[0], u[1]); + s[15] = _mm_packs_epi32(u[2], u[3]); + s[9] = _mm_packs_epi32(u[4], u[5]); + s[14] = _mm_packs_epi32(u[6], u[7]); + s[10] = _mm_packs_epi32(u[8], u[9]); + s[13] = _mm_packs_epi32(u[10], u[11]); + s[11] = _mm_packs_epi32(u[12], u[13]); + s[12] = _mm_packs_epi32(u[14], u[15]); + + // stage 3 + t[0] = s[0]; + t[1] = s[1]; + t[2] = s[2]; + t[3] = s[3]; + u[0] = _mm_unpacklo_epi16(s[4], s[7]); + u[1] = _mm_unpackhi_epi16(s[4], s[7]); + u[2] = _mm_unpacklo_epi16(s[5], s[6]); + u[3] = _mm_unpackhi_epi16(s[5], s[6]); + + v[0] = _mm_madd_epi16(u[0], k__cospi_p28_m04); + v[1] = _mm_madd_epi16(u[1], k__cospi_p28_m04); + v[2] = _mm_madd_epi16(u[0], k__cospi_p04_p28); + v[3] = _mm_madd_epi16(u[1], k__cospi_p04_p28); + v[4] = _mm_madd_epi16(u[2], k__cospi_p12_m20); + v[5] = _mm_madd_epi16(u[3], k__cospi_p12_m20); + v[6] = _mm_madd_epi16(u[2], k__cospi_p20_p12); + v[7] = _mm_madd_epi16(u[3], k__cospi_p20_p12); + + u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); + u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); + u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); + u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); + u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); + u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); + u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); + u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); + + u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); + u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); + u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); + u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); + u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); + u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); + u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); + u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); + + t[4] = _mm_packs_epi32(u[0], u[1]); + t[7] = _mm_packs_epi32(u[2], u[3]); + t[5] = _mm_packs_epi32(u[4], u[5]); + t[6] = _mm_packs_epi32(u[6], u[7]); + t[8] = _mm_add_epi16(s[8], s[9]); + t[9] = _mm_sub_epi16(s[8], s[9]); + t[10] = _mm_sub_epi16(s[11], s[10]); + t[11] = _mm_add_epi16(s[10], s[11]); + t[12] = _mm_add_epi16(s[12], s[13]); + t[13] = _mm_sub_epi16(s[12], s[13]); + t[14] = _mm_sub_epi16(s[15], s[14]); + t[15] = _mm_add_epi16(s[14], s[15]); + + // stage 4 + u[0] = _mm_add_epi16(t[0], t[1]); + u[1] = _mm_sub_epi16(t[0], t[1]); + u[2] = _mm_unpacklo_epi16(t[2], t[3]); + u[3] = _mm_unpackhi_epi16(t[2], t[3]); + u[4] = _mm_unpacklo_epi16(t[9], t[14]); + u[5] = _mm_unpackhi_epi16(t[9], t[14]); + u[6] = _mm_unpacklo_epi16(t[10], t[13]); + u[7] = _mm_unpackhi_epi16(t[10], t[13]); + + s[0] = _mm_mulhrs_epi16(u[0], k__cospi_p16_p16_x2); + s[1] = _mm_mulhrs_epi16(u[1], k__cospi_p16_p16_x2); + v[4] = _mm_madd_epi16(u[2], k__cospi_p24_m08); + v[5] = _mm_madd_epi16(u[3], k__cospi_p24_m08); + v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24); + v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24); + v[8] = _mm_madd_epi16(u[4], k__cospi_m08_p24); + v[9] = _mm_madd_epi16(u[5], k__cospi_m08_p24); + v[10] = _mm_madd_epi16(u[4], k__cospi_p24_p08); + v[11] = _mm_madd_epi16(u[5], k__cospi_p24_p08); + v[12] = _mm_madd_epi16(u[6], k__cospi_m24_m08); + v[13] = _mm_madd_epi16(u[7], k__cospi_m24_m08); + v[14] = _mm_madd_epi16(u[6], k__cospi_m08_p24); + v[15] = _mm_madd_epi16(u[7], k__cospi_m08_p24); + + u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); + u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); + u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); + u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); + u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING); + u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING); + u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING); + u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING); + u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING); + u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING); + u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING); + u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING); + + u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); + u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); + u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); + u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); + u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS); + u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS); + u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS); + u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS); + u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS); + u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS); + u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); + u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); + + s[2] = _mm_packs_epi32(u[4], u[5]); + s[3] = _mm_packs_epi32(u[6], u[7]); + s[4] = _mm_add_epi16(t[4], t[5]); + s[5] = _mm_sub_epi16(t[4], t[5]); + s[6] = _mm_sub_epi16(t[7], t[6]); + s[7] = _mm_add_epi16(t[6], t[7]); + s[8] = t[8]; + s[15] = t[15]; + s[9] = _mm_packs_epi32(u[8], u[9]); + s[14] = _mm_packs_epi32(u[10], u[11]); + s[10] = _mm_packs_epi32(u[12], u[13]); + s[13] = _mm_packs_epi32(u[14], u[15]); + s[11] = t[11]; + s[12] = t[12]; + + // stage 5 + t[0] = _mm_add_epi16(s[0], s[3]); + t[1] = _mm_add_epi16(s[1], s[2]); + t[2] = _mm_sub_epi16(s[1], s[2]); + t[3] = _mm_sub_epi16(s[0], s[3]); + t[4] = s[4]; + t[7] = s[7]; + + u[0] = _mm_sub_epi16(s[6], s[5]); + u[1] = _mm_add_epi16(s[6], s[5]); + t[5] = _mm_mulhrs_epi16(u[0], k__cospi_p16_p16_x2); + t[6] = _mm_mulhrs_epi16(u[1], k__cospi_p16_p16_x2); + + t[8] = _mm_add_epi16(s[8], s[11]); + t[9] = _mm_add_epi16(s[9], s[10]); + t[10] = _mm_sub_epi16(s[9], s[10]); + t[11] = _mm_sub_epi16(s[8], s[11]); + t[12] = _mm_sub_epi16(s[15], s[12]); + t[13] = _mm_sub_epi16(s[14], s[13]); + t[14] = _mm_add_epi16(s[13], s[14]); + t[15] = _mm_add_epi16(s[12], s[15]); + + // stage 6 + s[0] = _mm_add_epi16(t[0], t[7]); + s[1] = _mm_add_epi16(t[1], t[6]); + s[2] = _mm_add_epi16(t[2], t[5]); + s[3] = _mm_add_epi16(t[3], t[4]); + s[4] = _mm_sub_epi16(t[3], t[4]); + s[5] = _mm_sub_epi16(t[2], t[5]); + s[6] = _mm_sub_epi16(t[1], t[6]); + s[7] = _mm_sub_epi16(t[0], t[7]); + s[8] = t[8]; + s[9] = t[9]; + + u[0] = _mm_sub_epi16(t[13], t[10]); + u[1] = _mm_add_epi16(t[13], t[10]); + u[2] = _mm_sub_epi16(t[12], t[11]); + u[3] = _mm_add_epi16(t[12], t[11]); + + s[10] = _mm_mulhrs_epi16(u[0], k__cospi_p16_p16_x2); + s[13] = _mm_mulhrs_epi16(u[1], k__cospi_p16_p16_x2); + s[11] = _mm_mulhrs_epi16(u[2], k__cospi_p16_p16_x2); + s[12] = _mm_mulhrs_epi16(u[3], k__cospi_p16_p16_x2); + s[14] = t[14]; + s[15] = t[15]; + + // stage 7 + in[0] = _mm_add_epi16(s[0], s[15]); + in[1] = _mm_add_epi16(s[1], s[14]); + in[2] = _mm_add_epi16(s[2], s[13]); + in[3] = _mm_add_epi16(s[3], s[12]); + in[4] = _mm_add_epi16(s[4], s[11]); + in[5] = _mm_add_epi16(s[5], s[10]); + in[6] = _mm_add_epi16(s[6], s[9]); + in[7] = _mm_add_epi16(s[7], s[8]); + in[8] = _mm_sub_epi16(s[7], s[8]); + in[9] = _mm_sub_epi16(s[6], s[9]); + in[10] = _mm_sub_epi16(s[5], s[10]); + in[11] = _mm_sub_epi16(s[4], s[11]); + in[12] = _mm_sub_epi16(s[3], s[12]); + in[13] = _mm_sub_epi16(s[2], s[13]); + in[14] = _mm_sub_epi16(s[1], s[14]); + in[15] = _mm_sub_epi16(s[0], s[15]); +} + +static void idct16_sse2(__m128i *in0, __m128i *in1) { + array_transpose_16x16(in0, in1); + idct16_8col(in0); + idct16_8col(in1); +} + +void vp9_idct16x16_256_add_ssse3(const int16_t *input, uint8_t *dest, + int stride) { + __m128i in0[16], in1[16]; + + load_buffer_8x16(input, in0); + input += 8; + load_buffer_8x16(input, in1); + + idct16_sse2(in0, in1); + idct16_sse2(in0, in1); + + write_buffer_8x16(dest, in0, stride); + dest += 8; + write_buffer_8x16(dest, in1, stride); +} diff --git a/vp9/vp9_common.mk b/vp9/vp9_common.mk index 7bbfbdd33..a22838e50 100644 --- a/vp9/vp9_common.mk +++ b/vp9/vp9_common.mk @@ -119,7 +119,8 @@ VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_mblpf_horiz_loopfilter_d VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_mblpf_vert_loopfilter_dspr2.c VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_idct_intrin_sse2.c - +VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_idct_intrin_sse2.h +VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_idct_intrin_ssse3.c ifeq ($(ARCH_X86_64), yes) VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_idct_ssse3_x86_64.asm endif