From 11ca81f8b64fe5af24a800ba2cfb0f0d37d56ed5 Mon Sep 17 00:00:00 2001 From: Yunqing Wang Date: Fri, 8 Mar 2013 10:54:30 -0800 Subject: [PATCH] Add vp9_idct4_1d_sse2 Added SSE2 idct4_1d which is called by vp9_short_iht4x4. Also, modified the parameter type passed to vp9_short_iht functions to make it work with rtcd prototype. Change-Id: I81ba7cb4db6738f1923383b52a06deb760923ffe --- vp9/common/vp9_idctllm.c | 34 ++++++++++++------------ vp9/common/vp9_rtcd_defs.sh | 3 +++ vp9/common/x86/vp9_idctllm_x86.c | 44 +++++++++++++++++++++++++++++--- 3 files changed, 60 insertions(+), 21 deletions(-) diff --git a/vp9/common/vp9_idctllm.c b/vp9/common/vp9_idctllm.c index 54b79ee64..e2106250f 100644 --- a/vp9/common/vp9_idctllm.c +++ b/vp9/common/vp9_idctllm.c @@ -26,6 +26,7 @@ #include #include "./vpx_config.h" +#include "./vp9_rtcd.h" #include "vp9/common/vp9_systemdependent.h" #include "vp9/common/vp9_blockd.h" #include "vp9/common/vp9_common.h" @@ -109,7 +110,7 @@ void vp9_dc_only_inv_walsh_add_c(int input_dc, uint8_t *pred_ptr, } } -static void idct4_1d(int16_t *input, int16_t *output) { +void vp9_idct4_1d_c(int16_t *input, int16_t *output) { int16_t step[4]; int temp1, temp2; // stage 1 @@ -140,7 +141,7 @@ void vp9_short_idct4x4llm_c(int16_t *input, int16_t *output, int pitch) { for (i = 0; i < 4; ++i) { for (j = 0; j < 4; ++j) temp_in[j] = input[j]; - idct4_1d(temp_in, outptr); + vp9_idct4_1d(temp_in, outptr); input += 4; outptr += 4; } @@ -149,7 +150,7 @@ void vp9_short_idct4x4llm_c(int16_t *input, int16_t *output, int pitch) { for (i = 0; i < 4; ++i) { for (j = 0; j < 4; ++j) temp_in[j] = out[j * 4 + i]; - idct4_1d(temp_in, temp_out); + vp9_idct4_1d(temp_in, temp_out); for (j = 0; j < 4; ++j) output[j * half_pitch + i] = ROUND_POWER_OF_TWO(temp_out[j], 4); } @@ -205,7 +206,7 @@ static void idct8_1d(int16_t *input, int16_t *output) { step1[6] = dct_const_round_shift(temp2); // stage 2 & stage 3 - even half - idct4_1d(step1, step1); + vp9_idct4_1d(step1, step1); // stage 2 - odd half step2[4] = step1[4] + step1[5]; @@ -298,24 +299,23 @@ static void iadst4_1d(int16_t *input, int16_t *output) { output[3] = dct_const_round_shift(s3); } -static const transform_2d IHT_4[] = { - { idct4_1d, idct4_1d }, // DCT_DCT = 0 - { iadst4_1d, idct4_1d }, // ADST_DCT = 1 - { idct4_1d, iadst4_1d }, // DCT_ADST = 2 - { iadst4_1d, iadst4_1d } // ADST_ADST = 3 -}; - void vp9_short_iht4x4_c(int16_t *input, int16_t *output, - int pitch, TX_TYPE tx_type) { + int pitch, int tx_type) { + const transform_2d IHT_4[] = { + { vp9_idct4_1d, vp9_idct4_1d }, // DCT_DCT = 0 + { iadst4_1d, vp9_idct4_1d }, // ADST_DCT = 1 + { vp9_idct4_1d, iadst4_1d }, // DCT_ADST = 2 + { iadst4_1d, iadst4_1d } // ADST_ADST = 3 + }; + int i, j; int16_t out[4 * 4]; int16_t *outptr = out; int16_t temp_in[4], temp_out[4]; - const transform_2d ht = IHT_4[tx_type]; // inverse transform row vectors for (i = 0; i < 4; ++i) { - ht.rows(input, outptr); + IHT_4[tx_type].rows(input, outptr); input += 4; outptr += 4; } @@ -324,7 +324,7 @@ void vp9_short_iht4x4_c(int16_t *input, int16_t *output, for (i = 0; i < 4; ++i) { for (j = 0; j < 4; ++j) temp_in[j] = out[j * 4 + i]; - ht.cols(temp_in, temp_out); + IHT_4[tx_type].cols(temp_in, temp_out); for (j = 0; j < 4; ++j) output[j * pitch + i] = ROUND_POWER_OF_TWO(temp_out[j], 4); } @@ -415,7 +415,7 @@ static const transform_2d IHT_8[] = { }; void vp9_short_iht8x8_c(int16_t *input, int16_t *output, - int pitch, TX_TYPE tx_type) { + int pitch, int tx_type) { int i, j; int16_t out[8 * 8]; int16_t *outptr = out; @@ -838,7 +838,7 @@ static const transform_2d IHT_16[] = { }; void vp9_short_iht16x16_c(int16_t *input, int16_t *output, - int pitch, TX_TYPE tx_type) { + int pitch, int tx_type) { int i, j; int16_t out[16 * 16]; int16_t *outptr = out; diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh index 48ae860a9..04b67b925 100644 --- a/vp9/common/vp9_rtcd_defs.sh +++ b/vp9/common/vp9_rtcd_defs.sh @@ -322,6 +322,9 @@ specialize vp9_short_iht4x4 prototype void vp9_short_iht16x16 "int16_t *input, int16_t *output, int pitch, int tx_type" specialize vp9_short_iht16x16 +prototype void vp9_idct4_1d "int16_t *input, int16_t *output" +specialize vp9_idct4_1d sse2 + # dct and add prototype void vp9_dc_only_idct_add "int input_dc, uint8_t *pred_ptr, uint8_t *dst_ptr, int pitch, int stride" diff --git a/vp9/common/x86/vp9_idctllm_x86.c b/vp9/common/x86/vp9_idctllm_x86.c index 7b3c57967..3d7a1481c 100644 --- a/vp9/common/x86/vp9_idctllm_x86.c +++ b/vp9/common/x86/vp9_idctllm_x86.c @@ -77,10 +77,10 @@ void vp9_dc_only_idct_add_sse2(int input_dc, uint8_t *pred_ptr, void vp9_short_idct4x4llm_sse2(int16_t *input, int16_t *output, int pitch) { const __m128i zero = _mm_setzero_si128(); const __m128i eight = _mm_set1_epi16(8); - const __m128i cst = _mm_setr_epi16((short)cospi_16_64, (short)cospi_16_64, - (short)cospi_16_64, (short)-cospi_16_64, - (short)cospi_24_64, (short)-cospi_8_64, - (short)cospi_8_64, (short)cospi_24_64); + const __m128i cst = _mm_setr_epi16((int16_t)cospi_16_64, (int16_t)cospi_16_64, + (int16_t)cospi_16_64, (int16_t)-cospi_16_64, + (int16_t)cospi_24_64, (int16_t)-cospi_8_64, + (int16_t)cospi_8_64, (int16_t)cospi_24_64); const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); const int half_pitch = pitch >> 1; __m128i input0, input1, input2, input3; @@ -198,4 +198,40 @@ void vp9_short_idct4x4llm_sse2(int16_t *input, int16_t *output, int pitch) { input3 = _mm_srli_si128(input3, 8); _mm_storel_epi64((__m128i *)(output + 2 * half_pitch), input3); } + +void vp9_idct4_1d_sse2(int16_t *input, int16_t *output) { + const __m128i zero = _mm_setzero_si128(); + const __m128i c1 = _mm_setr_epi16((int16_t)cospi_16_64, (int16_t)cospi_16_64, + (int16_t)cospi_16_64, (int16_t)-cospi_16_64, + (int16_t)cospi_24_64, (int16_t)-cospi_8_64, + (int16_t)cospi_8_64, (int16_t)cospi_24_64); + const __m128i c2 = _mm_setr_epi16(1, 1, 1, 1, 1, -1, 1, -1); + + const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); + __m128i in, temp; + + // Load input data. + in = _mm_loadl_epi64((__m128i *)input); + + // Construct i3, i1, i3, i1, i2, i0, i2, i0 + in = _mm_shufflelo_epi16(in, 0xd8); + in = _mm_unpacklo_epi32(in, in); + + // Stage 1 + in = _mm_madd_epi16(in, c1); + in = _mm_add_epi32(in, rounding); + in = _mm_srai_epi32(in, DCT_CONST_BITS); + in = _mm_packs_epi32(in, zero); + + // Stage 2 + temp = _mm_shufflelo_epi16(in, 0x9c); + in = _mm_shufflelo_epi16(in, 0xc9); + in = _mm_unpacklo_epi64(temp, in); + in = _mm_madd_epi16(in, c2); + in = _mm_packs_epi32(in, zero); + + // Store results + _mm_storel_epi64((__m128i *)output, in); +} + #endif -- 2.40.0