From 3c4e9e341f9f5f9ed875b0a974f91bd1d2931e0f Mon Sep 17 00:00:00 2001 From: Dmitry Kovalev Date: Tue, 1 Oct 2013 18:34:36 -0700 Subject: [PATCH] Adding SSE2 optimized vp9_short_idct32x32_1_add function. Change-Id: I4b1c6bb9ff615f5872b96ed07dbf0f5e18e63643 --- vp9/common/vp9_rtcd_defs.sh | 2 +- vp9/common/x86/vp9_idct_intrin_sse2.c | 48 +++++++++++++++++++++++++++ 2 files changed, 49 insertions(+), 1 deletion(-) diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh index 8dacdd00d..225305b19 100644 --- a/vp9/common/vp9_rtcd_defs.sh +++ b/vp9/common/vp9_rtcd_defs.sh @@ -311,7 +311,7 @@ prototype void vp9_short_idct32x32_add "int16_t *input, uint8_t *dest, int dest_ specialize vp9_short_idct32x32_add sse2 neon prototype void vp9_short_idct32x32_1_add "int16_t *input, uint8_t *dest, int dest_stride" -specialize vp9_short_idct32x32_1_add +specialize vp9_short_idct32x32_1_add sse2 prototype void vp9_short_iht4x4_add "int16_t *input, uint8_t *dest, int dest_stride, int tx_type" specialize vp9_short_iht4x4_add sse2 neon diff --git a/vp9/common/x86/vp9_idct_intrin_sse2.c b/vp9/common/x86/vp9_idct_intrin_sse2.c index 9e9d632b3..d00993c47 100644 --- a/vp9/common/x86/vp9_idct_intrin_sse2.c +++ b/vp9/common/x86/vp9_idct_intrin_sse2.c @@ -3549,3 +3549,51 @@ void vp9_short_idct32x32_add_sse2(int16_t *input, uint8_t *dest, int stride) { } } } //NOLINT + +void vp9_short_idct32x32_1_add_sse2(int16_t *input, uint8_t *dest, int stride) { + __m128i dc_value; + const __m128i zero = _mm_setzero_si128(); + int a, i; + + a = dct_const_round_shift(input[0] * cospi_16_64); + a = dct_const_round_shift(a * cospi_16_64); + a = ROUND_POWER_OF_TWO(a, 6); + + dc_value = _mm_set1_epi16(a); + + for (i = 0; i < 4; ++i) { + RECON_AND_STORE(dest, dc_value); + RECON_AND_STORE(dest, dc_value); + RECON_AND_STORE(dest, dc_value); + RECON_AND_STORE(dest, dc_value); + RECON_AND_STORE(dest, dc_value); + RECON_AND_STORE(dest, dc_value); + RECON_AND_STORE(dest, dc_value); + RECON_AND_STORE(dest, dc_value); + RECON_AND_STORE(dest, dc_value); + RECON_AND_STORE(dest, dc_value); + RECON_AND_STORE(dest, dc_value); + RECON_AND_STORE(dest, dc_value); + RECON_AND_STORE(dest, dc_value); + RECON_AND_STORE(dest, dc_value); + RECON_AND_STORE(dest, dc_value); + RECON_AND_STORE(dest, dc_value); + RECON_AND_STORE(dest, dc_value); + RECON_AND_STORE(dest, dc_value); + RECON_AND_STORE(dest, dc_value); + RECON_AND_STORE(dest, dc_value); + RECON_AND_STORE(dest, dc_value); + RECON_AND_STORE(dest, dc_value); + RECON_AND_STORE(dest, dc_value); + RECON_AND_STORE(dest, dc_value); + RECON_AND_STORE(dest, dc_value); + RECON_AND_STORE(dest, dc_value); + RECON_AND_STORE(dest, dc_value); + RECON_AND_STORE(dest, dc_value); + RECON_AND_STORE(dest, dc_value); + RECON_AND_STORE(dest, dc_value); + RECON_AND_STORE(dest, dc_value); + RECON_AND_STORE(dest, dc_value); + dest += 8 - (stride * 32); + } +} -- 2.40.0