From: hkuang <hkuang@google.com> Date: Mon, 26 Aug 2013 19:39:12 +0000 (-0700) Subject: Add neon optimize vp9_short_idct4x4_1_add. X-Git-Tag: v1.3.0~524^2 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=69384f4fadf73cb6f2427f91134c9a54a5267140;p=libvpx Add neon optimize vp9_short_idct4x4_1_add. Change-Id: I6ecb5c4a1a472feb8e84e9f3352b536d5e28a4a5 --- diff --git a/vp9/common/arm/neon/vp9_short_idct4x4_1_add_neon.asm b/vp9/common/arm/neon/vp9_short_idct4x4_1_add_neon.asm new file mode 100644 index 000000000..869ee5f3f --- /dev/null +++ b/vp9/common/arm/neon/vp9_short_idct4x4_1_add_neon.asm @@ -0,0 +1,68 @@ +; +; Copyright (c) 2013 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + EXPORT |vp9_short_idct4x4_1_add_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +;void vp9_short_idct4x4_1_add_neon(int16_t *input, uint8_t *dest, +; int dest_stride) +; +; r0 int16_t input +; r1 uint8_t *dest +; r2 int dest_stride) + +|vp9_short_idct4x4_1_add_neon| PROC + ldrsh r0, [r0] + + ; generate cospi_16_64 = 11585 + mov r12, #0x2d00 + add r12, #0x41 + + ; out = dct_const_round_shift(input[0] * cospi_16_64) + mul r0, r0, r12 ; input[0] * cospi_16_64 + add r0, r0, #0x2000 ; +(1 << ((DCT_CONST_BITS) - 1)) + asr r0, r0, #14 ; >> DCT_CONST_BITS + + ; out = dct_const_round_shift(out * cospi_16_64) + mul r0, r0, r12 ; out * cospi_16_64 + mov r12, r1 ; save dest + add r0, r0, #0x2000 ; +(1 << ((DCT_CONST_BITS) - 1)) + asr r0, r0, #14 ; >> DCT_CONST_BITS + + ; a1 = ROUND_POWER_OF_TWO(out, 4) + add r0, r0, #8 ; + (1 <<((4) - 1)) + asr r0, r0, #4 ; >> 4 + + vdup.s16 q0, r0 ; duplicate a1 + + vld1.32 {d2[0]}, [r1], r2 + vld1.32 {d2[1]}, [r1], r2 + vld1.32 {d4[0]}, [r1], r2 + vld1.32 {d4[1]}, [r1] + + vaddw.u8 q8, q0, d2 ; dest[x] + a1 + vaddw.u8 q9, q0, d4 + + vqmovun.s16 d6, q8 ; clip_pixel + vqmovun.s16 d7, q9 + + vst1.32 {d6[0]}, [r12], r2 + vst1.32 {d6[1]}, [r12], r2 + vst1.32 {d7[0]}, [r12], r2 + vst1.32 {d7[1]}, [r12] + + bx lr + ENDP ; |vp9_short_idct4x4_1_add_neon| + + END diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh index 2979daf6d..30c1b26d0 100644 --- a/vp9/common/vp9_rtcd_defs.sh +++ b/vp9/common/vp9_rtcd_defs.sh @@ -295,7 +295,7 @@ specialize vp9_convolve8_avg_vert ssse3 neon # dct # prototype void vp9_short_idct4x4_1_add "int16_t *input, uint8_t *dest, int dest_stride" -specialize vp9_short_idct4x4_1_add sse2 +specialize vp9_short_idct4x4_1_add sse2 neon prototype void vp9_short_idct4x4_add "int16_t *input, uint8_t *dest, int dest_stride" specialize vp9_short_idct4x4_add sse2 neon diff --git a/vp9/vp9_common.mk b/vp9/vp9_common.mk index c6f398101..d5692efb1 100644 --- a/vp9/vp9_common.mk +++ b/vp9/vp9_common.mk @@ -96,6 +96,7 @@ VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_convolve8_neon$(ASM) VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_convolve8_avg_neon$(ASM) VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_loopfilter_neon$(ASM) VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_dc_only_idct_add_neon$(ASM) +VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_short_idct4x4_1_add_neon$(ASM) VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_short_idct4x4_add_neon$(ASM) VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_short_idct8x8_add_neon$(ASM) VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_short_idct16x16_add_neon$(ASM)