From d749ab6221fd3b313f0ab88e0a8d85d0e4610423 Mon Sep 17 00:00:00 2001 From: James Yu Date: Tue, 17 Dec 2013 19:03:16 +0800 Subject: [PATCH] VP8 for ARMv8 by using NEON intrinsics 03 Add dc_only_idct_add_neon.c - vp8_dc_only_idct_add_neon vpxdec --summary --noblit ../videos/tears_of_steel_1080p.webm Before => After, 13.25 => 13.24 (fps) Change-Id: I5e9e277ec3a3ca67e13c8cc4c324a6fbe8a897fc Signed-off-by: James Yu --- vp8/common/arm/neon/dc_only_idct_add_neon.asm | 54 ------------------- vp8/common/arm/neon/dc_only_idct_add_neon.c | 42 +++++++++++++++ vp8/vp8_common.mk | 2 +- 3 files changed, 43 insertions(+), 55 deletions(-) delete mode 100644 vp8/common/arm/neon/dc_only_idct_add_neon.asm create mode 100644 vp8/common/arm/neon/dc_only_idct_add_neon.c diff --git a/vp8/common/arm/neon/dc_only_idct_add_neon.asm b/vp8/common/arm/neon/dc_only_idct_add_neon.asm deleted file mode 100644 index 79ff02c69..000000000 --- a/vp8/common/arm/neon/dc_only_idct_add_neon.asm +++ /dev/null @@ -1,54 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license and patent -; grant that can be found in the LICENSE file in the root of the source -; tree. All contributing project authors may be found in the AUTHORS -; file in the root of the source tree. -; - - - EXPORT |vp8_dc_only_idct_add_neon| - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - -;void vp8_dc_only_idct_add_c(short input_dc, unsigned char *pred_ptr, -; int pred_stride, unsigned char *dst_ptr, -; int dst_stride) - -; r0 input_dc -; r1 pred_ptr -; r2 pred_stride -; r3 dst_ptr -; sp dst_stride - -|vp8_dc_only_idct_add_neon| PROC - add r0, r0, #4 - asr r0, r0, #3 - ldr r12, [sp] - vdup.16 q0, r0 - - vld1.32 {d2[0]}, [r1], r2 - vld1.32 {d2[1]}, [r1], r2 - vld1.32 {d4[0]}, [r1], r2 - vld1.32 {d4[1]}, [r1] - - vaddw.u8 q1, q0, d2 - vaddw.u8 q2, q0, d4 - - vqmovun.s16 d2, q1 - vqmovun.s16 d4, q2 - - vst1.32 {d2[0]}, [r3], r12 - vst1.32 {d2[1]}, [r3], r12 - vst1.32 {d4[0]}, [r3], r12 - vst1.32 {d4[1]}, [r3] - - bx lr - - ENDP - - END diff --git a/vp8/common/arm/neon/dc_only_idct_add_neon.c b/vp8/common/arm/neon/dc_only_idct_add_neon.c new file mode 100644 index 000000000..ad5f41d7d --- /dev/null +++ b/vp8/common/arm/neon/dc_only_idct_add_neon.c @@ -0,0 +1,42 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +void vp8_dc_only_idct_add_neon( + int16_t input_dc, + unsigned char *pred_ptr, + int pred_stride, + unsigned char *dst_ptr, + int dst_stride) { + int i; + uint16_t a1 = ((input_dc + 4) >> 3); + uint32x2_t d2u32 = vdup_n_u32(0); + uint8x8_t d2u8; + uint16x8_t q1u16; + uint16x8_t qAdd; + + qAdd = vdupq_n_u16(a1); + + for (i = 0; i < 2; i++) { + d2u32 = vld1_lane_u32((const uint32_t *)pred_ptr, d2u32, 0); + pred_ptr += pred_stride; + d2u32 = vld1_lane_u32((const uint32_t *)pred_ptr, d2u32, 1); + pred_ptr += pred_stride; + + q1u16 = vaddw_u8(qAdd, vreinterpret_u8_u32(d2u32)); + d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q1u16)); + + vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d2u8), 0); + dst_ptr += dst_stride; + vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d2u8), 1); + dst_ptr += dst_stride; + } +} diff --git a/vp8/vp8_common.mk b/vp8/vp8_common.mk index d1eb44540..4c313df04 100644 --- a/vp8/vp8_common.mk +++ b/vp8/vp8_common.mk @@ -159,7 +159,6 @@ VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/vp8_variance_halfpixvar16x16_ VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6$(ASM) # common (neon) -VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/dc_only_idct_add_neon$(ASM) VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/iwalsh_neon$(ASM) VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/loopfilter_neon$(ASM) VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/loopfiltersimplehorizontaledge_neon$(ASM) @@ -187,6 +186,7 @@ VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp8_subpixelvariance16x16s_neon # common (neon intrinsics) VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/bilinearpredict_neon.c VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/copymem_neon.c +VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/dc_only_idct_add_neon.c $(eval $(call rtcd_h_template,vp8_rtcd,vp8/common/rtcd_defs.sh)) -- 2.40.0