From: James Yu Date: Tue, 17 Dec 2013 14:11:35 +0000 (+0800) Subject: VP8 for ARMv8 by using NEON intrinsics 15 X-Git-Tag: v1.4.0~1671^2 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=727af7cebe3698b8493ba6c1360b0a6606c310fb;p=libvpx VP8 for ARMv8 by using NEON intrinsics 15 Add idct_dequant_0_2x_neon.c - idct_dequant_0_2x_neon Change-Id: I8e129172ef1b2517cf72ff267788921f1a792586 Signed-off-by: James Yu --- diff --git a/vp8/common/arm/neon/idct_dequant_0_2x_neon.asm b/vp8/common/arm/neon/idct_dequant_0_2x_neon.asm deleted file mode 100644 index 3a3921081..000000000 --- a/vp8/common/arm/neon/idct_dequant_0_2x_neon.asm +++ /dev/null @@ -1,81 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license and patent -; grant that can be found in the LICENSE file in the root of the source -; tree. All contributing project authors may be found in the AUTHORS -; file in the root of the source tree. -; - - - EXPORT |idct_dequant_0_2x_neon| - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 -;void idct_dequant_0_2x_neon(short *q, short dq, -; unsigned char *dst, int stride); -; r0 *q -; r1 dq -; r2 *dst -; r3 stride -|idct_dequant_0_2x_neon| PROC - push {r4, r5} - vpush {d8-d15} - - add r12, r2, #4 - vld1.32 {d2[0]}, [r2], r3 - vld1.32 {d8[0]}, [r12], r3 - vld1.32 {d2[1]}, [r2], r3 - vld1.32 {d8[1]}, [r12], r3 - vld1.32 {d4[0]}, [r2], r3 - vld1.32 {d10[0]}, [r12], r3 - vld1.32 {d4[1]}, [r2], r3 - vld1.32 {d10[1]}, [r12], r3 - - ldrh r12, [r0] ; lo q - ldrh r4, [r0, #32] ; hi q - mov r5, #0 - strh r5, [r0] - strh r5, [r0, #32] - - sxth r12, r12 ; lo - mul r0, r12, r1 - add r0, r0, #4 - asr r0, r0, #3 - vdup.16 q0, r0 - sxth r4, r4 ; hi - mul r0, r4, r1 - add r0, r0, #4 - asr r0, r0, #3 - vdup.16 q3, r0 - - vaddw.u8 q1, q0, d2 ; lo - vaddw.u8 q2, q0, d4 - vaddw.u8 q4, q3, d8 ; hi - vaddw.u8 q5, q3, d10 - - sub r2, r2, r3, lsl #2 ; dst - 4*stride - add r0, r2, #4 - - vqmovun.s16 d2, q1 ; lo - vqmovun.s16 d4, q2 - vqmovun.s16 d8, q4 ; hi - vqmovun.s16 d10, q5 - - vst1.32 {d2[0]}, [r2], r3 ; lo - vst1.32 {d8[0]}, [r0], r3 ; hi - vst1.32 {d2[1]}, [r2], r3 - vst1.32 {d8[1]}, [r0], r3 - vst1.32 {d4[0]}, [r2], r3 - vst1.32 {d10[0]}, [r0], r3 - vst1.32 {d4[1]}, [r2] - vst1.32 {d10[1]}, [r0] - - vpop {d8-d15} - pop {r4, r5} - bx lr - - ENDP ; |idct_dequant_0_2x_neon| - END diff --git a/vp8/common/arm/neon/idct_dequant_0_2x_neon.c b/vp8/common/arm/neon/idct_dequant_0_2x_neon.c new file mode 100644 index 000000000..967c32280 --- /dev/null +++ b/vp8/common/arm/neon/idct_dequant_0_2x_neon.c @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +void idct_dequant_0_2x_neon( + int16_t *q, + int16_t dq, + unsigned char *dst, + int stride) { + unsigned char *dst0; + int i, a0, a1; + int16x8x2_t q2Add; + int32x2_t d2s32, d4s32; + uint8x8_t d2u8, d4u8; + uint16x8_t q1u16, q2u16; + + a0 = ((q[0] * dq) + 4) >> 3; + a1 = ((q[16] * dq) + 4) >> 3; + q[0] = q[16] = 0; + q2Add.val[0] = vdupq_n_s16((int16_t)a0); + q2Add.val[1] = vdupq_n_s16((int16_t)a1); + + for (i = 0; i < 2; i++, dst += 4) { + dst0 = dst; + d2s32 = vld1_lane_s32((const int32_t *)dst0, d2s32, 0); + dst0 += stride; + d2s32 = vld1_lane_s32((const int32_t *)dst0, d2s32, 1); + dst0 += stride; + d4s32 = vld1_lane_s32((const int32_t *)dst0, d4s32, 0); + dst0 += stride; + d4s32 = vld1_lane_s32((const int32_t *)dst0, d4s32, 1); + + q1u16 = vaddw_u8(vreinterpretq_u16_s16(q2Add.val[i]), + vreinterpret_u8_s32(d2s32)); + q2u16 = vaddw_u8(vreinterpretq_u16_s16(q2Add.val[i]), + vreinterpret_u8_s32(d4s32)); + + d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q1u16)); + d4u8 = vqmovun_s16(vreinterpretq_s16_u16(q2u16)); + + d2s32 = vreinterpret_s32_u8(d2u8); + d4s32 = vreinterpret_s32_u8(d4u8); + + dst0 = dst; + vst1_lane_s32((int32_t *)dst0, d2s32, 0); + dst0 += stride; + vst1_lane_s32((int32_t *)dst0, d2s32, 1); + dst0 += stride; + vst1_lane_s32((int32_t *)dst0, d4s32, 0); + dst0 += stride; + vst1_lane_s32((int32_t *)dst0, d4s32, 1); + } + return; +} diff --git a/vp8/vp8_common.mk b/vp8/vp8_common.mk index 6a8db2b48..ea6226b2e 100644 --- a/vp8/vp8_common.mk +++ b/vp8/vp8_common.mk @@ -160,7 +160,6 @@ VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/vp8_variance_halfpixvar16x16_ # common (neon) VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/buildintrapredictorsmby_neon$(ASM) -VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/idct_dequant_0_2x_neon$(ASM) VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/idct_blk_neon.c VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/variance_neon$(ASM) VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp8_subpixelvariance8x8_neon$(ASM) @@ -182,6 +181,7 @@ VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/mbloopfilter_neon.c VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/sad_neon.c VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/shortidct4x4llm_neon.c VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/sixtappredict_neon.c +VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/idct_dequant_0_2x_neon.c $(eval $(call rtcd_h_template,vp8_rtcd,vp8/common/rtcd_defs.pl))