From 9114f0afdb23d98ac0704832db43a88b4ca4af01 Mon Sep 17 00:00:00 2001 From: James Yu Date: Thu, 30 Jan 2014 11:54:35 +0800 Subject: [PATCH] VP9 common for ARMv8 by using NEON intrinsics 08 Add vp9_idct4x4_1_add_neon.c - vp9_idct4x4_1_add_neon Change-Id: Ieab9af107dbd07a4f9503bc945890c90faccb8ac Signed-off-by: James Yu --- vp9/common/arm/neon/vp9_idct4x4_1_add_neon.c | 48 +++++++++++++++++++ ...eon.asm => vp9_idct4x4_1_add_neon_asm.asm} | 0 vp9/common/vp9_rtcd_defs.pl | 3 +- vp9/vp9_common.mk | 3 +- 4 files changed, 51 insertions(+), 3 deletions(-) create mode 100644 vp9/common/arm/neon/vp9_idct4x4_1_add_neon.c rename vp9/common/arm/neon/{vp9_idct4x4_1_add_neon.asm => vp9_idct4x4_1_add_neon_asm.asm} (100%) diff --git a/vp9/common/arm/neon/vp9_idct4x4_1_add_neon.c b/vp9/common/arm/neon/vp9_idct4x4_1_add_neon.c new file mode 100644 index 000000000..7c8a930b6 --- /dev/null +++ b/vp9/common/arm/neon/vp9_idct4x4_1_add_neon.c @@ -0,0 +1,48 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include "vp9/common/vp9_idct.h" + +void vp9_idct4x4_1_add_neon( + int16_t *input, + uint8_t *dest, + int dest_stride) { + uint8x8_t d6u8; + uint32x2_t d2u32 = vdup_n_u32(0); + uint16x8_t q8u16; + int16x8_t q0s16; + uint8_t *d1, *d2; + int16_t i, a1, cospi_16_64 = 11585; + int16_t out = dct_const_round_shift(input[0] * cospi_16_64); + out = dct_const_round_shift(out * cospi_16_64); + a1 = ROUND_POWER_OF_TWO(out, 4); + + q0s16 = vdupq_n_s16(a1); + + // dc_only_idct_add + d1 = d2 = dest; + for (i = 0; i < 2; i++) { + d2u32 = vld1_lane_u32((const uint32_t *)d1, d2u32, 0); + d1 += dest_stride; + d2u32 = vld1_lane_u32((const uint32_t *)d1, d2u32, 1); + d1 += dest_stride; + + q8u16 = vaddw_u8(vreinterpretq_u16_s16(q0s16), + vreinterpret_u8_u32(d2u32)); + d6u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16)); + + vst1_lane_u32((uint32_t *)d2, vreinterpret_u32_u8(d6u8), 0); + d2 += dest_stride; + vst1_lane_u32((uint32_t *)d2, vreinterpret_u32_u8(d6u8), 1); + d2 += dest_stride; + } + return; +} diff --git a/vp9/common/arm/neon/vp9_idct4x4_1_add_neon.asm b/vp9/common/arm/neon/vp9_idct4x4_1_add_neon_asm.asm similarity index 100% rename from vp9/common/arm/neon/vp9_idct4x4_1_add_neon.asm rename to vp9/common/arm/neon/vp9_idct4x4_1_add_neon_asm.asm diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl index 9f9b4f13c..7dc6714ea 100644 --- a/vp9/common/vp9_rtcd_defs.pl +++ b/vp9/common/vp9_rtcd_defs.pl @@ -422,8 +422,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { specialize qw/vp9_iwht4x4_16_add/; } else { add_proto qw/void vp9_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/vp9_idct4x4_1_add sse2 neon_asm dspr2/; - $vp9_idct4x4_1_add_neon_asm=vp9_idct4x4_1_add_neon; + specialize qw/vp9_idct4x4_1_add sse2 neon dspr2/; add_proto qw/void vp9_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; specialize qw/vp9_idct4x4_16_add sse2 neon_asm dspr2/; diff --git a/vp9/vp9_common.mk b/vp9/vp9_common.mk index 8d83ca069..7798a6ca1 100644 --- a/vp9/vp9_common.mk +++ b/vp9/vp9_common.mk @@ -134,7 +134,6 @@ endif VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_idct16x16_neon.c VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_loopfilter_16_neon_asm$(ASM) VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_dc_only_idct_add_neon$(ASM) -VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_idct4x4_1_add_neon$(ASM) VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_idct4x4_add_neon$(ASM) VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_idct8x8_1_add_neon$(ASM) VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_idct8x8_add_neon$(ASM) @@ -156,6 +155,7 @@ VP9_COMMON_SRCS-yes += common/arm/neon/vp9_convolve8_avg_neon_asm$(ASM) VP9_COMMON_SRCS-yes += common/arm/neon/vp9_convolve8_neon_asm$(ASM) VP9_COMMON_SRCS-yes += common/arm/neon/vp9_convolve_neon.c VP9_COMMON_SRCS-yes += common/arm/neon/vp9_copy_neon_asm$(ASM) +VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct4x4_1_add_neon_asm$(ASM) VP9_COMMON_SRCS-yes += common/arm/neon/vp9_loopfilter_neon_asm$(ASM) VP9_COMMON_SRCS-yes += common/arm/neon/vp9_loopfilter_16_neon.c else @@ -165,6 +165,7 @@ VP9_COMMON_SRCS-yes += common/arm/neon/vp9_convolve8_avg_neon.c VP9_COMMON_SRCS-yes += common/arm/neon/vp9_convolve8_neon.c VP9_COMMON_SRCS-yes += common/arm/neon/vp9_convolve_neon.c VP9_COMMON_SRCS-yes += common/arm/neon/vp9_copy_neon.c +VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct4x4_1_add_neon.c VP9_COMMON_SRCS-yes += common/arm/neon/vp9_loopfilter_neon.c VP9_COMMON_SRCS-yes += common/arm/neon/vp9_loopfilter_16_neon.c endif # HAVE_NEON -- 2.40.0