From d12757f5c69a7c69bdf8035282348363334ab2f3 Mon Sep 17 00:00:00 2001 From: James Yu Date: Tue, 21 Jan 2014 17:23:27 +0800 Subject: [PATCH] VP9 common for ARMv8 by using NEON intrinsics 03 Add vp9_copy_neon.c - vp9_convolve_copy_neon Change-Id: I291fc5423d06240876411bbceab03eae5ef585be Signed-off-by: James Yu --- test/convolve_test.cc | 2 +- vp9/common/arm/neon/vp9_copy_neon.c | 92 +++++++++++++++++++ ...p9_copy_neon.asm => vp9_copy_neon_asm.asm} | 0 vp9/common/vp9_rtcd_defs.pl | 3 +- vp9/vp9_common.mk | 3 +- 5 files changed, 96 insertions(+), 4 deletions(-) create mode 100644 vp9/common/arm/neon/vp9_copy_neon.c rename vp9/common/arm/neon/{vp9_copy_neon.asm => vp9_copy_neon_asm.asm} (100%) diff --git a/test/convolve_test.cc b/test/convolve_test.cc index 12ac4c3e9..44d573909 100644 --- a/test/convolve_test.cc +++ b/test/convolve_test.cc @@ -1767,7 +1767,7 @@ const ConvolveFunctions convolve8_neon( vp9_convolve8_neon, vp9_convolve8_avg_neon, 0); #else // HAVE_NEON const ConvolveFunctions convolve8_neon( - vp9_convolve_copy_c, vp9_convolve_avg_neon, + vp9_convolve_copy_neon, vp9_convolve_avg_neon, vp9_convolve8_horiz_c, vp9_convolve8_avg_horiz_c, vp9_convolve8_vert_c, vp9_convolve8_avg_vert_c, vp9_convolve8_c, vp9_convolve8_avg_c, 0); diff --git a/vp9/common/arm/neon/vp9_copy_neon.c b/vp9/common/arm/neon/vp9_copy_neon.c new file mode 100644 index 000000000..f334abe11 --- /dev/null +++ b/vp9/common/arm/neon/vp9_copy_neon.c @@ -0,0 +1,92 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +void vp9_convolve_copy_neon( + const uint8_t *src, // r0 + ptrdiff_t src_stride, // r1 + uint8_t *dst, // r2 + ptrdiff_t dst_stride, // r3 + const int16_t *filter_x, + int filter_x_stride, + const int16_t *filter_y, + int filter_y_stride, + int w, + int h) { + uint8x8_t d0u8, d2u8; + uint8x16_t q0u8, q1u8, q2u8, q3u8; + (void)filter_x; (void)filter_x_stride; + (void)filter_y; (void)filter_y_stride; + + if (w > 32) { // copy64 + for (; h > 0; h--) { + q0u8 = vld1q_u8(src); + q1u8 = vld1q_u8(src + 16); + q2u8 = vld1q_u8(src + 32); + q3u8 = vld1q_u8(src + 48); + src += src_stride; + + vst1q_u8(dst, q0u8); + vst1q_u8(dst + 16, q1u8); + vst1q_u8(dst + 32, q2u8); + vst1q_u8(dst + 48, q3u8); + dst += dst_stride; + } + } else if (w == 32) { // copy32 + for (; h > 0; h -= 2) { + q0u8 = vld1q_u8(src); + q1u8 = vld1q_u8(src + 16); + src += src_stride; + q2u8 = vld1q_u8(src); + q3u8 = vld1q_u8(src + 16); + src += src_stride; + + vst1q_u8(dst, q0u8); + vst1q_u8(dst + 16, q1u8); + dst += dst_stride; + vst1q_u8(dst, q2u8); + vst1q_u8(dst + 16, q3u8); + dst += dst_stride; + } + } else if (w > 8) { // copy16 + for (; h > 0; h -= 2) { + q0u8 = vld1q_u8(src); + src += src_stride; + q1u8 = vld1q_u8(src); + src += src_stride; + + vst1q_u8(dst, q0u8); + dst += dst_stride; + vst1q_u8(dst, q1u8); + dst += dst_stride; + } + } else if (w == 8) { // copy8 + for (; h > 0; h -= 2) { + d0u8 = vld1_u8(src); + src += src_stride; + d2u8 = vld1_u8(src); + src += src_stride; + + vst1_u8(dst, d0u8); + dst += dst_stride; + vst1_u8(dst, d2u8); + dst += dst_stride; + } + } else { // copy4 + for (; h > 0; h--) { + *(uint32_t *)dst = *(const uint32_t *)src; + src += src_stride; + dst += dst_stride; + } + } + return; +} diff --git a/vp9/common/arm/neon/vp9_copy_neon.asm b/vp9/common/arm/neon/vp9_copy_neon_asm.asm similarity index 100% rename from vp9/common/arm/neon/vp9_copy_neon.asm rename to vp9/common/arm/neon/vp9_copy_neon_asm.asm diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl index 8248b5cec..b9d9627bf 100644 --- a/vp9/common/vp9_rtcd_defs.pl +++ b/vp9/common/vp9_rtcd_defs.pl @@ -289,8 +289,7 @@ $vp9_plane_add_noise_sse2=vp9_plane_add_noise_wmt; # Sub Pixel Filters # add_proto qw/void vp9_convolve_copy/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; -specialize qw/vp9_convolve_copy neon_asm dspr2/, "$sse2_x86inc"; -$vp9_convolve_copy_neon_asm=vp9_convolve_copy_neon; +specialize qw/vp9_convolve_copy neon dspr2/, "$sse2_x86inc"; add_proto qw/void vp9_convolve_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; specialize qw/vp9_convolve_avg neon dspr2/, "$sse2_x86inc"; diff --git a/vp9/vp9_common.mk b/vp9/vp9_common.mk index 33bb56509..79ccad52d 100644 --- a/vp9/vp9_common.mk +++ b/vp9/vp9_common.mk @@ -148,7 +148,6 @@ VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_idct32x32_add_neon$(ASM) VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_iht4x4_add_neon$(ASM) VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_iht8x8_add_neon$(ASM) VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_mb_lpf_neon$(ASM) -VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_copy_neon$(ASM) VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_save_reg_neon$(ASM) VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_reconintra_neon$(ASM) @@ -156,11 +155,13 @@ VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_reconintra_neon$(ASM) # prefer assembly. ifeq ($(HAVE_NEON_ASM), yes) VP9_COMMON_SRCS-yes += common/arm/neon/vp9_avg_neon_asm$(ASM) +VP9_COMMON_SRCS-yes += common/arm/neon/vp9_copy_neon_asm$(ASM) VP9_COMMON_SRCS-yes += common/arm/neon/vp9_loopfilter_neon_asm$(ASM) VP9_COMMON_SRCS-yes += common/arm/neon/vp9_loopfilter_16_neon.c else ifeq ($(HAVE_NEON), yes) VP9_COMMON_SRCS-yes += common/arm/neon/vp9_avg_neon.c +VP9_COMMON_SRCS-yes += common/arm/neon/vp9_copy_neon.c VP9_COMMON_SRCS-yes += common/arm/neon/vp9_loopfilter_neon.c VP9_COMMON_SRCS-yes += common/arm/neon/vp9_loopfilter_16_neon.c endif # HAVE_NEON -- 2.40.0