From 5db8b6b93aa91079ab785b9b49413625430536fd Mon Sep 17 00:00:00 2001 From: =?utf8?q?Martin=20Storsj=C3=B6?= Date: Fri, 28 Aug 2015 00:15:01 +0300 Subject: [PATCH] arm: Implement x264_plane_copy_neon checkasm timing Cortex-A7 A8 A9 plane_copy_c 13124 10925 9106 plane_copy_neon 7349 5103 8945 --- common/arm/mc-a.S | 32 ++++++++++++++++++++++++++++++++ common/arm/mc-c.c | 3 +++ 2 files changed, 35 insertions(+) diff --git a/common/arm/mc-a.S b/common/arm/mc-a.S index 36ce86fa..5e0c117d 100644 --- a/common/arm/mc-a.S +++ b/common/arm/mc-a.S @@ -6,6 +6,7 @@ * Authors: David Conrad * Mans Rullgard * Stefan Groenroos + * Janne Grunau * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -1461,6 +1462,37 @@ function x264_load_deinterleave_chroma_fenc_neon bx lr endfunc +function x264_plane_copy_neon + push {r4,lr} + ldr r4, [sp, #8] + ldr lr, [sp, #12] + add r12, r4, #15 + bic r4, r12, #15 + sub r1, r1, r4 + sub r3, r3, r4 +1: + mov r12, r4 +16: + tst r12, #16 + beq 32f + subs r12, r12, #16 + vld1.8 {q0}, [r2]! + vst1.8 {q0}, [r0]! + beq 0f +32: + subs r12, r12, #32 + vld1.8 {q0, q1}, [r2]! + vst1.8 {q0, q1}, [r0]! + bgt 32b +0: + subs lr, lr, #1 + add r2, r2, r3 + add r0, r0, r1 + bgt 1b + + pop {r4,pc} +endfunc + function x264_plane_copy_deinterleave_neon push {r4-r7, lr} ldrd r6, r7, [sp, #28] diff --git a/common/arm/mc-c.c b/common/arm/mc-c.c index 3fa18ec8..dd86fb24 100644 --- a/common/arm/mc-c.c +++ b/common/arm/mc-c.c @@ -47,6 +47,8 @@ void x264_pixel_avg2_w8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t void x264_pixel_avg2_w16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int ); void x264_pixel_avg2_w20_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int ); +void x264_plane_copy_neon( pixel *dst, intptr_t i_dst, + pixel *src, intptr_t i_src, int w, int h ); void x264_plane_copy_deinterleave_neon( pixel *dstu, intptr_t i_dstu, pixel *dstv, intptr_t i_dstv, pixel *src, intptr_t i_src, int w, int h ); @@ -244,6 +246,7 @@ void x264_mc_init_arm( int cpu, x264_mc_functions_t *pf ) pf->copy[PIXEL_8x8] = x264_mc_copy_w8_neon; pf->copy[PIXEL_4x4] = x264_mc_copy_w4_neon; + pf->plane_copy = x264_plane_copy_neon; pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_neon; pf->plane_copy_deinterleave_rgb = x264_plane_copy_deinterleave_rgb_neon; pf->plane_copy_interleave = x264_plane_copy_interleave_neon; -- 2.40.0