From: Janne Grunau Date: Fri, 31 Oct 2014 13:49:04 +0000 (+0100) Subject: aarch64: {plane_copy,memcpy_aligned,memzero_aligned}_neon X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=f13573e490d9f18bbcb10409fb09ec25e477035e;p=libx264 aarch64: {plane_copy,memcpy_aligned,memzero_aligned}_neon 2-3 times faster than C. --- diff --git a/common/aarch64/mc-a.S b/common/aarch64/mc-a.S index 84074516..324ef169 100644 --- a/common/aarch64/mc-a.S +++ b/common/aarch64/mc-a.S @@ -1253,6 +1253,34 @@ load_deinterleave_chroma: ret endfunc +function x264_plane_copy_neon, export=1 + add x8, x4, #15 + and x4, x8, #~15 + sub x1, x1, x4 + sub x3, x3, x4 +1: + mov w8, w4 +16: + tst w8, #16 + b.eq 32f + subs w8, w8, #16 + ldr q0, [x2], #16 + str q0, [x0], #16 + b.eq 0f +32: + subs w8, w8, #32 + ldp q0, q1, [x2], #32 + stp q0, q1, [x0], #32 + b.gt 32b +0: + subs w5, w5, #1 + add x2, x2, x3 + add x0, x0, x1 + b.gt 1b + + ret +endfunc + function x264_plane_copy_deinterleave_neon, export=1 add w9, w6, #15 and w9, w9, #0xfffffff0 @@ -1601,3 +1629,41 @@ function x264_mbtree_propagate_list_internal_neon, export=1 b.ge 8b ret endfunc + +function x264_memcpy_aligned_neon, export=1 + tst x2, #16 + b.eq 32f + sub x2, x2, #16 + ldr q0, [x1], #16 + str q0, [x0], #16 +32: + tst x2, #32 + b.eq 640f + sub x2, x2, #32 + ldp q0, q1, [x1], #32 + stp q0, q1, [x0], #32 +640: + cbz x2, 1f +64: + subs x2, x2, #64 + ldp q0, q1, [x1, #32] + ldp q2, q3, [x1], #64 + stp q0, q1, [x0, #32] + stp q2, q3, [x0], #64 + b.gt 64b +1: + ret +endfunc + +function x264_memzero_aligned_neon, export=1 + movi v0.16b, #0 + movi v1.16b, #0 +1: + subs x1, x1, #128 + stp q0, q1, [x0, #96] + stp q0, q1, [x0, #64] + stp q0, q1, [x0, #32] + stp q0, q1, [x0], 128 + b.gt 1b + ret +endfunc diff --git a/common/aarch64/mc-c.c b/common/aarch64/mc-c.c index 96582d45..25ebea49 100644 --- a/common/aarch64/mc-c.c +++ b/common/aarch64/mc-c.c @@ -49,6 +49,8 @@ void x264_pixel_avg2_w8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t void x264_pixel_avg2_w16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int ); void x264_pixel_avg2_w20_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int ); +void x264_plane_copy_neon( pixel *dst, intptr_t i_dst, + pixel *src, intptr_t i_src, int w, int h ); void x264_plane_copy_deinterleave_neon( pixel *dstu, intptr_t i_dstu, pixel *dstv, intptr_t i_dstv, pixel *src, intptr_t i_src, int w, int h ); @@ -304,6 +306,7 @@ void x264_mc_init_aarch64( int cpu, x264_mc_functions_t *pf ) pf->copy[PIXEL_8x8] = x264_mc_copy_w8_neon; pf->copy[PIXEL_4x4] = x264_mc_copy_w4_neon; + pf->plane_copy = x264_plane_copy_neon; pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_neon; pf->plane_copy_deinterleave_rgb = x264_plane_copy_deinterleave_rgb_neon; pf->plane_copy_interleave = x264_plane_copy_interleave_neon; @@ -340,5 +343,8 @@ void x264_mc_init_aarch64( int cpu, x264_mc_functions_t *pf ) pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_neon; pf->mbtree_propagate_list = x264_mbtree_propagate_list_neon; + + pf->memcpy_aligned = x264_memcpy_aligned_neon; + pf->memzero_aligned = x264_memzero_aligned_neon; #endif // !HIGH_BIT_DEPTH }