From: Janne Grunau Date: Fri, 26 Aug 2016 17:26:56 +0000 (+0300) Subject: aarch64: implement x264_plane_copy_swap_neon X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=dc0fe73636d34baeb3a64918b52db64d2a9e83bb;p=libx264 aarch64: implement x264_plane_copy_swap_neon plane_copy_swap_c: 27054 plane_copy_swap_neon: 4152 --- diff --git a/common/aarch64/mc-a.S b/common/aarch64/mc-a.S index fe0f8704..3a99fbed 100644 --- a/common/aarch64/mc-a.S +++ b/common/aarch64/mc-a.S @@ -1281,6 +1281,34 @@ function x264_plane_copy_core_neon, export=1 ret endfunc +function x264_plane_copy_swap_core_neon, export=1 + lsl w4, w4, #1 + sub x1, x1, x4 + sub x3, x3, x4 +1: + mov w8, w4 + tbz w4, #4, 32f + subs w8, w8, #16 + ld1 {v0.16b}, [x2], #16 + rev16 v0.16b, v0.16b + st1 {v0.16b}, [x0], #16 + b.eq 0f +32: + subs w8, w8, #32 + ld1 {v0.16b,v1.16b}, [x2], #32 + rev16 v0.16b, v0.16b + rev16 v1.16b, v1.16b + st1 {v0.16b,v1.16b}, [x0], #32 + b.gt 32b +0: + subs w5, w5, #1 + add x2, x2, x3 + add x0, x0, x1 + b.gt 1b + + ret +endfunc + function x264_plane_copy_deinterleave_neon, export=1 add w9, w6, #15 and w9, w9, #0xfffffff0 diff --git a/common/aarch64/mc-c.c b/common/aarch64/mc-c.c index 4f93965e..09794d81 100644 --- a/common/aarch64/mc-c.c +++ b/common/aarch64/mc-c.c @@ -51,6 +51,8 @@ void x264_pixel_avg2_w20_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t void x264_plane_copy_core_neon( pixel *dst, intptr_t i_dst, pixel *src, intptr_t i_src, int w, int h ); +void x264_plane_copy_swap_core_neon( pixel *dst, intptr_t i_dst, + pixel *src, intptr_t i_src, int w, int h ); void x264_plane_copy_deinterleave_neon( pixel *dstu, intptr_t i_dstu, pixel *dstv, intptr_t i_dstv, pixel *src, intptr_t i_src, int w, int h ); @@ -208,6 +210,7 @@ void x264_hpel_filter_neon( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, int height, int16_t *buf ); PLANE_COPY(16, neon) +PLANE_COPY_SWAP(16, neon) PLANE_INTERLEAVE(neon) #endif // !HIGH_BIT_DEPTH @@ -232,6 +235,7 @@ void x264_mc_init_aarch64( int cpu, x264_mc_functions_t *pf ) pf->copy[PIXEL_4x4] = x264_mc_copy_w4_neon; pf->plane_copy = x264_plane_copy_neon; + pf->plane_copy_swap = x264_plane_copy_swap_neon; pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_neon; pf->plane_copy_deinterleave_rgb = x264_plane_copy_deinterleave_rgb_neon; pf->plane_copy_interleave = x264_plane_copy_interleave_neon;