]> granicus.if.org Git - libx264/commitdiff
aarch64: implement x264_plane_copy_swap_neon
authorJanne Grunau <janne-x264@jannau.net>
Fri, 26 Aug 2016 17:26:56 +0000 (20:26 +0300)
committerAnton Mitrofanov <BugMaster@narod.ru>
Tue, 20 Sep 2016 18:01:57 +0000 (21:01 +0300)
plane_copy_swap_c: 27054
plane_copy_swap_neon: 4152

common/aarch64/mc-a.S
common/aarch64/mc-c.c

index fe0f870447359b72592256e0eaa334cbc2ddddf1..3a99fbed47f99f799ab7d7b1545747e6d87b6717 100644 (file)
@@ -1281,6 +1281,34 @@ function x264_plane_copy_core_neon, export=1
     ret
 endfunc
 
+function x264_plane_copy_swap_core_neon, export=1
+    lsl         w4,  w4,  #1
+    sub         x1,  x1,  x4
+    sub         x3,  x3,  x4
+1:
+    mov         w8,  w4
+    tbz         w4,  #4,  32f
+    subs        w8,  w8,  #16
+    ld1         {v0.16b}, [x2], #16
+    rev16       v0.16b, v0.16b
+    st1         {v0.16b}, [x0], #16
+    b.eq        0f
+32:
+    subs        w8,  w8,  #32
+    ld1         {v0.16b,v1.16b}, [x2], #32
+    rev16       v0.16b, v0.16b
+    rev16       v1.16b, v1.16b
+    st1         {v0.16b,v1.16b}, [x0], #32
+    b.gt        32b
+0:
+    subs        w5,  w5,  #1
+    add         x2,  x2,  x3
+    add         x0,  x0,  x1
+    b.gt        1b
+
+    ret
+endfunc
+
 function x264_plane_copy_deinterleave_neon, export=1
     add         w9,  w6,  #15
     and         w9,  w9,  #0xfffffff0
index 4f93965e10a5dfacd183af79f692f735c3c8b5a4..09794d814833a6ec79c5de8cc9f95404bf1fcb3d 100644 (file)
@@ -51,6 +51,8 @@ void x264_pixel_avg2_w20_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t
 
 void x264_plane_copy_core_neon( pixel *dst, intptr_t i_dst,
                                 pixel *src, intptr_t i_src, int w, int h );
+void x264_plane_copy_swap_core_neon( pixel *dst, intptr_t i_dst,
+                                     pixel *src, intptr_t i_src, int w, int h );
 void x264_plane_copy_deinterleave_neon(  pixel *dstu, intptr_t i_dstu,
                                          pixel *dstv, intptr_t i_dstv,
                                          pixel *src,  intptr_t i_src, int w, int h );
@@ -208,6 +210,7 @@ void x264_hpel_filter_neon( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,
                             int height, int16_t *buf );
 
 PLANE_COPY(16, neon)
+PLANE_COPY_SWAP(16, neon)
 PLANE_INTERLEAVE(neon)
 #endif // !HIGH_BIT_DEPTH
 
@@ -232,6 +235,7 @@ void x264_mc_init_aarch64( int cpu, x264_mc_functions_t *pf )
     pf->copy[PIXEL_4x4]      = x264_mc_copy_w4_neon;
 
     pf->plane_copy                  = x264_plane_copy_neon;
+    pf->plane_copy_swap             = x264_plane_copy_swap_neon;
     pf->plane_copy_deinterleave     = x264_plane_copy_deinterleave_neon;
     pf->plane_copy_deinterleave_rgb = x264_plane_copy_deinterleave_rgb_neon;
     pf->plane_copy_interleave       = x264_plane_copy_interleave_neon;