]> granicus.if.org Git - libx264/commitdiff
aarch64: {plane_copy,memcpy_aligned,memzero_aligned}_neon
authorJanne Grunau <janne-x264@jannau.net>
Fri, 31 Oct 2014 13:49:04 +0000 (14:49 +0100)
committerAnton Mitrofanov <BugMaster@narod.ru>
Tue, 16 Dec 2014 17:40:09 +0000 (20:40 +0300)
2-3 times faster than C.

common/aarch64/mc-a.S
common/aarch64/mc-c.c

index 84074516832a59c5f9331a542cf0d530d0f7090d..324ef16939b8b05377f4e5c357f7440072011c8d 100644 (file)
@@ -1253,6 +1253,34 @@ load_deinterleave_chroma:
     ret
 endfunc
 
+function x264_plane_copy_neon, export=1
+    add         x8,  x4,  #15
+    and         x4,  x8,  #~15
+    sub         x1,  x1,  x4
+    sub         x3,  x3,  x4
+1:
+    mov         w8,  w4
+16:
+    tst         w8,  #16
+    b.eq        32f
+    subs        w8,  w8,  #16
+    ldr         q0,  [x2], #16
+    str         q0,  [x0], #16
+    b.eq        0f
+32:
+    subs        w8,  w8,  #32
+    ldp         q0,  q1,  [x2], #32
+    stp         q0,  q1,  [x0], #32
+    b.gt        32b
+0:
+    subs        w5,  w5,  #1
+    add         x2,  x2,  x3
+    add         x0,  x0,  x1
+    b.gt        1b
+
+    ret
+endfunc
+
 function x264_plane_copy_deinterleave_neon, export=1
     add         w9,  w6,  #15
     and         w9,  w9,  #0xfffffff0
@@ -1601,3 +1629,41 @@ function x264_mbtree_propagate_list_internal_neon, export=1
     b.ge        8b
     ret
 endfunc
+
+function x264_memcpy_aligned_neon, export=1
+    tst         x2,  #16
+    b.eq        32f
+    sub         x2,  x2,  #16
+    ldr         q0,  [x1], #16
+    str         q0,  [x0], #16
+32:
+    tst         x2,  #32
+    b.eq        640f
+    sub         x2,  x2,  #32
+    ldp         q0,  q1,  [x1], #32
+    stp         q0,  q1,  [x0], #32
+640:
+    cbz         x2,  1f
+64:
+    subs        x2,  x2,  #64
+    ldp         q0,  q1,  [x1, #32]
+    ldp         q2,  q3,  [x1], #64
+    stp         q0,  q1,  [x0, #32]
+    stp         q2,  q3,  [x0], #64
+    b.gt        64b
+1:
+    ret
+endfunc
+
+function x264_memzero_aligned_neon, export=1
+    movi        v0.16b,  #0
+    movi        v1.16b,  #0
+1:
+    subs        x1,  x1,  #128
+    stp         q0,  q1,  [x0, #96]
+    stp         q0,  q1,  [x0, #64]
+    stp         q0,  q1,  [x0, #32]
+    stp         q0,  q1,  [x0], 128
+    b.gt        1b
+    ret
+endfunc
index 96582d455a6aaf62b4a453c0f501f75b9d4d37de..25ebea49e8f1b7d0e65c98e54ed74e09693a3d1c 100644 (file)
@@ -49,6 +49,8 @@ void x264_pixel_avg2_w8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t
 void x264_pixel_avg2_w16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int );
 void x264_pixel_avg2_w20_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int );
 
+void x264_plane_copy_neon( pixel *dst, intptr_t i_dst,
+                           pixel *src, intptr_t i_src, int w, int h );
 void x264_plane_copy_deinterleave_neon(  pixel *dstu, intptr_t i_dstu,
                                          pixel *dstv, intptr_t i_dstv,
                                          pixel *src,  intptr_t i_src, int w, int h );
@@ -304,6 +306,7 @@ void x264_mc_init_aarch64( int cpu, x264_mc_functions_t *pf )
     pf->copy[PIXEL_8x8]      = x264_mc_copy_w8_neon;
     pf->copy[PIXEL_4x4]      = x264_mc_copy_w4_neon;
 
+    pf->plane_copy                  = x264_plane_copy_neon;
     pf->plane_copy_deinterleave     = x264_plane_copy_deinterleave_neon;
     pf->plane_copy_deinterleave_rgb = x264_plane_copy_deinterleave_rgb_neon;
     pf->plane_copy_interleave       = x264_plane_copy_interleave_neon;
@@ -340,5 +343,8 @@ void x264_mc_init_aarch64( int cpu, x264_mc_functions_t *pf )
 
     pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_neon;
     pf->mbtree_propagate_list = x264_mbtree_propagate_list_neon;
+
+    pf->memcpy_aligned  = x264_memcpy_aligned_neon;
+    pf->memzero_aligned = x264_memzero_aligned_neon;
 #endif // !HIGH_BIT_DEPTH
 }