]> granicus.if.org Git - libx264/commitdiff
arm: Add asm for mbtree fixed point conversion
authorJanne Grunau <janne-x264@jannau.net>
Sun, 24 Apr 2016 12:38:56 +0000 (14:38 +0200)
committerHenrik Gramner <henrik@gramner.com>
Mon, 13 Jun 2016 20:07:00 +0000 (22:07 +0200)
7-8 times faster on a cortex-a53 vs. gcc-5.3.

mbtree_fix8_pack_c: 44114
mbtree_fix8_pack_neon: 5805
mbtree_fix8_unpack_c: 38924
mbtree_fix8_unpack_neon: 4870

common/arm/mc-a.S
common/arm/mc-c.c

index 1dbd498961f195ed886851b2bcf4926eb0c102be..76295cd49f0a6180204cd2ece43cb6b524c9b4eb 100644 (file)
@@ -1880,3 +1880,60 @@ function x264_mbtree_propagate_list_internal_neon
     bge             8b
     bx              lr
 endfunc
+
+@ void mbtree_fix8_pack( int16_t *dst, float *src, int count )
+function x264_mbtree_fix8_pack_neon, export=1
+    subs            r3,  r2,  #8
+    blt             2f
+1:
+    subs            r3,  r3,  #8
+    vld1.32         {q0,q1}, [r1,:128]!
+    vcvt.s32.f32    q0,  q0,  #8
+    vcvt.s32.f32    q1,  q1,  #8
+    vqmovn.s32      d4,  q0
+    vqmovn.s32      d5,  q1
+    vrev16.8        q3,  q2
+    vst1.16         {q3}, [r0,:128]!
+    bge             1b
+2:
+    adds            r3,  r3,  #8
+    bxeq            lr
+3:
+    subs            r3,  r3,  #1
+    vld1.32         {d0[0]}, [r1]!
+    vcvt.s32.f32    s0,  s0,  #8
+    vrev16.8        d0,  d0
+    vst1.16         {d0[0]}, [r0]!
+    bgt             3b
+
+    bx              lr
+endfunc
+
+@ void mbtree_fix8_unpack( float *dst, int16_t *src, int count )
+function x264_mbtree_fix8_unpack_neon, export=1
+    subs            r3,  r2,  #8
+    blt             2f
+1:
+    subs            r3,  r3,  #8
+    vld1.16         {q0}, [r1,:128]!
+    vrev16.8        q1,  q0
+    vmovl.s16       q0,  d2
+    vmovl.s16       q1,  d3
+    vcvt.f32.s32    q0,  q0,  #8
+    vcvt.f32.s32    q1,  q1,  #8
+    vst1.32         {q0,q1}, [r0,:128]!
+    bge             1b
+2:
+    adds            r3,  r3,  #8
+    bxeq            lr
+3:
+    subs            r3,  r3,  #1
+    vld1.16         {d0[0]}, [r1]!
+    vrev16.8        d0,  d0
+    vmovl.s16       q0,  d0
+    vcvt.f32.s32    d0,  d0,  #8
+    vst1.32         {d0[0]}, [r0]!
+    bgt             3b
+
+    bx              lr
+endfunc
index a2ab9a35de554cd0efcb4e42635770e67c3ef8d6..d330bc305b414f46df02dacf1e565b4dc2e99302 100644 (file)
@@ -109,6 +109,9 @@ void integral_init8v_neon( uint16_t *, intptr_t );
 
 void x264_mbtree_propagate_cost_neon( int16_t *, uint16_t *, uint16_t *, uint16_t *, uint16_t *, float *, int );
 
+void x264_mbtree_fix8_pack_neon( uint16_t *dst, float *src, int count );
+void x264_mbtree_fix8_unpack_neon( float *dst, uint16_t *src, int count );
+
 #if !HIGH_BIT_DEPTH
 static void x264_weight_cache_neon( x264_t *h, x264_weight_t *w )
 {
@@ -291,6 +294,8 @@ void x264_mc_init_arm( int cpu, x264_mc_functions_t *pf )
 
     pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_neon;
     pf->mbtree_propagate_list = x264_mbtree_propagate_list_neon;
+    pf->mbtree_fix8_pack      = x264_mbtree_fix8_pack_neon;
+    pf->mbtree_fix8_unpack    = x264_mbtree_fix8_unpack_neon;
 #endif // !HIGH_BIT_DEPTH
 
 // Apple's gcc stupidly cannot align stack variables, and ALIGNED_ARRAY can't work on structs