From: Janne Grunau Date: Sun, 24 Apr 2016 12:38:56 +0000 (+0200) Subject: arm: Add asm for mbtree fixed point conversion X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=14a58532fea2c5f9e7b93c918476d842091c4268;p=libx264 arm: Add asm for mbtree fixed point conversion 7-8 times faster on a cortex-a53 vs. gcc-5.3. mbtree_fix8_pack_c: 44114 mbtree_fix8_pack_neon: 5805 mbtree_fix8_unpack_c: 38924 mbtree_fix8_unpack_neon: 4870 --- diff --git a/common/arm/mc-a.S b/common/arm/mc-a.S index 1dbd4989..76295cd4 100644 --- a/common/arm/mc-a.S +++ b/common/arm/mc-a.S @@ -1880,3 +1880,60 @@ function x264_mbtree_propagate_list_internal_neon bge 8b bx lr endfunc + +@ void mbtree_fix8_pack( int16_t *dst, float *src, int count ) +function x264_mbtree_fix8_pack_neon, export=1 + subs r3, r2, #8 + blt 2f +1: + subs r3, r3, #8 + vld1.32 {q0,q1}, [r1,:128]! + vcvt.s32.f32 q0, q0, #8 + vcvt.s32.f32 q1, q1, #8 + vqmovn.s32 d4, q0 + vqmovn.s32 d5, q1 + vrev16.8 q3, q2 + vst1.16 {q3}, [r0,:128]! + bge 1b +2: + adds r3, r3, #8 + bxeq lr +3: + subs r3, r3, #1 + vld1.32 {d0[0]}, [r1]! + vcvt.s32.f32 s0, s0, #8 + vrev16.8 d0, d0 + vst1.16 {d0[0]}, [r0]! + bgt 3b + + bx lr +endfunc + +@ void mbtree_fix8_unpack( float *dst, int16_t *src, int count ) +function x264_mbtree_fix8_unpack_neon, export=1 + subs r3, r2, #8 + blt 2f +1: + subs r3, r3, #8 + vld1.16 {q0}, [r1,:128]! + vrev16.8 q1, q0 + vmovl.s16 q0, d2 + vmovl.s16 q1, d3 + vcvt.f32.s32 q0, q0, #8 + vcvt.f32.s32 q1, q1, #8 + vst1.32 {q0,q1}, [r0,:128]! + bge 1b +2: + adds r3, r3, #8 + bxeq lr +3: + subs r3, r3, #1 + vld1.16 {d0[0]}, [r1]! + vrev16.8 d0, d0 + vmovl.s16 q0, d0 + vcvt.f32.s32 d0, d0, #8 + vst1.32 {d0[0]}, [r0]! + bgt 3b + + bx lr +endfunc diff --git a/common/arm/mc-c.c b/common/arm/mc-c.c index a2ab9a35..d330bc30 100644 --- a/common/arm/mc-c.c +++ b/common/arm/mc-c.c @@ -109,6 +109,9 @@ void integral_init8v_neon( uint16_t *, intptr_t ); void x264_mbtree_propagate_cost_neon( int16_t *, uint16_t *, uint16_t *, uint16_t *, uint16_t *, float *, int ); +void x264_mbtree_fix8_pack_neon( uint16_t *dst, float *src, int count ); +void x264_mbtree_fix8_unpack_neon( float *dst, uint16_t *src, int count ); + #if !HIGH_BIT_DEPTH static void x264_weight_cache_neon( x264_t *h, x264_weight_t *w ) { @@ -291,6 +294,8 @@ void x264_mc_init_arm( int cpu, x264_mc_functions_t *pf ) pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_neon; pf->mbtree_propagate_list = x264_mbtree_propagate_list_neon; + pf->mbtree_fix8_pack = x264_mbtree_fix8_pack_neon; + pf->mbtree_fix8_unpack = x264_mbtree_fix8_unpack_neon; #endif // !HIGH_BIT_DEPTH // Apple's gcc stupidly cannot align stack variables, and ALIGNED_ARRAY can't work on structs