bge 8b
bx lr
endfunc
+
+@ void mbtree_fix8_pack( int16_t *dst, float *src, int count )
+function x264_mbtree_fix8_pack_neon, export=1
+ subs r3, r2, #8
+ blt 2f
+1:
+ subs r3, r3, #8
+ vld1.32 {q0,q1}, [r1,:128]!
+ vcvt.s32.f32 q0, q0, #8
+ vcvt.s32.f32 q1, q1, #8
+ vqmovn.s32 d4, q0
+ vqmovn.s32 d5, q1
+ vrev16.8 q3, q2
+ vst1.16 {q3}, [r0,:128]!
+ bge 1b
+2:
+ adds r3, r3, #8
+ bxeq lr
+3:
+ subs r3, r3, #1
+ vld1.32 {d0[0]}, [r1]!
+ vcvt.s32.f32 s0, s0, #8
+ vrev16.8 d0, d0
+ vst1.16 {d0[0]}, [r0]!
+ bgt 3b
+
+ bx lr
+endfunc
+
+@ void mbtree_fix8_unpack( float *dst, int16_t *src, int count )
+function x264_mbtree_fix8_unpack_neon, export=1
+ subs r3, r2, #8
+ blt 2f
+1:
+ subs r3, r3, #8
+ vld1.16 {q0}, [r1,:128]!
+ vrev16.8 q1, q0
+ vmovl.s16 q0, d2
+ vmovl.s16 q1, d3
+ vcvt.f32.s32 q0, q0, #8
+ vcvt.f32.s32 q1, q1, #8
+ vst1.32 {q0,q1}, [r0,:128]!
+ bge 1b
+2:
+ adds r3, r3, #8
+ bxeq lr
+3:
+ subs r3, r3, #1
+ vld1.16 {d0[0]}, [r1]!
+ vrev16.8 d0, d0
+ vmovl.s16 q0, d0
+ vcvt.f32.s32 d0, d0, #8
+ vst1.32 {d0[0]}, [r0]!
+ bgt 3b
+
+ bx lr
+endfunc
void x264_mbtree_propagate_cost_neon( int16_t *, uint16_t *, uint16_t *, uint16_t *, uint16_t *, float *, int );
+void x264_mbtree_fix8_pack_neon( uint16_t *dst, float *src, int count );
+void x264_mbtree_fix8_unpack_neon( float *dst, uint16_t *src, int count );
+
#if !HIGH_BIT_DEPTH
static void x264_weight_cache_neon( x264_t *h, x264_weight_t *w )
{
pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_neon;
pf->mbtree_propagate_list = x264_mbtree_propagate_list_neon;
+ pf->mbtree_fix8_pack = x264_mbtree_fix8_pack_neon;
+ pf->mbtree_fix8_unpack = x264_mbtree_fix8_unpack_neon;
#endif // !HIGH_BIT_DEPTH
// Apple's gcc stupidly cannot align stack variables, and ALIGNED_ARRAY can't work on structs