From: Martin Storsjö Date: Tue, 25 Aug 2015 11:38:14 +0000 (+0300) Subject: arm: Implement integral_init4/8h/v_neon X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=5265b927b0f2e043dd39cbbbf3909da0862d60e6;p=libx264 arm: Implement integral_init4/8h/v_neon checkasm timing Cortex-A7 A8 A9 integral_init4h_c 10466 8590 6161 integral_init4h_neon 3021 1494 1800 integral_init4v_c 16250 13590 13628 integral_init4v_neon 3473 2073 3291 integral_init8h_c 10100 8275 5705 integral_init8h_neon 4403 2344 2751 integral_init8v_c 6403 4632 4999 integral_init8v_neon 1184 783 1306 --- diff --git a/common/arm/mc-a.S b/common/arm/mc-a.S index 695a6ca8..36ce86fa 100644 --- a/common/arm/mc-a.S +++ b/common/arm/mc-a.S @@ -1603,3 +1603,128 @@ function x264_store_interleave_chroma_neon pop {pc} endfunc + +.macro integral4h p1, p2 + vext.8 d1, \p1, \p2, #1 + vext.8 d2, \p1, \p2, #2 + vext.8 d3, \p1, \p2, #3 + vaddl.u8 q0, \p1, d1 + vaddl.u8 q1, d2, d3 + vadd.u16 q0, q0, q1 + vadd.u16 q0, q0, q2 +.endm + +function integral_init4h_neon + sub r3, r0, r2, lsl #1 + vld1.8 {d6, d7}, [r1, :128]! +1: + subs r2, r2, #16 + vld1.16 {q2}, [r3, :128]! + integral4h d6, d7 + vld1.8 {d6}, [r1, :64]! + vld1.16 {q2}, [r3, :128]! + vst1.16 {q0}, [r0, :128]! + integral4h d7, d6 + vld1.8 {d7}, [r1, :64]! + vst1.16 {q0}, [r0, :128]! + bgt 1b + bx lr +endfunc + +.macro integral8h p1, p2, s + vext.8 d1, \p1, \p2, #1 + vext.8 d2, \p1, \p2, #2 + vext.8 d3, \p1, \p2, #3 + vext.8 d4, \p1, \p2, #4 + vext.8 d5, \p1, \p2, #5 + vext.8 d6, \p1, \p2, #6 + vext.8 d7, \p1, \p2, #7 + vaddl.u8 q0, \p1, d1 + vaddl.u8 q1, d2, d3 + vaddl.u8 q2, d4, d5 + vaddl.u8 q3, d6, d7 + vadd.u16 q0, q0, q1 + vadd.u16 q2, q2, q3 + vadd.u16 q0, q0, q2 + vadd.u16 q0, q0, \s +.endm + +function integral_init8h_neon + sub r3, r0, r2, lsl #1 + vld1.8 {d16, d17}, [r1, :128]! +1: + subs r2, r2, #16 + vld1.16 {q9}, [r3, :128]! + integral8h d16, d17, q9 + vld1.8 {d16}, [r1, :64]! + vld1.16 {q9}, [r3, :128]! + vst1.16 {q0}, [r0, :128]! + integral8h d17, d16, q9 + vld1.8 {d17}, [r1, :64]! + vst1.16 {q0}, [r0, :128]! + bgt 1b + bx lr +endfunc + +function integral_init4v_neon + push {r4-r5} + mov r3, r0 + add r4, r0, r2, lsl #3 + add r5, r0, r2, lsl #4 + sub r2, r2, #8 + vld1.16 {q11, q12}, [r3]! + vld1.16 {q8, q9}, [r5]! + vld1.16 {q13}, [r3]! + vld1.16 {q10}, [r5]! +1: + subs r2, r2, #16 + vld1.16 {q14, q15}, [r4]! + vext.8 q0, q11, q12, #8 + vext.8 q1, q12, q13, #8 + vext.8 q2, q8, q9, #8 + vext.8 q3, q9, q10, #8 + vsub.u16 q14, q14, q11 + vsub.u16 q15, q15, q12 + vadd.u16 q0, q0, q11 + vadd.u16 q1, q1, q12 + vadd.u16 q2, q2, q8 + vadd.u16 q3, q3, q9 + vst1.16 {q14}, [r1]! + vst1.16 {q15}, [r1]! + vmov q11, q13 + vmov q8, q10 + vsub.u16 q0, q2, q0 + vsub.u16 q1, q3, q1 + vld1.16 {q12, q13}, [r3]! + vld1.16 {q9, q10}, [r5]! + vst1.16 {q0}, [r0]! + vst1.16 {q1}, [r0]! + bgt 1b +2: + pop {r4-r5} + bx lr +endfunc + +function integral_init8v_neon + add r2, r0, r1, lsl #4 + sub r1, r1, #8 + ands r3, r1, #16 - 1 + beq 1f + subs r1, r1, #8 + vld1.16 {q0}, [r0] + vld1.16 {q2}, [r2]! + vsub.u16 q8, q2, q0 + vst1.16 {q8}, [r0]! + ble 2f +1: + subs r1, r1, #16 + vld1.16 {q0, q1}, [r0] + vld1.16 {q2, q3}, [r2]! + vsub.u16 q8, q2, q0 + vsub.u16 q9, q3, q1 + vst1.16 {q8}, [r0]! + vst1.16 {q9}, [r0]! + bgt 1b +2: + bx lr +endfunc diff --git a/common/arm/mc-c.c b/common/arm/mc-c.c index 2633772e..3fa18ec8 100644 --- a/common/arm/mc-c.c +++ b/common/arm/mc-c.c @@ -97,6 +97,11 @@ void x264_hpel_filter_v_neon( uint8_t *, uint8_t *, int16_t *, intptr_t, int ); void x264_hpel_filter_c_neon( uint8_t *, int16_t *, int ); void x264_hpel_filter_h_neon( uint8_t *, uint8_t *, int ); +void integral_init4h_neon( uint16_t *, uint8_t *, intptr_t ); +void integral_init4v_neon( uint16_t *, uint16_t *, intptr_t ); +void integral_init8h_neon( uint16_t *, uint8_t *, intptr_t ); +void integral_init8v_neon( uint16_t *, intptr_t ); + #if !HIGH_BIT_DEPTH static void x264_weight_cache_neon( x264_t *h, x264_weight_t *w ) { @@ -268,6 +273,11 @@ void x264_mc_init_arm( int cpu, x264_mc_functions_t *pf ) pf->get_ref = get_ref_neon; pf->hpel_filter = hpel_filter_neon; pf->frame_init_lowres_core = x264_frame_init_lowres_core_neon; + + pf->integral_init4h = integral_init4h_neon; + pf->integral_init8h = integral_init8h_neon; + pf->integral_init4v = integral_init4v_neon; + pf->integral_init8v = integral_init8v_neon; #endif // !HIGH_BIT_DEPTH // Apple's gcc stupidly cannot align stack variables, and ALIGNED_ARRAY can't work on structs