From: Janne Grunau Date: Thu, 14 Aug 2014 13:22:50 +0000 (+0100) Subject: aarch64: NEON asm for integral init X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=be7e5fa6eee2731abdb1b41bc2a4c1a29e672747;p=libx264 aarch64: NEON asm for integral init integral_init4h_neon and integral_init8h_neon are 3-4 times faster than C. integral_init8v_neon is 6 times faster and integral_init4v_neon is 10 times faster. --- diff --git a/common/aarch64/mc-a.S b/common/aarch64/mc-a.S index 351317e5..83652f2e 100644 --- a/common/aarch64/mc-a.S +++ b/common/aarch64/mc-a.S @@ -1363,3 +1363,124 @@ function x264_store_interleave_chroma_neon, export=1 ret endfunc + +.macro integral4h p1, p2 + ext v1.8b, \p1\().8b, \p2\().8b, #1 + ext v2.8b, \p1\().8b, \p2\().8b, #2 + ext v3.8b, \p1\().8b, \p2\().8b, #3 + uaddl v0.8h, \p1\().8b, v1.8b + uaddl v4.8h, v2.8b, v3.8b + add v0.8h, v0.8h, v4.8h + add v0.8h, v0.8h, v5.8h +.endm + +function integral_init4h_neon, export=1 + sub x3, x0, x2 + ld1 {v6.8b,v7.8b}, [x1], #16 +1: + subs x2, x2, #16 + ld1 {v5.8h}, [x3], #16 + integral4h v6, v7 + ld1 {v6.8b}, [x1], #8 + ld1 {v5.8h}, [x3], #16 + st1 {v0.8h}, [x0], #16 + integral4h v7, v6 + ld1 {v7.8b}, [x1], #8 + st1 {v0.8h}, [x0], #16 + b.gt 1b + ret +endfunc + +.macro integral8h p1, p2, s + ext v1.8b, \p1\().8b, \p2\().8b, #1 + ext v2.8b, \p1\().8b, \p2\().8b, #2 + ext v3.8b, \p1\().8b, \p2\().8b, #3 + ext v4.8b, \p1\().8b, \p2\().8b, #4 + ext v5.8b, \p1\().8b, \p2\().8b, #5 + ext v6.8b, \p1\().8b, \p2\().8b, #6 + ext v7.8b, \p1\().8b, \p2\().8b, #7 + uaddl v0.8h, \p1\().8b, v1.8b + uaddl v2.8h, v2.8b, v3.8b + uaddl v4.8h, v4.8b, v5.8b + uaddl v6.8h, v6.8b, v7.8b + add v0.8h, v0.8h, v2.8h + add v4.8h, v4.8h, v6.8h + add v0.8h, v0.8h, v4.8h + add v0.8h, v0.8h, \s\().8h +.endm + +function integral_init8h_neon, export=1 + sub x3, x0, x2 + ld1 {v16.8b,v17.8b}, [x1], #16 +1: + subs x2, x2, #16 + ld1 {v18.8h}, [x3], #16 + integral8h v16, v17, v18 + ld1 {v16.8b}, [x1], #8 + ld1 {v18.8h}, [x3], #16 + st1 {v0.8h}, [x0], #16 + integral8h v17, v16, v18 + ld1 {v17.8b}, [x1], #8 + st1 {v0.8h}, [x0], #16 + b.gt 1b + ret +endfunc + +function integral_init4v_neon, export=1 + mov x3, x0 + add x4, x0, x2, lsl #3 + add x8, x0, x2, lsl #4 + sub x2, x2, #8 + ld1 {v20.8h,v21.8h,v22.8h}, [x3], #48 + ld1 {v16.8h,v17.8h,v18.8h}, [x8], #48 +1: + subs x2, x2, #16 + ld1 {v24.8h,v25.8h}, [x4], #32 + ext v0.16b, v20.16b, v21.16b, #8 + ext v1.16b, v21.16b, v22.16b, #8 + ext v2.16b, v16.16b, v17.16b, #8 + ext v3.16b, v17.16b, v18.16b, #8 + sub v24.8h, v24.8h, v20.8h + sub v25.8h, v25.8h, v21.8h + add v0.8h, v0.8h, v20.8h + add v1.8h, v1.8h, v21.8h + add v2.8h, v2.8h, v16.8h + add v3.8h, v3.8h, v17.8h + st1 {v24.8h}, [x1], #16 + st1 {v25.8h}, [x1], #16 + mov v20.16b, v22.16b + mov v16.16b, v18.16b + sub v0.8h, v2.8h, v0.8h + sub v1.8h, v3.8h, v1.8h + ld1 {v21.8h,v22.8h}, [x3], #32 + ld1 {v17.8h,v18.8h}, [x8], #32 + st1 {v0.8h}, [x0], #16 + st1 {v1.8h}, [x0], #16 + b.gt 1b +2: + ret +endfunc + +function integral_init8v_neon, export=1 + add x2, x0, x1, lsl #4 + sub x1, x1, #8 + ands x3, x1, #16 - 1 + b.eq 1f + subs x1, x1, #8 + ld1 {v0.8h}, [x0] + ld1 {v2.8h}, [x2], #16 + sub v4.8h, v2.8h, v0.8h + st1 {v4.8h}, [x0], #16 + b.le 2f +1: + subs x1, x1, #16 + ld1 {v0.8h,v1.8h}, [x0] + ld1 {v2.8h,v3.8h}, [x2], #32 + sub v4.8h, v2.8h, v0.8h + sub v5.8h, v3.8h, v1.8h + st1 {v4.8h}, [x0], #16 + st1 {v5.8h}, [x0], #16 + b.gt 1b +2: + ret +endfunc diff --git a/common/aarch64/mc-c.c b/common/aarch64/mc-c.c index c71606fc..f40fed62 100644 --- a/common/aarch64/mc-c.c +++ b/common/aarch64/mc-c.c @@ -4,6 +4,7 @@ * Copyright (C) 2009-2014 x264 project * * Authors: David Conrad + * Janne Grunau * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -89,6 +90,10 @@ void x264_mc_copy_w8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, int ); void x264_mc_copy_w16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, int ); void x264_mc_chroma_neon( uint8_t *, uint8_t *, intptr_t, uint8_t *, intptr_t, int, int, int, int ); +void integral_init4h_neon( uint16_t *, uint8_t *, intptr_t ); +void integral_init4v_neon( uint16_t *, uint16_t *, intptr_t ); +void integral_init8h_neon( uint16_t *, uint8_t *, intptr_t ); +void integral_init8v_neon( uint16_t *, intptr_t ); void x264_frame_init_lowres_core_neon( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, intptr_t, intptr_t, int, int ); #if !HIGH_BIT_DEPTH @@ -242,5 +247,10 @@ void x264_mc_init_aarch64( int cpu, x264_mc_functions_t *pf ) pf->get_ref = get_ref_neon; pf->hpel_filter = x264_hpel_filter_neon; pf->frame_init_lowres_core = x264_frame_init_lowres_core_neon; + + pf->integral_init4h = integral_init4h_neon; + pf->integral_init8h = integral_init8h_neon; + pf->integral_init4v = integral_init4v_neon; + pf->integral_init8v = integral_init8v_neon; #endif // !HIGH_BIT_DEPTH }