From: Janne Grunau Date: Fri, 10 Oct 2014 08:29:15 +0000 (+0200) Subject: aarch64: NEON asm for intra chroma deblocking X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=f2e439d113ae86a0a1ef8215d4d4111892aed3f7;p=libx264 aarch64: NEON asm for intra chroma deblocking deblock_h_chroma_420_intra, deblock_h_chroma_422_intra and x264_deblock_h_chroma_intra_mbaff_neon are ~3 times faster. deblock_chroma_intra[1] is ~4 times faster than C. --- diff --git a/common/aarch64/deblock-a.S b/common/aarch64/deblock-a.S index 00be8e70..9bcd6ade 100644 --- a/common/aarch64/deblock-a.S +++ b/common/aarch64/deblock-a.S @@ -275,6 +275,173 @@ function x264_deblock_h_chroma_neon, export=1 ret endfunc +.macro h264_loop_filter_start_intra + orr w4, w2, w3 + cmp w4, #0 + b.ne 1f + ret +1: + dup v30.16b, w2 // alpha + dup v31.16b, w3 // beta +.endm + +.macro h264_loop_filter_chroma_intra, width=16 + uabd v26.16b, v16.16b, v17.16b // abs(p0 - q0) + uabd v27.16b, v18.16b, v16.16b // abs(p1 - p0) + uabd v28.16b, v19.16b, v17.16b // abs(q1 - q0) + cmhi v26.16b, v30.16b, v26.16b // < alpha + cmhi v27.16b, v31.16b, v27.16b // < beta + cmhi v28.16b, v31.16b, v28.16b // < beta + and v26.16b, v26.16b, v27.16b + and v26.16b, v26.16b, v28.16b + + ushll v4.8h, v18.8b, #1 + ushll v6.8h, v19.8b, #1 +.ifc \width, 16 + ushll2 v5.8h, v18.16b, #1 + ushll2 v7.8h, v19.16b, #1 + uaddl2 v21.8h, v16.16b, v19.16b + uaddl2 v23.8h, v17.16b, v18.16b +.endif + uaddl v20.8h, v16.8b, v19.8b + uaddl v22.8h, v17.8b, v18.8b + add v20.8h, v20.8h, v4.8h // mlal? + add v22.8h, v22.8h, v6.8h +.ifc \width, 16 + add v21.8h, v21.8h, v5.8h + add v23.8h, v23.8h, v7.8h +.endif + uqrshrn v24.8b, v20.8h, #2 + uqrshrn v25.8b, v22.8h, #2 +.ifc \width, 16 + uqrshrn2 v24.16b, v21.8h, #2 + uqrshrn2 v25.16b, v23.8h, #2 +.endif + bit v16.16b, v24.16b, v26.16b + bit v17.16b, v25.16b, v26.16b +.endm + +function x264_deblock_v_chroma_intra_neon, export=1 + h264_loop_filter_start_intra + + sub x0, x0, x1, lsl #1 + ld1 {v18.16b}, [x0], x1 + ld1 {v16.16b}, [x0], x1 + ld1 {v17.16b}, [x0], x1 + ld1 {v19.16b}, [x0] + + h264_loop_filter_chroma_intra + + sub x0, x0, x1, lsl #1 + st1 {v16.16b}, [x0], x1 + st1 {v17.16b}, [x0], x1 + + ret +endfunc + +function x264_deblock_h_chroma_intra_mbaff_neon, export=1 + h264_loop_filter_start_intra + + sub x4, x0, #4 + sub x0, x0, #2 + ld1 {v18.8b}, [x4], x1 + ld1 {v16.8b}, [x4], x1 + ld1 {v17.8b}, [x4], x1 + ld1 {v19.8b}, [x4], x1 + + transpose4x4.h v18, v16, v17, v19, v26, v27, v28, v29 + + h264_loop_filter_chroma_intra, width=8 + + st2 {v16.h,v17.h}[0], [x0], x1 + st2 {v16.h,v17.h}[1], [x0], x1 + st2 {v16.h,v17.h}[2], [x0], x1 + st2 {v16.h,v17.h}[3], [x0], x1 + + ret +endfunc + +function x264_deblock_h_chroma_intra_neon, export=1 + h264_loop_filter_start_intra + + sub x4, x0, #4 + sub x0, x0, #2 + ld1 {v18.d}[0], [x4], x1 + ld1 {v16.d}[0], [x4], x1 + ld1 {v17.d}[0], [x4], x1 + ld1 {v19.d}[0], [x4], x1 + ld1 {v18.d}[1], [x4], x1 + ld1 {v16.d}[1], [x4], x1 + ld1 {v17.d}[1], [x4], x1 + ld1 {v19.d}[1], [x4], x1 + + transpose4x8.h v18, v16, v17, v19, v26, v27, v28, v29 + + h264_loop_filter_chroma_intra + + st2 {v16.h,v17.h}[0], [x0], x1 + st2 {v16.h,v17.h}[1], [x0], x1 + st2 {v16.h,v17.h}[2], [x0], x1 + st2 {v16.h,v17.h}[3], [x0], x1 + st2 {v16.h,v17.h}[4], [x0], x1 + st2 {v16.h,v17.h}[5], [x0], x1 + st2 {v16.h,v17.h}[6], [x0], x1 + st2 {v16.h,v17.h}[7], [x0], x1 + + ret +endfunc + +function x264_deblock_h_chroma_422_intra_neon, export=1 + h264_loop_filter_start_intra + + sub x4, x0, #4 + sub x0, x0, #2 + ld1 {v18.d}[0], [x4], x1 + ld1 {v16.d}[0], [x4], x1 + ld1 {v17.d}[0], [x4], x1 + ld1 {v19.d}[0], [x4], x1 + ld1 {v18.d}[1], [x4], x1 + ld1 {v16.d}[1], [x4], x1 + ld1 {v17.d}[1], [x4], x1 + ld1 {v19.d}[1], [x4], x1 + + transpose4x8.h v18, v16, v17, v19, v26, v27, v28, v29 + + h264_loop_filter_chroma_intra + + st2 {v16.h,v17.h}[0], [x0], x1 + st2 {v16.h,v17.h}[1], [x0], x1 + st2 {v16.h,v17.h}[2], [x0], x1 + st2 {v16.h,v17.h}[3], [x0], x1 + st2 {v16.h,v17.h}[4], [x0], x1 + st2 {v16.h,v17.h}[5], [x0], x1 + st2 {v16.h,v17.h}[6], [x0], x1 + st2 {v16.h,v17.h}[7], [x0], x1 + + ld1 {v18.d}[0], [x4], x1 + ld1 {v16.d}[0], [x4], x1 + ld1 {v17.d}[0], [x4], x1 + ld1 {v19.d}[0], [x4], x1 + ld1 {v18.d}[1], [x4], x1 + ld1 {v16.d}[1], [x4], x1 + ld1 {v17.d}[1], [x4], x1 + ld1 {v19.d}[1], [x4], x1 + + transpose4x8.h v18, v16, v17, v19, v26, v27, v28, v29 + + h264_loop_filter_chroma_intra + + st2 {v16.h,v17.h}[0], [x0], x1 + st2 {v16.h,v17.h}[1], [x0], x1 + st2 {v16.h,v17.h}[2], [x0], x1 + st2 {v16.h,v17.h}[3], [x0], x1 + st2 {v16.h,v17.h}[4], [x0], x1 + st2 {v16.h,v17.h}[5], [x0], x1 + st2 {v16.h,v17.h}[6], [x0], x1 + st2 {v16.h,v17.h}[7], [x0], x1 + + ret +endfunc //static void deblock_strength_c( uint8_t nnz[X264_SCAN8_SIZE], // int8_t ref[2][X264_SCAN8_LUMA_SIZE], diff --git a/common/deblock.c b/common/deblock.c index 51f7782b..101d0bbd 100644 --- a/common/deblock.c +++ b/common/deblock.c @@ -737,6 +737,12 @@ void x264_deblock_h_chroma_neon( uint8_t *pix, intptr_t stride, int alpha, int b void x264_deblock_strength_neon( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE], int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4], int mvy_limit, int bframe ); +#if ARCH_AARCH64 +void x264_deblock_h_chroma_intra_mbaff_neon( uint8_t *pix, intptr_t stride, int alpha, int beta ); +void x264_deblock_h_chroma_intra_neon( uint8_t *pix, intptr_t stride, int alpha, int beta ); +void x264_deblock_h_chroma_422_intra_neon( uint8_t *pix, intptr_t stride, int alpha, int beta ); +void x264_deblock_v_chroma_intra_neon( uint8_t *pix, intptr_t stride, int alpha, int beta ); +#endif #endif void x264_deblock_init( int cpu, x264_deblock_function_t *pf, int b_mbaff ) @@ -845,6 +851,12 @@ void x264_deblock_init( int cpu, x264_deblock_function_t *pf, int b_mbaff ) pf->deblock_luma[0] = x264_deblock_h_luma_neon; pf->deblock_chroma[1] = x264_deblock_v_chroma_neon; pf->deblock_h_chroma_420 = x264_deblock_h_chroma_neon; +#if ARCH_AARCH64 + pf->deblock_chroma_420_intra_mbaff = x264_deblock_h_chroma_intra_mbaff_neon; + pf->deblock_h_chroma_420_intra = x264_deblock_h_chroma_intra_neon; + pf->deblock_h_chroma_422_intra = x264_deblock_h_chroma_422_intra_neon; + pf->deblock_chroma_intra[1] = x264_deblock_v_chroma_intra_neon; +#endif pf->deblock_strength = x264_deblock_strength_neon; } #endif