From 44cb1dcdbdaafeddd98d2ebe3d02408bc380713e Mon Sep 17 00:00:00 2001 From: Janne Grunau Date: Mon, 13 Oct 2014 12:43:50 +0200 Subject: [PATCH] aarch64: x264_deblock_h_chroma_mbaff_neon deblock_chroma_420_mbaff_neon 2 times faster --- common/aarch64/deblock-a.S | 54 ++++++++++++++++++++++++++++++++++++++ common/deblock.c | 2 ++ 2 files changed, 56 insertions(+) diff --git a/common/aarch64/deblock-a.S b/common/aarch64/deblock-a.S index 9bcd6ade..9618665d 100644 --- a/common/aarch64/deblock-a.S +++ b/common/aarch64/deblock-a.S @@ -275,6 +275,60 @@ function x264_deblock_h_chroma_neon, export=1 ret endfunc +.macro h264_loop_filter_chroma8 + dup v22.8b, w2 // alpha + uxtl v24.8h, v24.8b + uabd v26.8b, v16.8b, v17.8b // abs(p0 - q0) + uxtl v4.8h, v17.8b + uabd v28.8b, v18.8b, v16.8b // abs(p1 - p0) + usubw v4.8h, v4.8h, v16.8b + sli v24.8h, v24.8h, #8 + shl v4.8h, v4.8h, #2 + uabd v30.8b, v19.8b, v17.8b // abs(q1 - q0) + uaddw v4.8h, v4.8h, v18.8b + cmhi v26.8b, v22.8b, v26.8b // < alpha + usubw v4.8h, v4.8h, v19.8b + dup v22.8b, w3 // beta + rshrn v4.8b, v4.8h, #3 + cmhi v28.8b, v22.8b, v28.8b // < beta + cmhi v30.8b, v22.8b, v30.8b // < beta + smin v4.8b, v4.8b, v24.8b + neg v25.8b, v24.8b + and v26.8b, v26.8b, v28.8b + smax v4.8b, v4.8b, v25.8b + and v26.8b, v26.8b, v30.8b + uxtl v22.8h, v17.8b + and v4.8b, v4.8b, v26.8b + uxtl v28.8h, v16.8b + saddw v28.8h, v28.8h, v4.8b + ssubw v22.8h, v22.8h, v4.8b + sqxtun v16.8b, v28.8h + sqxtun v17.8b, v22.8h +.endm + +function x264_deblock_h_chroma_mbaff_neon, export=1 + h264_loop_filter_start + + sub x4, x0, #4 + sub x0, x0, #2 + + ld1 {v18.8b}, [x4], x1 + ld1 {v16.8b}, [x4], x1 + ld1 {v17.8b}, [x4], x1 + ld1 {v19.8b}, [x4] + + transpose4x4.h v18, v16, v17, v19, v28, v29, v30, v31 + + h264_loop_filter_chroma8 + + st2 {v16.h,v17.h}[0], [x0], x1 + st2 {v16.h,v17.h}[1], [x0], x1 + st2 {v16.h,v17.h}[2], [x0], x1 + st2 {v16.h,v17.h}[3], [x0] + + ret +endfunc + .macro h264_loop_filter_start_intra orr w4, w2, w3 cmp w4, #0 diff --git a/common/deblock.c b/common/deblock.c index 101d0bbd..b0b8d2b6 100644 --- a/common/deblock.c +++ b/common/deblock.c @@ -738,6 +738,7 @@ void x264_deblock_strength_neon( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X26 int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4], int mvy_limit, int bframe ); #if ARCH_AARCH64 +void x264_deblock_h_chroma_mbaff_neon( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ); void x264_deblock_h_chroma_intra_mbaff_neon( uint8_t *pix, intptr_t stride, int alpha, int beta ); void x264_deblock_h_chroma_intra_neon( uint8_t *pix, intptr_t stride, int alpha, int beta ); void x264_deblock_h_chroma_422_intra_neon( uint8_t *pix, intptr_t stride, int alpha, int beta ); @@ -852,6 +853,7 @@ void x264_deblock_init( int cpu, x264_deblock_function_t *pf, int b_mbaff ) pf->deblock_chroma[1] = x264_deblock_v_chroma_neon; pf->deblock_h_chroma_420 = x264_deblock_h_chroma_neon; #if ARCH_AARCH64 + pf->deblock_chroma_420_mbaff = x264_deblock_h_chroma_mbaff_neon; pf->deblock_chroma_420_intra_mbaff = x264_deblock_h_chroma_intra_mbaff_neon; pf->deblock_h_chroma_420_intra = x264_deblock_h_chroma_intra_neon; pf->deblock_h_chroma_422_intra = x264_deblock_h_chroma_422_intra_neon; -- 2.40.0