From c570be3ea9f24942c362e1c2402ec7fccbb5c330 Mon Sep 17 00:00:00 2001 From: Janne Grunau Date: Sat, 15 Mar 2014 13:29:41 +0100 Subject: [PATCH] arm: implement deblock_strength_neon Based on deblock_strength_avx. checkasm --bench on a cortex-a9: deblock_strength_c: 14611 deblock_strength_neon: 1848 --- common/arm/deblock-a.S | 106 +++++++++++++++++++++++++++++++++++++++++ common/deblock.c | 4 ++ 2 files changed, 110 insertions(+) diff --git a/common/arm/deblock-a.S b/common/arm/deblock-a.S index 7cfecb7e..21f44a7b 100644 --- a/common/arm/deblock-a.S +++ b/common/arm/deblock-a.S @@ -304,3 +304,109 @@ function x264_deblock_h_chroma_neon bx lr .endfunc + +function x264_deblock_strength_neon + ldr ip, [sp] + vmov.i8 q8, #0 + lsl ip, ip, #8 + add r3, r3, #32 + sub ip, ip, #(1<<8)-3 + vmov.i8 q9, #0 + vdup.16 q10, ip + ldr ip, [sp, #4] + +lists: + @ load bytes ref + vld1.8 {d31}, [r1]! + add r2, r2, #16 + vld1.8 {q1}, [r1]! + vmov.i8 q0, #0 + vld1.8 {q2}, [r1]! + vext.8 q3, q0, q1, #15 + vext.8 q0, q0, q2, #15 + vuzp.32 q1, q2 + vuzp.32 q3, q0 + vext.8 q1, q15, q2, #12 + + veor q0, q0, q2 + veor q1, q1, q2 + vorr q8, q8, q0 + vorr q9, q9, q1 + + vld1.16 {q11}, [r2,:128]! @ mv + 0x10 + vld1.16 {q3}, [r2,:128]! @ mv + 0x20 + vld1.16 {q12}, [r2,:128]! @ mv + 0x30 + vld1.16 {q2}, [r2,:128]! @ mv + 0x40 + vld1.16 {q13}, [r2,:128]! @ mv + 0x50 + vext.8 q3, q3, q12, #12 + vext.8 q2, q2, q13, #12 + vabd.s16 q0, q12, q3 + vld1.16 {q3}, [r2,:128]! @ mv + 0x60 + vabd.s16 q1, q13, q2 + vld1.16 {q14}, [r2,:128]! @ mv + 0x70 + vqmovn.u16 d0, q0 + vld1.16 {q2}, [r2,:128]! @ mv + 0x80 + vld1.16 {q15}, [r2,:128]! @ mv + 0x90 + vqmovn.u16 d1, q1 + vext.8 q3, q3, q14, #12 + vext.8 q2, q2, q15, #12 + vabd.s16 q3, q14, q3 + vabd.s16 q2, q15, q2 + vqmovn.u16 d2, q3 + vqmovn.u16 d3, q2 + + vqsub.u8 q0, q0, q10 + vqsub.u8 q1, q1, q10 + vqmovn.u16 d0, q0 + vqmovn.u16 d1, q1 + + vabd.s16 q1, q12, q13 + vorr q8, q8, q0 + + vabd.s16 q0, q11, q12 + vabd.s16 q2, q13, q14 + vabd.s16 q3, q14, q15 + vqmovn.u16 d0, q0 + vqmovn.u16 d1, q1 + vqmovn.u16 d2, q2 + vqmovn.u16 d3, q3 + + vqsub.u8 q0, q0, q10 + vqsub.u8 q1, q1, q10 + vqmovn.u16 d0, q0 + vqmovn.u16 d1, q1 + subs ip, ip, #1 + vorr q9, q9, q0 + beq lists + + mov ip, #-32 + @ load bytes nnz + vld1.8 {d31}, [r0]! + vld1.8 {q1}, [r0]! + vmov.i8 q0, #0 + vld1.8 {q2}, [r0] + vext.8 q3, q0, q1, #15 + vext.8 q0, q0, q2, #15 + vuzp.32 q1, q2 + vuzp.32 q3, q0 + vext.8 q1, q15, q2, #12 + + vorr q0, q0, q2 + vorr q1, q1, q2 + vmov.u8 q10, #1 + vmin.u8 q0, q0, q10 + vmin.u8 q1, q1, q10 + vmin.u8 q8, q8, q10 @ mv ? 1 : 0 + vmin.u8 q9, q9, q10 + vadd.u8 q0, q0, q0 @ nnz ? 2 : 0 + vadd.u8 q1, q1, q1 + vmax.u8 q8, q8, q0 + vmax.u8 q9, q9, q1 + vzip.16 d16, d17 + vst1.8 {q9}, [r3,:128], ip @ bs[1] + vtrn.8 d16, d17 + vtrn.32 d16, d17 + + vst1.8 {q8}, [r3,:128] @ bs[0] + bx lr +.endfunc diff --git a/common/deblock.c b/common/deblock.c index e6af417f..6b369f2a 100644 --- a/common/deblock.c +++ b/common/deblock.c @@ -734,6 +734,9 @@ void x264_deblock_v_luma_neon ( uint8_t *pix, intptr_t stride, int alpha, int b void x264_deblock_h_luma_neon ( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ); void x264_deblock_v_chroma_neon( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ); void x264_deblock_h_chroma_neon( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ); +void x264_deblock_strength_neon( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE], + int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4], + int mvy_limit, int bframe ); #endif void x264_deblock_init( int cpu, x264_deblock_function_t *pf, int b_mbaff ) @@ -842,6 +845,7 @@ void x264_deblock_init( int cpu, x264_deblock_function_t *pf, int b_mbaff ) pf->deblock_luma[0] = x264_deblock_h_luma_neon; pf->deblock_chroma[1] = x264_deblock_v_chroma_neon; pf->deblock_h_chroma_420 = x264_deblock_h_chroma_neon; + pf->deblock_strength = x264_deblock_strength_neon; } #endif #endif // !HIGH_BIT_DEPTH -- 2.40.0