From: Martin Storsjö Date: Thu, 13 Aug 2015 20:59:28 +0000 (+0300) Subject: arm: Optimize x264_deblock_h_chroma_neon X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=89439b2c604c81e13eb3da9e692d2cdae5a18b53;p=libx264 arm: Optimize x264_deblock_h_chroma_neon Shuffle both chroma components together as a 16 bit unit, and don't write the unchanged columns (like in x264_deblock_h_luma_neon and in the aarch64 version of the function). This causes a minor slowdown for x264_deblock_v_chroma_neon, but it is negligible compared to the speedup. checkasm timing Cortex-A7 A8 A9 deblock_chroma[1]_c 4817 4057 3601 deblock_chroma[1]_neon 1249 716 817 (before) deblock_chroma[1]_neon 1249 766 845 (after) deblock_h_chroma_420_c 3699 3275 2830 deblock_h_chroma_420_neon 2068 1414 1400 (before) deblock_h_chroma_420_neon 1838 1355 1291 (after) --- diff --git a/common/arm/deblock-a.S b/common/arm/deblock-a.S index 079c654d..446e6780 100644 --- a/common/arm/deblock-a.S +++ b/common/arm/deblock-a.S @@ -205,11 +205,13 @@ endfunc vshl.i16 q2, q2, #2 vshl.i16 q3, q3, #2 vabd.u8 q15, q1, q0 // abs(q1 - q0) + vmovl.u8 q12, d24 vaddw.u8 q2, q2, d18 vaddw.u8 q3, q3, d19 vclt.u8 q13, q13, q11 // < alpha vsubw.u8 q2, q2, d2 vsubw.u8 q3, q3, d3 + vsli.16 q12, q12, #8 vdup.8 q11, r3 // beta vclt.s8 q10, q12, #0 vrshrn.i16 d4, q2, #3 @@ -241,16 +243,16 @@ function x264_deblock_v_chroma_neon h264_loop_filter_start sub r0, r0, r1, lsl #1 - vld2.8 {d18,d19}, [r0,:128], r1 - vld2.8 {d16,d17}, [r0,:128], r1 - vld2.8 {d0, d1}, [r0,:128], r1 - vld2.8 {d2, d3}, [r0,:128] + vld1.8 {d18,d19}, [r0,:128], r1 + vld1.8 {d16,d17}, [r0,:128], r1 + vld1.8 {d0, d1}, [r0,:128], r1 + vld1.8 {d2, d3}, [r0,:128] h264_loop_filter_chroma sub r0, r0, r1, lsl #1 - vst2.8 {d16,d17}, [r0,:128], r1 - vst2.8 {d0, d1}, [r0,:128], r1 + vst1.8 {d16,d17}, [r0,:128], r1 + vst1.8 {d0, d1}, [r0,:128], r1 bx lr endfunc @@ -268,37 +270,22 @@ function x264_deblock_h_chroma_neon vld1.8 {d1}, [r0], r1 vld1.8 {d3}, [r0], r1 - vuzp.8 d18, d19 - vuzp.8 d16, d17 - vuzp.8 d0, d1 - vuzp.8 d2, d3 - - vtrn.16 q9, q0 - vtrn.16 q8, q1 - vtrn.8 q9, q8 - vtrn.8 q0, q1 + TRANSPOSE4x4_16 q9, q8, q0, q1 h264_loop_filter_chroma - vtrn.16 q9, q0 - vtrn.16 q8, q1 - vtrn.8 q9, q8 - vtrn.8 q0, q1 - - vzip.8 d18, d19 - vzip.8 d16, d17 - vzip.8 d0, d1 - vzip.8 d2, d3 + vtrn.16 q8, q0 sub r0, r0, r1, lsl #3 - vst1.8 {d18}, [r0], r1 - vst1.8 {d16}, [r0], r1 - vst1.8 {d0}, [r0], r1 - vst1.8 {d2}, [r0], r1 - vst1.8 {d19}, [r0], r1 - vst1.8 {d17}, [r0], r1 - vst1.8 {d1}, [r0], r1 - vst1.8 {d3}, [r0], r1 + add r0, r0, #2 + vst1.32 {d16[0]}, [r0], r1 + vst1.32 {d0[0]}, [r0], r1 + vst1.32 {d16[1]}, [r0], r1 + vst1.32 {d0[1]}, [r0], r1 + vst1.32 {d17[0]}, [r0], r1 + vst1.32 {d1[0]}, [r0], r1 + vst1.32 {d17[1]}, [r0], r1 + vst1.32 {d1[1]}, [r0], r1 bx lr endfunc