From f8f8d13d5978b13fc831e041e52aa617550bbdf3 Mon Sep 17 00:00:00 2001 From: Janne Grunau Date: Thu, 7 Aug 2014 16:49:12 +0200 Subject: [PATCH] aarch64: NEON asm for 4x16 sad, satd and ssd pixel_sad_4x16_neon: 33% faster than C pixel_satd_4x16_neon: 5 times faster pixel_ssd_4x16_neon: 4 times faster --- common/aarch64/pixel-a.S | 57 ++++++++++++++++++++++++++++++++++++++++ common/aarch64/pixel.h | 1 + common/pixel.c | 8 +++--- 3 files changed, 62 insertions(+), 4 deletions(-) diff --git a/common/aarch64/pixel-a.S b/common/aarch64/pixel-a.S index d2c3de65..92912edd 100644 --- a/common/aarch64/pixel-a.S +++ b/common/aarch64/pixel-a.S @@ -114,6 +114,7 @@ endfunc SAD_FUNC 4, 4 SAD_FUNC 4, 8 +SAD_FUNC 4, 16 SAD_FUNC 8, 4 SAD_FUNC 8, 8 SAD_FUNC 8, 16 @@ -367,6 +368,7 @@ endfunc SSD_FUNC 4, 4 SSD_FUNC 4, 8 +SSD_FUNC 4, 16 SSD_FUNC 8, 4 SSD_FUNC 8, 8 SSD_FUNC 8, 16 @@ -895,6 +897,61 @@ function x264_satd_16x4_neon b x264_satd_8x4v_8x8h_neon endfunc +function x264_pixel_satd_4x16_neon, export=1 + mov x4, x30 + ld1 {v1.s}[0], [x2], x3 + ld1 {v0.s}[0], [x0], x1 + ld1 {v3.s}[0], [x2], x3 + ld1 {v2.s}[0], [x0], x1 + ld1 {v5.s}[0], [x2], x3 + ld1 {v4.s}[0], [x0], x1 + ld1 {v7.s}[0], [x2], x3 + ld1 {v6.s}[0], [x0], x1 + ld1 {v1.s}[1], [x2], x3 + ld1 {v0.s}[1], [x0], x1 + ld1 {v3.s}[1], [x2], x3 + ld1 {v2.s}[1], [x0], x1 + ld1 {v5.s}[1], [x2], x3 + ld1 {v4.s}[1], [x0], x1 + ld1 {v7.s}[1], [x2], x3 + ld1 {v6.s}[1], [x0], x1 + usubl v16.8h, v0.8b, v1.8b + usubl v17.8h, v2.8b, v3.8b + usubl v18.8h, v4.8b, v5.8b + usubl v19.8h, v6.8b, v7.8b + ld1 {v1.s}[0], [x2], x3 + ld1 {v0.s}[0], [x0], x1 + ld1 {v3.s}[0], [x2], x3 + ld1 {v2.s}[0], [x0], x1 + ld1 {v5.s}[0], [x2], x3 + ld1 {v4.s}[0], [x0], x1 + ld1 {v7.s}[0], [x2], x3 + ld1 {v6.s}[0], [x0], x1 + ld1 {v1.s}[1], [x2], x3 + ld1 {v0.s}[1], [x0], x1 + ld1 {v3.s}[1], [x2], x3 + ld1 {v2.s}[1], [x0], x1 + ld1 {v5.s}[1], [x2], x3 + ld1 {v4.s}[1], [x0], x1 + ld1 {v7.s}[1], [x2], x3 + ld1 {v6.s}[1], [x0], x1 + usubl v20.8h, v0.8b, v1.8b + usubl v21.8h, v2.8b, v3.8b + usubl v22.8h, v4.8b, v5.8b + usubl v23.8h, v6.8b, v7.8b + + SUMSUB_AB v0.8h, v1.8h, v16.8h, v17.8h + SUMSUB_AB v2.8h, v3.8h, v18.8h, v19.8h + + bl x264_satd_8x4v_8x8h_neon + + add v30.8h, v0.8h, v1.8h + add v31.8h, v2.8h, v3.8h + add v0.8h, v30.8h, v31.8h + uaddlv s0, v0.8h + mov w0, v0.s[0] + ret x4 +endfunc function x264_pixel_sa8d_8x8_neon, export=1 mov x4, x30 diff --git a/common/aarch64/pixel.h b/common/aarch64/pixel.h index c7c386ae..7d519644 100644 --- a/common/aarch64/pixel.h +++ b/common/aarch64/pixel.h @@ -33,6 +33,7 @@ ret x264_pixel_##name##_8x16_##suffix args;\ ret x264_pixel_##name##_8x8_##suffix args;\ ret x264_pixel_##name##_8x4_##suffix args;\ + ret x264_pixel_##name##_4x16_##suffix args;\ ret x264_pixel_##name##_4x8_##suffix args;\ ret x264_pixel_##name##_4x4_##suffix args;\ diff --git a/common/pixel.c b/common/pixel.c index bb1894a0..6bdbbca3 100644 --- a/common/pixel.c +++ b/common/pixel.c @@ -1409,13 +1409,13 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ) #if ARCH_AARCH64 if( cpu&X264_CPU_NEON ) { - INIT7( sad, _neon ); + INIT8( sad, _neon ); // AArch64 has no distinct instructions for aligned load/store - INIT7_NAME( sad_aligned, sad, _neon ); + INIT8_NAME( sad_aligned, sad, _neon ); INIT7( sad_x3, _neon ); INIT7( sad_x4, _neon ); - INIT7( ssd, _neon ); - INIT7( satd, _neon ); + INIT8( ssd, _neon ); + INIT8( satd, _neon ); INIT7( satd_x3, _neon ); INIT7( satd_x4, _neon ); INIT4( hadamard_ac, _neon ); -- 2.40.0