From 90f0b5c1c881f345c9da15bc482055f2a92f8ceb Mon Sep 17 00:00:00 2001 From: Janne Grunau Date: Thu, 7 Aug 2014 19:46:07 +0200 Subject: [PATCH] aarch64: implement x264_pixel_asd8_neon 7 times faster than C. --- common/aarch64/pixel-a.S | 26 ++++++++++++++++++++++++++ common/aarch64/pixel.h | 2 ++ common/pixel.c | 1 + 3 files changed, 29 insertions(+) diff --git a/common/aarch64/pixel-a.S b/common/aarch64/pixel-a.S index 92912edd..83ef1249 100644 --- a/common/aarch64/pixel-a.S +++ b/common/aarch64/pixel-a.S @@ -273,6 +273,32 @@ function x264_pixel_vsad_neon, export=1 ret endfunc +function x264_pixel_asd8_neon, export=1 + sub w4, w4, #2 + ld1 {v0.8b}, [x0], x1 + ld1 {v1.8b}, [x2], x3 + ld1 {v2.8b}, [x0], x1 + ld1 {v3.8b}, [x2], x3 + usubl v16.8h, v0.8b, v1.8b +1: + subs w4, w4, #2 + ld1 {v4.8b}, [x0], x1 + ld1 {v5.8b}, [x2], x3 + usubl v17.8h, v2.8b, v3.8b + usubl v18.8h, v4.8b, v5.8b + add v16.8h, v16.8h, v17.8h + ld1 {v2.8b}, [x0], x1 + ld1 {v3.8b}, [x2], x3 + add v16.8h, v16.8h, v18.8h + b.gt 1b + usubl v17.8h, v2.8b, v3.8b + add v16.8h, v16.8h, v17.8h + saddlv s0, v16.8h + abs v0.2s, v0.2s + fmov w0, s0 + ret +endfunc + .macro SSD_START_4 ld1 {v16.s}[0], [x0], x1 ld1 {v17.s}[0], [x2], x3 diff --git a/common/aarch64/pixel.h b/common/aarch64/pixel.h index 7d519644..6ef43af4 100644 --- a/common/aarch64/pixel.h +++ b/common/aarch64/pixel.h @@ -74,4 +74,6 @@ void x264_pixel_ssim_4x4x2_core_neon( const uint8_t *, intptr_t, int sums[2][4] ); float x264_pixel_ssim_end4_neon( int sum0[5][4], int sum1[5][4], int width ); +int x264_pixel_asd8_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, int ); + #endif diff --git a/common/pixel.c b/common/pixel.c index 6bdbbca3..f84618d2 100644 --- a/common/pixel.c +++ b/common/pixel.c @@ -1430,6 +1430,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ) pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_neon; pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_neon; pixf->vsad = x264_pixel_vsad_neon; + pixf->asd8 = x264_pixel_asd8_neon; pixf->intra_sad_x3_4x4 = x264_intra_sad_x3_4x4_neon; pixf->intra_satd_x3_4x4 = x264_intra_satd_x3_4x4_neon; -- 2.40.0