From 99a1ca1f1a62d51e47d1ac2c92ee9c3bf3b5712b Mon Sep 17 00:00:00 2001 From: Janne Grunau Date: Tue, 29 Jul 2014 18:26:11 +0100 Subject: [PATCH] aarch64: implement x264_pixel_vsad_neon 35 times faster than C. --- common/aarch64/pixel-a.S | 26 +++++++++++++++++++++++++- common/aarch64/pixel.h | 2 ++ common/pixel.c | 1 + 3 files changed, 28 insertions(+), 1 deletion(-) diff --git a/common/aarch64/pixel-a.S b/common/aarch64/pixel-a.S index 8c7b9279..efa708a7 100644 --- a/common/aarch64/pixel-a.S +++ b/common/aarch64/pixel-a.S @@ -148,7 +148,7 @@ SAD_FUNC 16, 16 \first v17.8h, v2.8b, v0.8b ld1 {v3.8b}, [x3], x5 ld1 {v1.8b}, [x1], x5 - \first v18.8h, v3.8b, v0.8b + \first v18.8h, v3.8b, v0.8b uabal v16.8h, v1.8b, v5.8b ld1 {v2.8b}, [x2], x5 ld1 {v3.8b}, [x3], x5 @@ -248,6 +248,30 @@ SAD_X_FUNC 4, 16, 8 SAD_X_FUNC 4, 16, 16 +function x264_pixel_vsad_neon, export=1 + subs w2, w2, #2 + ld1 {v0.16b}, [x0], x1 + ld1 {v1.16b}, [x0], x1 + uabdl v6.8h, v0.8b, v1.8b + uabdl2 v7.8h, v0.16b, v1.16b + b.le 2f +1: + subs w2, w2, #2 + ld1 {v0.16b}, [x0], x1 + uabal v6.8h, v1.8b, v0.8b + uabal2 v7.8h, v1.16b, v0.16b + ld1 {v1.16b}, [x0], x1 + b.lt 2f + uabal v6.8h, v0.8b, v1.8b + uabal2 v7.8h, v0.16b, v1.16b + b.gt 1b +2: + add v5.8h, v6.8h, v7.8h + uaddlv s0, v5.8h + fmov w0, s0 + ret +endfunc + .macro SSD_START_4 ld1 {v16.s}[0], [x0], x1 ld1 {v17.s}[0], [x2], x3 diff --git a/common/aarch64/pixel.h b/common/aarch64/pixel.h index d4097edd..c7cc6c98 100644 --- a/common/aarch64/pixel.h +++ b/common/aarch64/pixel.h @@ -48,6 +48,8 @@ DECL_X4( sad, neon ) DECL_X1( satd, neon ) DECL_X1( ssd, neon ) +int x264_pixel_vsad_neon( uint8_t *, intptr_t, int ); + int x264_pixel_sa8d_8x8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t ); int x264_pixel_sa8d_16x16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t ); uint64_t x264_pixel_sa8d_satd_16x16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t ); diff --git a/common/pixel.c b/common/pixel.c index 421a67c9..d467151e 100644 --- a/common/pixel.c +++ b/common/pixel.c @@ -1429,6 +1429,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ) pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_neon; pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_neon; pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_neon; + pixf->vsad = x264_pixel_vsad_neon; pixf->intra_sad_x3_4x4 = x264_intra_sad_x3_4x4_neon; pixf->intra_satd_x3_4x4 = x264_intra_satd_x3_4x4_neon; -- 2.50.1