From 35b91f2410dcf4fc5191dd85ccda7a42eb01eae8 Mon Sep 17 00:00:00 2001 From: Janne Grunau Date: Wed, 30 Jul 2014 15:48:25 +0100 Subject: [PATCH] aarch64: implement x264_pixel_ssd_nv12_core_neon 13 times faster than C. --- common/aarch64/pixel-a.S | 71 ++++++++++++++++++++++++++++++++++++++++ common/aarch64/pixel.h | 3 ++ common/pixel.c | 1 + 3 files changed, 75 insertions(+) diff --git a/common/aarch64/pixel-a.S b/common/aarch64/pixel-a.S index efa708a7..d2c3de65 100644 --- a/common/aarch64/pixel-a.S +++ b/common/aarch64/pixel-a.S @@ -373,6 +373,77 @@ SSD_FUNC 8, 16 SSD_FUNC 16, 8 SSD_FUNC 16, 16 + +function x264_pixel_ssd_nv12_core_neon, export=1 + sxtw x8, w4 + add x8, x8, #8 + and x8, x8, #~15 + movi v6.2d, #0 + movi v7.2d, #0 + sub x1, x1, x8, lsl #1 + sub x3, x3, x8, lsl #1 +1: + subs w8, w4, #16 + ld2 {v0.8b,v1.8b}, [x0], #16 + ld2 {v2.8b,v3.8b}, [x2], #16 + ld2 {v24.8b,v25.8b}, [x0], #16 + ld2 {v26.8b,v27.8b}, [x2], #16 + + usubl v16.8h, v0.8b, v2.8b + usubl v17.8h, v1.8b, v3.8b + smull v20.4s, v16.4h, v16.4h + smull v21.4s, v17.4h, v17.4h + usubl v18.8h, v24.8b, v26.8b + usubl v19.8h, v25.8b, v27.8b + smlal2 v20.4s, v16.8h, v16.8h + smlal2 v21.4s, v17.8h, v17.8h + + b.lt 4f + b.eq 3f +2: + smlal v20.4s, v18.4h, v18.4h + smlal v21.4s, v19.4h, v19.4h + ld2 {v0.8b,v1.8b}, [x0], #16 + ld2 {v2.8b,v3.8b}, [x2], #16 + smlal2 v20.4s, v18.8h, v18.8h + smlal2 v21.4s, v19.8h, v19.8h + + subs w8, w8, #16 + usubl v16.8h, v0.8b, v2.8b + usubl v17.8h, v1.8b, v3.8b + smlal v20.4s, v16.4h, v16.4h + smlal v21.4s, v17.4h, v17.4h + ld2 {v24.8b,v25.8b}, [x0], #16 + ld2 {v26.8b,v27.8b}, [x2], #16 + smlal2 v20.4s, v16.8h, v16.8h + smlal2 v21.4s, v17.8h, v17.8h + b.lt 4f + + usubl v18.8h, v24.8b, v26.8b + usubl v19.8h, v25.8b, v27.8b + b.gt 2b +3: + smlal v20.4s, v18.4h, v18.4h + smlal v21.4s, v19.4h, v19.4h + smlal2 v20.4s, v18.8h, v18.8h + smlal2 v21.4s, v19.8h, v19.8h +4: + subs w5, w5, #1 + uaddw v6.2d, v6.2d, v20.2s + uaddw v7.2d, v7.2d, v21.2s + add x0, x0, x1 + add x2, x2, x3 + uaddw2 v6.2d, v6.2d, v20.4s + uaddw2 v7.2d, v7.2d, v21.4s + b.gt 1b + + addp v6.2d, v6.2d, v7.2d + st1 {v6.d}[0], [x6] + st1 {v6.d}[1], [x7] + + ret +endfunc + .macro pixel_var_8 h function x264_pixel_var_8x\h\()_neon, export=1 ld1 {v16.8b}, [x0], x1 diff --git a/common/aarch64/pixel.h b/common/aarch64/pixel.h index c7cc6c98..c7c386ae 100644 --- a/common/aarch64/pixel.h +++ b/common/aarch64/pixel.h @@ -48,6 +48,9 @@ DECL_X4( sad, neon ) DECL_X1( satd, neon ) DECL_X1( ssd, neon ) + +void x264_pixel_ssd_nv12_core_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, int, int, uint64_t *, uint64_t * ); + int x264_pixel_vsad_neon( uint8_t *, intptr_t, int ); int x264_pixel_sa8d_8x8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t ); diff --git a/common/pixel.c b/common/pixel.c index d467151e..bb1894a0 100644 --- a/common/pixel.c +++ b/common/pixel.c @@ -1442,6 +1442,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ) pixf->intra_sad_x3_16x16 = x264_intra_sad_x3_16x16_neon; pixf->intra_satd_x3_16x16 = x264_intra_satd_x3_16x16_neon; + pixf->ssd_nv12_core = x264_pixel_ssd_nv12_core_neon; pixf->ssim_4x4x2_core = x264_pixel_ssim_4x4x2_core_neon; pixf->ssim_end4 = x264_pixel_ssim_end4_neon; } -- 2.40.0