From 98e9543b4c39360326e6d5bf266c0c634cb9ee2e Mon Sep 17 00:00:00 2001 From: =?utf8?q?Martin=20Storsj=C3=B6?= Date: Mon, 29 May 2017 12:13:03 +0300 Subject: [PATCH] aarch64: Update the var2 functions to the new signature The existing functions could easily be used by just calling them twice - this would give the following cycle numbers from checkasm: var2_8x8_c: 4110 var2_8x8_neon: 1505 var2_8x16_c: 8019 var2_8x16_neon: 2545 However, by merging both passes into the same function, we get the following speedup: var2_8x8_neon: 1205 var2_8x16_neon: 2327 --- common/aarch64/pixel-a.S | 72 ++++++++++++++++++++++------------------ common/aarch64/pixel.h | 4 +-- common/pixel.c | 4 +-- 3 files changed, 44 insertions(+), 36 deletions(-) diff --git a/common/aarch64/pixel-a.S b/common/aarch64/pixel-a.S index 48209b21..047d3db1 100644 --- a/common/aarch64/pixel-a.S +++ b/common/aarch64/pixel-a.S @@ -569,57 +569,65 @@ endfunc .macro pixel_var2_8 h function x264_pixel_var2_8x\h\()_neon, export=1 - ld1 {v16.8b}, [x0], x1 - ld1 {v18.8b}, [x2], x3 - ld1 {v17.8b}, [x0], x1 - ld1 {v19.8b}, [x2], x3 - mov x5, \h - 4 - usubl v6.8h, v16.8b, v18.8b - usubl v7.8h, v17.8b, v19.8b - ld1 {v16.8b}, [x0], x1 - ld1 {v18.8b}, [x2], x3 - smull v2.4s, v6.4h, v6.4h - smull2 v3.4s, v6.8h, v6.8h - add v0.8h, v6.8h, v7.8h - smlal v2.4s, v7.4h, v7.4h - smlal2 v3.4s, v7.8h, v7.8h + mov x3, #16 + ld1 {v16.8b}, [x0], #8 + ld1 {v18.8b}, [x1], x3 + ld1 {v17.8b}, [x0], #8 + ld1 {v19.8b}, [x1], x3 + mov x5, \h - 2 + usubl v0.8h, v16.8b, v18.8b + usubl v1.8h, v17.8b, v19.8b + ld1 {v16.8b}, [x0], #8 + ld1 {v18.8b}, [x1], x3 + smull v2.4s, v0.4h, v0.4h + smull2 v3.4s, v0.8h, v0.8h + smull v4.4s, v1.4h, v1.4h + smull2 v5.4s, v1.8h, v1.8h usubl v6.8h, v16.8b, v18.8b -1: subs x5, x5, #2 - ld1 {v17.8b}, [x0], x1 - ld1 {v19.8b}, [x2], x3 +1: subs x5, x5, #1 + ld1 {v17.8b}, [x0], #8 + ld1 {v19.8b}, [x1], x3 smlal v2.4s, v6.4h, v6.4h smlal2 v3.4s, v6.8h, v6.8h usubl v7.8h, v17.8b, v19.8b add v0.8h, v0.8h, v6.8h - ld1 {v16.8b}, [x0], x1 - ld1 {v18.8b}, [x2], x3 - smlal v2.4s, v7.4h, v7.4h - smlal2 v3.4s, v7.8h, v7.8h + ld1 {v16.8b}, [x0], #8 + ld1 {v18.8b}, [x1], x3 + smlal v4.4s, v7.4h, v7.4h + smlal2 v5.4s, v7.8h, v7.8h usubl v6.8h, v16.8b, v18.8b - add v0.8h, v0.8h, v7.8h + add v1.8h, v1.8h, v7.8h b.gt 1b - ld1 {v17.8b}, [x0], x1 - ld1 {v19.8b}, [x2], x3 + ld1 {v17.8b}, [x0], #8 + ld1 {v19.8b}, [x1], x3 smlal v2.4s, v6.4h, v6.4h smlal2 v3.4s, v6.8h, v6.8h usubl v7.8h, v17.8b, v19.8b add v0.8h, v0.8h, v6.8h - smlal v2.4s, v7.4h, v7.4h - add v0.8h, v0.8h, v7.8h - smlal2 v3.4s, v7.8h, v7.8h + smlal v4.4s, v7.4h, v7.4h + add v1.8h, v1.8h, v7.8h + smlal2 v5.4s, v7.8h, v7.8h saddlv s0, v0.8h + saddlv s1, v1.8h add v2.4s, v2.4s, v3.4s + add v4.4s, v4.4s, v5.4s mov w0, v0.s[0] - addv s1, v2.4s - sxtw x0, w0 mov w1, v1.s[0] - mul x0, x0, x0 - str w1, [x4] - sub x0, x1, x0, lsr # 6 + (\h >> 4) + addv s2, v2.4s + addv s4, v4.4s + mul w0, w0, w0 + mul w1, w1, w1 + mov w3, v2.s[0] + mov w4, v4.s[0] + sub w0, w3, w0, lsr # 6 + (\h >> 4) + sub w1, w4, w1, lsr # 6 + (\h >> 4) + str w3, [x2] + add w0, w0, w1 + str w4, [x2, #4] ret endfunc diff --git a/common/aarch64/pixel.h b/common/aarch64/pixel.h index 8a7b83e9..5206a0c7 100644 --- a/common/aarch64/pixel.h +++ b/common/aarch64/pixel.h @@ -61,8 +61,8 @@ uint64_t x264_pixel_sa8d_satd_16x16_neon( uint8_t *, intptr_t, uint8_t *, intptr uint64_t x264_pixel_var_8x8_neon ( uint8_t *, intptr_t ); uint64_t x264_pixel_var_8x16_neon ( uint8_t *, intptr_t ); uint64_t x264_pixel_var_16x16_neon( uint8_t *, intptr_t ); -int x264_pixel_var2_8x8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, int * ); -int x264_pixel_var2_8x16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, int * ); +int x264_pixel_var2_8x8_neon ( uint8_t *, uint8_t *, int * ); +int x264_pixel_var2_8x16_neon( uint8_t *, uint8_t *, int * ); uint64_t x264_pixel_hadamard_ac_8x8_neon ( uint8_t *, intptr_t ); uint64_t x264_pixel_hadamard_ac_8x16_neon ( uint8_t *, intptr_t ); diff --git a/common/pixel.c b/common/pixel.c index aeadd7cc..00c14125 100644 --- a/common/pixel.c +++ b/common/pixel.c @@ -1452,8 +1452,8 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ) pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_neon; pixf->var[PIXEL_8x16] = x264_pixel_var_8x16_neon; pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_neon; - //pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_neon; - //pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_neon; + pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_neon; + pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_neon; pixf->vsad = x264_pixel_vsad_neon; pixf->asd8 = x264_pixel_asd8_neon; -- 2.40.0