From 98e9543b4c39360326e6d5bf266c0c634cb9ee2e Mon Sep 17 00:00:00 2001
From: =?utf8?q?Martin=20Storsj=C3=B6?= <martin@martin.st>
Date: Mon, 29 May 2017 12:13:03 +0300
Subject: [PATCH] aarch64: Update the var2 functions to the new signature

The existing functions could easily be used by just calling them
twice - this would give the following cycle numbers from checkasm:

var2_8x8_c:      4110
var2_8x8_neon:   1505
var2_8x16_c:     8019
var2_8x16_neon:  2545

However, by merging both passes into the same function, we get the
following speedup:
var2_8x8_neon:   1205
var2_8x16_neon:  2327
---
 common/aarch64/pixel-a.S | 72 ++++++++++++++++++++++------------------
 common/aarch64/pixel.h   |  4 +--
 common/pixel.c           |  4 +--
 3 files changed, 44 insertions(+), 36 deletions(-)

diff --git a/common/aarch64/pixel-a.S b/common/aarch64/pixel-a.S
index 48209b21..047d3db1 100644
--- a/common/aarch64/pixel-a.S
+++ b/common/aarch64/pixel-a.S
@@ -569,57 +569,65 @@ endfunc
 
 .macro pixel_var2_8 h
 function x264_pixel_var2_8x\h\()_neon, export=1
-    ld1            {v16.8b}, [x0], x1
-    ld1            {v18.8b}, [x2], x3
-    ld1            {v17.8b}, [x0], x1
-    ld1            {v19.8b}, [x2], x3
-    mov             x5,  \h - 4
-    usubl           v6.8h,  v16.8b, v18.8b
-    usubl           v7.8h,  v17.8b, v19.8b
-    ld1            {v16.8b}, [x0], x1
-    ld1            {v18.8b}, [x2], x3
-    smull           v2.4s,  v6.4h,  v6.4h
-    smull2          v3.4s,  v6.8h,  v6.8h
-    add             v0.8h,  v6.8h,  v7.8h
-    smlal           v2.4s,  v7.4h,  v7.4h
-    smlal2          v3.4s,  v7.8h,  v7.8h
+    mov             x3,  #16
+    ld1            {v16.8b}, [x0], #8
+    ld1            {v18.8b}, [x1], x3
+    ld1            {v17.8b}, [x0], #8
+    ld1            {v19.8b}, [x1], x3
+    mov             x5,  \h - 2
+    usubl           v0.8h,  v16.8b, v18.8b
+    usubl           v1.8h,  v17.8b, v19.8b
+    ld1            {v16.8b}, [x0], #8
+    ld1            {v18.8b}, [x1], x3
+    smull           v2.4s,  v0.4h,  v0.4h
+    smull2          v3.4s,  v0.8h,  v0.8h
+    smull           v4.4s,  v1.4h,  v1.4h
+    smull2          v5.4s,  v1.8h,  v1.8h
 
     usubl           v6.8h,  v16.8b, v18.8b
 
-1:  subs            x5,  x5,  #2
-    ld1            {v17.8b}, [x0], x1
-    ld1            {v19.8b}, [x2], x3
+1:  subs            x5,  x5,  #1
+    ld1            {v17.8b}, [x0], #8
+    ld1            {v19.8b}, [x1], x3
     smlal           v2.4s,  v6.4h,  v6.4h
     smlal2          v3.4s,  v6.8h,  v6.8h
     usubl           v7.8h,  v17.8b, v19.8b
     add             v0.8h,  v0.8h,  v6.8h
-    ld1            {v16.8b}, [x0], x1
-    ld1            {v18.8b}, [x2], x3
-    smlal           v2.4s,  v7.4h,  v7.4h
-    smlal2          v3.4s,  v7.8h,  v7.8h
+    ld1            {v16.8b}, [x0], #8
+    ld1            {v18.8b}, [x1], x3
+    smlal           v4.4s,  v7.4h,  v7.4h
+    smlal2          v5.4s,  v7.8h,  v7.8h
     usubl           v6.8h,  v16.8b, v18.8b
-    add             v0.8h,  v0.8h,  v7.8h
+    add             v1.8h,  v1.8h,  v7.8h
     b.gt            1b
 
-    ld1            {v17.8b}, [x0], x1
-    ld1            {v19.8b}, [x2], x3
+    ld1            {v17.8b}, [x0], #8
+    ld1            {v19.8b}, [x1], x3
     smlal           v2.4s,  v6.4h,  v6.4h
     smlal2          v3.4s,  v6.8h,  v6.8h
     usubl           v7.8h,  v17.8b, v19.8b
     add             v0.8h,  v0.8h,  v6.8h
-    smlal           v2.4s,  v7.4h,  v7.4h
-    add             v0.8h,  v0.8h,  v7.8h
-    smlal2          v3.4s,  v7.8h,  v7.8h
+    smlal           v4.4s,  v7.4h,  v7.4h
+    add             v1.8h,  v1.8h,  v7.8h
+    smlal2          v5.4s,  v7.8h,  v7.8h
 
     saddlv          s0,  v0.8h
+    saddlv          s1,  v1.8h
     add             v2.4s,  v2.4s,  v3.4s
+    add             v4.4s,  v4.4s,  v5.4s
     mov             w0,  v0.s[0]
-    addv            s1,  v2.4s
-    sxtw            x0,  w0
     mov             w1,  v1.s[0]
-    mul             x0,  x0,  x0
-    str             w1,  [x4]
-    sub             x0,  x1,  x0,  lsr # 6 + (\h >> 4)
+    addv            s2,  v2.4s
+    addv            s4,  v4.4s
+    mul             w0,  w0,  w0
+    mul             w1,  w1,  w1
+    mov             w3,  v2.s[0]
+    mov             w4,  v4.s[0]
+    sub             w0,  w3,  w0,  lsr # 6 + (\h >> 4)
+    sub             w1,  w4,  w1,  lsr # 6 + (\h >> 4)
+    str             w3,  [x2]
+    add             w0,  w0,  w1
+    str             w4,  [x2, #4]
 
     ret
 endfunc
diff --git a/common/aarch64/pixel.h b/common/aarch64/pixel.h
index 8a7b83e9..5206a0c7 100644
--- a/common/aarch64/pixel.h
+++ b/common/aarch64/pixel.h
@@ -61,8 +61,8 @@ uint64_t x264_pixel_sa8d_satd_16x16_neon( uint8_t *, intptr_t, uint8_t *, intptr
 uint64_t x264_pixel_var_8x8_neon  ( uint8_t *, intptr_t );
 uint64_t x264_pixel_var_8x16_neon ( uint8_t *, intptr_t );
 uint64_t x264_pixel_var_16x16_neon( uint8_t *, intptr_t );
-int x264_pixel_var2_8x8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, int * );
-int x264_pixel_var2_8x16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, int * );
+int x264_pixel_var2_8x8_neon ( uint8_t *, uint8_t *, int * );
+int x264_pixel_var2_8x16_neon( uint8_t *, uint8_t *, int * );
 
 uint64_t x264_pixel_hadamard_ac_8x8_neon  ( uint8_t *, intptr_t );
 uint64_t x264_pixel_hadamard_ac_8x16_neon ( uint8_t *, intptr_t );
diff --git a/common/pixel.c b/common/pixel.c
index aeadd7cc..00c14125 100644
--- a/common/pixel.c
+++ b/common/pixel.c
@@ -1452,8 +1452,8 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
         pixf->var[PIXEL_8x8]    = x264_pixel_var_8x8_neon;
         pixf->var[PIXEL_8x16]   = x264_pixel_var_8x16_neon;
         pixf->var[PIXEL_16x16]  = x264_pixel_var_16x16_neon;
-      //pixf->var2[PIXEL_8x8]   = x264_pixel_var2_8x8_neon;
-      //pixf->var2[PIXEL_8x16]  = x264_pixel_var2_8x16_neon;
+        pixf->var2[PIXEL_8x8]   = x264_pixel_var2_8x8_neon;
+        pixf->var2[PIXEL_8x16]  = x264_pixel_var2_8x16_neon;
         pixf->vsad = x264_pixel_vsad_neon;
         pixf->asd8 = x264_pixel_asd8_neon;
 
-- 
2.40.0