]> granicus.if.org Git - libx264/commitdiff
aarch64: Update the var2 functions to the new signature
authorMartin Storsjö <martin@martin.st>
Mon, 29 May 2017 09:13:03 +0000 (12:13 +0300)
committerAnton Mitrofanov <BugMaster@narod.ru>
Wed, 14 Jun 2017 20:24:38 +0000 (23:24 +0300)
The existing functions could easily be used by just calling them
twice - this would give the following cycle numbers from checkasm:

var2_8x8_c:      4110
var2_8x8_neon:   1505
var2_8x16_c:     8019
var2_8x16_neon:  2545

However, by merging both passes into the same function, we get the
following speedup:
var2_8x8_neon:   1205
var2_8x16_neon:  2327

common/aarch64/pixel-a.S
common/aarch64/pixel.h
common/pixel.c

index 48209b214d344d62e030314857c29466d7787735..047d3db1d42abf2e5f9309c85a0a0d1b591b6640 100644 (file)
@@ -569,57 +569,65 @@ endfunc
 
 .macro pixel_var2_8 h
 function x264_pixel_var2_8x\h\()_neon, export=1
-    ld1            {v16.8b}, [x0], x1
-    ld1            {v18.8b}, [x2], x3
-    ld1            {v17.8b}, [x0], x1
-    ld1            {v19.8b}, [x2], x3
-    mov             x5,  \h - 4
-    usubl           v6.8h,  v16.8b, v18.8b
-    usubl           v7.8h,  v17.8b, v19.8b
-    ld1            {v16.8b}, [x0], x1
-    ld1            {v18.8b}, [x2], x3
-    smull           v2.4s,  v6.4h,  v6.4h
-    smull2          v3.4s,  v6.8h,  v6.8h
-    add             v0.8h,  v6.8h,  v7.8h
-    smlal           v2.4s,  v7.4h,  v7.4h
-    smlal2          v3.4s,  v7.8h,  v7.8h
+    mov             x3,  #16
+    ld1            {v16.8b}, [x0], #8
+    ld1            {v18.8b}, [x1], x3
+    ld1            {v17.8b}, [x0], #8
+    ld1            {v19.8b}, [x1], x3
+    mov             x5,  \h - 2
+    usubl           v0.8h,  v16.8b, v18.8b
+    usubl           v1.8h,  v17.8b, v19.8b
+    ld1            {v16.8b}, [x0], #8
+    ld1            {v18.8b}, [x1], x3
+    smull           v2.4s,  v0.4h,  v0.4h
+    smull2          v3.4s,  v0.8h,  v0.8h
+    smull           v4.4s,  v1.4h,  v1.4h
+    smull2          v5.4s,  v1.8h,  v1.8h
 
     usubl           v6.8h,  v16.8b, v18.8b
 
-1:  subs            x5,  x5,  #2
-    ld1            {v17.8b}, [x0], x1
-    ld1            {v19.8b}, [x2], x3
+1:  subs            x5,  x5,  #1
+    ld1            {v17.8b}, [x0], #8
+    ld1            {v19.8b}, [x1], x3
     smlal           v2.4s,  v6.4h,  v6.4h
     smlal2          v3.4s,  v6.8h,  v6.8h
     usubl           v7.8h,  v17.8b, v19.8b
     add             v0.8h,  v0.8h,  v6.8h
-    ld1            {v16.8b}, [x0], x1
-    ld1            {v18.8b}, [x2], x3
-    smlal           v2.4s,  v7.4h,  v7.4h
-    smlal2          v3.4s,  v7.8h,  v7.8h
+    ld1            {v16.8b}, [x0], #8
+    ld1            {v18.8b}, [x1], x3
+    smlal           v4.4s,  v7.4h,  v7.4h
+    smlal2          v5.4s,  v7.8h,  v7.8h
     usubl           v6.8h,  v16.8b, v18.8b
-    add             v0.8h,  v0.8h,  v7.8h
+    add             v1.8h,  v1.8h,  v7.8h
     b.gt            1b
 
-    ld1            {v17.8b}, [x0], x1
-    ld1            {v19.8b}, [x2], x3
+    ld1            {v17.8b}, [x0], #8
+    ld1            {v19.8b}, [x1], x3
     smlal           v2.4s,  v6.4h,  v6.4h
     smlal2          v3.4s,  v6.8h,  v6.8h
     usubl           v7.8h,  v17.8b, v19.8b
     add             v0.8h,  v0.8h,  v6.8h
-    smlal           v2.4s,  v7.4h,  v7.4h
-    add             v0.8h,  v0.8h,  v7.8h
-    smlal2          v3.4s,  v7.8h,  v7.8h
+    smlal           v4.4s,  v7.4h,  v7.4h
+    add             v1.8h,  v1.8h,  v7.8h
+    smlal2          v5.4s,  v7.8h,  v7.8h
 
     saddlv          s0,  v0.8h
+    saddlv          s1,  v1.8h
     add             v2.4s,  v2.4s,  v3.4s
+    add             v4.4s,  v4.4s,  v5.4s
     mov             w0,  v0.s[0]
-    addv            s1,  v2.4s
-    sxtw            x0,  w0
     mov             w1,  v1.s[0]
-    mul             x0,  x0,  x0
-    str             w1,  [x4]
-    sub             x0,  x1,  x0,  lsr # 6 + (\h >> 4)
+    addv            s2,  v2.4s
+    addv            s4,  v4.4s
+    mul             w0,  w0,  w0
+    mul             w1,  w1,  w1
+    mov             w3,  v2.s[0]
+    mov             w4,  v4.s[0]
+    sub             w0,  w3,  w0,  lsr # 6 + (\h >> 4)
+    sub             w1,  w4,  w1,  lsr # 6 + (\h >> 4)
+    str             w3,  [x2]
+    add             w0,  w0,  w1
+    str             w4,  [x2, #4]
 
     ret
 endfunc
index 8a7b83e9502a7ea565c1be993b5edea44ec03192..5206a0c7f02bb29289679570daab91c41cd83060 100644 (file)
@@ -61,8 +61,8 @@ uint64_t x264_pixel_sa8d_satd_16x16_neon( uint8_t *, intptr_t, uint8_t *, intptr
 uint64_t x264_pixel_var_8x8_neon  ( uint8_t *, intptr_t );
 uint64_t x264_pixel_var_8x16_neon ( uint8_t *, intptr_t );
 uint64_t x264_pixel_var_16x16_neon( uint8_t *, intptr_t );
-int x264_pixel_var2_8x8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, int * );
-int x264_pixel_var2_8x16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, int * );
+int x264_pixel_var2_8x8_neon ( uint8_t *, uint8_t *, int * );
+int x264_pixel_var2_8x16_neon( uint8_t *, uint8_t *, int * );
 
 uint64_t x264_pixel_hadamard_ac_8x8_neon  ( uint8_t *, intptr_t );
 uint64_t x264_pixel_hadamard_ac_8x16_neon ( uint8_t *, intptr_t );
index aeadd7cc5785b3185bf27206334cbbadaec79654..00c14125bc30c70d43573773cea51c8f30f4d34c 100644 (file)
@@ -1452,8 +1452,8 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
         pixf->var[PIXEL_8x8]    = x264_pixel_var_8x8_neon;
         pixf->var[PIXEL_8x16]   = x264_pixel_var_8x16_neon;
         pixf->var[PIXEL_16x16]  = x264_pixel_var_16x16_neon;
-      //pixf->var2[PIXEL_8x8]   = x264_pixel_var2_8x8_neon;
-      //pixf->var2[PIXEL_8x16]  = x264_pixel_var2_8x16_neon;
+        pixf->var2[PIXEL_8x8]   = x264_pixel_var2_8x8_neon;
+        pixf->var2[PIXEL_8x16]  = x264_pixel_var2_8x16_neon;
         pixf->vsad = x264_pixel_vsad_neon;
         pixf->asd8 = x264_pixel_asd8_neon;