aarch64: Update the var2 functions to the new signature

author Martin Storsjö <martin@martin.st>

Mon, 29 May 2017 09:13:03 +0000 (12:13 +0300)

committer Anton Mitrofanov <BugMaster@narod.ru>

Wed, 14 Jun 2017 20:24:38 +0000 (23:24 +0300)
author Martin Storsjö <martin@martin.st>
Mon, 29 May 2017 09:13:03 +0000 (12:13 +0300)
committer Anton Mitrofanov <BugMaster@narod.ru>
Wed, 14 Jun 2017 20:24:38 +0000 (23:24 +0300)
diff --git a/common/aarch64/pixel-a.S b/common/aarch64/pixel-a.S

index 48209b214d344d62e030314857c29466d7787735..047d3db1d42abf2e5f9309c85a0a0d1b591b6640 100644 (file)
--- a/common/aarch64/pixel-a.S
+++ b/common/aarch64/pixel-a.S
@@ -569,57 +569,65 @@ endfunc
  
  .macro pixel_var2_8 h
  function x264_pixel_var2_8x\h\()_neon, export=1
-    ld1            {v16.8b}, [x0], x1
-    ld1            {v18.8b}, [x2], x3
-    ld1            {v17.8b}, [x0], x1
-    ld1            {v19.8b}, [x2], x3
-    mov             x5,  \h - 4
-    usubl           v6.8h,  v16.8b, v18.8b
-    usubl           v7.8h,  v17.8b, v19.8b
-    ld1            {v16.8b}, [x0], x1
-    ld1            {v18.8b}, [x2], x3
-    smull           v2.4s,  v6.4h,  v6.4h
-    smull2          v3.4s,  v6.8h,  v6.8h
-    add             v0.8h,  v6.8h,  v7.8h
-    smlal           v2.4s,  v7.4h,  v7.4h
-    smlal2          v3.4s,  v7.8h,  v7.8h
+    mov             x3,  #16
+    ld1            {v16.8b}, [x0], #8
+    ld1            {v18.8b}, [x1], x3
+    ld1            {v17.8b}, [x0], #8
+    ld1            {v19.8b}, [x1], x3
+    mov             x5,  \h - 2
+    usubl           v0.8h,  v16.8b, v18.8b
+    usubl           v1.8h,  v17.8b, v19.8b
+    ld1            {v16.8b}, [x0], #8
+    ld1            {v18.8b}, [x1], x3
+    smull           v2.4s,  v0.4h,  v0.4h
+    smull2          v3.4s,  v0.8h,  v0.8h
+    smull           v4.4s,  v1.4h,  v1.4h
+    smull2          v5.4s,  v1.8h,  v1.8h
  
      usubl           v6.8h,  v16.8b, v18.8b
  
-1:  subs            x5,  x5,  #2
-    ld1            {v17.8b}, [x0], x1
-    ld1            {v19.8b}, [x2], x3
+1:  subs            x5,  x5,  #1
+    ld1            {v17.8b}, [x0], #8
+    ld1            {v19.8b}, [x1], x3
      smlal           v2.4s,  v6.4h,  v6.4h
      smlal2          v3.4s,  v6.8h,  v6.8h
      usubl           v7.8h,  v17.8b, v19.8b
      add             v0.8h,  v0.8h,  v6.8h
-    ld1            {v16.8b}, [x0], x1
-    ld1            {v18.8b}, [x2], x3
-    smlal           v2.4s,  v7.4h,  v7.4h
-    smlal2          v3.4s,  v7.8h,  v7.8h
+    ld1            {v16.8b}, [x0], #8
+    ld1            {v18.8b}, [x1], x3
+    smlal           v4.4s,  v7.4h,  v7.4h
+    smlal2          v5.4s,  v7.8h,  v7.8h
      usubl           v6.8h,  v16.8b, v18.8b
-    add             v0.8h,  v0.8h,  v7.8h
+    add             v1.8h,  v1.8h,  v7.8h
      b.gt            1b
  
-    ld1            {v17.8b}, [x0], x1
-    ld1            {v19.8b}, [x2], x3
+    ld1            {v17.8b}, [x0], #8
+    ld1            {v19.8b}, [x1], x3
      smlal           v2.4s,  v6.4h,  v6.4h
      smlal2          v3.4s,  v6.8h,  v6.8h
      usubl           v7.8h,  v17.8b, v19.8b
      add             v0.8h,  v0.8h,  v6.8h
-    smlal           v2.4s,  v7.4h,  v7.4h
-    add             v0.8h,  v0.8h,  v7.8h
-    smlal2          v3.4s,  v7.8h,  v7.8h
+    smlal           v4.4s,  v7.4h,  v7.4h
+    add             v1.8h,  v1.8h,  v7.8h
+    smlal2          v5.4s,  v7.8h,  v7.8h
  
      saddlv          s0,  v0.8h
+    saddlv          s1,  v1.8h
      add             v2.4s,  v2.4s,  v3.4s
+    add             v4.4s,  v4.4s,  v5.4s
      mov             w0,  v0.s[0]
-    addv            s1,  v2.4s
-    sxtw            x0,  w0
      mov             w1,  v1.s[0]
-    mul             x0,  x0,  x0
-    str             w1,  [x4]
-    sub             x0,  x1,  x0,  lsr # 6 + (\h >> 4)
+    addv            s2,  v2.4s
+    addv            s4,  v4.4s
+    mul             w0,  w0,  w0
+    mul             w1,  w1,  w1
+    mov             w3,  v2.s[0]
+    mov             w4,  v4.s[0]
+    sub             w0,  w3,  w0,  lsr # 6 + (\h >> 4)
+    sub             w1,  w4,  w1,  lsr # 6 + (\h >> 4)
+    str             w3,  [x2]
+    add             w0,  w0,  w1
+    str             w4,  [x2, #4]
  
      ret
  endfunc
diff --git a/common/aarch64/pixel.h b/common/aarch64/pixel.h

index 8a7b83e9502a7ea565c1be993b5edea44ec03192..5206a0c7f02bb29289679570daab91c41cd83060 100644 (file)
--- a/common/aarch64/pixel.h
+++ b/common/aarch64/pixel.h
@@ -61,8 +61,8 @@ uint64_t x264_pixel_sa8d_satd_16x16_neon( uint8_t *, intptr_t, uint8_t *, intptr
  uint64_t x264_pixel_var_8x8_neon  ( uint8_t *, intptr_t );
  uint64_t x264_pixel_var_8x16_neon ( uint8_t *, intptr_t );
  uint64_t x264_pixel_var_16x16_neon( uint8_t *, intptr_t );
-int x264_pixel_var2_8x8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, int * );
-int x264_pixel_var2_8x16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, int * );
+int x264_pixel_var2_8x8_neon ( uint8_t *, uint8_t *, int * );
+int x264_pixel_var2_8x16_neon( uint8_t *, uint8_t *, int * );
  
  uint64_t x264_pixel_hadamard_ac_8x8_neon  ( uint8_t *, intptr_t );
  uint64_t x264_pixel_hadamard_ac_8x16_neon ( uint8_t *, intptr_t );
diff --git a/common/pixel.c b/common/pixel.c

index aeadd7cc5785b3185bf27206334cbbadaec79654..00c14125bc30c70d43573773cea51c8f30f4d34c 100644 (file)
--- a/common/pixel.c
+++ b/common/pixel.c
@@ -1452,8 +1452,8 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
          pixf->var[PIXEL_8x8]    = x264_pixel_var_8x8_neon;
          pixf->var[PIXEL_8x16]   = x264_pixel_var_8x16_neon;
          pixf->var[PIXEL_16x16]  = x264_pixel_var_16x16_neon;
-      //pixf->var2[PIXEL_8x8]   = x264_pixel_var2_8x8_neon;
-      //pixf->var2[PIXEL_8x16]  = x264_pixel_var2_8x16_neon;
+        pixf->var2[PIXEL_8x8]   = x264_pixel_var2_8x8_neon;
+        pixf->var2[PIXEL_8x16]  = x264_pixel_var2_8x16_neon;
          pixf->vsad = x264_pixel_vsad_neon;
          pixf->asd8 = x264_pixel_asd8_neon;
author	Martin Storsjö <martin@martin.st>
	Mon, 29 May 2017 09:13:03 +0000 (12:13 +0300)
committer	Anton Mitrofanov <BugMaster@narod.ru>
	Wed, 14 Jun 2017 20:24:38 +0000 (23:24 +0300)
common/aarch64/pixel-a.S		patch \| blob \| history
common/aarch64/pixel.h		patch \| blob \| history
common/pixel.c		patch \| blob \| history