arm/aarch64: use plane_copy wrapper macros

author Janne Grunau <janne-x264@jannau.net>

Fri, 26 Aug 2016 17:26:55 +0000 (20:26 +0300)

committer Henrik Gramner <henrik@gramner.com>

Sat, 17 Sep 2016 13:10:14 +0000 (15:10 +0200)
author Janne Grunau <janne-x264@jannau.net>
Fri, 26 Aug 2016 17:26:55 +0000 (20:26 +0300)
committer Henrik Gramner <henrik@gramner.com>
Sat, 17 Sep 2016 13:10:14 +0000 (15:10 +0200)
diff --git a/common/aarch64/mc-a.S b/common/aarch64/mc-a.S

index 915d8c01b236bafe8f0fecabb2d360d8ca665187..fe0f870447359b72592256e0eaa334cbc2ddddf1 100644 (file)
--- a/common/aarch64/mc-a.S
+++ b/common/aarch64/mc-a.S
@@ -1253,7 +1253,7 @@ load_deinterleave_chroma:
      ret
  endfunc
  
-function x264_plane_copy_neon, export=1
+function x264_plane_copy_core_neon, export=1
      add         x8,  x4,  #15
      and         x4,  x8,  #~15
      sub         x1,  x1,  x4
@@ -1352,7 +1352,7 @@ function x264_plane_copy_deinterleave_rgb_neon, export=1
      ret
  endfunc
  
-function x264_plane_copy_interleave_neon, export=1
+function x264_plane_copy_interleave_core_neon, export=1
      add         w9,  w6,  #15
      and         w9,  w9,  #0xfffffff0
      sub         x1,  x1,  x9,  lsl #1
diff --git a/common/aarch64/mc-c.c b/common/aarch64/mc-c.c

index 717820f0dc07538db77339f6b6ee7c739c14930c..4f93965e10a5dfacd183af79f692f735c3c8b5a4 100644 (file)
--- a/common/aarch64/mc-c.c
+++ b/common/aarch64/mc-c.c
@@ -49,8 +49,8 @@ void x264_pixel_avg2_w8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t
  void x264_pixel_avg2_w16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int );
  void x264_pixel_avg2_w20_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int );
  
-void x264_plane_copy_neon( pixel *dst, intptr_t i_dst,
-                           pixel *src, intptr_t i_src, int w, int h );
+void x264_plane_copy_core_neon( pixel *dst, intptr_t i_dst,
+                                pixel *src, intptr_t i_src, int w, int h );
  void x264_plane_copy_deinterleave_neon(  pixel *dstu, intptr_t i_dstu,
                                           pixel *dstv, intptr_t i_dstv,
                                           pixel *src,  intptr_t i_src, int w, int h );
@@ -58,9 +58,9 @@ void x264_plane_copy_deinterleave_rgb_neon( pixel *dsta, intptr_t i_dsta,
                                              pixel *dstb, intptr_t i_dstb,
                                              pixel *dstc, intptr_t i_dstc,
                                              pixel *src,  intptr_t i_src, int pw, int w, int h );
-void x264_plane_copy_interleave_neon( pixel *dst,  intptr_t i_dst,
-                                      pixel *srcu, intptr_t i_srcu,
-                                      pixel *srcv, intptr_t i_srcv, int w, int h );
+void x264_plane_copy_interleave_core_neon( pixel *dst,  intptr_t i_dst,
+                                           pixel *srcu, intptr_t i_srcu,
+                                           pixel *srcv, intptr_t i_srcv, int w, int h );
  
  void x264_store_interleave_chroma_neon( pixel *dst, intptr_t i_dst, pixel *srcu, pixel *srcv, int height );
  void x264_load_deinterleave_chroma_fdec_neon( pixel *dst, pixel *src, intptr_t i_src, int height );
@@ -206,6 +206,9 @@ static uint8_t *get_ref_neon( uint8_t *dst,   intptr_t *i_dst_stride,
  void x264_hpel_filter_neon( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,
                              uint8_t *src, intptr_t stride, int width,
                              int height, int16_t *buf );
+
+PLANE_COPY(16, neon)
+PLANE_INTERLEAVE(neon)
  #endif // !HIGH_BIT_DEPTH
  
  PROPAGATE_LIST(neon)
diff --git a/common/arm/mc-a.S b/common/arm/mc-a.S

index 76295cd49f0a6180204cd2ece43cb6b524c9b4eb..165c1fa92979f316190cfdc09bf0d729494c9cb0 100644 (file)
--- a/common/arm/mc-a.S
+++ b/common/arm/mc-a.S
@@ -1468,7 +1468,7 @@ function x264_load_deinterleave_chroma_fenc_neon
      bx              lr
  endfunc
  
-function x264_plane_copy_neon
+function x264_plane_copy_core_neon
      push            {r4,lr}
      ldr             r4,  [sp, #8]
      ldr             lr,  [sp, #12]
@@ -1577,7 +1577,7 @@ block4:
      pop             {r4-r8, r10, r11, pc}
  endfunc
  
-function x264_plane_copy_interleave_neon
+function x264_plane_copy_interleave_core_neon
      push            {r4-r7, lr}
      ldrd            r6, r7, [sp, #28]
      ldrd            r4, r5, [sp, #20]
@@ -1604,7 +1604,7 @@ blocki:
      pop             {r4-r7, pc}
  endfunc
  
-function x264_plane_copy_swap_neon
+function x264_plane_copy_swap_core_neon
      push            {r4-r5, lr}
      ldrd            r4, r5, [sp, #12]
      add             lr,  r4,  #15
diff --git a/common/arm/mc-c.c b/common/arm/mc-c.c

index d330bc305b414f46df02dacf1e565b4dc2e99302..ae1a68616b83e8215111f52c576c2175ed30179e 100644 (file)
--- a/common/arm/mc-c.c
+++ b/common/arm/mc-c.c
@@ -48,8 +48,8 @@ void x264_pixel_avg2_w8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t
  void x264_pixel_avg2_w16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int );
  void x264_pixel_avg2_w20_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int );
  
-void x264_plane_copy_neon( pixel *dst, intptr_t i_dst,
-                           pixel *src, intptr_t i_src, int w, int h );
+void x264_plane_copy_core_neon( pixel *dst, intptr_t i_dst,
+                                pixel *src, intptr_t i_src, int w, int h );
  void x264_plane_copy_deinterleave_neon(  pixel *dstu, intptr_t i_dstu,
                                           pixel *dstv, intptr_t i_dstv,
                                           pixel *src,  intptr_t i_src, int w, int h );
@@ -57,11 +57,11 @@ void x264_plane_copy_deinterleave_rgb_neon( pixel *dsta, intptr_t i_dsta,
                                              pixel *dstb, intptr_t i_dstb,
                                              pixel *dstc, intptr_t i_dstc,
                                              pixel *src,  intptr_t i_src, int pw, int w, int h );
-void x264_plane_copy_interleave_neon( pixel *dst,  intptr_t i_dst,
-                                      pixel *srcu, intptr_t i_srcu,
-                                      pixel *srcv, intptr_t i_srcv, int w, int h );
-void x264_plane_copy_swap_neon( pixel *dst, intptr_t i_dst,
-                                pixel *src, intptr_t i_src, int w, int h );
+void x264_plane_copy_interleave_core_neon( pixel *dst,  intptr_t i_dst,
+                                           pixel *srcu, intptr_t i_srcu,
+                                           pixel *srcv, intptr_t i_srcv, int w, int h );
+void x264_plane_copy_swap_core_neon( pixel *dst, intptr_t i_dst,
+                                     pixel *src, intptr_t i_src, int w, int h );
  
  void x264_store_interleave_chroma_neon( pixel *dst, intptr_t i_dst, pixel *srcu, pixel *srcv, int height );
  void x264_load_deinterleave_chroma_fdec_neon( pixel *dst, pixel *src, intptr_t i_src, int height );
@@ -232,6 +232,10 @@ static void hpel_filter_neon( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8
          src  += stride;
      }
  }
+
+PLANE_COPY(16, neon)
+PLANE_COPY_SWAP(16, neon)
+PLANE_INTERLEAVE(neon)
  #endif // !HIGH_BIT_DEPTH
  
  PROPAGATE_LIST(neon)
diff --git a/common/mc.h b/common/mc.h

index cebdb55745a33f25aa0b1e75192a12835242a8fc..5a83ec228b7b2f9a9f5f600adf0baa2f44a6ff27 100644 (file)
--- a/common/mc.h
+++ b/common/mc.h
@@ -100,6 +100,98 @@ static void x264_mbtree_propagate_list_##cpu( x264_t *h, uint16_t *ref_costs, in
      }\
  }
  
+void x264_plane_copy_c( pixel *, intptr_t, pixel *, intptr_t, int w, int h );
+
+#define PLANE_COPY(align, cpu)\
+static void x264_plane_copy_##cpu( pixel *dst, intptr_t i_dst, pixel *src, intptr_t i_src, int w, int h )\
+{\
+    int c_w = (align) / sizeof(pixel) - 1;\
+    if( w < 256 ) /* tiny resolutions don't want non-temporal hints. dunno the exact threshold. */\
+        x264_plane_copy_c( dst, i_dst, src, i_src, w, h );\
+    else if( !(w&c_w) )\
+        x264_plane_copy_core_##cpu( dst, i_dst, src, i_src, w, h );\
+    else\
+    {\
+        if( --h > 0 )\
+        {\
+            if( i_src > 0 )\
+            {\
+                x264_plane_copy_core_##cpu( dst, i_dst, src, i_src, (w+c_w)&~c_w, h );\
+                dst += i_dst * h;\
+                src += i_src * h;\
+            }\
+            else\
+                x264_plane_copy_core_##cpu( dst+i_dst, i_dst, src+i_src, i_src, (w+c_w)&~c_w, h );\
+        }\
+        /* use plain memcpy on the last line (in memory order) to avoid overreading src. */\
+        memcpy( dst, src, w*sizeof(pixel) );\
+    }\
+}
+
+void x264_plane_copy_swap_c( pixel *, intptr_t, pixel *, intptr_t, int w, int h );
+
+#define PLANE_COPY_SWAP(align, cpu)\
+static void x264_plane_copy_swap_##cpu( pixel *dst, intptr_t i_dst, pixel *src, intptr_t i_src, int w, int h )\
+{\
+    int c_w = (align>>1) / sizeof(pixel) - 1;\
+    if( !(w&c_w) )\
+        x264_plane_copy_swap_core_##cpu( dst, i_dst, src, i_src, w, h );\
+    else if( w > c_w )\
+    {\
+        if( --h > 0 )\
+        {\
+            if( i_src > 0 )\
+            {\
+                x264_plane_copy_swap_core_##cpu( dst, i_dst, src, i_src, (w+c_w)&~c_w, h );\
+                dst += i_dst * h;\
+                src += i_src * h;\
+            }\
+            else\
+                x264_plane_copy_swap_core_##cpu( dst+i_dst, i_dst, src+i_src, i_src, (w+c_w)&~c_w, h );\
+        }\
+        x264_plane_copy_swap_core_##cpu( dst, 0, src, 0, w&~c_w, 1 );\
+        for( int x = 2*(w&~c_w); x < 2*w; x += 2 )\
+        {\
+            dst[x]   = src[x+1];\
+            dst[x+1] = src[x];\
+        }\
+    }\
+    else\
+        x264_plane_copy_swap_c( dst, i_dst, src, i_src, w, h );\
+}
+
+void x264_plane_copy_interleave_c( pixel *dst,  intptr_t i_dst,
+                                   pixel *srcu, intptr_t i_srcu,
+                                   pixel *srcv, intptr_t i_srcv, int w, int h );
+
+#define PLANE_INTERLEAVE(cpu) \
+static void x264_plane_copy_interleave_##cpu( pixel *dst,  intptr_t i_dst,\
+                                              pixel *srcu, intptr_t i_srcu,\
+                                              pixel *srcv, intptr_t i_srcv, int w, int h )\
+{\
+    int c_w = 16 / sizeof(pixel) - 1;\
+    if( !(w&c_w) )\
+        x264_plane_copy_interleave_core_##cpu( dst, i_dst, srcu, i_srcu, srcv, i_srcv, w, h );\
+    else if( w > c_w && (i_srcu ^ i_srcv) >= 0 ) /* only works correctly for strides with identical signs */\
+    {\
+        if( --h > 0 )\
+        {\
+            if( i_srcu > 0 )\
+            {\
+                x264_plane_copy_interleave_core_##cpu( dst, i_dst, srcu, i_srcu, srcv, i_srcv, (w+c_w)&~c_w, h );\
+                dst  += i_dst  * h;\
+                srcu += i_srcu * h;\
+                srcv += i_srcv * h;\
+            }\
+            else\
+                x264_plane_copy_interleave_core_##cpu( dst+i_dst, i_dst, srcu+i_srcu, i_srcu, srcv+i_srcv, i_srcv, (w+c_w)&~c_w, h );\
+        }\
+        x264_plane_copy_interleave_c( dst, 0, srcu, 0, srcv, 0, w, 1 );\
+    }\
+    else\
+        x264_plane_copy_interleave_c( dst, i_dst, srcu, i_srcu, srcv, i_srcv, w, h );\
+}
+
  struct x264_weight_t;
  typedef void (* weight_fn_t)( pixel *, intptr_t, pixel *,intptr_t, const struct x264_weight_t *, int );
  typedef struct x264_weight_t
diff --git a/common/x86/mc-c.c b/common/x86/mc-c.c

index 11e481e1f1b429b1d4889cd03f3c102c1489d000..21acdebd917e6b1bd284501982aa70e8c0de2ef8 100644 (file)
--- a/common/x86/mc-c.c
+++ b/common/x86/mc-c.c
@@ -88,10 +88,8 @@ void x264_prefetch_fenc_422_mmx2( pixel *, intptr_t, pixel *, intptr_t, int );
  void x264_prefetch_ref_mmx2( pixel *, intptr_t, int );
  void x264_plane_copy_core_sse( pixel *, intptr_t, pixel *, intptr_t, int w, int h );
  void x264_plane_copy_core_avx( pixel *, intptr_t, pixel *, intptr_t, int w, int h );
-void x264_plane_copy_c( pixel *, intptr_t, pixel *, intptr_t, int w, int h );
  void x264_plane_copy_swap_core_ssse3( pixel *, intptr_t, pixel *, intptr_t, int w, int h );
  void x264_plane_copy_swap_core_avx2 ( pixel *, intptr_t, pixel *, intptr_t, int w, int h );
-void x264_plane_copy_swap_c( pixel *, intptr_t, pixel *, intptr_t, int w, int h );
  void x264_plane_copy_interleave_core_mmx2( pixel *dst,  intptr_t i_dst,
                                             pixel *srcu, intptr_t i_srcu,
                                             pixel *srcv, intptr_t i_srcv, int w, int h );
@@ -101,9 +99,6 @@ void x264_plane_copy_interleave_core_sse2( pixel *dst,  intptr_t i_dst,
  void x264_plane_copy_interleave_core_avx( pixel *dst,  intptr_t i_dst,
                                            pixel *srcu, intptr_t i_srcu,
                                            pixel *srcv, intptr_t i_srcv, int w, int h );
-void x264_plane_copy_interleave_c( pixel *dst,  intptr_t i_dst,
-                                   pixel *srcu, intptr_t i_srcu,
-                                   pixel *srcv, intptr_t i_srcv, int w, int h );
  void x264_plane_copy_deinterleave_mmx( pixel *dstu, intptr_t i_dstu,
                                         pixel *dstv, intptr_t i_dstv,
                                         pixel *src,  intptr_t i_src, int w, int h );
@@ -493,96 +488,12 @@ HPEL(32, avx2, avx2, avx2, avx2)
  #endif
  #endif // HIGH_BIT_DEPTH
  
-#define PLANE_COPY(align, cpu)\
-static void x264_plane_copy_##cpu( pixel *dst, intptr_t i_dst, pixel *src, intptr_t i_src, int w, int h )\
-{\
-    int c_w = (align) / sizeof(pixel) - 1;\
-    if( w < 256 ) /* tiny resolutions don't want non-temporal hints. dunno the exact threshold. */\
-        x264_plane_copy_c( dst, i_dst, src, i_src, w, h );\
-    else if( !(w&c_w) )\
-        x264_plane_copy_core_##cpu( dst, i_dst, src, i_src, w, h );\
-    else\
-    {\
-        if( --h > 0 )\
-        {\
-            if( i_src > 0 )\
-            {\
-                x264_plane_copy_core_##cpu( dst, i_dst, src, i_src, (w+c_w)&~c_w, h );\
-                dst += i_dst * h;\
-                src += i_src * h;\
-            }\
-            else\
-                x264_plane_copy_core_##cpu( dst+i_dst, i_dst, src+i_src, i_src, (w+c_w)&~c_w, h );\
-        }\
-        /* use plain memcpy on the last line (in memory order) to avoid overreading src. */\
-        memcpy( dst, src, w*sizeof(pixel) );\
-    }\
-}
-
  PLANE_COPY(16, sse)
  PLANE_COPY(32, avx)
  
-#define PLANE_COPY_SWAP(align, cpu)\
-static void x264_plane_copy_swap_##cpu( pixel *dst, intptr_t i_dst, pixel *src, intptr_t i_src, int w, int h )\
-{\
-    int c_w = (align>>1) / sizeof(pixel) - 1;\
-    if( !(w&c_w) )\
-        x264_plane_copy_swap_core_##cpu( dst, i_dst, src, i_src, w, h );\
-    else if( w > c_w )\
-    {\
-        if( --h > 0 )\
-        {\
-            if( i_src > 0 )\
-            {\
-                x264_plane_copy_swap_core_##cpu( dst, i_dst, src, i_src, (w+c_w)&~c_w, h );\
-                dst += i_dst * h;\
-                src += i_src * h;\
-            }\
-            else\
-                x264_plane_copy_swap_core_##cpu( dst+i_dst, i_dst, src+i_src, i_src, (w+c_w)&~c_w, h );\
-        }\
-        x264_plane_copy_swap_core_##cpu( dst, 0, src, 0, w&~c_w, 1 );\
-        for( int x = 2*(w&~c_w); x < 2*w; x += 2 )\
-        {\
-            dst[x]   = src[x+1];\
-            dst[x+1] = src[x];\
-        }\
-    }\
-    else\
-        x264_plane_copy_swap_c( dst, i_dst, src, i_src, w, h );\
-}
-
  PLANE_COPY_SWAP(16, ssse3)
  PLANE_COPY_SWAP(32, avx2)
  
-#define PLANE_INTERLEAVE(cpu) \
-static void x264_plane_copy_interleave_##cpu( pixel *dst,  intptr_t i_dst,\
-                                              pixel *srcu, intptr_t i_srcu,\
-                                              pixel *srcv, intptr_t i_srcv, int w, int h )\
-{\
-    int c_w = 16 / sizeof(pixel) - 1;\
-    if( !(w&c_w) )\
-        x264_plane_copy_interleave_core_##cpu( dst, i_dst, srcu, i_srcu, srcv, i_srcv, w, h );\
-    else if( w > c_w && (i_srcu ^ i_srcv) >= 0 ) /* only works correctly for strides with identical signs */\
-    {\
-        if( --h > 0 )\
-        {\
-            if( i_srcu > 0 )\
-            {\
-                x264_plane_copy_interleave_core_##cpu( dst, i_dst, srcu, i_srcu, srcv, i_srcv, (w+c_w)&~c_w, h );\
-                dst  += i_dst  * h;\
-                srcu += i_srcu * h;\
-                srcv += i_srcv * h;\
-            }\
-            else\
-                x264_plane_copy_interleave_core_##cpu( dst+i_dst, i_dst, srcu+i_srcu, i_srcu, srcv+i_srcv, i_srcv, (w+c_w)&~c_w, h );\
-        }\
-        x264_plane_copy_interleave_c( dst, 0, srcu, 0, srcv, 0, w, 1 );\
-    }\
-    else\
-        x264_plane_copy_interleave_c( dst, i_dst, srcu, i_srcu, srcv, i_srcv, w, h );\
-}
-
  PLANE_INTERLEAVE(mmx2)
  PLANE_INTERLEAVE(sse2)
  #if HIGH_BIT_DEPTH
author	Janne Grunau <janne-x264@jannau.net>
	Fri, 26 Aug 2016 17:26:55 +0000 (20:26 +0300)
committer	Henrik Gramner <henrik@gramner.com>
	Sat, 17 Sep 2016 13:10:14 +0000 (15:10 +0200)
common/aarch64/mc-a.S		patch \| blob \| history
common/aarch64/mc-c.c		patch \| blob \| history
common/arm/mc-a.S		patch \| blob \| history
common/arm/mc-c.c		patch \| blob \| history
common/mc.h		patch \| blob \| history
common/x86/mc-c.c		patch \| blob \| history