x86: SSE and AVX implementations of plane_copy

author Henrik Gramner <henrik@gramner.com>

Fri, 28 Nov 2014 22:24:56 +0000 (23:24 +0100)

committer Anton Mitrofanov <BugMaster@narod.ru>

Sat, 20 Dec 2014 16:27:54 +0000 (19:27 +0300)
author Henrik Gramner <henrik@gramner.com>
Fri, 28 Nov 2014 22:24:56 +0000 (23:24 +0100)
committer Anton Mitrofanov <BugMaster@narod.ru>
Sat, 20 Dec 2014 16:27:54 +0000 (19:27 +0300)
diff --git a/common/frame.c b/common/frame.c

index a845181067a29b5508a806a746367df25446c401..db1d659bc66197c0072199f533b4cd7f54221acf 100644 (file)
--- a/common/frame.c
+++ b/common/frame.c
@@ -77,7 +77,7 @@ static x264_frame_t *x264_frame_new( x264_t *h, int b_fdec )
  #if ARCH_X86 || ARCH_X86_64
      if( h->param.cpu&X264_CPU_CACHELINE_64 )
          align = 64;
-    else if( h->param.cpu&X264_CPU_CACHELINE_32 || h->param.cpu&X264_CPU_AVX2 )
+    else if( h->param.cpu&X264_CPU_CACHELINE_32 || h->param.cpu&X264_CPU_AVX )
          align = 32;
  #endif
  #if ARCH_PPC
diff --git a/common/x86/mc-a2.asm b/common/x86/mc-a2.asm

index 9e1746c36622855679620311d8dbeac56975d373..58812a0644ac0a7aa401b3f6ffa57c74843aaea9 100644 (file)
--- a/common/x86/mc-a2.asm
+++ b/common/x86/mc-a2.asm
@@ -913,64 +913,63 @@ HPEL
  %undef sfence
  %endif ; !HIGH_BIT_DEPTH
  
+%macro PREFETCHNT_ITER 2 ; src, bytes/iteration
+    %assign %%i 4*(%2) ; prefetch 4 iterations ahead. is this optimal?
+    %rep (%2+63) / 64  ; assume 64 byte cache lines
+        prefetchnta [%1+%%i]
+        %assign %%i %%i + 64
+    %endrep
+%endmacro
+
  ;-----------------------------------------------------------------------------
  ; void plane_copy_core( pixel *dst, intptr_t i_dst,
  ;                       pixel *src, intptr_t i_src, int w, int h )
  ;-----------------------------------------------------------------------------
-; assumes i_dst and w are multiples of 16, and i_dst>w
-INIT_MMX
-cglobal plane_copy_core_mmx2, 6,7
+; assumes i_dst and w are multiples of mmsize, and i_dst>w
+%macro PLANE_COPY_CORE 0
+cglobal plane_copy_core, 6,7
      FIX_STRIDES r1, r3, r4d
  %if HIGH_BIT_DEPTH == 0
      movsxdifnidn r4, r4d
  %endif
-    sub    r1,  r4
-    sub    r3,  r4
+    add    r0, r4
+    add    r2, r4
+    neg    r4
  .loopy:
-    lea   r6d, [r4-63]
+    lea    r6, [r4+4*mmsize]
  .loopx:
-    prefetchnta [r2+256]
-    movq   m0, [r2   ]
-    movq   m1, [r2+ 8]
-    movntq [r0   ], m0
-    movntq [r0+ 8], m1
-    movq   m2, [r2+16]
-    movq   m3, [r2+24]
-    movntq [r0+16], m2
-    movntq [r0+24], m3
-    movq   m4, [r2+32]
-    movq   m5, [r2+40]
-    movntq [r0+32], m4
-    movntq [r0+40], m5
-    movq   m6, [r2+48]
-    movq   m7, [r2+56]
-    movntq [r0+48], m6
-    movntq [r0+56], m7
-    add    r2,  64
-    add    r0,  64
-    sub    r6d, 64
-    jg .loopx
-    prefetchnta [r2+256]
-    add    r6d, 63
-    jle .end16
-.loop16:
-    movq   m0, [r2  ]
-    movq   m1, [r2+8]
-    movntq [r0  ], m0
-    movntq [r0+8], m1
-    add    r2,  16
-    add    r0,  16
-    sub    r6d, 16
-    jg .loop16
-.end16:
+    PREFETCHNT_ITER r2+r6, 4*mmsize
+    movu   m0, [r2+r6-4*mmsize]
+    movu   m1, [r2+r6-3*mmsize]
+    movu   m2, [r2+r6-2*mmsize]
+    movu   m3, [r2+r6-1*mmsize]
+    movnta [r0+r6-4*mmsize], m0
+    movnta [r0+r6-3*mmsize], m1
+    movnta [r0+r6-2*mmsize], m2
+    movnta [r0+r6-1*mmsize], m3
+    add    r6, 4*mmsize
+    jle .loopx
+    PREFETCHNT_ITER r2+r6, 4*mmsize
+    sub    r6, 4*mmsize
+    jz .end
+.loop_end:
+    movu   m0, [r2+r6]
+    movnta [r0+r6], m0
+    add    r6, mmsize
+    jl .loop_end
+.end:
      add    r0, r1
      add    r2, r3
-    dec    r5d
+    dec   r5d
      jg .loopy
      sfence
-    emms
      RET
+%endmacro
  
+INIT_XMM sse
+PLANE_COPY_CORE
+INIT_YMM avx
+PLANE_COPY_CORE
  
  %macro INTERLEAVE 4-5 ; dst, srcu, srcv, is_aligned, nt_hint
  %if HIGH_BIT_DEPTH
diff --git a/common/x86/mc-c.c b/common/x86/mc-c.c

index 9101997f83f3e0a6854b7394898d3edea895c1fb..fd56130d5287a6c2b98f76d2be4934f5a3be84b0 100644 (file)
--- a/common/x86/mc-c.c
+++ b/common/x86/mc-c.c
@@ -90,7 +90,8 @@ void x264_mc_copy_w16_aligned_avx( uint16_t *, intptr_t, uint16_t *, intptr_t, i
  void x264_prefetch_fenc_420_mmx2( pixel *, intptr_t, pixel *, intptr_t, int );
  void x264_prefetch_fenc_422_mmx2( pixel *, intptr_t, pixel *, intptr_t, int );
  void x264_prefetch_ref_mmx2( pixel *, intptr_t, int );
-void x264_plane_copy_core_mmx2( pixel *, intptr_t, pixel *, intptr_t, int w, int h );
+void x264_plane_copy_core_sse( pixel *, intptr_t, pixel *, intptr_t, int w, int h );
+void x264_plane_copy_core_avx( pixel *, intptr_t, pixel *, intptr_t, int w, int h );
  void x264_plane_copy_c( pixel *, intptr_t, pixel *, intptr_t, int w, int h );
  void x264_plane_copy_interleave_core_mmx2( pixel *dst,  intptr_t i_dst,
                                             pixel *srcu, intptr_t i_srcu,
@@ -489,23 +490,35 @@ HPEL(32, avx2, avx2, avx2, avx2)
  #endif
  #endif // HIGH_BIT_DEPTH
  
-static void x264_plane_copy_mmx2( pixel *dst, intptr_t i_dst, pixel *src, intptr_t i_src, int w, int h )
-{
-    int c_w = 16/sizeof(pixel) - 1;
-    if( w < 256 ) { // tiny resolutions don't want non-temporal hints. dunno the exact threshold.
-        x264_plane_copy_c( dst, i_dst, src, i_src, w, h );
-    } else if( !(w&c_w) ) {
-        x264_plane_copy_core_mmx2( dst, i_dst, src, i_src, w, h );
-    } else if( i_src > 0 ) {
-        // have to use plain memcpy on the last line (in memory order) to avoid overreading src
-        x264_plane_copy_core_mmx2( dst, i_dst, src, i_src, (w+c_w)&~c_w, h-1 );
-        memcpy( dst+i_dst*(h-1), src+i_src*(h-1), w*sizeof(pixel) );
-    } else {
-        memcpy( dst, src, w*sizeof(pixel) );
-        x264_plane_copy_core_mmx2( dst+i_dst, i_dst, src+i_src, i_src, (w+c_w)&~c_w, h-1 );
-    }
+#define PLANE_COPY(align, cpu)\
+static void x264_plane_copy_##cpu( pixel *dst, intptr_t i_dst, pixel *src, intptr_t i_src, int w, int h )\
+{\
+    int c_w = (align) / sizeof(pixel) - 1;\
+    if( w < 256 ) /* tiny resolutions don't want non-temporal hints. dunno the exact threshold. */\
+        x264_plane_copy_c( dst, i_dst, src, i_src, w, h );\
+    else if( !(w&c_w) )\
+        x264_plane_copy_core_##cpu( dst, i_dst, src, i_src, w, h );\
+    else\
+    {\
+        if( --h > 0 )\
+        {\
+            if( i_src > 0 )\
+            {\
+                x264_plane_copy_core_##cpu( dst, i_dst, src, i_src, (w+c_w)&~c_w, h );\
+                dst += i_dst * h;\
+                src += i_src * h;\
+            }\
+            else\
+                x264_plane_copy_core_##cpu( dst+i_dst, i_dst, src+i_src, i_src, (w+c_w)&~c_w, h );\
+        }\
+        /* use plain memcpy on the last line (in memory order) to avoid overreading src. */\
+        memcpy( dst, src, w*sizeof(pixel) );\
+    }\
  }
  
+PLANE_COPY(16, sse)
+PLANE_COPY(32, avx)
+
  #define PLANE_INTERLEAVE(cpu) \
  static void x264_plane_copy_interleave_##cpu( pixel *dst,  intptr_t i_dst,\
                                                pixel *srcu, intptr_t i_srcu,\
@@ -663,7 +676,6 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
      pf->prefetch_fenc_422 = x264_prefetch_fenc_422_mmx2;
      pf->prefetch_ref  = x264_prefetch_ref_mmx2;
  
-    pf->plane_copy = x264_plane_copy_mmx2;
      pf->plane_copy_interleave = x264_plane_copy_interleave_mmx2;
      pf->store_interleave_chroma = x264_store_interleave_chroma_mmx2;
  
@@ -692,6 +704,7 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
      {
          pf->memcpy_aligned  = x264_memcpy_aligned_sse;
          pf->memzero_aligned = x264_memzero_aligned_sse;
+        pf->plane_copy = x264_plane_copy_sse;
      }
  
  #if HIGH_BIT_DEPTH
@@ -929,6 +942,7 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
      if( !(cpu&X264_CPU_AVX) )
          return;
      pf->memzero_aligned = x264_memzero_aligned_avx;
+    pf->plane_copy = x264_plane_copy_avx;
      pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_avx;
      pf->mbtree_propagate_list = x264_mbtree_propagate_list_avx;
author	Henrik Gramner <henrik@gramner.com>
	Fri, 28 Nov 2014 22:24:56 +0000 (23:24 +0100)
committer	Anton Mitrofanov <BugMaster@narod.ru>
	Sat, 20 Dec 2014 16:27:54 +0000 (19:27 +0300)
common/frame.c		patch \| blob \| history
common/x86/mc-a2.asm		patch \| blob \| history
common/x86/mc-c.c		patch \| blob \| history