x86: AVX-512 memzero_aligned

author Henrik Gramner <henrik@gramner.com>

Sun, 9 Apr 2017 18:34:28 +0000 (20:34 +0200)

committer Henrik Gramner <henrik@gramner.com>

Sun, 21 May 2017 21:14:24 +0000 (23:14 +0200)
author Henrik Gramner <henrik@gramner.com>
Sun, 9 Apr 2017 18:34:28 +0000 (20:34 +0200)
committer Henrik Gramner <henrik@gramner.com>
Sun, 21 May 2017 21:14:24 +0000 (23:14 +0200)
diff --git a/common/common.h b/common/common.h

index c20b584032b0da4882e5c21d1ff2e7253792e4f7..8931529c2796532edac4e4896199d76305866bf8 100644 (file)
--- a/common/common.h
+++ b/common/common.h
@@ -788,16 +788,17 @@ struct x264_t
              ALIGNED_64( dctcoef i4x4_dct_buf[15][16] );
              uint32_t i4x4_nnz_buf[4];
              uint32_t i8x8_nnz_buf[4];
-            int i4x4_cbp;
-            int i8x8_cbp;
  
              /* Psy trellis DCT data */
              ALIGNED_16( dctcoef fenc_dct8[4][64] );
              ALIGNED_16( dctcoef fenc_dct4[16][16] );
  
              /* Psy RD SATD/SA8D scores cache */
-            ALIGNED_32( uint64_t fenc_hadamard_cache[9] );
-            ALIGNED_32( uint32_t fenc_satd_cache[32] );
+            ALIGNED_64( uint32_t fenc_satd_cache[32] );
+            ALIGNED_16( uint64_t fenc_hadamard_cache[9] );
+
+            int i4x4_cbp;
+            int i8x8_cbp;
  
              /* pointer over mb of the frame to be compressed */
              pixel *p_fenc[3]; /* y,u,v */
diff --git a/common/x86/mc-a2.asm b/common/x86/mc-a2.asm

index 5d04ba971403a4710e4d0f5b38dd83102b8d9c7f..45692ff58181fad0324ee8a797dd2f44b3079c48 100644 (file)
--- a/common/x86/mc-a2.asm
+++ b/common/x86/mc-a2.asm
@@ -1507,12 +1507,32 @@ cglobal memcpy_aligned, 3,3
      RET
  %endmacro
  
+;-----------------------------------------------------------------------------
+; void *memzero_aligned( void *dst, size_t n );
+;-----------------------------------------------------------------------------
+%macro MEMZERO 0
+cglobal memzero_aligned, 2,2
+    xorps m0, m0
+.loop:
+%assign %%i mmsize
+%rep 128 / mmsize
+    movaps [r0 + r1 - %%i], m0
+%assign %%i %%i+mmsize
+%endrep
+    sub r1d, 128
+    jg .loop
+    RET
+%endmacro
+
  INIT_XMM sse
  MEMCPY
+MEMZERO
  INIT_YMM avx
  MEMCPY
-
+MEMZERO
  INIT_ZMM avx512
+MEMZERO
+
  cglobal memcpy_aligned, 3,4
      dec      r2d           ; offset of the last byte
      rorx     r3d, r2d, 2
@@ -1533,36 +1553,6 @@ cglobal memcpy_aligned, 3,4
  .ret:
      RET
  
-;-----------------------------------------------------------------------------
-; void *memzero_aligned( void *dst, size_t n );
-;-----------------------------------------------------------------------------
-%macro MEMZERO 1
-cglobal memzero_aligned, 2,2
-    add  r0, r1
-    neg  r1
-%if mmsize == 8
-    pxor m0, m0
-%else
-    xorps m0, m0
-%endif
-.loop:
-%assign i 0
-%rep %1
-    mova [r0 + r1 + i], m0
-%assign i i+mmsize
-%endrep
-    add r1, mmsize*%1
-    jl .loop
-    RET
-%endmacro
-
-INIT_MMX mmx
-MEMZERO 8
-INIT_XMM sse
-MEMZERO 8
-INIT_YMM avx
-MEMZERO 4
-
  %if HIGH_BIT_DEPTH == 0
  ;-----------------------------------------------------------------------------
  ; void integral_init4h( uint16_t *sum, uint8_t *pix, intptr_t stride )
diff --git a/common/x86/mc-c.c b/common/x86/mc-c.c

index d4f37f5477ac3cdec58aa52995c7f88dc3189b35..b7f508a279f9352470310f9929c45a00c0d1d694 100644 (file)
--- a/common/x86/mc-c.c
+++ b/common/x86/mc-c.c
@@ -146,9 +146,9 @@ void x264_load_deinterleave_chroma_fdec_avx2( uint16_t *dst, uint16_t *src, intp
  void *x264_memcpy_aligned_sse   ( void *dst, const void *src, size_t n );
  void *x264_memcpy_aligned_avx   ( void *dst, const void *src, size_t n );
  void *x264_memcpy_aligned_avx512( void *dst, const void *src, size_t n );
-void x264_memzero_aligned_mmx( void *dst, size_t n );
-void x264_memzero_aligned_sse( void *dst, size_t n );
-void x264_memzero_aligned_avx( void *dst, size_t n );
+void x264_memzero_aligned_sse   ( void *dst, size_t n );
+void x264_memzero_aligned_avx   ( void *dst, size_t n );
+void x264_memzero_aligned_avx512( void *dst, size_t n );
  void x264_integral_init4h_sse4( uint16_t *sum, uint8_t *pix, intptr_t stride );
  void x264_integral_init4h_avx2( uint16_t *sum, uint8_t *pix, intptr_t stride );
  void x264_integral_init8h_sse4( uint16_t *sum, uint8_t *pix, intptr_t stride );
@@ -559,7 +559,6 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
      pf->copy[PIXEL_16x16] = x264_mc_copy_w16_mmx;
      pf->copy[PIXEL_8x8]   = x264_mc_copy_w8_mmx;
      pf->copy[PIXEL_4x4]   = x264_mc_copy_w4_mmx;
-    pf->memzero_aligned = x264_memzero_aligned_mmx;
      pf->integral_init4v = x264_integral_init4v_mmx;
      pf->integral_init8v = x264_integral_init8v_mmx;
  
@@ -871,5 +870,6 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
      if( !(cpu&X264_CPU_AVX512) )
          return;
      pf->memcpy_aligned = x264_memcpy_aligned_avx512;
+    pf->memzero_aligned = x264_memzero_aligned_avx512;
      pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_avx512;
  }
diff --git a/encoder/analyse.c b/encoder/analyse.c

index d19cbbbd57e3ac5ae3b95c9e4841763f62f8c3fc..8bb83fbf9fd4c441625d607b0bb5517dc7f71c95 100644 (file)
--- a/encoder/analyse.c
+++ b/encoder/analyse.c
@@ -695,8 +695,12 @@ static inline void x264_mb_init_fenc_cache( x264_t *h, int b_satd )
          x264_psy_trellis_init( h, h->param.analyse.b_transform_8x8 );
      if( !h->mb.i_psy_rd )
          return;
-    /* Writes beyond the end of the array, but not a problem since fenc_satd_cache is right after. */
-    h->mc.memzero_aligned( h->mb.pic.fenc_hadamard_cache, sizeof(h->mb.pic.fenc_hadamard_cache) );
+
+    M128( &h->mb.pic.fenc_hadamard_cache[0] ) = M128_ZERO;
+    M128( &h->mb.pic.fenc_hadamard_cache[2] ) = M128_ZERO;
+    M128( &h->mb.pic.fenc_hadamard_cache[4] ) = M128_ZERO;
+    M128( &h->mb.pic.fenc_hadamard_cache[6] ) = M128_ZERO;
+    h->mb.pic.fenc_hadamard_cache[8] = 0;
      if( b_satd )
          h->mc.memzero_aligned( h->mb.pic.fenc_satd_cache, sizeof(h->mb.pic.fenc_satd_cache) );
  }
diff --git a/encoder/me.c b/encoder/me.c

index 58a39dcf74fb1977a775e521d3ddccb3a42cf8a4..094fc5da87c4bdb5fd977e74bafff74e5f5a1ea2 100644 (file)
--- a/encoder/me.c
+++ b/encoder/me.c
@@ -1059,7 +1059,7 @@ static void ALWAYS_INLINE x264_me_refine_bidir( x264_t *h, x264_me_t *m0, x264_m
      uint64_t bcostrd = COST_MAX64;
      uint16_t amvd;
      /* each byte of visited represents 8 possible m1y positions, so a 4D array isn't needed */
-    ALIGNED_ARRAY_32( uint8_t, visited,[8],[8][8] );
+    ALIGNED_ARRAY_64( uint8_t, visited,[8],[8][8] );
      /* all permutations of an offset in up to 2 of the dimensions */
      ALIGNED_4( static const int8_t dia4d[33][4] ) =
      {
diff --git a/tools/checkasm.c b/tools/checkasm.c

index 8879b10566b35939c65f87ca8297b2efe2775d77..4c84ade921ce3d4e59dcc742eaa935cc82c94919 100644 (file)
--- a/tools/checkasm.c
+++ b/tools/checkasm.c
@@ -1878,10 +1878,10 @@ static int check_mc( int cpu_ref, int cpu_new )
          ok = 1; used_asm = 1;
          for( size_t size = 128; size < 1024; size += 128 )
          {
-            memset( buf4, 0xAA, size + 1 );
+            memset( buf4-1, 0xAA, size + 2 );
              call_c( mc_c.memzero_aligned, buf3, size );
              call_a( mc_a.memzero_aligned, buf4, size );
-            if( memcmp( buf3, buf4, size ) || buf4[size] != 0xAA )
+            if( memcmp( buf3, buf4, size ) || buf4[-1] != 0xAA || buf4[size] != 0xAA )
              {
                  ok = 0;
                  fprintf( stderr, "memzero_aligned FAILED: size=%d\n", (int)size );
author	Henrik Gramner <henrik@gramner.com>
	Sun, 9 Apr 2017 18:34:28 +0000 (20:34 +0200)
committer	Henrik Gramner <henrik@gramner.com>
	Sun, 21 May 2017 21:14:24 +0000 (23:14 +0200)
common/common.h		patch \| blob \| history
common/x86/mc-a2.asm		patch \| blob \| history
common/x86/mc-c.c		patch \| blob \| history
encoder/analyse.c		patch \| blob \| history
encoder/me.c		patch \| blob \| history
tools/checkasm.c		patch \| blob \| history