x86: AVX-512 support

author Henrik Gramner <henrik@gramner.com>

Sat, 25 Mar 2017 09:16:09 +0000 (10:16 +0100)

committer Henrik Gramner <henrik@gramner.com>

Sun, 21 May 2017 20:42:15 +0000 (22:42 +0200)
author Henrik Gramner <henrik@gramner.com>
Sat, 25 Mar 2017 09:16:09 +0000 (10:16 +0100)
committer Henrik Gramner <henrik@gramner.com>
Sun, 21 May 2017 20:42:15 +0000 (22:42 +0200)
diff --git a/common/cpu.c b/common/cpu.c

index 172957ea0bb99db39ffa159139b4369f633ad45e..863818628b4c027c78fcff55838ec96e4af8fdad 100644 (file)
--- a/common/cpu.c
+++ b/common/cpu.c
@@ -47,8 +47,7 @@ const x264_cpu_name_t x264_cpu_names[] =
  {
  #if HAVE_MMX
  //  {"MMX",         X264_CPU_MMX},  // we don't support asm on mmx1 cpus anymore
-//  {"CMOV",        X264_CPU_CMOV}, // we require this unconditionally, so don't print it
-#define MMX2 X264_CPU_MMX|X264_CPU_MMX2|X264_CPU_CMOV
+#define MMX2 X264_CPU_MMX|X264_CPU_MMX2
      {"MMX2",        MMX2},
      {"MMXEXT",      MMX2},
      {"SSE",         MMX2|X264_CPU_SSE},
@@ -71,13 +70,13 @@ const x264_cpu_name_t x264_cpu_names[] =
      {"BMI2",        AVX|X264_CPU_LZCNT|X264_CPU_BMI1|X264_CPU_BMI2},
  #define AVX2 AVX|X264_CPU_FMA3|X264_CPU_LZCNT|X264_CPU_BMI1|X264_CPU_BMI2|X264_CPU_AVX2
      {"AVX2",        AVX2},
+    {"AVX512",      AVX2|X264_CPU_AVX512},
  #undef AVX2
  #undef AVX
  #undef SSE2
  #undef MMX2
      {"Cache32",         X264_CPU_CACHELINE_32},
      {"Cache64",         X264_CPU_CACHELINE_64},
-    {"SlowCTZ",         X264_CPU_SLOW_CTZ},
      {"SlowAtom",        X264_CPU_SLOW_ATOM},
      {"SlowPshufb",      X264_CPU_SLOW_PSHUFB},
      {"SlowPalignr",     X264_CPU_SLOW_PALIGNR},
@@ -120,7 +119,7 @@ static void sigill_handler( int sig )
  #if HAVE_MMX
  int x264_cpu_cpuid_test( void );
  void x264_cpu_cpuid( uint32_t op, uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx );
-void x264_cpu_xgetbv( uint32_t op, uint32_t *eax, uint32_t *edx );
+uint64_t x264_cpu_xgetbv( int xcr );
  
  uint32_t x264_cpu_detect( void )
  {
@@ -128,15 +127,14 @@ uint32_t x264_cpu_detect( void )
      uint32_t eax, ebx, ecx, edx;
      uint32_t vendor[4] = {0};
      uint32_t max_extended_cap, max_basic_cap;
-    int cache;
+    uint64_t xcr0 = 0;
  
  #if !ARCH_X86_64
      if( !x264_cpu_cpuid_test() )
          return 0;
  #endif
  
-    x264_cpu_cpuid( 0, &eax, vendor+0, vendor+2, vendor+1 );
-    max_basic_cap = eax;
+    x264_cpu_cpuid( 0, &max_basic_cap, vendor+0, vendor+2, vendor+1 );
      if( max_basic_cap == 0 )
          return 0;
  
@@ -147,28 +145,24 @@ uint32_t x264_cpu_detect( void )
          return cpu;
      if( edx&0x02000000 )
          cpu |= X264_CPU_MMX2|X264_CPU_SSE;
-    if( edx&0x00008000 )
-        cpu |= X264_CPU_CMOV;
-    else
-        return cpu;
      if( edx&0x04000000 )
          cpu |= X264_CPU_SSE2;
      if( ecx&0x00000001 )
          cpu |= X264_CPU_SSE3;
      if( ecx&0x00000200 )
-        cpu |= X264_CPU_SSSE3;
+        cpu |= X264_CPU_SSSE3|X264_CPU_SSE2_IS_FAST;
      if( ecx&0x00080000 )
          cpu |= X264_CPU_SSE4;
      if( ecx&0x00100000 )
          cpu |= X264_CPU_SSE42;
-    /* Check OXSAVE and AVX bits */
-    if( (ecx&0x18000000) == 0x18000000 )
+
+    if( ecx&0x08000000 ) /* XGETBV supported and XSAVE enabled by OS */
      {
-        /* Check for OS support */
-        x264_cpu_xgetbv( 0, &eax, &edx );
-        if( (eax&0x6) == 0x6 )
+        xcr0 = x264_cpu_xgetbv( 0 );
+        if( (xcr0&0x6) == 0x6 ) /* XMM/YMM state */
          {
-            cpu |= X264_CPU_AVX;
+            if( ecx&0x10000000 )
+                cpu |= X264_CPU_AVX;
              if( ecx&0x00001000 )
                  cpu |= X264_CPU_FMA3;
          }
@@ -177,20 +171,25 @@ uint32_t x264_cpu_detect( void )
      if( max_basic_cap >= 7 )
      {
          x264_cpu_cpuid( 7, &eax, &ebx, &ecx, &edx );
-        /* AVX2 requires OS support, but BMI1/2 don't. */
-        if( (cpu&X264_CPU_AVX) && (ebx&0x00000020) )
-            cpu |= X264_CPU_AVX2;
+
          if( ebx&0x00000008 )
-        {
              cpu |= X264_CPU_BMI1;
-            if( ebx&0x00000100 )
-                cpu |= X264_CPU_BMI2;
+        if( ebx&0x00000100 )
+            cpu |= X264_CPU_BMI2;
+
+        if( (xcr0&0x6) == 0x6 ) /* XMM/YMM state */
+        {
+            if( ebx&0x00000020 )
+                cpu |= X264_CPU_AVX2;
+
+            if( (xcr0&0xE0) == 0xE0 ) /* OPMASK/ZMM state */
+            {
+                if( (ebx&0xD0030000) == 0xD0030000 )
+                    cpu |= X264_CPU_AVX512;
+            }
          }
      }
  
-    if( cpu & X264_CPU_SSSE3 )
-        cpu |= X264_CPU_SSE2_IS_FAST;
-
      x264_cpu_cpuid( 0x80000000, &eax, &ebx, &ecx, &edx );
      max_extended_cap = eax;
  
@@ -230,8 +229,6 @@ uint32_t x264_cpu_detect( void )
          {
              if( edx&0x00400000 )
                  cpu |= X264_CPU_MMX2;
-            if( !(cpu&X264_CPU_LZCNT) )
-                cpu |= X264_CPU_SLOW_CTZ;
              if( (cpu&X264_CPU_SSE2) && !(cpu&X264_CPU_SSE2_IS_FAST) )
                  cpu |= X264_CPU_SSE2_IS_SLOW; /* AMD CPUs come in two types: terrible at SSE and great at it */
          }
@@ -256,7 +253,6 @@ uint32_t x264_cpu_detect( void )
              else if( model == 28 )
              {
                  cpu |= X264_CPU_SLOW_ATOM;
-                cpu |= X264_CPU_SLOW_CTZ;
                  cpu |= X264_CPU_SLOW_PSHUFB;
              }
              /* Conroe has a slow shuffle unit. Check the model number to make sure not
@@ -270,7 +266,7 @@ uint32_t x264_cpu_detect( void )
      {
          /* cacheline size is specified in 3 places, any of which may be missing */
          x264_cpu_cpuid( 1, &eax, &ebx, &ecx, &edx );
-        cache = (ebx&0xff00)>>5; // cflush size
+        int cache = (ebx&0xff00)>>5; // cflush size
          if( !cache && max_extended_cap >= 0x80000006 )
          {
              x264_cpu_cpuid( 0x80000006, &eax, &ebx, &ecx, &edx );
diff --git a/common/cpu.h b/common/cpu.h

index eec1be29ef973f7d11b49eb157f3eddfd8d54849..845034c40e74d00c46169095a22ad884fad9c3bf 100644 (file)
--- a/common/cpu.h
+++ b/common/cpu.h
@@ -56,7 +56,7 @@ void     x264_cpu_sfence( void );
   * alignment between functions (osdep.h handles manual alignment of arrays
   * if it doesn't).
   */
-#if (ARCH_X86 || STACK_ALIGNMENT > 16) && HAVE_MMX
+#if HAVE_MMX && (STACK_ALIGNMENT > 16 || (ARCH_X86 && STACK_ALIGNMENT > 4))
  intptr_t x264_stack_align( void (*func)(), ... );
  #define x264_stack_align(func,...) x264_stack_align((void (*)())func, __VA_ARGS__)
  #else
@@ -65,7 +65,7 @@ intptr_t x264_stack_align( void (*func)(), ... );
  
  typedef struct
  {
-    const char name[16];
+    const char *name;
      uint32_t flags;
  } x264_cpu_name_t;
  extern const x264_cpu_name_t x264_cpu_names[];
diff --git a/common/osdep.h b/common/osdep.h

index 3ff86fcb89c4a4dfbdd28a84a8f18f2e251b3436..ca2455d00449d62aa469d62d579ae767e46902bf 100644 (file)
--- a/common/osdep.h
+++ b/common/osdep.h
@@ -139,17 +139,23 @@ int x264_is_pipe( const char *path );
  #define EXPAND(x) x
  
  #if ARCH_X86 || ARCH_X86_64
-#define NATIVE_ALIGN 32
+#define NATIVE_ALIGN 64
  #define ALIGNED_32( var ) DECLARE_ALIGNED( var, 32 )
+#define ALIGNED_64( var ) DECLARE_ALIGNED( var, 64 )
  #if STACK_ALIGNMENT >= 32
  #define ALIGNED_ARRAY_32( type, name, sub1, ... ) ALIGNED_32( type name sub1 __VA_ARGS__ )
  #else
  #define ALIGNED_ARRAY_32( ... ) EXPAND( ALIGNED_ARRAY_EMU( 31, __VA_ARGS__ ) )
  #endif
+#if STACK_ALIGNMENT >= 64
+#define ALIGNED_ARRAY_64( type, name, sub1, ... ) ALIGNED_64( type name sub1 __VA_ARGS__ )
+#else
  #define ALIGNED_ARRAY_64( ... ) EXPAND( ALIGNED_ARRAY_EMU( 63, __VA_ARGS__ ) )
+#endif
  #else
  #define NATIVE_ALIGN 16
  #define ALIGNED_32 ALIGNED_16
+#define ALIGNED_64 ALIGNED_16
  #define ALIGNED_ARRAY_32 ALIGNED_ARRAY_16
  #define ALIGNED_ARRAY_64 ALIGNED_ARRAY_16
  #endif
diff --git a/common/x86/cpu-a.asm b/common/x86/cpu-a.asm

index c961903ac1ccde5d90859191ce411d4ca193e544..4692f653d0f2f11da28a2f3152db1139e2bf0228 100644 (file)
--- a/common/x86/cpu-a.asm
+++ b/common/x86/cpu-a.asm
@@ -53,18 +53,16 @@ cglobal cpu_cpuid, 5,7
      RET
  
  ;-----------------------------------------------------------------------------
-; void cpu_xgetbv( int op, int *eax, int *edx )
+; uint64_t cpu_xgetbv( int xcr )
  ;-----------------------------------------------------------------------------
-cglobal cpu_xgetbv, 3,7
-    push  r2
-    push  r1
-    mov  ecx, r0d
+cglobal cpu_xgetbv
+    movifnidn ecx, r0m
      xgetbv
-    pop   r4
-    mov [r4], eax
-    pop   r4
-    mov [r4], edx
-    RET
+%if ARCH_X86_64
+    shl       rdx, 32
+    or        rax, rdx
+%endif
+    ret
  
  %if ARCH_X86_64
  
@@ -77,7 +75,7 @@ cglobal stack_align
  %if WIN64
      sub  rsp, 32 ; shadow space
  %endif
-    and  rsp, ~31
+    and  rsp, ~(STACK_ALIGNMENT-1)
      mov  rax, r0
      mov   r0, r1
      mov   r1, r2
@@ -118,7 +116,7 @@ cglobal stack_align
      push ebp
      mov  ebp, esp
      sub  esp, 12
-    and  esp, ~31
+    and  esp, ~(STACK_ALIGNMENT-1)
      mov  ecx, [ebp+8]
      mov  edx, [ebp+12]
      mov  [esp], edx
diff --git a/common/x86/x86inc.asm b/common/x86/x86inc.asm

index f7ab79381bf9b30621e744132ffc4ce2e465e7eb..856b021f90903b05d2e99cf11b67e54448da1a22 100644 (file)
--- a/common/x86/x86inc.asm
+++ b/common/x86/x86inc.asm
@@ -323,6 +323,8 @@ DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
  %endmacro
  
  %define required_stack_alignment ((mmsize + 15) & ~15)
+%define vzeroupper_required (mmsize > 16 && (ARCH_X86_64 == 0 || xmm_regs_used > 16 || notcpuflag(avx512)))
+%define high_mm_regs (16*cpuflag(avx512))
  
  %macro ALLOC_STACK 1-2 0 ; stack_size, n_xmm_regs (for win64 only)
      %ifnum %1
@@ -436,15 +438,16 @@ DECLARE_REG 14, R13, 120
  
  %macro WIN64_PUSH_XMM 0
      ; Use the shadow space to store XMM6 and XMM7, the rest needs stack space allocated.
-    %if xmm_regs_used > 6
+    %if xmm_regs_used > 6 + high_mm_regs
          movaps [rstk + stack_offset +  8], xmm6
      %endif
-    %if xmm_regs_used > 7
+    %if xmm_regs_used > 7 + high_mm_regs
          movaps [rstk + stack_offset + 24], xmm7
      %endif
-    %if xmm_regs_used > 8
+    %assign %%xmm_regs_on_stack xmm_regs_used - high_mm_regs - 8
+    %if %%xmm_regs_on_stack > 0
          %assign %%i 8
-        %rep xmm_regs_used-8
+        %rep %%xmm_regs_on_stack
              movaps [rsp + (%%i-8)*16 + stack_size + 32], xmm %+ %%i
              %assign %%i %%i+1
          %endrep
@@ -453,10 +456,11 @@ DECLARE_REG 14, R13, 120
  
  %macro WIN64_SPILL_XMM 1
      %assign xmm_regs_used %1
-    ASSERT xmm_regs_used <= 16
-    %if xmm_regs_used > 8
+    ASSERT xmm_regs_used <= 16 + high_mm_regs
+    %assign %%xmm_regs_on_stack xmm_regs_used - high_mm_regs - 8
+    %if %%xmm_regs_on_stack > 0
          ; Allocate stack space for callee-saved xmm registers plus shadow space and align the stack.
-        %assign %%pad (xmm_regs_used-8)*16 + 32
+        %assign %%pad %%xmm_regs_on_stack*16 + 32
          %assign stack_size_padded %%pad + ((-%%pad-stack_offset-gprsize) & (STACK_ALIGNMENT-1))
          SUB rsp, stack_size_padded
      %endif
@@ -465,9 +469,10 @@ DECLARE_REG 14, R13, 120
  
  %macro WIN64_RESTORE_XMM_INTERNAL 0
      %assign %%pad_size 0
-    %if xmm_regs_used > 8
-        %assign %%i xmm_regs_used
-        %rep xmm_regs_used-8
+    %assign %%xmm_regs_on_stack xmm_regs_used - high_mm_regs - 8
+    %if %%xmm_regs_on_stack > 0
+        %assign %%i xmm_regs_used - high_mm_regs
+        %rep %%xmm_regs_on_stack
              %assign %%i %%i-1
              movaps xmm %+ %%i, [rsp + (%%i-8)*16 + stack_size + 32]
          %endrep
@@ -480,10 +485,10 @@ DECLARE_REG 14, R13, 120
              %assign %%pad_size stack_size_padded
          %endif
      %endif
-    %if xmm_regs_used > 7
+    %if xmm_regs_used > 7 + high_mm_regs
          movaps xmm7, [rsp + stack_offset - %%pad_size + 24]
      %endif
-    %if xmm_regs_used > 6
+    %if xmm_regs_used > 6 + high_mm_regs
          movaps xmm6, [rsp + stack_offset - %%pad_size +  8]
      %endif
  %endmacro
@@ -495,12 +500,12 @@ DECLARE_REG 14, R13, 120
      %assign xmm_regs_used 0
  %endmacro
  
-%define has_epilogue regs_used > 7 || xmm_regs_used > 6 || mmsize == 32 || stack_size > 0
+%define has_epilogue regs_used > 7 || stack_size > 0 || vzeroupper_required || xmm_regs_used > 6+high_mm_regs
  
  %macro RET 0
      WIN64_RESTORE_XMM_INTERNAL
      POP_IF_USED 14, 13, 12, 11, 10, 9, 8, 7
-    %if mmsize == 32
+    %if vzeroupper_required
          vzeroupper
      %endif
      AUTO_REP_RET
@@ -524,9 +529,10 @@ DECLARE_REG 12, R15, 56
  DECLARE_REG 13, R12, 64
  DECLARE_REG 14, R13, 72
  
-%macro PROLOGUE 2-5+ ; #args, #regs, #xmm_regs, [stack_size,] arg_names...
+%macro PROLOGUE 2-5+ 0 ; #args, #regs, #xmm_regs, [stack_size,] arg_names...
      %assign num_args %1
      %assign regs_used %2
+    %assign xmm_regs_used %3
      ASSERT regs_used >= num_args
      SETUP_STACK_POINTER %4
      ASSERT regs_used <= 15
@@ -536,7 +542,7 @@ DECLARE_REG 14, R13, 72
      DEFINE_ARGS_INTERNAL %0, %4, %5
  %endmacro
  
-%define has_epilogue regs_used > 9 || mmsize == 32 || stack_size > 0
+%define has_epilogue regs_used > 9 || stack_size > 0 || vzeroupper_required
  
  %macro RET 0
      %if stack_size_padded > 0
@@ -547,7 +553,7 @@ DECLARE_REG 14, R13, 72
          %endif
      %endif
      POP_IF_USED 14, 13, 12, 11, 10, 9
-    %if mmsize == 32
+    %if vzeroupper_required
          vzeroupper
      %endif
      AUTO_REP_RET
@@ -592,7 +598,7 @@ DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14
      DEFINE_ARGS_INTERNAL %0, %4, %5
  %endmacro
  
-%define has_epilogue regs_used > 3 || mmsize == 32 || stack_size > 0
+%define has_epilogue regs_used > 3 || stack_size > 0 || vzeroupper_required
  
  %macro RET 0
      %if stack_size_padded > 0
@@ -603,7 +609,7 @@ DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14
          %endif
      %endif
      POP_IF_USED 6, 5, 4, 3
-    %if mmsize == 32
+    %if vzeroupper_required
          vzeroupper
      %endif
      AUTO_REP_RET
@@ -713,7 +719,7 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae,
      %assign stack_offset 0      ; stack pointer offset relative to the return address
      %assign stack_size 0        ; amount of stack space that can be freely used inside a function
      %assign stack_size_padded 0 ; total amount of allocated stack space, including space for callee-saved xmm registers on WIN64 and alignment padding
-    %assign xmm_regs_used 0     ; number of XMM registers requested, used for dealing with callee-saved registers on WIN64
+    %assign xmm_regs_used 0     ; number of XMM registers requested, used for dealing with callee-saved registers on WIN64 and vzeroupper
      %ifnidn %3, ""
          PROLOGUE %3
      %endif
@@ -788,10 +794,10 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae,
  %assign cpuflags_bmi1     (1<<16)| cpuflags_avx|cpuflags_lzcnt
  %assign cpuflags_bmi2     (1<<17)| cpuflags_bmi1
  %assign cpuflags_avx2     (1<<18)| cpuflags_fma3|cpuflags_bmi2
+%assign cpuflags_avx512   (1<<19)| cpuflags_avx2 ; F, CD, BW, DQ, VL
  
-%assign cpuflags_cache32  (1<<19)
-%assign cpuflags_cache64  (1<<20)
-%assign cpuflags_slowctz  (1<<21)
+%assign cpuflags_cache32  (1<<20)
+%assign cpuflags_cache64  (1<<21)
  %assign cpuflags_aligned  (1<<22) ; not a cpu feature, but a function variant
  %assign cpuflags_atom     (1<<23)
  
@@ -849,11 +855,12 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae,
      %endif
  %endmacro
  
-; Merge mmx and sse*
+; Merge mmx, sse*, and avx*
  ; m# is a simd register of the currently selected size
  ; xm# is the corresponding xmm register if mmsize >= 16, otherwise the same as m#
  ; ym# is the corresponding ymm register if mmsize >= 32, otherwise the same as m#
-; (All 3 remain in sync through SWAP.)
+; zm# is the corresponding zmm register if mmsize >= 64, otherwise the same as m#
+; (All 4 remain in sync through SWAP.)
  
  %macro CAT_XDEFINE 3
      %xdefine %1%2 %3
@@ -863,6 +870,18 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae,
      %undef %1%2
  %endmacro
  
+; Prefer registers 16-31 over 0-15 to avoid having to use vzeroupper
+%macro AVX512_MM_PERMUTATION 0-1 0 ; start_reg
+    %if ARCH_X86_64 && cpuflag(avx512)
+        %assign %%i %1
+        %rep 16-%1
+            %assign %%i_high %%i+16
+            SWAP %%i, %%i_high
+            %assign %%i %%i+1
+        %endrep
+    %endif
+%endmacro
+
  %macro INIT_MMX 0-1+
      %assign avx_enabled 0
      %define RESET_MM_PERMUTATION INIT_MMX %1
@@ -878,7 +897,7 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae,
          CAT_XDEFINE nnmm, %%i, %%i
          %assign %%i %%i+1
      %endrep
-    %rep 8
+    %rep 24
          CAT_UNDEF m, %%i
          CAT_UNDEF nnmm, %%i
          %assign %%i %%i+1
@@ -892,7 +911,7 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae,
      %define mmsize 16
      %define num_mmregs 8
      %if ARCH_X86_64
-        %define num_mmregs 16
+        %define num_mmregs 32
      %endif
      %define mova movdqa
      %define movu movdqu
@@ -905,6 +924,10 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae,
          %assign %%i %%i+1
      %endrep
      INIT_CPUFLAGS %1
+    %if WIN64
+        ; Swap callee-saved registers with volatile registers
+        AVX512_MM_PERMUTATION 6
+    %endif
  %endmacro
  
  %macro INIT_YMM 0-1+
@@ -913,7 +936,7 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae,
      %define mmsize 32
      %define num_mmregs 8
      %if ARCH_X86_64
-        %define num_mmregs 16
+        %define num_mmregs 32
      %endif
      %define mova movdqa
      %define movu movdqu
@@ -926,6 +949,29 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae,
          %assign %%i %%i+1
      %endrep
      INIT_CPUFLAGS %1
+    AVX512_MM_PERMUTATION
+%endmacro
+
+%macro INIT_ZMM 0-1+
+    %assign avx_enabled 1
+    %define RESET_MM_PERMUTATION INIT_ZMM %1
+    %define mmsize 64
+    %define num_mmregs 8
+    %if ARCH_X86_64
+        %define num_mmregs 32
+    %endif
+    %define mova movdqa
+    %define movu movdqu
+    %undef movh
+    %define movnta movntdq
+    %assign %%i 0
+    %rep num_mmregs
+        CAT_XDEFINE m, %%i, zmm %+ %%i
+        CAT_XDEFINE nnzmm, %%i, %%i
+        %assign %%i %%i+1
+    %endrep
+    INIT_CPUFLAGS %1
+    AVX512_MM_PERMUTATION
  %endmacro
  
  INIT_XMM
@@ -934,18 +980,26 @@ INIT_XMM
      %define  mmmm%1   mm%1
      %define  mmxmm%1  mm%1
      %define  mmymm%1  mm%1
+    %define  mmzmm%1  mm%1
      %define xmmmm%1   mm%1
      %define xmmxmm%1 xmm%1
      %define xmmymm%1 xmm%1
+    %define xmmzmm%1 xmm%1
      %define ymmmm%1   mm%1
      %define ymmxmm%1 xmm%1
      %define ymmymm%1 ymm%1
+    %define ymmzmm%1 ymm%1
+    %define zmmmm%1   mm%1
+    %define zmmxmm%1 xmm%1
+    %define zmmymm%1 ymm%1
+    %define zmmzmm%1 zmm%1
      %define xm%1 xmm %+ m%1
      %define ym%1 ymm %+ m%1
+    %define zm%1 zmm %+ m%1
  %endmacro
  
  %assign i 0
-%rep 16
+%rep 32
      DECLARE_MMCAST i
      %assign i i+1
  %endrep
@@ -1080,12 +1134,17 @@ INIT_XMM
  ;=============================================================================
  
  %assign i 0
-%rep 16
+%rep 32
      %if i < 8
          CAT_XDEFINE sizeofmm, i, 8
+        CAT_XDEFINE regnumofmm, i, i
      %endif
      CAT_XDEFINE sizeofxmm, i, 16
      CAT_XDEFINE sizeofymm, i, 32
+    CAT_XDEFINE sizeofzmm, i, 64
+    CAT_XDEFINE regnumofxmm, i, i
+    CAT_XDEFINE regnumofymm, i, i
+    CAT_XDEFINE regnumofzmm, i, i
      %assign i i+1
  %endrep
  %undef i
@@ -1202,7 +1261,7 @@ INIT_XMM
      %endmacro
  %endmacro
  
-; Instructions with both VEX and non-VEX encodings
+; Instructions with both VEX/EVEX and legacy encodings
  ; Non-destructive instructions are written without parameters
  AVX_INSTR addpd, sse2, 1, 0, 1
  AVX_INSTR addps, sse, 1, 0, 1
@@ -1533,3 +1592,49 @@ FMA4_INSTR fmsub,    pd, ps, sd, ss
  FMA4_INSTR fmsubadd, pd, ps
  FMA4_INSTR fnmadd,   pd, ps, sd, ss
  FMA4_INSTR fnmsub,   pd, ps, sd, ss
+
+; Macros for converting VEX instructions to equivalent EVEX ones.
+%macro EVEX_INSTR 2-3 0 ; vex, evex, prefer_evex
+    %macro %1 2-7 fnord, fnord, %1, %2, %3
+        %ifidn %3, fnord
+            %define %%args %1, %2
+        %elifidn %4, fnord
+            %define %%args %1, %2, %3
+        %else
+            %define %%args %1, %2, %3, %4
+        %endif
+        %assign %%evex_required cpuflag(avx512) & %7
+        %ifnum regnumof%1
+            %if regnumof%1 >= 16 || sizeof%1 > 32
+                %assign %%evex_required 1
+            %endif
+        %endif
+        %ifnum regnumof%2
+            %if regnumof%2 >= 16 || sizeof%2 > 32
+                %assign %%evex_required 1
+            %endif
+        %endif
+        %if %%evex_required
+            %6 %%args
+        %else
+            %5 %%args ; Prefer VEX over EVEX due to shorter instruction length
+        %endif
+    %endmacro
+%endmacro
+
+EVEX_INSTR vbroadcastf128, vbroadcastf32x4
+EVEX_INSTR vbroadcasti128, vbroadcasti32x4
+EVEX_INSTR vextractf128,   vextractf32x4
+EVEX_INSTR vextracti128,   vextracti32x4
+EVEX_INSTR vinsertf128,    vinsertf32x4
+EVEX_INSTR vinserti128,    vinserti32x4
+EVEX_INSTR vmovdqa,        vmovdqa32
+EVEX_INSTR vmovdqu,        vmovdqu32
+EVEX_INSTR vpand,          vpandd
+EVEX_INSTR vpandn,         vpandnd
+EVEX_INSTR vpor,           vpord
+EVEX_INSTR vpxor,          vpxord
+EVEX_INSTR vrcpps,         vrcp14ps,   1 ; EVEX versions have higher precision
+EVEX_INSTR vrcpss,         vrcp14ss,   1
+EVEX_INSTR vrsqrtps,       vrsqrt14ps, 1
+EVEX_INSTR vrsqrtss,       vrsqrt14ss, 1
diff --git a/configure b/configure

index 4ebaf57217069fb3360f11db6ab85f250ad3a887..52871c43c7959cf2ab02c06d6d443238a7137aab 100755 (executable)
--- a/configure
+++ b/configure
@@ -863,7 +863,10 @@ if [ $shared = yes -a \( $ARCH = "X86_64" -o $ARCH = "PPC" -o $ARCH = "ALPHA" -o
  fi
  
  if [ $compiler = GNU -a \( $ARCH = X86 -o $ARCH = X86_64 \) ] ; then
-    if cc_check '' -mpreferred-stack-boundary=5 ; then
+    if cc_check '' -mpreferred-stack-boundary=6 ; then
+        CFLAGS="$CFLAGS -mpreferred-stack-boundary=6"
+        stack_alignment=64
+    elif cc_check '' -mpreferred-stack-boundary=5 ; then
          CFLAGS="$CFLAGS -mpreferred-stack-boundary=5"
          stack_alignment=32
      elif [ $stack_alignment -lt 16 ] && cc_check '' -mpreferred-stack-boundary=4 ; then
diff --git a/encoder/encoder.c b/encoder/encoder.c

index a4771c986bdf6b0383cc7beee0065506980f9319..4b7dcdac8ef0c83d60e9b3a4d06920a435079757 100644 (file)
--- a/encoder/encoder.c
+++ b/encoder/encoder.c
@@ -444,11 +444,6 @@ static int x264_validate_parameters( x264_t *h, int b_open )
              fail = 1;
          }
  #endif
-        if( !fail && !(cpuflags & X264_CPU_CMOV) )
-        {
-            x264_log( h, X264_LOG_ERROR, "your cpu does not support CMOV, but x264 was compiled with asm\n");
-            fail = 1;
-        }
          if( fail )
          {
              x264_log( h, X264_LOG_ERROR, "to run x264, recompile without asm (configure --disable-asm)\n");
diff --git a/tools/checkasm.c b/tools/checkasm.c

index a72768e5b78317d35d4e828d484f2d9bda365be1..a2c2e492a6de69a7bea1ee8d18467491cdcf40b3 100644 (file)
--- a/tools/checkasm.c
+++ b/tools/checkasm.c
@@ -57,8 +57,7 @@ int quiet = 0;
      if( !ok ) ret = -1; \
  }
  
-#define BENCH_RUNS 100  // tradeoff between accuracy and speed
-#define BENCH_ALIGNS 16 // number of stack+heap data alignments (another accuracy vs speed tradeoff)
+#define BENCH_RUNS 2000 // tradeoff between accuracy and speed
  #define MAX_FUNCS 1000  // just has to be big enough to hold all the existing functions
  #define MAX_CPUS 30     // number of different combinations of cpu flags
  
@@ -178,6 +177,7 @@ static void print_bench(void)
                  continue;
              printf( "%s_%s%s: %"PRId64"\n", benchs[i].name,
  #if HAVE_MMX
+                    b->cpu&X264_CPU_AVX512 ? "avx512" :
                      b->cpu&X264_CPU_AVX2 ? "avx2" :
                      b->cpu&X264_CPU_BMI2 ? "bmi2" :
                      b->cpu&X264_CPU_BMI1 ? "bmi1" :
@@ -2602,6 +2602,11 @@ static int check_cabac( int cpu_ref, int cpu_new )
      x264_quant_init( &h, cpu_new, &h.quantf );
      h.quantf.coeff_last[DCT_CHROMA_DC] = h.quantf.coeff_last4;
  
+/* Reset cabac state to avoid buffer overruns in do_bench() with large BENCH_RUNS values. */
+#define GET_CB( i ) (\
+    x264_cabac_encode_init( &cb[i], bitstream[i], bitstream[i]+0xfff0 ),\
+    cb[i].f8_bits_encoded = 0, &cb[i] )
+
  #define CABAC_RESIDUAL(name, start, end, rd)\
  {\
      if( bs_a.name##_internal && (bs_a.name##_internal != bs_ref.name##_internal || (cpu_new&X264_CPU_SSE2_IS_SLOW)) )\
@@ -2637,13 +2642,9 @@ static int check_cabac( int cpu_ref, int cpu_new )
                      x264_cabac_t cb[2];\
                      x264_cabac_context_init( &h, &cb[0], SLICE_TYPE_P, 26, 0 );\
                      x264_cabac_context_init( &h, &cb[1], SLICE_TYPE_P, 26, 0 );\
-                    x264_cabac_encode_init( &cb[0], bitstream[0], bitstream[0]+0xfff0 );\
-                    x264_cabac_encode_init( &cb[1], bitstream[1], bitstream[1]+0xfff0 );\
-                    cb[0].f8_bits_encoded = 0;\
-                    cb[1].f8_bits_encoded = 0;\
                      if( !rd ) memcpy( bitstream[1], bitstream[0], 0x400 );\
-                    call_c1( x264_##name##_c, &h, &cb[0], ctx_block_cat, dct[0]+ac );\
-                    call_a1( bs_a.name##_internal, dct[1]+ac, i, ctx_block_cat, &cb[1] );\
+                    call_c1( x264_##name##_c, &h, GET_CB( 0 ), ctx_block_cat, dct[0]+ac );\
+                    call_a1( bs_a.name##_internal, dct[1]+ac, i, ctx_block_cat, GET_CB( 1 ) );\
                      ok = cb[0].f8_bits_encoded == cb[1].f8_bits_encoded && !memcmp(cb[0].state, cb[1].state, 1024);\
                      if( !rd ) ok |= !memcmp( bitstream[1], bitstream[0], 0x400 ) && !memcmp( &cb[1], &cb[0], offsetof(x264_cabac_t, p_start) );\
                      if( !ok )\
@@ -2656,8 +2657,8 @@ static int check_cabac( int cpu_ref, int cpu_new )
                      }\
                      if( (j&15) == 0 )\
                      {\
-                        call_c2( x264_##name##_c, &h, &cb[0], ctx_block_cat, dct[0]+ac );\
-                        call_a2( bs_a.name##_internal, dct[1]+ac, i, ctx_block_cat, &cb[1] );\
+                        call_c2( x264_##name##_c, &h, GET_CB( 0 ), ctx_block_cat, dct[0]+ac );\
+                        call_a2( bs_a.name##_internal, dct[1]+ac, i, ctx_block_cat, GET_CB( 1 ) );\
                      }\
                  }\
              }\
@@ -2794,8 +2795,6 @@ static int check_all_flags( void )
          ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_32, "MMX Cache32" );
          cpu1 &= ~X264_CPU_CACHELINE_32;
  #endif
-        ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_CTZ, "MMX SlowCTZ" );
-        cpu1 &= ~X264_CPU_SLOW_CTZ;
      }
      if( cpu_detect & X264_CPU_SSE )
          ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE, "SSE" );
@@ -2807,8 +2806,6 @@ static int check_all_flags( void )
          cpu1 &= ~X264_CPU_CACHELINE_64;
          ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_SHUFFLE, "SSE2 SlowShuffle" );
          cpu1 &= ~X264_CPU_SLOW_SHUFFLE;
-        ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_CTZ, "SSE2 SlowCTZ" );
-        cpu1 &= ~X264_CPU_SLOW_CTZ;
      }
      if( cpu_detect & X264_CPU_LZCNT )
      {
@@ -2827,8 +2824,6 @@ static int check_all_flags( void )
          cpu1 &= ~X264_CPU_CACHELINE_64;
          ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_SHUFFLE, "SSSE3 SlowShuffle" );
          cpu1 &= ~X264_CPU_SLOW_SHUFFLE;
-        ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_CTZ, "SSSE3 SlowCTZ" );
-        cpu1 &= ~X264_CPU_SLOW_CTZ;
          ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_ATOM, "SSSE3 SlowAtom" );
          ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_64, "SSSE3 Cache64 SlowAtom" );
          cpu1 &= ~X264_CPU_CACHELINE_64;
@@ -2860,6 +2855,8 @@ static int check_all_flags( void )
          ret |= add_flags( &cpu0, &cpu1, X264_CPU_BMI2, "BMI2" );
      if( cpu_detect & X264_CPU_AVX2 )
          ret |= add_flags( &cpu0, &cpu1, X264_CPU_AVX2, "AVX2" );
+    if( cpu_detect & X264_CPU_AVX512 )
+        ret |= add_flags( &cpu0, &cpu1, X264_CPU_AVX512, "AVX512" );
  #elif ARCH_PPC
      if( cpu_detect & X264_CPU_ALTIVEC )
      {
@@ -2889,8 +2886,6 @@ static int check_all_flags( void )
  
  int main(int argc, char *argv[])
  {
-    int ret = 0;
-
  #ifdef _WIN32
      /* Disable the Windows Error Reporting dialog */
      SetErrorMode( SEM_NOGPFAULTERRORBOX );
@@ -2916,8 +2911,8 @@ int main(int argc, char *argv[])
      fprintf( stderr, "x264: using random seed %u\n", seed );
      srand( seed );
  
-    buf1 = x264_malloc( 0x1e00 + 0x2000*sizeof(pixel) + 32*BENCH_ALIGNS );
-    pbuf1 = x264_malloc( 0x1e00*sizeof(pixel) + 32*BENCH_ALIGNS );
+    buf1 = x264_malloc( 0x1e00 + 0x2000*sizeof(pixel) );
+    pbuf1 = x264_malloc( 0x1e00*sizeof(pixel) );
      if( !buf1 || !pbuf1 )
      {
          fprintf( stderr, "malloc failed, unable to initiate tests!\n" );
@@ -2938,21 +2933,7 @@ int main(int argc, char *argv[])
      }
      memset( buf1+0x1e00, 0, 0x2000*sizeof(pixel) );
  
-    /* 32-byte alignment is guaranteed whenever it's useful, but some functions also vary in speed depending on %64 */
-    if( do_bench )
-        for( int i = 0; i < BENCH_ALIGNS && !ret; i++ )
-        {
-            INIT_POINTER_OFFSETS;
-            ret |= x264_stack_pagealign( check_all_flags, i*32 );
-            buf1 += 32;
-            pbuf1 += 32;
-            quiet = 1;
-            fprintf( stderr, "%d/%d\r", i+1, BENCH_ALIGNS );
-        }
-    else
-        ret = x264_stack_pagealign( check_all_flags, 0 );
-
-    if( ret )
+    if( x264_stack_pagealign( check_all_flags, 0 ) )
      {
          fprintf( stderr, "x264: at least one test has failed. Go and fix that Right Now!\n" );
          return -1;
diff --git a/x264.h b/x264.h

index 748b3b0a596eaef931f4cd76dbf100c16005e6ab..277090d0837fb87ab0c2ee46414bcc5b6985844c 100644 (file)
--- a/x264.h
+++ b/x264.h
@@ -45,7 +45,7 @@ extern "C" {
  
  #include "x264_config.h"
  
-#define X264_BUILD 149
+#define X264_BUILD 150
  
  /* Application developers planning to link against a shared library version of
   * libx264 from a Microsoft Visual Studio or similar development environment
@@ -119,39 +119,38 @@ typedef struct x264_nal_t
  /* CPU flags */
  
  /* x86 */
-#define X264_CPU_CMOV            0x0000001
-#define X264_CPU_MMX             0x0000002
-#define X264_CPU_MMX2            0x0000004  /* MMX2 aka MMXEXT aka ISSE */
-#define X264_CPU_MMXEXT          X264_CPU_MMX2
-#define X264_CPU_SSE             0x0000008
-#define X264_CPU_SSE2            0x0000010
-#define X264_CPU_SSE3            0x0000020
-#define X264_CPU_SSSE3           0x0000040
-#define X264_CPU_SSE4            0x0000080  /* SSE4.1 */
-#define X264_CPU_SSE42           0x0000100  /* SSE4.2 */
-#define X264_CPU_LZCNT           0x0000200  /* Phenom support for "leading zero count" instruction. */
-#define X264_CPU_AVX             0x0000400  /* AVX support: requires OS support even if YMM registers aren't used. */
-#define X264_CPU_XOP             0x0000800  /* AMD XOP */
-#define X264_CPU_FMA4            0x0001000  /* AMD FMA4 */
-#define X264_CPU_FMA3            0x0002000  /* FMA3 */
-#define X264_CPU_AVX2            0x0004000  /* AVX2 */
-#define X264_CPU_BMI1            0x0008000  /* BMI1 */
-#define X264_CPU_BMI2            0x0010000  /* BMI2 */
+#define X264_CPU_MMX                (1<<0)
+#define X264_CPU_MMX2               (1<<1)  /* MMX2 aka MMXEXT aka ISSE */
+#define X264_CPU_MMXEXT             X264_CPU_MMX2
+#define X264_CPU_SSE                (1<<2)
+#define X264_CPU_SSE2               (1<<3)
+#define X264_CPU_LZCNT              (1<<4)
+#define X264_CPU_SSE3               (1<<5)
+#define X264_CPU_SSSE3              (1<<6)
+#define X264_CPU_SSE4               (1<<7)  /* SSE4.1 */
+#define X264_CPU_SSE42              (1<<8)  /* SSE4.2 */
+#define X264_CPU_AVX                (1<<9)  /* Requires OS support even if YMM registers aren't used */
+#define X264_CPU_XOP                (1<<10) /* AMD XOP */
+#define X264_CPU_FMA4               (1<<11) /* AMD FMA4 */
+#define X264_CPU_FMA3               (1<<12)
+#define X264_CPU_BMI1               (1<<13)
+#define X264_CPU_BMI2               (1<<14)
+#define X264_CPU_AVX2               (1<<15)
+#define X264_CPU_AVX512             (1<<16) /* AVX-512 {F, CD, BW, DQ, VL}, requires OS support */
  /* x86 modifiers */
-#define X264_CPU_CACHELINE_32    0x0020000  /* avoid memory loads that span the border between two cachelines */
-#define X264_CPU_CACHELINE_64    0x0040000  /* 32/64 is the size of a cacheline in bytes */
-#define X264_CPU_SSE2_IS_SLOW    0x0080000  /* avoid most SSE2 functions on Athlon64 */
-#define X264_CPU_SSE2_IS_FAST    0x0100000  /* a few functions are only faster on Core2 and Phenom */
-#define X264_CPU_SLOW_SHUFFLE    0x0200000  /* The Conroe has a slow shuffle unit (relative to overall SSE performance) */
-#define X264_CPU_STACK_MOD4      0x0400000  /* if stack is only mod4 and not mod16 */
-#define X264_CPU_SLOW_CTZ        0x0800000  /* BSR/BSF x86 instructions are really slow on some CPUs */
-#define X264_CPU_SLOW_ATOM       0x1000000  /* The Atom is terrible: slow SSE unaligned loads, slow
+#define X264_CPU_CACHELINE_32       (1<<17) /* avoid memory loads that span the border between two cachelines */
+#define X264_CPU_CACHELINE_64       (1<<18) /* 32/64 is the size of a cacheline in bytes */
+#define X264_CPU_SSE2_IS_SLOW       (1<<19) /* avoid most SSE2 functions on Athlon64 */
+#define X264_CPU_SSE2_IS_FAST       (1<<20) /* a few functions are only faster on Core2 and Phenom */
+#define X264_CPU_SLOW_SHUFFLE       (1<<21) /* The Conroe has a slow shuffle unit (relative to overall SSE performance) */
+#define X264_CPU_STACK_MOD4         (1<<22) /* if stack is only mod4 and not mod16 */
+#define X264_CPU_SLOW_ATOM          (1<<23) /* The Atom is terrible: slow SSE unaligned loads, slow
                                               * SIMD multiplies, slow SIMD variable shifts, slow pshufb,
                                               * cacheline split penalties -- gather everything here that
                                               * isn't shared by other CPUs to avoid making half a dozen
                                               * new SLOW flags. */
-#define X264_CPU_SLOW_PSHUFB     0x2000000  /* such as on the Intel Atom */
-#define X264_CPU_SLOW_PALIGNR    0x4000000  /* such as on the AMD Bobcat */
+#define X264_CPU_SLOW_PSHUFB        (1<<24) /* such as on the Intel Atom */
+#define X264_CPU_SLOW_PALIGNR       (1<<25) /* such as on the AMD Bobcat */
  
  /* PowerPC */
  #define X264_CPU_ALTIVEC         0x0000001
author	Henrik Gramner <henrik@gramner.com>
	Sat, 25 Mar 2017 09:16:09 +0000 (10:16 +0100)
committer	Henrik Gramner <henrik@gramner.com>
	Sun, 21 May 2017 20:42:15 +0000 (22:42 +0200)
common/cpu.c		patch \| blob \| history
common/cpu.h		patch \| blob \| history
common/osdep.h		patch \| blob \| history
common/x86/cpu-a.asm		patch \| blob \| history
common/x86/x86inc.asm		patch \| blob \| history
configure		patch \| blob \| history
encoder/encoder.c		patch \| blob \| history
tools/checkasm.c		patch \| blob \| history
x264.h		patch \| blob \| history