{
#if HAVE_MMX
// {"MMX", X264_CPU_MMX}, // we don't support asm on mmx1 cpus anymore
-// {"CMOV", X264_CPU_CMOV}, // we require this unconditionally, so don't print it
-#define MMX2 X264_CPU_MMX|X264_CPU_MMX2|X264_CPU_CMOV
+#define MMX2 X264_CPU_MMX|X264_CPU_MMX2
{"MMX2", MMX2},
{"MMXEXT", MMX2},
{"SSE", MMX2|X264_CPU_SSE},
{"BMI2", AVX|X264_CPU_LZCNT|X264_CPU_BMI1|X264_CPU_BMI2},
#define AVX2 AVX|X264_CPU_FMA3|X264_CPU_LZCNT|X264_CPU_BMI1|X264_CPU_BMI2|X264_CPU_AVX2
{"AVX2", AVX2},
+ {"AVX512", AVX2|X264_CPU_AVX512},
#undef AVX2
#undef AVX
#undef SSE2
#undef MMX2
{"Cache32", X264_CPU_CACHELINE_32},
{"Cache64", X264_CPU_CACHELINE_64},
- {"SlowCTZ", X264_CPU_SLOW_CTZ},
{"SlowAtom", X264_CPU_SLOW_ATOM},
{"SlowPshufb", X264_CPU_SLOW_PSHUFB},
{"SlowPalignr", X264_CPU_SLOW_PALIGNR},
#if HAVE_MMX
int x264_cpu_cpuid_test( void );
void x264_cpu_cpuid( uint32_t op, uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx );
-void x264_cpu_xgetbv( uint32_t op, uint32_t *eax, uint32_t *edx );
+uint64_t x264_cpu_xgetbv( int xcr );
uint32_t x264_cpu_detect( void )
{
uint32_t eax, ebx, ecx, edx;
uint32_t vendor[4] = {0};
uint32_t max_extended_cap, max_basic_cap;
- int cache;
+ uint64_t xcr0 = 0;
#if !ARCH_X86_64
if( !x264_cpu_cpuid_test() )
return 0;
#endif
- x264_cpu_cpuid( 0, &eax, vendor+0, vendor+2, vendor+1 );
- max_basic_cap = eax;
+ x264_cpu_cpuid( 0, &max_basic_cap, vendor+0, vendor+2, vendor+1 );
if( max_basic_cap == 0 )
return 0;
return cpu;
if( edx&0x02000000 )
cpu |= X264_CPU_MMX2|X264_CPU_SSE;
- if( edx&0x00008000 )
- cpu |= X264_CPU_CMOV;
- else
- return cpu;
if( edx&0x04000000 )
cpu |= X264_CPU_SSE2;
if( ecx&0x00000001 )
cpu |= X264_CPU_SSE3;
if( ecx&0x00000200 )
- cpu |= X264_CPU_SSSE3;
+ cpu |= X264_CPU_SSSE3|X264_CPU_SSE2_IS_FAST;
if( ecx&0x00080000 )
cpu |= X264_CPU_SSE4;
if( ecx&0x00100000 )
cpu |= X264_CPU_SSE42;
- /* Check OXSAVE and AVX bits */
- if( (ecx&0x18000000) == 0x18000000 )
+
+ if( ecx&0x08000000 ) /* XGETBV supported and XSAVE enabled by OS */
{
- /* Check for OS support */
- x264_cpu_xgetbv( 0, &eax, &edx );
- if( (eax&0x6) == 0x6 )
+ xcr0 = x264_cpu_xgetbv( 0 );
+ if( (xcr0&0x6) == 0x6 ) /* XMM/YMM state */
{
- cpu |= X264_CPU_AVX;
+ if( ecx&0x10000000 )
+ cpu |= X264_CPU_AVX;
if( ecx&0x00001000 )
cpu |= X264_CPU_FMA3;
}
if( max_basic_cap >= 7 )
{
x264_cpu_cpuid( 7, &eax, &ebx, &ecx, &edx );
- /* AVX2 requires OS support, but BMI1/2 don't. */
- if( (cpu&X264_CPU_AVX) && (ebx&0x00000020) )
- cpu |= X264_CPU_AVX2;
+
if( ebx&0x00000008 )
- {
cpu |= X264_CPU_BMI1;
- if( ebx&0x00000100 )
- cpu |= X264_CPU_BMI2;
+ if( ebx&0x00000100 )
+ cpu |= X264_CPU_BMI2;
+
+ if( (xcr0&0x6) == 0x6 ) /* XMM/YMM state */
+ {
+ if( ebx&0x00000020 )
+ cpu |= X264_CPU_AVX2;
+
+ if( (xcr0&0xE0) == 0xE0 ) /* OPMASK/ZMM state */
+ {
+ if( (ebx&0xD0030000) == 0xD0030000 )
+ cpu |= X264_CPU_AVX512;
+ }
}
}
- if( cpu & X264_CPU_SSSE3 )
- cpu |= X264_CPU_SSE2_IS_FAST;
-
x264_cpu_cpuid( 0x80000000, &eax, &ebx, &ecx, &edx );
max_extended_cap = eax;
{
if( edx&0x00400000 )
cpu |= X264_CPU_MMX2;
- if( !(cpu&X264_CPU_LZCNT) )
- cpu |= X264_CPU_SLOW_CTZ;
if( (cpu&X264_CPU_SSE2) && !(cpu&X264_CPU_SSE2_IS_FAST) )
cpu |= X264_CPU_SSE2_IS_SLOW; /* AMD CPUs come in two types: terrible at SSE and great at it */
}
else if( model == 28 )
{
cpu |= X264_CPU_SLOW_ATOM;
- cpu |= X264_CPU_SLOW_CTZ;
cpu |= X264_CPU_SLOW_PSHUFB;
}
/* Conroe has a slow shuffle unit. Check the model number to make sure not
{
/* cacheline size is specified in 3 places, any of which may be missing */
x264_cpu_cpuid( 1, &eax, &ebx, &ecx, &edx );
- cache = (ebx&0xff00)>>5; // cflush size
+ int cache = (ebx&0xff00)>>5; // cflush size
if( !cache && max_extended_cap >= 0x80000006 )
{
x264_cpu_cpuid( 0x80000006, &eax, &ebx, &ecx, &edx );
* alignment between functions (osdep.h handles manual alignment of arrays
* if it doesn't).
*/
-#if (ARCH_X86 || STACK_ALIGNMENT > 16) && HAVE_MMX
+#if HAVE_MMX && (STACK_ALIGNMENT > 16 || (ARCH_X86 && STACK_ALIGNMENT > 4))
intptr_t x264_stack_align( void (*func)(), ... );
#define x264_stack_align(func,...) x264_stack_align((void (*)())func, __VA_ARGS__)
#else
typedef struct
{
- const char name[16];
+ const char *name;
uint32_t flags;
} x264_cpu_name_t;
extern const x264_cpu_name_t x264_cpu_names[];
#define EXPAND(x) x
#if ARCH_X86 || ARCH_X86_64
-#define NATIVE_ALIGN 32
+#define NATIVE_ALIGN 64
#define ALIGNED_32( var ) DECLARE_ALIGNED( var, 32 )
+#define ALIGNED_64( var ) DECLARE_ALIGNED( var, 64 )
#if STACK_ALIGNMENT >= 32
#define ALIGNED_ARRAY_32( type, name, sub1, ... ) ALIGNED_32( type name sub1 __VA_ARGS__ )
#else
#define ALIGNED_ARRAY_32( ... ) EXPAND( ALIGNED_ARRAY_EMU( 31, __VA_ARGS__ ) )
#endif
+#if STACK_ALIGNMENT >= 64
+#define ALIGNED_ARRAY_64( type, name, sub1, ... ) ALIGNED_64( type name sub1 __VA_ARGS__ )
+#else
#define ALIGNED_ARRAY_64( ... ) EXPAND( ALIGNED_ARRAY_EMU( 63, __VA_ARGS__ ) )
+#endif
#else
#define NATIVE_ALIGN 16
#define ALIGNED_32 ALIGNED_16
+#define ALIGNED_64 ALIGNED_16
#define ALIGNED_ARRAY_32 ALIGNED_ARRAY_16
#define ALIGNED_ARRAY_64 ALIGNED_ARRAY_16
#endif
RET
;-----------------------------------------------------------------------------
-; void cpu_xgetbv( int op, int *eax, int *edx )
+; uint64_t cpu_xgetbv( int xcr )
;-----------------------------------------------------------------------------
-cglobal cpu_xgetbv, 3,7
- push r2
- push r1
- mov ecx, r0d
+cglobal cpu_xgetbv
+ movifnidn ecx, r0m
xgetbv
- pop r4
- mov [r4], eax
- pop r4
- mov [r4], edx
- RET
+%if ARCH_X86_64
+ shl rdx, 32
+ or rax, rdx
+%endif
+ ret
%if ARCH_X86_64
%if WIN64
sub rsp, 32 ; shadow space
%endif
- and rsp, ~31
+ and rsp, ~(STACK_ALIGNMENT-1)
mov rax, r0
mov r0, r1
mov r1, r2
push ebp
mov ebp, esp
sub esp, 12
- and esp, ~31
+ and esp, ~(STACK_ALIGNMENT-1)
mov ecx, [ebp+8]
mov edx, [ebp+12]
mov [esp], edx
%endmacro
%define required_stack_alignment ((mmsize + 15) & ~15)
+%define vzeroupper_required (mmsize > 16 && (ARCH_X86_64 == 0 || xmm_regs_used > 16 || notcpuflag(avx512)))
+%define high_mm_regs (16*cpuflag(avx512))
%macro ALLOC_STACK 1-2 0 ; stack_size, n_xmm_regs (for win64 only)
%ifnum %1
%macro WIN64_PUSH_XMM 0
; Use the shadow space to store XMM6 and XMM7, the rest needs stack space allocated.
- %if xmm_regs_used > 6
+ %if xmm_regs_used > 6 + high_mm_regs
movaps [rstk + stack_offset + 8], xmm6
%endif
- %if xmm_regs_used > 7
+ %if xmm_regs_used > 7 + high_mm_regs
movaps [rstk + stack_offset + 24], xmm7
%endif
- %if xmm_regs_used > 8
+ %assign %%xmm_regs_on_stack xmm_regs_used - high_mm_regs - 8
+ %if %%xmm_regs_on_stack > 0
%assign %%i 8
- %rep xmm_regs_used-8
+ %rep %%xmm_regs_on_stack
movaps [rsp + (%%i-8)*16 + stack_size + 32], xmm %+ %%i
%assign %%i %%i+1
%endrep
%macro WIN64_SPILL_XMM 1
%assign xmm_regs_used %1
- ASSERT xmm_regs_used <= 16
- %if xmm_regs_used > 8
+ ASSERT xmm_regs_used <= 16 + high_mm_regs
+ %assign %%xmm_regs_on_stack xmm_regs_used - high_mm_regs - 8
+ %if %%xmm_regs_on_stack > 0
; Allocate stack space for callee-saved xmm registers plus shadow space and align the stack.
- %assign %%pad (xmm_regs_used-8)*16 + 32
+ %assign %%pad %%xmm_regs_on_stack*16 + 32
%assign stack_size_padded %%pad + ((-%%pad-stack_offset-gprsize) & (STACK_ALIGNMENT-1))
SUB rsp, stack_size_padded
%endif
%macro WIN64_RESTORE_XMM_INTERNAL 0
%assign %%pad_size 0
- %if xmm_regs_used > 8
- %assign %%i xmm_regs_used
- %rep xmm_regs_used-8
+ %assign %%xmm_regs_on_stack xmm_regs_used - high_mm_regs - 8
+ %if %%xmm_regs_on_stack > 0
+ %assign %%i xmm_regs_used - high_mm_regs
+ %rep %%xmm_regs_on_stack
%assign %%i %%i-1
movaps xmm %+ %%i, [rsp + (%%i-8)*16 + stack_size + 32]
%endrep
%assign %%pad_size stack_size_padded
%endif
%endif
- %if xmm_regs_used > 7
+ %if xmm_regs_used > 7 + high_mm_regs
movaps xmm7, [rsp + stack_offset - %%pad_size + 24]
%endif
- %if xmm_regs_used > 6
+ %if xmm_regs_used > 6 + high_mm_regs
movaps xmm6, [rsp + stack_offset - %%pad_size + 8]
%endif
%endmacro
%assign xmm_regs_used 0
%endmacro
-%define has_epilogue regs_used > 7 || xmm_regs_used > 6 || mmsize == 32 || stack_size > 0
+%define has_epilogue regs_used > 7 || stack_size > 0 || vzeroupper_required || xmm_regs_used > 6+high_mm_regs
%macro RET 0
WIN64_RESTORE_XMM_INTERNAL
POP_IF_USED 14, 13, 12, 11, 10, 9, 8, 7
- %if mmsize == 32
+ %if vzeroupper_required
vzeroupper
%endif
AUTO_REP_RET
DECLARE_REG 13, R12, 64
DECLARE_REG 14, R13, 72
-%macro PROLOGUE 2-5+ ; #args, #regs, #xmm_regs, [stack_size,] arg_names...
+%macro PROLOGUE 2-5+ 0 ; #args, #regs, #xmm_regs, [stack_size,] arg_names...
%assign num_args %1
%assign regs_used %2
+ %assign xmm_regs_used %3
ASSERT regs_used >= num_args
SETUP_STACK_POINTER %4
ASSERT regs_used <= 15
DEFINE_ARGS_INTERNAL %0, %4, %5
%endmacro
-%define has_epilogue regs_used > 9 || mmsize == 32 || stack_size > 0
+%define has_epilogue regs_used > 9 || stack_size > 0 || vzeroupper_required
%macro RET 0
%if stack_size_padded > 0
%endif
%endif
POP_IF_USED 14, 13, 12, 11, 10, 9
- %if mmsize == 32
+ %if vzeroupper_required
vzeroupper
%endif
AUTO_REP_RET
DEFINE_ARGS_INTERNAL %0, %4, %5
%endmacro
-%define has_epilogue regs_used > 3 || mmsize == 32 || stack_size > 0
+%define has_epilogue regs_used > 3 || stack_size > 0 || vzeroupper_required
%macro RET 0
%if stack_size_padded > 0
%endif
%endif
POP_IF_USED 6, 5, 4, 3
- %if mmsize == 32
+ %if vzeroupper_required
vzeroupper
%endif
AUTO_REP_RET
%assign stack_offset 0 ; stack pointer offset relative to the return address
%assign stack_size 0 ; amount of stack space that can be freely used inside a function
%assign stack_size_padded 0 ; total amount of allocated stack space, including space for callee-saved xmm registers on WIN64 and alignment padding
- %assign xmm_regs_used 0 ; number of XMM registers requested, used for dealing with callee-saved registers on WIN64
+ %assign xmm_regs_used 0 ; number of XMM registers requested, used for dealing with callee-saved registers on WIN64 and vzeroupper
%ifnidn %3, ""
PROLOGUE %3
%endif
%assign cpuflags_bmi1 (1<<16)| cpuflags_avx|cpuflags_lzcnt
%assign cpuflags_bmi2 (1<<17)| cpuflags_bmi1
%assign cpuflags_avx2 (1<<18)| cpuflags_fma3|cpuflags_bmi2
+%assign cpuflags_avx512 (1<<19)| cpuflags_avx2 ; F, CD, BW, DQ, VL
-%assign cpuflags_cache32 (1<<19)
-%assign cpuflags_cache64 (1<<20)
-%assign cpuflags_slowctz (1<<21)
+%assign cpuflags_cache32 (1<<20)
+%assign cpuflags_cache64 (1<<21)
%assign cpuflags_aligned (1<<22) ; not a cpu feature, but a function variant
%assign cpuflags_atom (1<<23)
%endif
%endmacro
-; Merge mmx and sse*
+; Merge mmx, sse*, and avx*
; m# is a simd register of the currently selected size
; xm# is the corresponding xmm register if mmsize >= 16, otherwise the same as m#
; ym# is the corresponding ymm register if mmsize >= 32, otherwise the same as m#
-; (All 3 remain in sync through SWAP.)
+; zm# is the corresponding zmm register if mmsize >= 64, otherwise the same as m#
+; (All 4 remain in sync through SWAP.)
%macro CAT_XDEFINE 3
%xdefine %1%2 %3
%undef %1%2
%endmacro
+; Prefer registers 16-31 over 0-15 to avoid having to use vzeroupper
+%macro AVX512_MM_PERMUTATION 0-1 0 ; start_reg
+ %if ARCH_X86_64 && cpuflag(avx512)
+ %assign %%i %1
+ %rep 16-%1
+ %assign %%i_high %%i+16
+ SWAP %%i, %%i_high
+ %assign %%i %%i+1
+ %endrep
+ %endif
+%endmacro
+
%macro INIT_MMX 0-1+
%assign avx_enabled 0
%define RESET_MM_PERMUTATION INIT_MMX %1
CAT_XDEFINE nnmm, %%i, %%i
%assign %%i %%i+1
%endrep
- %rep 8
+ %rep 24
CAT_UNDEF m, %%i
CAT_UNDEF nnmm, %%i
%assign %%i %%i+1
%define mmsize 16
%define num_mmregs 8
%if ARCH_X86_64
- %define num_mmregs 16
+ %define num_mmregs 32
%endif
%define mova movdqa
%define movu movdqu
%assign %%i %%i+1
%endrep
INIT_CPUFLAGS %1
+ %if WIN64
+ ; Swap callee-saved registers with volatile registers
+ AVX512_MM_PERMUTATION 6
+ %endif
%endmacro
%macro INIT_YMM 0-1+
%define mmsize 32
%define num_mmregs 8
%if ARCH_X86_64
- %define num_mmregs 16
+ %define num_mmregs 32
%endif
%define mova movdqa
%define movu movdqu
%assign %%i %%i+1
%endrep
INIT_CPUFLAGS %1
+ AVX512_MM_PERMUTATION
+%endmacro
+
+%macro INIT_ZMM 0-1+
+ %assign avx_enabled 1
+ %define RESET_MM_PERMUTATION INIT_ZMM %1
+ %define mmsize 64
+ %define num_mmregs 8
+ %if ARCH_X86_64
+ %define num_mmregs 32
+ %endif
+ %define mova movdqa
+ %define movu movdqu
+ %undef movh
+ %define movnta movntdq
+ %assign %%i 0
+ %rep num_mmregs
+ CAT_XDEFINE m, %%i, zmm %+ %%i
+ CAT_XDEFINE nnzmm, %%i, %%i
+ %assign %%i %%i+1
+ %endrep
+ INIT_CPUFLAGS %1
+ AVX512_MM_PERMUTATION
%endmacro
INIT_XMM
%define mmmm%1 mm%1
%define mmxmm%1 mm%1
%define mmymm%1 mm%1
+ %define mmzmm%1 mm%1
%define xmmmm%1 mm%1
%define xmmxmm%1 xmm%1
%define xmmymm%1 xmm%1
+ %define xmmzmm%1 xmm%1
%define ymmmm%1 mm%1
%define ymmxmm%1 xmm%1
%define ymmymm%1 ymm%1
+ %define ymmzmm%1 ymm%1
+ %define zmmmm%1 mm%1
+ %define zmmxmm%1 xmm%1
+ %define zmmymm%1 ymm%1
+ %define zmmzmm%1 zmm%1
%define xm%1 xmm %+ m%1
%define ym%1 ymm %+ m%1
+ %define zm%1 zmm %+ m%1
%endmacro
%assign i 0
-%rep 16
+%rep 32
DECLARE_MMCAST i
%assign i i+1
%endrep
;=============================================================================
%assign i 0
-%rep 16
+%rep 32
%if i < 8
CAT_XDEFINE sizeofmm, i, 8
+ CAT_XDEFINE regnumofmm, i, i
%endif
CAT_XDEFINE sizeofxmm, i, 16
CAT_XDEFINE sizeofymm, i, 32
+ CAT_XDEFINE sizeofzmm, i, 64
+ CAT_XDEFINE regnumofxmm, i, i
+ CAT_XDEFINE regnumofymm, i, i
+ CAT_XDEFINE regnumofzmm, i, i
%assign i i+1
%endrep
%undef i
%endmacro
%endmacro
-; Instructions with both VEX and non-VEX encodings
+; Instructions with both VEX/EVEX and legacy encodings
; Non-destructive instructions are written without parameters
AVX_INSTR addpd, sse2, 1, 0, 1
AVX_INSTR addps, sse, 1, 0, 1
FMA4_INSTR fmsubadd, pd, ps
FMA4_INSTR fnmadd, pd, ps, sd, ss
FMA4_INSTR fnmsub, pd, ps, sd, ss
+
+; Macros for converting VEX instructions to equivalent EVEX ones.
+%macro EVEX_INSTR 2-3 0 ; vex, evex, prefer_evex
+ %macro %1 2-7 fnord, fnord, %1, %2, %3
+ %ifidn %3, fnord
+ %define %%args %1, %2
+ %elifidn %4, fnord
+ %define %%args %1, %2, %3
+ %else
+ %define %%args %1, %2, %3, %4
+ %endif
+ %assign %%evex_required cpuflag(avx512) & %7
+ %ifnum regnumof%1
+ %if regnumof%1 >= 16 || sizeof%1 > 32
+ %assign %%evex_required 1
+ %endif
+ %endif
+ %ifnum regnumof%2
+ %if regnumof%2 >= 16 || sizeof%2 > 32
+ %assign %%evex_required 1
+ %endif
+ %endif
+ %if %%evex_required
+ %6 %%args
+ %else
+ %5 %%args ; Prefer VEX over EVEX due to shorter instruction length
+ %endif
+ %endmacro
+%endmacro
+
+EVEX_INSTR vbroadcastf128, vbroadcastf32x4
+EVEX_INSTR vbroadcasti128, vbroadcasti32x4
+EVEX_INSTR vextractf128, vextractf32x4
+EVEX_INSTR vextracti128, vextracti32x4
+EVEX_INSTR vinsertf128, vinsertf32x4
+EVEX_INSTR vinserti128, vinserti32x4
+EVEX_INSTR vmovdqa, vmovdqa32
+EVEX_INSTR vmovdqu, vmovdqu32
+EVEX_INSTR vpand, vpandd
+EVEX_INSTR vpandn, vpandnd
+EVEX_INSTR vpor, vpord
+EVEX_INSTR vpxor, vpxord
+EVEX_INSTR vrcpps, vrcp14ps, 1 ; EVEX versions have higher precision
+EVEX_INSTR vrcpss, vrcp14ss, 1
+EVEX_INSTR vrsqrtps, vrsqrt14ps, 1
+EVEX_INSTR vrsqrtss, vrsqrt14ss, 1
fi
if [ $compiler = GNU -a \( $ARCH = X86 -o $ARCH = X86_64 \) ] ; then
- if cc_check '' -mpreferred-stack-boundary=5 ; then
+ if cc_check '' -mpreferred-stack-boundary=6 ; then
+ CFLAGS="$CFLAGS -mpreferred-stack-boundary=6"
+ stack_alignment=64
+ elif cc_check '' -mpreferred-stack-boundary=5 ; then
CFLAGS="$CFLAGS -mpreferred-stack-boundary=5"
stack_alignment=32
elif [ $stack_alignment -lt 16 ] && cc_check '' -mpreferred-stack-boundary=4 ; then
fail = 1;
}
#endif
- if( !fail && !(cpuflags & X264_CPU_CMOV) )
- {
- x264_log( h, X264_LOG_ERROR, "your cpu does not support CMOV, but x264 was compiled with asm\n");
- fail = 1;
- }
if( fail )
{
x264_log( h, X264_LOG_ERROR, "to run x264, recompile without asm (configure --disable-asm)\n");
if( !ok ) ret = -1; \
}
-#define BENCH_RUNS 100 // tradeoff between accuracy and speed
-#define BENCH_ALIGNS 16 // number of stack+heap data alignments (another accuracy vs speed tradeoff)
+#define BENCH_RUNS 2000 // tradeoff between accuracy and speed
#define MAX_FUNCS 1000 // just has to be big enough to hold all the existing functions
#define MAX_CPUS 30 // number of different combinations of cpu flags
continue;
printf( "%s_%s%s: %"PRId64"\n", benchs[i].name,
#if HAVE_MMX
+ b->cpu&X264_CPU_AVX512 ? "avx512" :
b->cpu&X264_CPU_AVX2 ? "avx2" :
b->cpu&X264_CPU_BMI2 ? "bmi2" :
b->cpu&X264_CPU_BMI1 ? "bmi1" :
x264_quant_init( &h, cpu_new, &h.quantf );
h.quantf.coeff_last[DCT_CHROMA_DC] = h.quantf.coeff_last4;
+/* Reset cabac state to avoid buffer overruns in do_bench() with large BENCH_RUNS values. */
+#define GET_CB( i ) (\
+ x264_cabac_encode_init( &cb[i], bitstream[i], bitstream[i]+0xfff0 ),\
+ cb[i].f8_bits_encoded = 0, &cb[i] )
+
#define CABAC_RESIDUAL(name, start, end, rd)\
{\
if( bs_a.name##_internal && (bs_a.name##_internal != bs_ref.name##_internal || (cpu_new&X264_CPU_SSE2_IS_SLOW)) )\
x264_cabac_t cb[2];\
x264_cabac_context_init( &h, &cb[0], SLICE_TYPE_P, 26, 0 );\
x264_cabac_context_init( &h, &cb[1], SLICE_TYPE_P, 26, 0 );\
- x264_cabac_encode_init( &cb[0], bitstream[0], bitstream[0]+0xfff0 );\
- x264_cabac_encode_init( &cb[1], bitstream[1], bitstream[1]+0xfff0 );\
- cb[0].f8_bits_encoded = 0;\
- cb[1].f8_bits_encoded = 0;\
if( !rd ) memcpy( bitstream[1], bitstream[0], 0x400 );\
- call_c1( x264_##name##_c, &h, &cb[0], ctx_block_cat, dct[0]+ac );\
- call_a1( bs_a.name##_internal, dct[1]+ac, i, ctx_block_cat, &cb[1] );\
+ call_c1( x264_##name##_c, &h, GET_CB( 0 ), ctx_block_cat, dct[0]+ac );\
+ call_a1( bs_a.name##_internal, dct[1]+ac, i, ctx_block_cat, GET_CB( 1 ) );\
ok = cb[0].f8_bits_encoded == cb[1].f8_bits_encoded && !memcmp(cb[0].state, cb[1].state, 1024);\
if( !rd ) ok |= !memcmp( bitstream[1], bitstream[0], 0x400 ) && !memcmp( &cb[1], &cb[0], offsetof(x264_cabac_t, p_start) );\
if( !ok )\
}\
if( (j&15) == 0 )\
{\
- call_c2( x264_##name##_c, &h, &cb[0], ctx_block_cat, dct[0]+ac );\
- call_a2( bs_a.name##_internal, dct[1]+ac, i, ctx_block_cat, &cb[1] );\
+ call_c2( x264_##name##_c, &h, GET_CB( 0 ), ctx_block_cat, dct[0]+ac );\
+ call_a2( bs_a.name##_internal, dct[1]+ac, i, ctx_block_cat, GET_CB( 1 ) );\
}\
}\
}\
ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_32, "MMX Cache32" );
cpu1 &= ~X264_CPU_CACHELINE_32;
#endif
- ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_CTZ, "MMX SlowCTZ" );
- cpu1 &= ~X264_CPU_SLOW_CTZ;
}
if( cpu_detect & X264_CPU_SSE )
ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE, "SSE" );
cpu1 &= ~X264_CPU_CACHELINE_64;
ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_SHUFFLE, "SSE2 SlowShuffle" );
cpu1 &= ~X264_CPU_SLOW_SHUFFLE;
- ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_CTZ, "SSE2 SlowCTZ" );
- cpu1 &= ~X264_CPU_SLOW_CTZ;
}
if( cpu_detect & X264_CPU_LZCNT )
{
cpu1 &= ~X264_CPU_CACHELINE_64;
ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_SHUFFLE, "SSSE3 SlowShuffle" );
cpu1 &= ~X264_CPU_SLOW_SHUFFLE;
- ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_CTZ, "SSSE3 SlowCTZ" );
- cpu1 &= ~X264_CPU_SLOW_CTZ;
ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_ATOM, "SSSE3 SlowAtom" );
ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_64, "SSSE3 Cache64 SlowAtom" );
cpu1 &= ~X264_CPU_CACHELINE_64;
ret |= add_flags( &cpu0, &cpu1, X264_CPU_BMI2, "BMI2" );
if( cpu_detect & X264_CPU_AVX2 )
ret |= add_flags( &cpu0, &cpu1, X264_CPU_AVX2, "AVX2" );
+ if( cpu_detect & X264_CPU_AVX512 )
+ ret |= add_flags( &cpu0, &cpu1, X264_CPU_AVX512, "AVX512" );
#elif ARCH_PPC
if( cpu_detect & X264_CPU_ALTIVEC )
{
int main(int argc, char *argv[])
{
- int ret = 0;
-
#ifdef _WIN32
/* Disable the Windows Error Reporting dialog */
SetErrorMode( SEM_NOGPFAULTERRORBOX );
fprintf( stderr, "x264: using random seed %u\n", seed );
srand( seed );
- buf1 = x264_malloc( 0x1e00 + 0x2000*sizeof(pixel) + 32*BENCH_ALIGNS );
- pbuf1 = x264_malloc( 0x1e00*sizeof(pixel) + 32*BENCH_ALIGNS );
+ buf1 = x264_malloc( 0x1e00 + 0x2000*sizeof(pixel) );
+ pbuf1 = x264_malloc( 0x1e00*sizeof(pixel) );
if( !buf1 || !pbuf1 )
{
fprintf( stderr, "malloc failed, unable to initiate tests!\n" );
}
memset( buf1+0x1e00, 0, 0x2000*sizeof(pixel) );
- /* 32-byte alignment is guaranteed whenever it's useful, but some functions also vary in speed depending on %64 */
- if( do_bench )
- for( int i = 0; i < BENCH_ALIGNS && !ret; i++ )
- {
- INIT_POINTER_OFFSETS;
- ret |= x264_stack_pagealign( check_all_flags, i*32 );
- buf1 += 32;
- pbuf1 += 32;
- quiet = 1;
- fprintf( stderr, "%d/%d\r", i+1, BENCH_ALIGNS );
- }
- else
- ret = x264_stack_pagealign( check_all_flags, 0 );
-
- if( ret )
+ if( x264_stack_pagealign( check_all_flags, 0 ) )
{
fprintf( stderr, "x264: at least one test has failed. Go and fix that Right Now!\n" );
return -1;
#include "x264_config.h"
-#define X264_BUILD 149
+#define X264_BUILD 150
/* Application developers planning to link against a shared library version of
* libx264 from a Microsoft Visual Studio or similar development environment
/* CPU flags */
/* x86 */
-#define X264_CPU_CMOV 0x0000001
-#define X264_CPU_MMX 0x0000002
-#define X264_CPU_MMX2 0x0000004 /* MMX2 aka MMXEXT aka ISSE */
-#define X264_CPU_MMXEXT X264_CPU_MMX2
-#define X264_CPU_SSE 0x0000008
-#define X264_CPU_SSE2 0x0000010
-#define X264_CPU_SSE3 0x0000020
-#define X264_CPU_SSSE3 0x0000040
-#define X264_CPU_SSE4 0x0000080 /* SSE4.1 */
-#define X264_CPU_SSE42 0x0000100 /* SSE4.2 */
-#define X264_CPU_LZCNT 0x0000200 /* Phenom support for "leading zero count" instruction. */
-#define X264_CPU_AVX 0x0000400 /* AVX support: requires OS support even if YMM registers aren't used. */
-#define X264_CPU_XOP 0x0000800 /* AMD XOP */
-#define X264_CPU_FMA4 0x0001000 /* AMD FMA4 */
-#define X264_CPU_FMA3 0x0002000 /* FMA3 */
-#define X264_CPU_AVX2 0x0004000 /* AVX2 */
-#define X264_CPU_BMI1 0x0008000 /* BMI1 */
-#define X264_CPU_BMI2 0x0010000 /* BMI2 */
+#define X264_CPU_MMX (1<<0)
+#define X264_CPU_MMX2 (1<<1) /* MMX2 aka MMXEXT aka ISSE */
+#define X264_CPU_MMXEXT X264_CPU_MMX2
+#define X264_CPU_SSE (1<<2)
+#define X264_CPU_SSE2 (1<<3)
+#define X264_CPU_LZCNT (1<<4)
+#define X264_CPU_SSE3 (1<<5)
+#define X264_CPU_SSSE3 (1<<6)
+#define X264_CPU_SSE4 (1<<7) /* SSE4.1 */
+#define X264_CPU_SSE42 (1<<8) /* SSE4.2 */
+#define X264_CPU_AVX (1<<9) /* Requires OS support even if YMM registers aren't used */
+#define X264_CPU_XOP (1<<10) /* AMD XOP */
+#define X264_CPU_FMA4 (1<<11) /* AMD FMA4 */
+#define X264_CPU_FMA3 (1<<12)
+#define X264_CPU_BMI1 (1<<13)
+#define X264_CPU_BMI2 (1<<14)
+#define X264_CPU_AVX2 (1<<15)
+#define X264_CPU_AVX512 (1<<16) /* AVX-512 {F, CD, BW, DQ, VL}, requires OS support */
/* x86 modifiers */
-#define X264_CPU_CACHELINE_32 0x0020000 /* avoid memory loads that span the border between two cachelines */
-#define X264_CPU_CACHELINE_64 0x0040000 /* 32/64 is the size of a cacheline in bytes */
-#define X264_CPU_SSE2_IS_SLOW 0x0080000 /* avoid most SSE2 functions on Athlon64 */
-#define X264_CPU_SSE2_IS_FAST 0x0100000 /* a few functions are only faster on Core2 and Phenom */
-#define X264_CPU_SLOW_SHUFFLE 0x0200000 /* The Conroe has a slow shuffle unit (relative to overall SSE performance) */
-#define X264_CPU_STACK_MOD4 0x0400000 /* if stack is only mod4 and not mod16 */
-#define X264_CPU_SLOW_CTZ 0x0800000 /* BSR/BSF x86 instructions are really slow on some CPUs */
-#define X264_CPU_SLOW_ATOM 0x1000000 /* The Atom is terrible: slow SSE unaligned loads, slow
+#define X264_CPU_CACHELINE_32 (1<<17) /* avoid memory loads that span the border between two cachelines */
+#define X264_CPU_CACHELINE_64 (1<<18) /* 32/64 is the size of a cacheline in bytes */
+#define X264_CPU_SSE2_IS_SLOW (1<<19) /* avoid most SSE2 functions on Athlon64 */
+#define X264_CPU_SSE2_IS_FAST (1<<20) /* a few functions are only faster on Core2 and Phenom */
+#define X264_CPU_SLOW_SHUFFLE (1<<21) /* The Conroe has a slow shuffle unit (relative to overall SSE performance) */
+#define X264_CPU_STACK_MOD4 (1<<22) /* if stack is only mod4 and not mod16 */
+#define X264_CPU_SLOW_ATOM (1<<23) /* The Atom is terrible: slow SSE unaligned loads, slow
* SIMD multiplies, slow SIMD variable shifts, slow pshufb,
* cacheline split penalties -- gather everything here that
* isn't shared by other CPUs to avoid making half a dozen
* new SLOW flags. */
-#define X264_CPU_SLOW_PSHUFB 0x2000000 /* such as on the Intel Atom */
-#define X264_CPU_SLOW_PALIGNR 0x4000000 /* such as on the AMD Bobcat */
+#define X264_CPU_SLOW_PSHUFB (1<<24) /* such as on the Intel Atom */
+#define X264_CPU_SLOW_PALIGNR (1<<25) /* such as on the AMD Bobcat */
/* PowerPC */
#define X264_CPU_ALTIVEC 0x0000001