From: Henrik Gramner Date: Sat, 2 Aug 2014 16:26:18 +0000 (+0200) Subject: x86: Make AVX2 also imply FMA3 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=8ae4e1cfa3d16451ccf285228d309f6f4940a747;p=libx264 x86: Make AVX2 also imply FMA3 All CPUs with AVX2 supports FMA3 (but not the other way around). --- diff --git a/common/cpu.c b/common/cpu.c index 4877a015..cad5f2c2 100644 --- a/common/cpu.c +++ b/common/cpu.c @@ -67,8 +67,8 @@ const x264_cpu_name_t x264_cpu_names[] = {"AVX", AVX}, {"XOP", AVX|X264_CPU_XOP}, {"FMA4", AVX|X264_CPU_FMA4}, - {"AVX2", AVX|X264_CPU_AVX2}, {"FMA3", AVX|X264_CPU_FMA3}, + {"AVX2", AVX|X264_CPU_FMA3|X264_CPU_AVX2}, #undef AVX #undef SSE2 #undef MMX2 diff --git a/common/x86/mc-a2.asm b/common/x86/mc-a2.asm index d56ad4f6..9e1746c3 100644 --- a/common/x86/mc-a2.asm +++ b/common/x86/mc-a2.asm @@ -2136,7 +2136,7 @@ cglobal mbtree_propagate_cost, 6,6,%1 INIT_YMM avx MBTREE_AVX 8 -INIT_YMM avx2,fma3 +INIT_YMM avx2 MBTREE_AVX 7 %macro MBTREE_PROPAGATE_LIST 0 diff --git a/common/x86/mc-c.c b/common/x86/mc-c.c index d231a8c9..9101997f 100644 --- a/common/x86/mc-c.c +++ b/common/x86/mc-c.c @@ -167,8 +167,8 @@ void x264_mbtree_propagate_cost_avx ( int16_t *dst, uint16_t *propagate_in, uint uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len ); void x264_mbtree_propagate_cost_fma4( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs, uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len ); -void x264_mbtree_propagate_cost_avx2_fma3( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs, - uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len ); +void x264_mbtree_propagate_cost_avx2( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs, + uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len ); #define MC_CHROMA(cpu)\ void x264_mc_chroma_##cpu( pixel *dstu, pixel *dstv, intptr_t i_dst, pixel *src, intptr_t i_src,\ @@ -938,7 +938,5 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf ) if( !(cpu&X264_CPU_AVX2) ) return; pf->get_ref = get_ref_avx2; - - if( cpu&X264_CPU_FMA3 ) - pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_avx2_fma3; + pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_avx2; } diff --git a/common/x86/x86inc.asm b/common/x86/x86inc.asm index 0f0dcdfc..67dfff6a 100644 --- a/common/x86/x86inc.asm +++ b/common/x86/x86inc.asm @@ -738,8 +738,8 @@ SECTION .note.GNU-stack noalloc noexec nowrite progbits %assign cpuflags_avx (1<<11)| cpuflags_sse42 %assign cpuflags_xop (1<<12)| cpuflags_avx %assign cpuflags_fma4 (1<<13)| cpuflags_avx -%assign cpuflags_avx2 (1<<14)| cpuflags_avx -%assign cpuflags_fma3 (1<<15)| cpuflags_avx +%assign cpuflags_fma3 (1<<14)| cpuflags_avx +%assign cpuflags_avx2 (1<<15)| cpuflags_fma3 %assign cpuflags_cache32 (1<<16) %assign cpuflags_cache64 (1<<17) diff --git a/tools/checkasm.c b/tools/checkasm.c index 7163976a..a348aa18 100644 --- a/tools/checkasm.c +++ b/tools/checkasm.c @@ -167,12 +167,12 @@ static void print_bench(void) continue; printf( "%s_%s%s: %"PRId64"\n", benchs[i].name, #if HAVE_MMX - b->cpu&X264_CPU_AVX2 && b->cpu&X264_CPU_FMA3 ? "avx2_fma3" : b->cpu&X264_CPU_AVX2 ? "avx2" : b->cpu&X264_CPU_FMA3 ? "fma3" : b->cpu&X264_CPU_FMA4 ? "fma4" : b->cpu&X264_CPU_XOP ? "xop" : b->cpu&X264_CPU_AVX ? "avx" : + b->cpu&X264_CPU_SSE42 ? "sse42" : b->cpu&X264_CPU_SSE4 ? "sse4" : b->cpu&X264_CPU_SSSE3 ? "ssse3" : b->cpu&X264_CPU_SSE3 ? "sse3" : @@ -2651,7 +2651,7 @@ static int check_all_flags( void ) #endif if( cpu_detect & X264_CPU_LZCNT ) { - ret |= add_flags( &cpu0, &cpu1, X264_CPU_LZCNT, "MMX_LZCNT" ); + ret |= add_flags( &cpu0, &cpu1, X264_CPU_LZCNT, "MMX LZCNT" ); cpu1 &= ~X264_CPU_LZCNT; } ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_CTZ, "MMX SlowCTZ" ); @@ -2669,11 +2669,11 @@ static int check_all_flags( void ) cpu1 &= ~X264_CPU_SLOW_SHUFFLE; ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_CTZ, "SSE2 SlowCTZ" ); cpu1 &= ~X264_CPU_SLOW_CTZ; - } - if( cpu_detect & X264_CPU_LZCNT ) - { - ret |= add_flags( &cpu0, &cpu1, X264_CPU_LZCNT, "SSE_LZCNT" ); - cpu1 &= ~X264_CPU_LZCNT; + if( cpu_detect & X264_CPU_LZCNT ) + { + ret |= add_flags( &cpu0, &cpu1, X264_CPU_LZCNT, "SSE2 LZCNT" ); + cpu1 &= ~X264_CPU_LZCNT; + } } if( cpu_detect & X264_CPU_SSE3 ) { @@ -2693,9 +2693,16 @@ static int check_all_flags( void ) ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_64, "SSSE3 Cache64 SlowAtom" ); cpu1 &= ~X264_CPU_CACHELINE_64; cpu1 &= ~X264_CPU_SLOW_ATOM; + if( cpu_detect & X264_CPU_LZCNT ) + { + ret |= add_flags( &cpu0, &cpu1, X264_CPU_LZCNT, "SSSE3 LZCNT" ); + cpu1 &= ~X264_CPU_LZCNT; + } } if( cpu_detect & X264_CPU_SSE4 ) ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE4, "SSE4" ); + if( cpu_detect & X264_CPU_SSE42 ) + ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE42, "SSE4.2" ); if( cpu_detect & X264_CPU_AVX ) ret |= add_flags( &cpu0, &cpu1, X264_CPU_AVX, "AVX" ); if( cpu_detect & X264_CPU_XOP ) @@ -2705,30 +2712,30 @@ static int check_all_flags( void ) ret |= add_flags( &cpu0, &cpu1, X264_CPU_FMA4, "FMA4" ); cpu1 &= ~X264_CPU_FMA4; } - if( cpu_detect & X264_CPU_BMI1 ) + if( cpu_detect & X264_CPU_FMA3 ) { - ret |= add_flags( &cpu0, &cpu1, X264_CPU_BMI1, "BMI1" ); - cpu1 &= ~X264_CPU_BMI1; + ret |= add_flags( &cpu0, &cpu1, X264_CPU_FMA3, "FMA3" ); + cpu1 &= ~X264_CPU_FMA3; } if( cpu_detect & X264_CPU_AVX2 ) { - ret |= add_flags( &cpu0, &cpu1, X264_CPU_AVX2, "AVX2" ); + ret |= add_flags( &cpu0, &cpu1, X264_CPU_FMA3 | X264_CPU_AVX2, "AVX2" ); if( cpu_detect & X264_CPU_LZCNT ) { - ret |= add_flags( &cpu0, &cpu1, X264_CPU_LZCNT, "AVX2_LZCNT" ); + ret |= add_flags( &cpu0, &cpu1, X264_CPU_LZCNT, "AVX2 LZCNT" ); cpu1 &= ~X264_CPU_LZCNT; } } + if( cpu_detect & X264_CPU_BMI1 ) + { + ret |= add_flags( &cpu0, &cpu1, X264_CPU_BMI1, "BMI1" ); + cpu1 &= ~X264_CPU_BMI1; + } if( cpu_detect & X264_CPU_BMI2 ) { ret |= add_flags( &cpu0, &cpu1, X264_CPU_BMI1|X264_CPU_BMI2, "BMI2" ); cpu1 &= ~(X264_CPU_BMI1|X264_CPU_BMI2); } - if( cpu_detect & X264_CPU_FMA3 ) - { - ret |= add_flags( &cpu0, &cpu1, X264_CPU_FMA3, "FMA3" ); - cpu1 &= ~X264_CPU_FMA3; - } #elif ARCH_PPC if( cpu_detect & X264_CPU_ALTIVEC ) { diff --git a/x264.h b/x264.h index b896f913..e3b1d15b 100644 --- a/x264.h +++ b/x264.h @@ -41,7 +41,7 @@ #include "x264_config.h" -#define X264_BUILD 142 +#define X264_BUILD 143 /* Application developers planning to link against a shared library version of * libx264 from a Microsoft Visual Studio or similar development environment @@ -129,8 +129,8 @@ typedef struct #define X264_CPU_AVX 0x0000400 /* AVX support: requires OS support even if YMM registers aren't used. */ #define X264_CPU_XOP 0x0000800 /* AMD XOP */ #define X264_CPU_FMA4 0x0001000 /* AMD FMA4 */ -#define X264_CPU_AVX2 0x0002000 /* AVX2 */ -#define X264_CPU_FMA3 0x0004000 /* Intel FMA3 */ +#define X264_CPU_FMA3 0x0002000 /* FMA3 */ +#define X264_CPU_AVX2 0x0004000 /* AVX2 */ #define X264_CPU_BMI1 0x0008000 /* BMI1 */ #define X264_CPU_BMI2 0x0010000 /* BMI2 */ /* x86 modifiers */