From 8c2974255b01728d4eda2434cc1997c4a3ca5eff Mon Sep 17 00:00:00 2001 From: Henrik Gramner Date: Sat, 6 May 2017 12:26:56 +0200 Subject: [PATCH] x86: Add some additional cpuflag relations Simplifies writing assembly code that depends on available instructions. LZCNT implies SSE2 BMI1 implies AVX+LZCNT AVX2 implies BMI2 Skip printing LZCNT under CPU capabilities when BMI1 or BMI2 is available, and don't print FMA4 when FMA3 is available. --- common/bitstream.c | 28 ++++++++--------- common/cpu.c | 10 +++--- common/quant.c | 69 +++++++++++++++++++----------------------- common/x86/cabac-a.asm | 34 ++++++++++----------- common/x86/quant-a.asm | 24 ++++++++------- common/x86/quant.h | 27 ++++++++--------- common/x86/x86inc.asm | 36 +++++++++++----------- encoder/encoder.c | 6 ++++ tools/checkasm.c | 45 ++++++++------------------- 9 files changed, 129 insertions(+), 150 deletions(-) diff --git a/common/bitstream.c b/common/bitstream.c index d6c1c2ca..6d3f9c6c 100644 --- a/common/bitstream.c +++ b/common/bitstream.c @@ -43,16 +43,16 @@ uint8_t *x264_nal_escape_mmx2( uint8_t *dst, uint8_t *src, uint8_t *end ); uint8_t *x264_nal_escape_sse2( uint8_t *dst, uint8_t *src, uint8_t *end ); uint8_t *x264_nal_escape_avx2( uint8_t *dst, uint8_t *src, uint8_t *end ); void x264_cabac_block_residual_rd_internal_sse2 ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb ); -void x264_cabac_block_residual_rd_internal_sse2_lzcnt ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb ); +void x264_cabac_block_residual_rd_internal_lzcnt ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb ); void x264_cabac_block_residual_rd_internal_ssse3 ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb ); void x264_cabac_block_residual_rd_internal_ssse3_lzcnt( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb ); void x264_cabac_block_residual_8x8_rd_internal_sse2 ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb ); -void x264_cabac_block_residual_8x8_rd_internal_sse2_lzcnt ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb ); +void x264_cabac_block_residual_8x8_rd_internal_lzcnt ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb ); void x264_cabac_block_residual_8x8_rd_internal_ssse3 ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb ); void x264_cabac_block_residual_8x8_rd_internal_ssse3_lzcnt( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb ); -void x264_cabac_block_residual_internal_sse2 ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb ); -void x264_cabac_block_residual_internal_sse2_lzcnt( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb ); -void x264_cabac_block_residual_internal_avx2_bmi2 ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb ); +void x264_cabac_block_residual_internal_sse2 ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb ); +void x264_cabac_block_residual_internal_lzcnt ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb ); +void x264_cabac_block_residual_internal_avx2 ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb ); uint8_t *x264_nal_escape_neon( uint8_t *dst, uint8_t *src, uint8_t *end ); @@ -126,18 +126,17 @@ void x264_bitstream_init( int cpu, x264_bitstream_function_t *pf ) pf->nal_escape = x264_nal_escape_mmx2; if( cpu&X264_CPU_SSE2 ) { -#if ARCH_X86_64 - if( cpu&X264_CPU_LZCNT ) - { - pf->cabac_block_residual_internal = x264_cabac_block_residual_internal_sse2_lzcnt; - pf->cabac_block_residual_rd_internal = x264_cabac_block_residual_rd_internal_sse2_lzcnt; - pf->cabac_block_residual_8x8_rd_internal = x264_cabac_block_residual_8x8_rd_internal_sse2_lzcnt; - } -#endif if( cpu&X264_CPU_SSE2_IS_FAST ) pf->nal_escape = x264_nal_escape_sse2; } #if ARCH_X86_64 + if( cpu&X264_CPU_LZCNT ) + { + pf->cabac_block_residual_internal = x264_cabac_block_residual_internal_lzcnt; + pf->cabac_block_residual_rd_internal = x264_cabac_block_residual_rd_internal_lzcnt; + pf->cabac_block_residual_8x8_rd_internal = x264_cabac_block_residual_8x8_rd_internal_lzcnt; + } + if( cpu&X264_CPU_SSSE3 ) { pf->cabac_block_residual_rd_internal = x264_cabac_block_residual_rd_internal_ssse3; @@ -152,8 +151,7 @@ void x264_bitstream_init( int cpu, x264_bitstream_function_t *pf ) if( cpu&X264_CPU_AVX2 ) { pf->nal_escape = x264_nal_escape_avx2; - if( cpu&X264_CPU_BMI2 ) - pf->cabac_block_residual_internal = x264_cabac_block_residual_internal_avx2_bmi2; + pf->cabac_block_residual_internal = x264_cabac_block_residual_internal_avx2; } #endif #endif diff --git a/common/cpu.c b/common/cpu.c index 636a40c5..172957ea 100644 --- a/common/cpu.c +++ b/common/cpu.c @@ -56,6 +56,7 @@ const x264_cpu_name_t x264_cpu_names[] = {"SSE2Slow", SSE2|X264_CPU_SSE2_IS_SLOW}, {"SSE2", SSE2}, {"SSE2Fast", SSE2|X264_CPU_SSE2_IS_FAST}, + {"LZCNT", SSE2|X264_CPU_LZCNT}, {"SSE3", SSE2|X264_CPU_SSE3}, {"SSSE3", SSE2|X264_CPU_SSE3|X264_CPU_SSSE3}, {"SSE4.1", SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4}, @@ -66,15 +67,16 @@ const x264_cpu_name_t x264_cpu_names[] = {"XOP", AVX|X264_CPU_XOP}, {"FMA4", AVX|X264_CPU_FMA4}, {"FMA3", AVX|X264_CPU_FMA3}, - {"AVX2", AVX|X264_CPU_FMA3|X264_CPU_AVX2}, + {"BMI1", AVX|X264_CPU_LZCNT|X264_CPU_BMI1}, + {"BMI2", AVX|X264_CPU_LZCNT|X264_CPU_BMI1|X264_CPU_BMI2}, +#define AVX2 AVX|X264_CPU_FMA3|X264_CPU_LZCNT|X264_CPU_BMI1|X264_CPU_BMI2|X264_CPU_AVX2 + {"AVX2", AVX2}, +#undef AVX2 #undef AVX #undef SSE2 #undef MMX2 {"Cache32", X264_CPU_CACHELINE_32}, {"Cache64", X264_CPU_CACHELINE_64}, - {"LZCNT", X264_CPU_LZCNT}, - {"BMI1", X264_CPU_BMI1}, - {"BMI2", X264_CPU_BMI1|X264_CPU_BMI2}, {"SlowCTZ", X264_CPU_SLOW_CTZ}, {"SlowAtom", X264_CPU_SLOW_ATOM}, {"SlowPshufb", X264_CPU_SLOW_PSHUFB}, diff --git a/common/quant.c b/common/quant.c index 7eef140b..d1414445 100644 --- a/common/quant.c +++ b/common/quant.c @@ -473,8 +473,6 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf ) #endif pf->coeff_last4 = x264_coeff_last4_mmx2; pf->coeff_level_run4 = x264_coeff_level_run4_mmx2; - if( cpu&X264_CPU_LZCNT ) - pf->coeff_level_run4 = x264_coeff_level_run4_mmx2_lzcnt; } if( cpu&X264_CPU_SSE2 ) { @@ -499,17 +497,18 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf ) pf->coeff_level_run8 = x264_coeff_level_run8_sse2; pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_sse2; pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_sse2; - if( cpu&X264_CPU_LZCNT ) - { - pf->coeff_last4 = x264_coeff_last4_mmx2_lzcnt; - pf->coeff_last8 = x264_coeff_last8_sse2_lzcnt; - pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_sse2_lzcnt; - pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_sse2_lzcnt; - pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_sse2_lzcnt; - pf->coeff_level_run8 = x264_coeff_level_run8_sse2_lzcnt; - pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_sse2_lzcnt; - pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_sse2_lzcnt; - } + } + if( cpu&X264_CPU_LZCNT ) + { + pf->coeff_last4 = x264_coeff_last4_lzcnt; + pf->coeff_last8 = x264_coeff_last8_lzcnt; + pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_lzcnt; + pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_lzcnt; + pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_lzcnt; + pf->coeff_level_run4 = x264_coeff_level_run4_lzcnt; + pf->coeff_level_run8 = x264_coeff_level_run8_lzcnt; + pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_lzcnt; + pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_lzcnt; } if( cpu&X264_CPU_SSSE3 ) { @@ -557,8 +556,7 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf ) pf->dequant_8x8 = x264_dequant_8x8_avx2; pf->dequant_4x4_dc = x264_dequant_4x4dc_avx2; pf->denoise_dct = x264_denoise_dct_avx2; - if( cpu&X264_CPU_LZCNT ) - pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_avx2_lzcnt; + pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_avx2; } #endif // HAVE_MMX #else // !HIGH_BIT_DEPTH @@ -599,13 +597,6 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf ) pf->coeff_last8 = x264_coeff_last8_mmx2; pf->coeff_level_run4 = x264_coeff_level_run4_mmx2; pf->coeff_level_run8 = x264_coeff_level_run8_mmx2; - if( cpu&X264_CPU_LZCNT ) - { - pf->coeff_last4 = x264_coeff_last4_mmx2_lzcnt; - pf->coeff_last8 = x264_coeff_last8_mmx2_lzcnt; - pf->coeff_level_run4 = x264_coeff_level_run4_mmx2_lzcnt; - pf->coeff_level_run8 = x264_coeff_level_run8_mmx2_lzcnt; - } } if( cpu&X264_CPU_SSE2 ) @@ -634,14 +625,19 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf ) pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_sse2; pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_sse2; pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_sse2; - if( cpu&X264_CPU_LZCNT ) - { - pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_sse2_lzcnt; - pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_sse2_lzcnt; - pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_sse2_lzcnt; - pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_sse2_lzcnt; - pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_sse2_lzcnt; - } + } + + if( cpu&X264_CPU_LZCNT ) + { + pf->coeff_last4 = x264_coeff_last4_lzcnt; + pf->coeff_last8 = x264_coeff_last8_lzcnt; + pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_lzcnt; + pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_lzcnt; + pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_lzcnt; + pf->coeff_level_run4 = x264_coeff_level_run4_lzcnt; + pf->coeff_level_run8 = x264_coeff_level_run8_lzcnt; + pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_lzcnt; + pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_lzcnt; } if( cpu&X264_CPU_SSSE3 ) @@ -663,8 +659,8 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf ) pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_ssse3; if( cpu&X264_CPU_LZCNT ) { - pf->coeff_level_run4 = x264_coeff_level_run4_ssse3; - pf->coeff_level_run8 = x264_coeff_level_run8_ssse3; + pf->coeff_level_run4 = x264_coeff_level_run4_ssse3_lzcnt; + pf->coeff_level_run8 = x264_coeff_level_run8_ssse3_lzcnt; pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_ssse3_lzcnt; pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_ssse3_lzcnt; } @@ -717,12 +713,9 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf ) } pf->decimate_score64 = x264_decimate_score64_avx2; pf->denoise_dct = x264_denoise_dct_avx2; - if( cpu&X264_CPU_LZCNT ) - { - pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_avx2_lzcnt; - pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_avx2_lzcnt; - pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_avx2_lzcnt; - } + pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_avx2; + pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_avx2; + pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_avx2; } #endif // HAVE_MMX diff --git a/common/x86/cabac-a.asm b/common/x86/cabac-a.asm index d7870a3a..c550d8e1 100644 --- a/common/x86/cabac-a.asm +++ b/common/x86/cabac-a.asm @@ -53,21 +53,21 @@ coeff_abs_level_transition: db 1, 2, 3, 3, 4, 5, 6, 7 %endmacro cextern coeff_last4_mmx2 -cextern coeff_last4_mmx2_lzcnt +cextern coeff_last4_lzcnt cextern coeff_last15_sse2 -cextern coeff_last15_sse2_lzcnt +cextern coeff_last15_lzcnt cextern coeff_last16_sse2 -cextern coeff_last16_sse2_lzcnt +cextern coeff_last16_lzcnt cextern coeff_last64_sse2 -cextern coeff_last64_sse2_lzcnt -cextern coeff_last64_avx2_lzcnt +cextern coeff_last64_lzcnt +cextern coeff_last64_avx2 %ifdef PIC SECTION .data %endif -coeff_last_sse2: COEFF_LAST_TABLE mmx2, sse2, sse2, 16, 15, 16, 4, 15, 64, 16, 15, 16, 64, 16, 15, 16, 64 -coeff_last_sse2_lzcnt: COEFF_LAST_TABLE mmx2_lzcnt, sse2_lzcnt, sse2_lzcnt, 16, 15, 16, 4, 15, 64, 16, 15, 16, 64, 16, 15, 16, 64 -coeff_last_avx2_lzcnt: COEFF_LAST_TABLE mmx2_lzcnt, avx2_lzcnt, sse2_lzcnt, 16, 15, 16, 4, 15, 64, 16, 15, 16, 64, 16, 15, 16, 64 +coeff_last_sse2: COEFF_LAST_TABLE mmx2, sse2, sse2, 16, 15, 16, 4, 15, 64, 16, 15, 16, 64, 16, 15, 16, 64 +coeff_last_lzcnt: COEFF_LAST_TABLE lzcnt, lzcnt, lzcnt, 16, 15, 16, 4, 15, 64, 16, 15, 16, 64, 16, 15, 16, 64 +coeff_last_avx2: COEFF_LAST_TABLE lzcnt, avx2, lzcnt, 16, 15, 16, 4, 15, 64, 16, 15, 16, 64, 16, 15, 16, 64 %endif SECTION .text @@ -529,15 +529,15 @@ CABAC bmi2 INIT_XMM sse2 CABAC_RESIDUAL_RD 0, coeff_last_sse2 CABAC_RESIDUAL_RD 1, coeff_last_sse2 -INIT_XMM sse2,lzcnt -CABAC_RESIDUAL_RD 0, coeff_last_sse2_lzcnt -CABAC_RESIDUAL_RD 1, coeff_last_sse2_lzcnt +INIT_XMM lzcnt +CABAC_RESIDUAL_RD 0, coeff_last_lzcnt +CABAC_RESIDUAL_RD 1, coeff_last_lzcnt INIT_XMM ssse3 CABAC_RESIDUAL_RD 0, coeff_last_sse2 CABAC_RESIDUAL_RD 1, coeff_last_sse2 INIT_XMM ssse3,lzcnt -CABAC_RESIDUAL_RD 0, coeff_last_sse2_lzcnt -CABAC_RESIDUAL_RD 1, coeff_last_sse2_lzcnt +CABAC_RESIDUAL_RD 0, coeff_last_lzcnt +CABAC_RESIDUAL_RD 1, coeff_last_lzcnt %endif ;----------------------------------------------------------------------------- @@ -749,8 +749,8 @@ cglobal cabac_block_residual_internal, 4,15 %if ARCH_X86_64 INIT_XMM sse2 CABAC_RESIDUAL coeff_last_sse2 -INIT_XMM sse2,lzcnt -CABAC_RESIDUAL coeff_last_sse2_lzcnt -INIT_XMM avx2,bmi2 -CABAC_RESIDUAL coeff_last_avx2_lzcnt +INIT_XMM lzcnt +CABAC_RESIDUAL coeff_last_lzcnt +INIT_XMM avx2 +CABAC_RESIDUAL coeff_last_avx2 %endif diff --git a/common/x86/quant-a.asm b/common/x86/quant-a.asm index 2391b57a..a20d1cd9 100644 --- a/common/x86/quant-a.asm +++ b/common/x86/quant-a.asm @@ -1556,7 +1556,7 @@ cglobal coeff_last4, 1,3 INIT_MMX mmx2 COEFF_LAST4 -INIT_MMX mmx2, lzcnt +INIT_MMX lzcnt COEFF_LAST4 %macro COEFF_LAST8 0 @@ -1579,7 +1579,7 @@ COEFF_LAST8 %endif INIT_XMM sse2 COEFF_LAST8 -INIT_XMM sse2, lzcnt +INIT_XMM lzcnt COEFF_LAST8 %else ; !HIGH_BIT_DEPTH @@ -1642,7 +1642,7 @@ cglobal coeff_last8, 1,3 INIT_MMX mmx2 COEFF_LAST48 -INIT_MMX mmx2, lzcnt +INIT_MMX lzcnt COEFF_LAST48 %endif ; HIGH_BIT_DEPTH @@ -1707,7 +1707,7 @@ COEFF_LAST %endif INIT_XMM sse2 COEFF_LAST -INIT_XMM sse2, lzcnt +INIT_XMM lzcnt COEFF_LAST %macro LAST_MASK_AVX2 2 @@ -1729,7 +1729,7 @@ COEFF_LAST %endmacro %if ARCH_X86_64 == 0 -INIT_YMM avx2,lzcnt +INIT_YMM avx2 cglobal coeff_last64, 1,2 pxor m2, m2 LAST_MASK_AVX2 r1d, r0+SIZEOF_DCTCOEF*32 @@ -1744,7 +1744,7 @@ cglobal coeff_last64, 1,2 add eax, 32 RET %else -INIT_YMM avx2,lzcnt +INIT_YMM avx2 cglobal coeff_last64, 1,3 pxor m2, m2 LAST_MASK_AVX2 r1d, r0+SIZEOF_DCTCOEF* 0 @@ -1833,15 +1833,17 @@ COEFF_LEVELRUN 8 %endif COEFF_LEVELRUN 15 COEFF_LEVELRUN 16 -INIT_XMM sse2, lzcnt +INIT_MMX lzcnt +COEFF_LEVELRUN 4 +%if HIGH_BIT_DEPTH == 0 +COEFF_LEVELRUN 8 +%endif +INIT_XMM lzcnt %if HIGH_BIT_DEPTH COEFF_LEVELRUN 8 %endif COEFF_LEVELRUN 15 COEFF_LEVELRUN 16 -INIT_MMX mmx2, lzcnt -COEFF_LEVELRUN 4 -COEFF_LEVELRUN 8 ; Similar to the one above, but saves the DCT ; coefficients in m0/m1 so we don't have to load @@ -1968,7 +1970,7 @@ INIT_XMM ssse3, lzcnt COEFF_LEVELRUN_LUT 8 COEFF_LEVELRUN_LUT 15 COEFF_LEVELRUN_LUT 16 -INIT_XMM avx2, lzcnt +INIT_XMM avx2 COEFF_LEVELRUN_LUT 15 COEFF_LEVELRUN_LUT 16 %endif diff --git a/common/x86/quant.h b/common/x86/quant.h index 9596a58c..e0ce0f23 100644 --- a/common/x86/quant.h +++ b/common/x86/quant.h @@ -104,33 +104,32 @@ int x264_coeff_last8_sse2( dctcoef *dct ); int x264_coeff_last15_sse2( dctcoef *dct ); int x264_coeff_last16_sse2( dctcoef *dct ); int x264_coeff_last64_sse2( dctcoef *dct ); -int x264_coeff_last4_mmx2_lzcnt( dctcoef *dct ); -int x264_coeff_last8_mmx2_lzcnt( dctcoef *dct ); -int x264_coeff_last8_sse2_lzcnt( dctcoef *dct ); -int x264_coeff_last15_sse2_lzcnt( dctcoef *dct ); -int x264_coeff_last16_sse2_lzcnt( dctcoef *dct ); -int x264_coeff_last64_sse2_lzcnt( dctcoef *dct ); -int x264_coeff_last64_avx2_lzcnt( dctcoef *dct ); +int x264_coeff_last4_lzcnt( dctcoef *dct ); +int x264_coeff_last8_lzcnt( dctcoef *dct ); +int x264_coeff_last15_lzcnt( dctcoef *dct ); +int x264_coeff_last16_lzcnt( dctcoef *dct ); +int x264_coeff_last64_lzcnt( dctcoef *dct ); +int x264_coeff_last64_avx2 ( dctcoef *dct ); int x264_coeff_level_run16_mmx2( dctcoef *dct, x264_run_level_t *runlevel ); int x264_coeff_level_run16_sse2( dctcoef *dct, x264_run_level_t *runlevel ); -int x264_coeff_level_run16_sse2_lzcnt( dctcoef *dct, x264_run_level_t *runlevel ); +int x264_coeff_level_run16_lzcnt( dctcoef *dct, x264_run_level_t *runlevel ); int x264_coeff_level_run16_ssse3( dctcoef *dct, x264_run_level_t *runlevel ); int x264_coeff_level_run16_ssse3_lzcnt( dctcoef *dct, x264_run_level_t *runlevel ); -int x264_coeff_level_run16_avx2_lzcnt( dctcoef *dct, x264_run_level_t *runlevel ); +int x264_coeff_level_run16_avx2( dctcoef *dct, x264_run_level_t *runlevel ); int x264_coeff_level_run15_mmx2( dctcoef *dct, x264_run_level_t *runlevel ); int x264_coeff_level_run15_sse2( dctcoef *dct, x264_run_level_t *runlevel ); -int x264_coeff_level_run15_sse2_lzcnt( dctcoef *dct, x264_run_level_t *runlevel ); +int x264_coeff_level_run15_lzcnt( dctcoef *dct, x264_run_level_t *runlevel ); int x264_coeff_level_run15_ssse3( dctcoef *dct, x264_run_level_t *runlevel ); int x264_coeff_level_run15_ssse3_lzcnt( dctcoef *dct, x264_run_level_t *runlevel ); -int x264_coeff_level_run15_avx2_lzcnt( dctcoef *dct, x264_run_level_t *runlevel ); +int x264_coeff_level_run15_avx2( dctcoef *dct, x264_run_level_t *runlevel ); int x264_coeff_level_run4_mmx2( dctcoef *dct, x264_run_level_t *runlevel ); -int x264_coeff_level_run4_mmx2_lzcnt( dctcoef *dct, x264_run_level_t *runlevel ); +int x264_coeff_level_run4_lzcnt( dctcoef *dct, x264_run_level_t *runlevel ); int x264_coeff_level_run4_ssse3( dctcoef *dct, x264_run_level_t *runlevel ); int x264_coeff_level_run4_ssse3_lzcnt( dctcoef *dct, x264_run_level_t *runlevel ); int x264_coeff_level_run8_mmx2( dctcoef *dct, x264_run_level_t *runlevel ); -int x264_coeff_level_run8_mmx2_lzcnt( dctcoef *dct, x264_run_level_t *runlevel ); +int x264_coeff_level_run8_lzcnt( dctcoef *dct, x264_run_level_t *runlevel ); int x264_coeff_level_run8_sse2( dctcoef *dct, x264_run_level_t *runlevel ); -int x264_coeff_level_run8_sse2_lzcnt( dctcoef *dct, x264_run_level_t *runlevel ); +int x264_coeff_level_run8_lzcnt( dctcoef *dct, x264_run_level_t *runlevel ); int x264_coeff_level_run8_ssse3( dctcoef *dct, x264_run_level_t *runlevel ); int x264_coeff_level_run8_ssse3_lzcnt( dctcoef *dct, x264_run_level_t *runlevel ); int x264_trellis_cabac_4x4_sse2 ( TRELLIS_PARAMS, int b_ac ); diff --git a/common/x86/x86inc.asm b/common/x86/x86inc.asm index ff8b5002..e7168b18 100644 --- a/common/x86/x86inc.asm +++ b/common/x86/x86inc.asm @@ -776,24 +776,24 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae, %assign cpuflags_sse (1<<4) | cpuflags_mmx2 %assign cpuflags_sse2 (1<<5) | cpuflags_sse %assign cpuflags_sse2slow (1<<6) | cpuflags_sse2 -%assign cpuflags_sse3 (1<<7) | cpuflags_sse2 -%assign cpuflags_ssse3 (1<<8) | cpuflags_sse3 -%assign cpuflags_sse4 (1<<9) | cpuflags_ssse3 -%assign cpuflags_sse42 (1<<10)| cpuflags_sse4 -%assign cpuflags_avx (1<<11)| cpuflags_sse42 -%assign cpuflags_xop (1<<12)| cpuflags_avx -%assign cpuflags_fma4 (1<<13)| cpuflags_avx -%assign cpuflags_fma3 (1<<14)| cpuflags_avx -%assign cpuflags_avx2 (1<<15)| cpuflags_fma3 - -%assign cpuflags_cache32 (1<<16) -%assign cpuflags_cache64 (1<<17) -%assign cpuflags_slowctz (1<<18) -%assign cpuflags_lzcnt (1<<19) -%assign cpuflags_aligned (1<<20) ; not a cpu feature, but a function variant -%assign cpuflags_atom (1<<21) -%assign cpuflags_bmi1 (1<<22)|cpuflags_lzcnt -%assign cpuflags_bmi2 (1<<23)|cpuflags_bmi1 +%assign cpuflags_lzcnt (1<<7) | cpuflags_sse2 +%assign cpuflags_sse3 (1<<8) | cpuflags_sse2 +%assign cpuflags_ssse3 (1<<9) | cpuflags_sse3 +%assign cpuflags_sse4 (1<<10)| cpuflags_ssse3 +%assign cpuflags_sse42 (1<<11)| cpuflags_sse4 +%assign cpuflags_avx (1<<12)| cpuflags_sse42 +%assign cpuflags_xop (1<<13)| cpuflags_avx +%assign cpuflags_fma4 (1<<14)| cpuflags_avx +%assign cpuflags_fma3 (1<<15)| cpuflags_avx +%assign cpuflags_bmi1 (1<<16)| cpuflags_avx|cpuflags_lzcnt +%assign cpuflags_bmi2 (1<<17)| cpuflags_bmi1 +%assign cpuflags_avx2 (1<<18)| cpuflags_fma3|cpuflags_bmi2 + +%assign cpuflags_cache32 (1<<19) +%assign cpuflags_cache64 (1<<20) +%assign cpuflags_slowctz (1<<21) +%assign cpuflags_aligned (1<<22) ; not a cpu feature, but a function variant +%assign cpuflags_atom (1<<23) ; Returns a boolean value expressing whether or not the specified cpuflag is enabled. %define cpuflag(x) (((((cpuflags & (cpuflags_ %+ x)) ^ (cpuflags_ %+ x)) - 1) >> 31) & 1) diff --git a/encoder/encoder.c b/encoder/encoder.c index a27067c4..a4771c98 100644 --- a/encoder/encoder.c +++ b/encoder/encoder.c @@ -1567,9 +1567,15 @@ x264_t *x264_encoder_open( x264_param_t *param ) if( !strcmp(x264_cpu_names[i].name, "SSE4.1") && (h->param.cpu & X264_CPU_SSE42) ) continue; + if( !strcmp(x264_cpu_names[i].name, "LZCNT") + && (h->param.cpu & X264_CPU_BMI1) ) + continue; if( !strcmp(x264_cpu_names[i].name, "BMI1") && (h->param.cpu & X264_CPU_BMI2) ) continue; + if( !strcmp(x264_cpu_names[i].name, "FMA4") + && (h->param.cpu & X264_CPU_FMA3) ) + continue; if( (h->param.cpu & x264_cpu_names[i].flags) == x264_cpu_names[i].flags && (!i || x264_cpu_names[i].flags != x264_cpu_names[i-1].flags) ) p += sprintf( p, " %s", x264_cpu_names[i].name ); diff --git a/tools/checkasm.c b/tools/checkasm.c index 193c2398..a72768e5 100644 --- a/tools/checkasm.c +++ b/tools/checkasm.c @@ -179,6 +179,8 @@ static void print_bench(void) printf( "%s_%s%s: %"PRId64"\n", benchs[i].name, #if HAVE_MMX b->cpu&X264_CPU_AVX2 ? "avx2" : + b->cpu&X264_CPU_BMI2 ? "bmi2" : + b->cpu&X264_CPU_BMI1 ? "bmi1" : b->cpu&X264_CPU_FMA3 ? "fma3" : b->cpu&X264_CPU_FMA4 ? "fma4" : b->cpu&X264_CPU_XOP ? "xop" : @@ -187,6 +189,7 @@ static void print_bench(void) b->cpu&X264_CPU_SSE4 ? "sse4" : b->cpu&X264_CPU_SSSE3 ? "ssse3" : b->cpu&X264_CPU_SSE3 ? "sse3" : + b->cpu&X264_CPU_LZCNT ? "lzcnt" : /* print sse2slow only if there's also a sse2fast version of the same func */ b->cpu&X264_CPU_SSE2_IS_SLOW && jcpu&X264_CPU_SSE2 ? "sse2" : @@ -209,10 +212,7 @@ static void print_bench(void) b->cpu&X264_CPU_SLOW_ATOM && b->cpu&X264_CPU_CACHELINE_64 ? "_c64_atom" : b->cpu&X264_CPU_CACHELINE_64 ? "_c64" : b->cpu&X264_CPU_SLOW_SHUFFLE ? "_slowshuffle" : - b->cpu&X264_CPU_LZCNT ? "_lzcnt" : - b->cpu&X264_CPU_BMI2 ? "_bmi2" : - b->cpu&X264_CPU_BMI1 ? "_bmi1" : - b->cpu&X264_CPU_SLOW_CTZ ? "_slow_ctz" : + b->cpu&X264_CPU_LZCNT && b->cpu&X264_CPU_SSE3 && !(b->cpu&X264_CPU_BMI1) ? "_lzcnt" : b->cpu&X264_CPU_SLOW_ATOM ? "_atom" : #elif ARCH_ARM b->cpu&X264_CPU_FAST_NEON_MRC ? "_fast_mrc" : @@ -2794,11 +2794,6 @@ static int check_all_flags( void ) ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_32, "MMX Cache32" ); cpu1 &= ~X264_CPU_CACHELINE_32; #endif - if( cpu_detect & X264_CPU_LZCNT ) - { - ret |= add_flags( &cpu0, &cpu1, X264_CPU_LZCNT, "MMX LZCNT" ); - cpu1 &= ~X264_CPU_LZCNT; - } ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_CTZ, "MMX SlowCTZ" ); cpu1 &= ~X264_CPU_SLOW_CTZ; } @@ -2814,11 +2809,11 @@ static int check_all_flags( void ) cpu1 &= ~X264_CPU_SLOW_SHUFFLE; ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_CTZ, "SSE2 SlowCTZ" ); cpu1 &= ~X264_CPU_SLOW_CTZ; - if( cpu_detect & X264_CPU_LZCNT ) - { - ret |= add_flags( &cpu0, &cpu1, X264_CPU_LZCNT, "SSE2 LZCNT" ); - cpu1 &= ~X264_CPU_LZCNT; - } + } + if( cpu_detect & X264_CPU_LZCNT ) + { + ret |= add_flags( &cpu0, &cpu1, X264_CPU_LZCNT, "LZCNT" ); + cpu1 &= ~X264_CPU_LZCNT; } if( cpu_detect & X264_CPU_SSE3 ) { @@ -2858,29 +2853,13 @@ static int check_all_flags( void ) cpu1 &= ~X264_CPU_FMA4; } if( cpu_detect & X264_CPU_FMA3 ) - { ret |= add_flags( &cpu0, &cpu1, X264_CPU_FMA3, "FMA3" ); - cpu1 &= ~X264_CPU_FMA3; - } - if( cpu_detect & X264_CPU_AVX2 ) - { - ret |= add_flags( &cpu0, &cpu1, X264_CPU_FMA3 | X264_CPU_AVX2, "AVX2" ); - if( cpu_detect & X264_CPU_LZCNT ) - { - ret |= add_flags( &cpu0, &cpu1, X264_CPU_LZCNT, "AVX2 LZCNT" ); - cpu1 &= ~X264_CPU_LZCNT; - } - } if( cpu_detect & X264_CPU_BMI1 ) - { ret |= add_flags( &cpu0, &cpu1, X264_CPU_BMI1, "BMI1" ); - cpu1 &= ~X264_CPU_BMI1; - } if( cpu_detect & X264_CPU_BMI2 ) - { - ret |= add_flags( &cpu0, &cpu1, X264_CPU_BMI1|X264_CPU_BMI2, "BMI2" ); - cpu1 &= ~(X264_CPU_BMI1|X264_CPU_BMI2); - } + ret |= add_flags( &cpu0, &cpu1, X264_CPU_BMI2, "BMI2" ); + if( cpu_detect & X264_CPU_AVX2 ) + ret |= add_flags( &cpu0, &cpu1, X264_CPU_AVX2, "AVX2" ); #elif ARCH_PPC if( cpu_detect & X264_CPU_ALTIVEC ) { -- 2.40.0