From c17d12f83381913650d84004815c20a1f7092144 Mon Sep 17 00:00:00 2001 From: Fiona Glaser Date: Mon, 25 Mar 2013 14:03:37 -0700 Subject: [PATCH] x86-64: BMI2 cabac_residual functions --- common/bitstream.c | 3 ++ common/x86/cabac-a.asm | 85 ++++++++++++++++++++++++++++++------------ common/x86/x86inc.asm | 3 +- encoder/cabac.c | 10 ++++- encoder/encoder.c | 3 ++ tools/checkasm.c | 54 +++++++++++---------------- 6 files changed, 98 insertions(+), 60 deletions(-) diff --git a/common/bitstream.c b/common/bitstream.c index cecd5f7f..2c2ca37f 100644 --- a/common/bitstream.c +++ b/common/bitstream.c @@ -52,6 +52,7 @@ void x264_cabac_block_residual_8x8_rd_internal_ssse3 ( dctcoef *l, int b_in void x264_cabac_block_residual_8x8_rd_internal_ssse3_lzcnt( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb ); void x264_cabac_block_residual_internal_sse2 ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb ); void x264_cabac_block_residual_internal_sse2_lzcnt ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb ); +void x264_cabac_block_residual_internal_avx2_bmi2 ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb ); /**************************************************************************** * x264_nal_encode: @@ -136,6 +137,8 @@ void x264_bitstream_init( int cpu, x264_bitstream_function_t *pf ) if( cpu&X264_CPU_AVX2 ) { pf->nal_escape = x264_nal_escape_avx2; + if( cpu&X264_CPU_BMI2 ) + pf->cabac_block_residual_internal = x264_cabac_block_residual_internal_avx2_bmi2; } #endif #endif diff --git a/common/x86/cabac-a.asm b/common/x86/cabac-a.asm index 7fcd8449..1d0c0a02 100644 --- a/common/x86/cabac-a.asm +++ b/common/x86/cabac-a.asm @@ -36,14 +36,17 @@ coeff_abs_level_transition: db 1, 2, 3, 3, 4, 5, 6, 7 db 4, 4, 4, 4, 5, 6, 7, 7 %if ARCH_X86_64 -%macro COEFF_LAST_TABLE 16 +%macro COEFF_LAST_TABLE 17 %define funccpu1 %1 %define funccpu2 %2 + %define funccpu3 %3 %rep 14 - %ifidn %3, 4 - dq mangle(x264_coeff_last%3_ %+ funccpu1) + %ifidn %4, 4 + dq mangle(x264_coeff_last%4_ %+ funccpu1) + %elifidn %4, 64 + dq mangle(x264_coeff_last%4_ %+ funccpu2) %else - dq mangle(x264_coeff_last%3_ %+ funccpu2) + dq mangle(x264_coeff_last%4_ %+ funccpu3) %endif %rotate 1 %endrep @@ -57,9 +60,11 @@ cextern coeff_last16_sse2 cextern coeff_last16_sse2_lzcnt cextern coeff_last64_sse2 cextern coeff_last64_sse2_lzcnt +cextern coeff_last64_avx2_lzcnt -coeff_last_sse2: COEFF_LAST_TABLE mmx2, sse2, 16, 15, 16, 4, 15, 64, 16, 15, 16, 64, 16, 15, 16, 64 -coeff_last_sse2_lzcnt: COEFF_LAST_TABLE mmx2_lzcnt, sse2_lzcnt, 16, 15, 16, 4, 15, 64, 16, 15, 16, 64, 16, 15, 16, 64 +coeff_last_sse2: COEFF_LAST_TABLE mmx2, sse2, sse2, 16, 15, 16, 4, 15, 64, 16, 15, 16, 64, 16, 15, 16, 64 +coeff_last_sse2_lzcnt: COEFF_LAST_TABLE mmx2_lzcnt, sse2_lzcnt, sse2_lzcnt, 16, 15, 16, 4, 15, 64, 16, 15, 16, 64, 16, 15, 16, 64 +coeff_last_avx2_lzcnt: COEFF_LAST_TABLE mmx2_lzcnt, avx2_lzcnt, sse2_lzcnt, 16, 15, 16, 4, 15, 64, 16, 15, 16, 64, 16, 15, 16, 64 %endif SECTION .text @@ -78,15 +83,9 @@ cextern coeff_abs_level_m1_offset cextern count_cat_m1 cextern cabac_encode_ue_bypass -; t3 must be ecx, since it's used for shift. -%if WIN64 - DECLARE_REG_TMP 3,1,2,0,5,6,4,4 - %define pointer resq -%elif ARCH_X86_64 - DECLARE_REG_TMP 0,1,2,3,4,5,6,6 +%if ARCH_X86_64 %define pointer resq %else - DECLARE_REG_TMP 0,4,2,1,3,5,6,2 %define pointer resd %endif @@ -116,7 +115,17 @@ endstruc %endif %endmacro -cglobal cabac_encode_decision_asm, 1,7 +%macro CABAC 1 +; t3 must be ecx, since it's used for shift. +%if WIN64 + DECLARE_REG_TMP 3,1,2,0,5,6,4,4 +%elif ARCH_X86_64 + DECLARE_REG_TMP 0,1,2,3,4,5,6,6 +%else + DECLARE_REG_TMP 0,4,2,1,3,5,6,2 +%endif + +cglobal cabac_encode_decision_%1, 1,7 movifnidn t1d, r1m mov t5d, [r0+cb.range] movzx t6d, byte [r0+cb.state+t1] @@ -144,22 +153,29 @@ cglobal cabac_encode_decision_asm, 1,7 mov [t0+cb.state+t1], t4b ;cabac_encode_renorm mov t4d, t3d +%ifidn %1, bmi2 + lzcnt t3d, t3d + sub t3d, 23 + shlx t4d, t4d, t3d + shlx t6d, t6d, t3d +%else shr t3d, 3 LOAD_GLOBAL t3d, cabac_renorm_shift, t3 + shl t4d, t3b + shl t6d, t3b +%endif %if WIN64 POP r7 %endif - shl t4d, t3b - shl t6d, t3b mov [t0+cb.range], t4d add t3d, [t0+cb.queue] - jge cabac_putbyte + jge cabac_putbyte_%1 .update_queue_low: mov [t0+cb.low], t6d mov [t0+cb.queue], t3d RET -cglobal cabac_encode_bypass_asm, 2,3 +cglobal cabac_encode_bypass_%1, 2,3 mov t7d, [r0+cb.low] and r1d, [r0+cb.range] lea t7d, [t7*2+r1] @@ -167,7 +183,7 @@ cglobal cabac_encode_bypass_asm, 2,3 mov t3d, [r0+cb.queue] inc t3d %if ARCH_X86_64 ; .putbyte compiles to nothing but a jmp - jge cabac_putbyte + jge cabac_putbyte_%1 %else jge .putbyte %endif @@ -178,10 +194,11 @@ cglobal cabac_encode_bypass_asm, 2,3 .putbyte: PROLOGUE 0,7 movifnidn t6d, t7d - jmp cabac_putbyte + jmp cabac_putbyte_%1 %endif -cglobal cabac_encode_terminal_asm, 1,3 +%ifnidn %1,bmi2 +cglobal cabac_encode_terminal_%1, 1,3 sub dword [r0+cb.range], 2 ; shortcut: the renormalization shift in terminal ; can only be 0 or 1 and is zero over 99% of the time. @@ -199,12 +216,19 @@ cglobal cabac_encode_terminal_asm, 1,3 movifnidn t0, r0 ; WIN64 mov t3d, [r0+cb.queue] mov t6d, [t0+cb.low] +%endif -cabac_putbyte: +cabac_putbyte_%1: ; alive: t0=cb t3=queue t6=low %if WIN64 DECLARE_REG_TMP 3,6,1,0,2,5,4 %endif +%ifidn %1, bmi2 + add t3d, 10 + shrx t2d, t6d, t3d + bzhi t6d, t6d, t3d + sub t3d, 18 +%else mov t1d, -1 add t3d, 10 mov t2d, t6d @@ -213,6 +237,7 @@ cabac_putbyte: not t1d sub t3d, 18 and t6d, t1d +%endif mov t5d, [t0+cb.bytes_outstanding] cmp t2b, 0xff ; FIXME is a 32bit op faster? jz .postpone @@ -229,7 +254,11 @@ cabac_putbyte: .postpone: inc t5d mov [t0+cb.bytes_outstanding], t5d - jmp mangle(x264_cabac_encode_decision_asm.update_queue_low) + jmp mangle(x264_cabac_encode_decision_%1.update_queue_low) +%endmacro + +CABAC asm +CABAC bmi2 ; %1 = label name ; %2 = node_ctx init? @@ -514,7 +543,11 @@ CABAC_RESIDUAL_RD 1, coeff_last_sse2_lzcnt ;----------------------------------------------------------------------------- %macro CALL_CABAC 0 +%if cpuflag(bmi2) + call cabac_encode_decision_bmi2 +%else call cabac_encode_decision_asm +%endif %if WIN64 ; move cabac back mov r0, r3 %endif @@ -696,7 +729,11 @@ cglobal cabac_block_residual_internal, 4,15 movzx nodectxd, byte [coeff_abs_level_transition+8+nodectxq GLOBAL] .level_sign: mov r1d, r11d +%if cpuflag(bmi2) + call cabac_encode_bypass_bmi2 +%else call cabac_encode_bypass_asm +%endif %if WIN64 mov r0, r3 %endif @@ -711,4 +748,6 @@ INIT_XMM sse2 CABAC_RESIDUAL coeff_last_sse2 INIT_XMM sse2,lzcnt CABAC_RESIDUAL coeff_last_sse2_lzcnt +INIT_XMM avx2,bmi2 +CABAC_RESIDUAL coeff_last_avx2_lzcnt %endif diff --git a/common/x86/x86inc.asm b/common/x86/x86inc.asm index c7c4c90f..f0e816d7 100644 --- a/common/x86/x86inc.asm +++ b/common/x86/x86inc.asm @@ -742,9 +742,8 @@ SECTION .note.GNU-stack noalloc noexec nowrite progbits %assign cpuflags_misalign (1<<20) %assign cpuflags_aligned (1<<21) ; not a cpu feature, but a function variant %assign cpuflags_atom (1<<22) -%assign cpuflags_bmi1 (1<<23) +%assign cpuflags_bmi1 (1<<23)|cpuflags_lzcnt %assign cpuflags_bmi2 (1<<24)|cpuflags_bmi1 -%assign cpuflags_tbm (1<<25)|cpuflags_bmi1 %define cpuflag(x) ((cpuflags & (cpuflags_ %+ x)) == (cpuflags_ %+ x)) %define notcpuflag(x) ((cpuflags & (cpuflags_ %+ x)) != (cpuflags_ %+ x)) diff --git a/encoder/cabac.c b/encoder/cabac.c index 1a9a734e..0a14ecdf 100644 --- a/encoder/cabac.c +++ b/encoder/cabac.c @@ -793,12 +793,18 @@ static ALWAYS_INLINE void x264_cabac_block_residual_internal( x264_t *h, x264_ca x264_cabac_encode_bypass( cb, coeff_sign ); } while( --coeff_idx >= 0 ); } -static void x264_cabac_block_residual( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l ) + +void x264_cabac_block_residual_c( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l ) +{ + x264_cabac_block_residual_internal( h, cb, ctx_block_cat, l, 0 ); +} + +static void ALWAYS_INLINE x264_cabac_block_residual( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l ) { #if ARCH_X86_64 && HAVE_MMX h->bsf.cabac_block_residual_internal( l, MB_INTERLACED, ctx_block_cat, cb ); #else - x264_cabac_block_residual_internal( h, cb, ctx_block_cat, l, 0 ); + x264_cabac_block_residual_c( h, cb, ctx_block_cat, l ); #endif } static void x264_cabac_block_residual_422_dc( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l ) diff --git a/encoder/encoder.c b/encoder/encoder.c index e239de0e..7a0c506a 100644 --- a/encoder/encoder.c +++ b/encoder/encoder.c @@ -1334,6 +1334,9 @@ x264_t *x264_encoder_open( x264_param_t *param ) if( !strcmp(x264_cpu_names[i].name, "SSE4.1") && (h->param.cpu & X264_CPU_SSE42) ) continue; + if( !strcmp(x264_cpu_names[i].name, "BMI1") + && (h->param.cpu & X264_CPU_BMI2) ) + continue; if( (h->param.cpu & x264_cpu_names[i].flags) == x264_cpu_names[i].flags && (!i || x264_cpu_names[i].flags != x264_cpu_names[i-1].flags) ) p += sprintf( p, " %s", x264_cpu_names[i].name ); diff --git a/tools/checkasm.c b/tools/checkasm.c index 1b02255b..7a2f6d4d 100644 --- a/tools/checkasm.c +++ b/tools/checkasm.c @@ -28,7 +28,6 @@ #include #include "common/common.h" #include "common/cpu.h" -#include "encoder/cabac.c" // GCC doesn't align stack variables on ARM, so use .bss #if ARCH_ARM @@ -2318,21 +2317,8 @@ DECL_CABAC(asm) #define run_cabac_terminal_asm run_cabac_terminal_c #endif -static void x264_cabac_block_residual_c( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l ) -{ - x264_cabac_block_residual_internal( h, cb, ctx_block_cat, l, 0 ); - cb->p = cb->p_start; -} - -/* Wrapper to roll back the pointer to avoid running out of memory bounds during - * benchmark repetitions. Introduces slight bias into the test, but not too much. */ -static void x264_cabac_block_residual_asm( void (*c)( dctcoef *, int, intptr_t, x264_cabac_t * ), - dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb ) -{ - c( l, b_interlaced, ctx_block_cat, cb ); - cb->p = cb->p_start; -} - +extern const uint8_t x264_count_cat_m1[14]; +void x264_cabac_block_residual_c( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l ); void x264_cabac_block_residual_8x8_rd_c( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l ); void x264_cabac_block_residual_rd_c( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l ); @@ -2351,10 +2337,8 @@ static int check_cabac( int cpu_ref, int cpu_new ) #define CABAC_RESIDUAL(name, start, end, rd)\ {\ - static int cabac_checked = 0;\ - if( bs_a.name##_internal && (bs_a.name##_internal != bs_ref.name##_internal || ((cpu_new&X264_CPU_SSE2) && !cabac_checked)) )\ + if( bs_a.name##_internal && (bs_a.name##_internal != bs_ref.name##_internal || (cpu_new&X264_CPU_SSE2_IS_SLOW)) )\ {\ - cabac_checked = 1;\ used_asm = 1;\ set_func_name( #name );\ for( int i = 0; i < 2; i++ )\ @@ -2363,7 +2347,8 @@ static int check_cabac( int cpu_ref, int cpu_new ) {\ for( int j = 0; j < 256; j++ )\ {\ - ALIGNED_ARRAY_16( dctcoef, dct, [2],[64] );\ + ALIGNED_ARRAY_N( dctcoef, dct, [2],[64] );\ + uint8_t bitstream[2][1<<16];\ static const uint8_t ctx_ac[14] = {0,1,0,0,1,0,0,1,0,0,0,1,0,0};\ int ac = ctx_ac[ctx_block_cat];\ int nz = 0;\ @@ -2385,15 +2370,15 @@ static int check_cabac( int cpu_ref, int cpu_new ) x264_cabac_t cb[2];\ x264_cabac_context_init( &h, &cb[0], SLICE_TYPE_P, 26, 0 );\ x264_cabac_context_init( &h, &cb[1], SLICE_TYPE_P, 26, 0 );\ - x264_cabac_encode_init( &cb[0], buf3, buf3+0x3f0 );\ - x264_cabac_encode_init( &cb[1], buf4, buf4+0x3f0 );\ + x264_cabac_encode_init( &cb[0], bitstream[0], bitstream[0]+0xfff0 );\ + x264_cabac_encode_init( &cb[1], bitstream[1], bitstream[1]+0xfff0 );\ cb[0].f8_bits_encoded = 0;\ cb[1].f8_bits_encoded = 0;\ - if( !rd ) memcpy( buf4, buf3, 0x400 );\ + if( !rd ) memcpy( bitstream[1], bitstream[0], 0x400 );\ call_c1( x264_##name##_c, &h, &cb[0], ctx_block_cat, dct[0]+ac );\ call_a1( bs_a.name##_internal, dct[1]+ac, i, ctx_block_cat, &cb[1] );\ ok = cb[0].f8_bits_encoded == cb[1].f8_bits_encoded && !memcmp(cb[0].state, cb[1].state, 1024);\ - if( !rd ) ok |= !memcmp( buf3, buf4, 0x400 ) && !memcmp( &cb[1], &cb[0], offsetof(x264_cabac_t, p_start) );\ + if( !rd ) ok |= !memcmp( bitstream[1], bitstream[0], 0x400 ) && !memcmp( &cb[1], &cb[0], offsetof(x264_cabac_t, p_start) );\ if( !ok )\ {\ fprintf( stderr, #name " : [FAILED] ctx_block_cat %d", (int)ctx_block_cat );\ @@ -2402,9 +2387,11 @@ static int check_cabac( int cpu_ref, int cpu_new ) fprintf( stderr, "\n");\ goto name##fail;\ }\ - call_c2( x264_##name##_c, &h, &cb[0], ctx_block_cat, dct[0]+ac );\ - if( rd ) call_a2( bs_a.name##_internal, dct[1]+ac, i, ctx_block_cat, &cb[1] );\ - else call_a2( x264_cabac_block_residual_asm, bs_a.name##_internal, dct[1]+ac, i, ctx_block_cat, &cb[1] );\ + if( (j&15) == 0 )\ + {\ + call_c2( x264_##name##_c, &h, &cb[0], ctx_block_cat, dct[0]+ac );\ + call_a2( bs_a.name##_internal, dct[1]+ac, i, ctx_block_cat, &cb[1] );\ + }\ }\ }\ }\ @@ -2415,13 +2402,14 @@ name##fail: CABAC_RESIDUAL( cabac_block_residual, 0, DCT_LUMA_8x8, 0 ) report( "cabac residual:" ); + ok = 1; used_asm = 0; CABAC_RESIDUAL( cabac_block_residual_rd, 0, DCT_LUMA_8x8-1, 1 ) CABAC_RESIDUAL( cabac_block_residual_8x8_rd, DCT_LUMA_8x8, DCT_LUMA_8x8, 1 ) report( "cabac residual rd:" ); if( cpu_ref || run_cabac_decision_c == run_cabac_decision_asm ) return ret; - used_asm = 1; + ok = 1; used_asm = 0; x264_cabac_init( &h ); set_func_name( "cabac_encode_decision" ); @@ -2602,15 +2590,15 @@ static int check_all_flags( void ) if( x264_cpu_detect() & X264_CPU_BMI1 ) { ret |= add_flags( &cpu0, &cpu1, X264_CPU_BMI1, "BMI1" ); - if( x264_cpu_detect() & X264_CPU_BMI2 ) - { - ret |= add_flags( &cpu0, &cpu1, X264_CPU_BMI2, "BMI2" ); - cpu1 &= ~X264_CPU_BMI2; - } cpu1 &= ~X264_CPU_BMI1; } if( x264_cpu_detect() & X264_CPU_AVX2 ) ret |= add_flags( &cpu0, &cpu1, X264_CPU_AVX2, "AVX2" ); + if( x264_cpu_detect() & X264_CPU_BMI2 ) + { + ret |= add_flags( &cpu0, &cpu1, X264_CPU_BMI1|X264_CPU_BMI2, "BMI2" ); + cpu1 &= ~(X264_CPU_BMI1|X264_CPU_BMI2); + } if( x264_cpu_detect() & X264_CPU_FMA3 ) { ret |= add_flags( &cpu0, &cpu1, X264_CPU_FMA3, "FMA3" ); -- 2.50.1