void x264_cabac_block_residual_8x8_rd_internal_ssse3_lzcnt( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
void x264_cabac_block_residual_internal_sse2 ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
void x264_cabac_block_residual_internal_sse2_lzcnt ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
+void x264_cabac_block_residual_internal_avx2_bmi2 ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
/****************************************************************************
* x264_nal_encode:
if( cpu&X264_CPU_AVX2 )
{
pf->nal_escape = x264_nal_escape_avx2;
+ if( cpu&X264_CPU_BMI2 )
+ pf->cabac_block_residual_internal = x264_cabac_block_residual_internal_avx2_bmi2;
}
#endif
#endif
db 4, 4, 4, 4, 5, 6, 7, 7
%if ARCH_X86_64
-%macro COEFF_LAST_TABLE 16
+%macro COEFF_LAST_TABLE 17
%define funccpu1 %1
%define funccpu2 %2
+ %define funccpu3 %3
%rep 14
- %ifidn %3, 4
- dq mangle(x264_coeff_last%3_ %+ funccpu1)
+ %ifidn %4, 4
+ dq mangle(x264_coeff_last%4_ %+ funccpu1)
+ %elifidn %4, 64
+ dq mangle(x264_coeff_last%4_ %+ funccpu2)
%else
- dq mangle(x264_coeff_last%3_ %+ funccpu2)
+ dq mangle(x264_coeff_last%4_ %+ funccpu3)
%endif
%rotate 1
%endrep
cextern coeff_last16_sse2_lzcnt
cextern coeff_last64_sse2
cextern coeff_last64_sse2_lzcnt
+cextern coeff_last64_avx2_lzcnt
-coeff_last_sse2: COEFF_LAST_TABLE mmx2, sse2, 16, 15, 16, 4, 15, 64, 16, 15, 16, 64, 16, 15, 16, 64
-coeff_last_sse2_lzcnt: COEFF_LAST_TABLE mmx2_lzcnt, sse2_lzcnt, 16, 15, 16, 4, 15, 64, 16, 15, 16, 64, 16, 15, 16, 64
+coeff_last_sse2: COEFF_LAST_TABLE mmx2, sse2, sse2, 16, 15, 16, 4, 15, 64, 16, 15, 16, 64, 16, 15, 16, 64
+coeff_last_sse2_lzcnt: COEFF_LAST_TABLE mmx2_lzcnt, sse2_lzcnt, sse2_lzcnt, 16, 15, 16, 4, 15, 64, 16, 15, 16, 64, 16, 15, 16, 64
+coeff_last_avx2_lzcnt: COEFF_LAST_TABLE mmx2_lzcnt, avx2_lzcnt, sse2_lzcnt, 16, 15, 16, 4, 15, 64, 16, 15, 16, 64, 16, 15, 16, 64
%endif
SECTION .text
cextern count_cat_m1
cextern cabac_encode_ue_bypass
-; t3 must be ecx, since it's used for shift.
-%if WIN64
- DECLARE_REG_TMP 3,1,2,0,5,6,4,4
- %define pointer resq
-%elif ARCH_X86_64
- DECLARE_REG_TMP 0,1,2,3,4,5,6,6
+%if ARCH_X86_64
%define pointer resq
%else
- DECLARE_REG_TMP 0,4,2,1,3,5,6,2
%define pointer resd
%endif
%endif
%endmacro
-cglobal cabac_encode_decision_asm, 1,7
+%macro CABAC 1
+; t3 must be ecx, since it's used for shift.
+%if WIN64
+ DECLARE_REG_TMP 3,1,2,0,5,6,4,4
+%elif ARCH_X86_64
+ DECLARE_REG_TMP 0,1,2,3,4,5,6,6
+%else
+ DECLARE_REG_TMP 0,4,2,1,3,5,6,2
+%endif
+
+cglobal cabac_encode_decision_%1, 1,7
movifnidn t1d, r1m
mov t5d, [r0+cb.range]
movzx t6d, byte [r0+cb.state+t1]
mov [t0+cb.state+t1], t4b
;cabac_encode_renorm
mov t4d, t3d
+%ifidn %1, bmi2
+ lzcnt t3d, t3d
+ sub t3d, 23
+ shlx t4d, t4d, t3d
+ shlx t6d, t6d, t3d
+%else
shr t3d, 3
LOAD_GLOBAL t3d, cabac_renorm_shift, t3
+ shl t4d, t3b
+ shl t6d, t3b
+%endif
%if WIN64
POP r7
%endif
- shl t4d, t3b
- shl t6d, t3b
mov [t0+cb.range], t4d
add t3d, [t0+cb.queue]
- jge cabac_putbyte
+ jge cabac_putbyte_%1
.update_queue_low:
mov [t0+cb.low], t6d
mov [t0+cb.queue], t3d
RET
-cglobal cabac_encode_bypass_asm, 2,3
+cglobal cabac_encode_bypass_%1, 2,3
mov t7d, [r0+cb.low]
and r1d, [r0+cb.range]
lea t7d, [t7*2+r1]
mov t3d, [r0+cb.queue]
inc t3d
%if ARCH_X86_64 ; .putbyte compiles to nothing but a jmp
- jge cabac_putbyte
+ jge cabac_putbyte_%1
%else
jge .putbyte
%endif
.putbyte:
PROLOGUE 0,7
movifnidn t6d, t7d
- jmp cabac_putbyte
+ jmp cabac_putbyte_%1
%endif
-cglobal cabac_encode_terminal_asm, 1,3
+%ifnidn %1,bmi2
+cglobal cabac_encode_terminal_%1, 1,3
sub dword [r0+cb.range], 2
; shortcut: the renormalization shift in terminal
; can only be 0 or 1 and is zero over 99% of the time.
movifnidn t0, r0 ; WIN64
mov t3d, [r0+cb.queue]
mov t6d, [t0+cb.low]
+%endif
-cabac_putbyte:
+cabac_putbyte_%1:
; alive: t0=cb t3=queue t6=low
%if WIN64
DECLARE_REG_TMP 3,6,1,0,2,5,4
%endif
+%ifidn %1, bmi2
+ add t3d, 10
+ shrx t2d, t6d, t3d
+ bzhi t6d, t6d, t3d
+ sub t3d, 18
+%else
mov t1d, -1
add t3d, 10
mov t2d, t6d
not t1d
sub t3d, 18
and t6d, t1d
+%endif
mov t5d, [t0+cb.bytes_outstanding]
cmp t2b, 0xff ; FIXME is a 32bit op faster?
jz .postpone
.postpone:
inc t5d
mov [t0+cb.bytes_outstanding], t5d
- jmp mangle(x264_cabac_encode_decision_asm.update_queue_low)
+ jmp mangle(x264_cabac_encode_decision_%1.update_queue_low)
+%endmacro
+
+CABAC asm
+CABAC bmi2
; %1 = label name
; %2 = node_ctx init?
;-----------------------------------------------------------------------------
%macro CALL_CABAC 0
+%if cpuflag(bmi2)
+ call cabac_encode_decision_bmi2
+%else
call cabac_encode_decision_asm
+%endif
%if WIN64 ; move cabac back
mov r0, r3
%endif
movzx nodectxd, byte [coeff_abs_level_transition+8+nodectxq GLOBAL]
.level_sign:
mov r1d, r11d
+%if cpuflag(bmi2)
+ call cabac_encode_bypass_bmi2
+%else
call cabac_encode_bypass_asm
+%endif
%if WIN64
mov r0, r3
%endif
CABAC_RESIDUAL coeff_last_sse2
INIT_XMM sse2,lzcnt
CABAC_RESIDUAL coeff_last_sse2_lzcnt
+INIT_XMM avx2,bmi2
+CABAC_RESIDUAL coeff_last_avx2_lzcnt
%endif
%assign cpuflags_misalign (1<<20)
%assign cpuflags_aligned (1<<21) ; not a cpu feature, but a function variant
%assign cpuflags_atom (1<<22)
-%assign cpuflags_bmi1 (1<<23)
+%assign cpuflags_bmi1 (1<<23)|cpuflags_lzcnt
%assign cpuflags_bmi2 (1<<24)|cpuflags_bmi1
-%assign cpuflags_tbm (1<<25)|cpuflags_bmi1
%define cpuflag(x) ((cpuflags & (cpuflags_ %+ x)) == (cpuflags_ %+ x))
%define notcpuflag(x) ((cpuflags & (cpuflags_ %+ x)) != (cpuflags_ %+ x))
x264_cabac_encode_bypass( cb, coeff_sign );
} while( --coeff_idx >= 0 );
}
-static void x264_cabac_block_residual( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l )
+
+void x264_cabac_block_residual_c( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l )
+{
+ x264_cabac_block_residual_internal( h, cb, ctx_block_cat, l, 0 );
+}
+
+static void ALWAYS_INLINE x264_cabac_block_residual( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l )
{
#if ARCH_X86_64 && HAVE_MMX
h->bsf.cabac_block_residual_internal( l, MB_INTERLACED, ctx_block_cat, cb );
#else
- x264_cabac_block_residual_internal( h, cb, ctx_block_cat, l, 0 );
+ x264_cabac_block_residual_c( h, cb, ctx_block_cat, l );
#endif
}
static void x264_cabac_block_residual_422_dc( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l )
if( !strcmp(x264_cpu_names[i].name, "SSE4.1")
&& (h->param.cpu & X264_CPU_SSE42) )
continue;
+ if( !strcmp(x264_cpu_names[i].name, "BMI1")
+ && (h->param.cpu & X264_CPU_BMI2) )
+ continue;
if( (h->param.cpu & x264_cpu_names[i].flags) == x264_cpu_names[i].flags
&& (!i || x264_cpu_names[i].flags != x264_cpu_names[i-1].flags) )
p += sprintf( p, " %s", x264_cpu_names[i].name );
#include <ctype.h>
#include "common/common.h"
#include "common/cpu.h"
-#include "encoder/cabac.c"
// GCC doesn't align stack variables on ARM, so use .bss
#if ARCH_ARM
#define run_cabac_terminal_asm run_cabac_terminal_c
#endif
-static void x264_cabac_block_residual_c( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l )
-{
- x264_cabac_block_residual_internal( h, cb, ctx_block_cat, l, 0 );
- cb->p = cb->p_start;
-}
-
-/* Wrapper to roll back the pointer to avoid running out of memory bounds during
- * benchmark repetitions. Introduces slight bias into the test, but not too much. */
-static void x264_cabac_block_residual_asm( void (*c)( dctcoef *, int, intptr_t, x264_cabac_t * ),
- dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb )
-{
- c( l, b_interlaced, ctx_block_cat, cb );
- cb->p = cb->p_start;
-}
-
+extern const uint8_t x264_count_cat_m1[14];
+void x264_cabac_block_residual_c( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l );
void x264_cabac_block_residual_8x8_rd_c( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l );
void x264_cabac_block_residual_rd_c( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l );
#define CABAC_RESIDUAL(name, start, end, rd)\
{\
- static int cabac_checked = 0;\
- if( bs_a.name##_internal && (bs_a.name##_internal != bs_ref.name##_internal || ((cpu_new&X264_CPU_SSE2) && !cabac_checked)) )\
+ if( bs_a.name##_internal && (bs_a.name##_internal != bs_ref.name##_internal || (cpu_new&X264_CPU_SSE2_IS_SLOW)) )\
{\
- cabac_checked = 1;\
used_asm = 1;\
set_func_name( #name );\
for( int i = 0; i < 2; i++ )\
{\
for( int j = 0; j < 256; j++ )\
{\
- ALIGNED_ARRAY_16( dctcoef, dct, [2],[64] );\
+ ALIGNED_ARRAY_N( dctcoef, dct, [2],[64] );\
+ uint8_t bitstream[2][1<<16];\
static const uint8_t ctx_ac[14] = {0,1,0,0,1,0,0,1,0,0,0,1,0,0};\
int ac = ctx_ac[ctx_block_cat];\
int nz = 0;\
x264_cabac_t cb[2];\
x264_cabac_context_init( &h, &cb[0], SLICE_TYPE_P, 26, 0 );\
x264_cabac_context_init( &h, &cb[1], SLICE_TYPE_P, 26, 0 );\
- x264_cabac_encode_init( &cb[0], buf3, buf3+0x3f0 );\
- x264_cabac_encode_init( &cb[1], buf4, buf4+0x3f0 );\
+ x264_cabac_encode_init( &cb[0], bitstream[0], bitstream[0]+0xfff0 );\
+ x264_cabac_encode_init( &cb[1], bitstream[1], bitstream[1]+0xfff0 );\
cb[0].f8_bits_encoded = 0;\
cb[1].f8_bits_encoded = 0;\
- if( !rd ) memcpy( buf4, buf3, 0x400 );\
+ if( !rd ) memcpy( bitstream[1], bitstream[0], 0x400 );\
call_c1( x264_##name##_c, &h, &cb[0], ctx_block_cat, dct[0]+ac );\
call_a1( bs_a.name##_internal, dct[1]+ac, i, ctx_block_cat, &cb[1] );\
ok = cb[0].f8_bits_encoded == cb[1].f8_bits_encoded && !memcmp(cb[0].state, cb[1].state, 1024);\
- if( !rd ) ok |= !memcmp( buf3, buf4, 0x400 ) && !memcmp( &cb[1], &cb[0], offsetof(x264_cabac_t, p_start) );\
+ if( !rd ) ok |= !memcmp( bitstream[1], bitstream[0], 0x400 ) && !memcmp( &cb[1], &cb[0], offsetof(x264_cabac_t, p_start) );\
if( !ok )\
{\
fprintf( stderr, #name " : [FAILED] ctx_block_cat %d", (int)ctx_block_cat );\
fprintf( stderr, "\n");\
goto name##fail;\
}\
- call_c2( x264_##name##_c, &h, &cb[0], ctx_block_cat, dct[0]+ac );\
- if( rd ) call_a2( bs_a.name##_internal, dct[1]+ac, i, ctx_block_cat, &cb[1] );\
- else call_a2( x264_cabac_block_residual_asm, bs_a.name##_internal, dct[1]+ac, i, ctx_block_cat, &cb[1] );\
+ if( (j&15) == 0 )\
+ {\
+ call_c2( x264_##name##_c, &h, &cb[0], ctx_block_cat, dct[0]+ac );\
+ call_a2( bs_a.name##_internal, dct[1]+ac, i, ctx_block_cat, &cb[1] );\
+ }\
}\
}\
}\
CABAC_RESIDUAL( cabac_block_residual, 0, DCT_LUMA_8x8, 0 )
report( "cabac residual:" );
+ ok = 1; used_asm = 0;
CABAC_RESIDUAL( cabac_block_residual_rd, 0, DCT_LUMA_8x8-1, 1 )
CABAC_RESIDUAL( cabac_block_residual_8x8_rd, DCT_LUMA_8x8, DCT_LUMA_8x8, 1 )
report( "cabac residual rd:" );
if( cpu_ref || run_cabac_decision_c == run_cabac_decision_asm )
return ret;
- used_asm = 1;
+ ok = 1; used_asm = 0;
x264_cabac_init( &h );
set_func_name( "cabac_encode_decision" );
if( x264_cpu_detect() & X264_CPU_BMI1 )
{
ret |= add_flags( &cpu0, &cpu1, X264_CPU_BMI1, "BMI1" );
- if( x264_cpu_detect() & X264_CPU_BMI2 )
- {
- ret |= add_flags( &cpu0, &cpu1, X264_CPU_BMI2, "BMI2" );
- cpu1 &= ~X264_CPU_BMI2;
- }
cpu1 &= ~X264_CPU_BMI1;
}
if( x264_cpu_detect() & X264_CPU_AVX2 )
ret |= add_flags( &cpu0, &cpu1, X264_CPU_AVX2, "AVX2" );
+ if( x264_cpu_detect() & X264_CPU_BMI2 )
+ {
+ ret |= add_flags( &cpu0, &cpu1, X264_CPU_BMI1|X264_CPU_BMI2, "BMI2" );
+ cpu1 &= ~(X264_CPU_BMI1|X264_CPU_BMI2);
+ }
if( x264_cpu_detect() & X264_CPU_FMA3 )
{
ret |= add_flags( &cpu0, &cpu1, X264_CPU_FMA3, "FMA3" );