void x264_cabac_block_residual_rd_internal_lzcnt ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
void x264_cabac_block_residual_rd_internal_ssse3 ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
void x264_cabac_block_residual_rd_internal_ssse3_lzcnt( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
+void x264_cabac_block_residual_rd_internal_avx512 ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
void x264_cabac_block_residual_8x8_rd_internal_sse2 ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
void x264_cabac_block_residual_8x8_rd_internal_lzcnt ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
void x264_cabac_block_residual_8x8_rd_internal_ssse3 ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
void x264_cabac_block_residual_8x8_rd_internal_ssse3_lzcnt( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
+void x264_cabac_block_residual_8x8_rd_internal_avx512 ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
void x264_cabac_block_residual_internal_sse2 ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
void x264_cabac_block_residual_internal_lzcnt ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
void x264_cabac_block_residual_internal_avx2 ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
+void x264_cabac_block_residual_internal_avx512( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
uint8_t *x264_nal_escape_neon( uint8_t *dst, uint8_t *src, uint8_t *end );
pf->nal_escape = x264_nal_escape_avx2;
pf->cabac_block_residual_internal = x264_cabac_block_residual_internal_avx2;
}
+
+ if( cpu&X264_CPU_AVX512 )
+ {
+ pf->cabac_block_residual_internal = x264_cabac_block_residual_internal_avx512;
+ pf->cabac_block_residual_rd_internal = x264_cabac_block_residual_rd_internal_avx512;
+ pf->cabac_block_residual_8x8_rd_internal = x264_cabac_block_residual_8x8_rd_internal_avx512;
+ }
#endif
#endif
#if HAVE_ARMV6
cextern coeff_last4_mmx2
cextern coeff_last4_lzcnt
+cextern coeff_last4_avx512
cextern coeff_last15_sse2
cextern coeff_last15_lzcnt
+cextern coeff_last15_avx512
cextern coeff_last16_sse2
cextern coeff_last16_lzcnt
+cextern coeff_last16_avx512
cextern coeff_last64_sse2
cextern coeff_last64_lzcnt
cextern coeff_last64_avx2
+cextern coeff_last64_avx512
%ifdef PIC
SECTION .data
coeff_last_sse2: COEFF_LAST_TABLE mmx2, sse2, sse2, 16, 15, 16, 4, 15, 64, 16, 15, 16, 64, 16, 15, 16, 64
coeff_last_lzcnt: COEFF_LAST_TABLE lzcnt, lzcnt, lzcnt, 16, 15, 16, 4, 15, 64, 16, 15, 16, 64, 16, 15, 16, 64
coeff_last_avx2: COEFF_LAST_TABLE lzcnt, avx2, lzcnt, 16, 15, 16, 4, 15, 64, 16, 15, 16, 64, 16, 15, 16, 64
+%if HIGH_BIT_DEPTH
+coeff_last_avx512: COEFF_LAST_TABLE avx512, avx512, avx512, 16, 15, 16, 4, 15, 64, 16, 15, 16, 64, 16, 15, 16, 64
+%else
+coeff_last_avx512: COEFF_LAST_TABLE lzcnt, avx512, avx512, 16, 15, 16, 4, 15, 64, 16, 15, 16, 64, 16, 15, 16, 64
+%endif
%endif
SECTION .text
%endmacro
%macro ABS_DCTCOEFS 2
-%assign i 0
-%rep %2/16
%if HIGH_BIT_DEPTH
- ABSD m0, [%1+ 0+i*64], m4
- ABSD m1, [%1+16+i*64], m5
- ABSD m2, [%1+32+i*64], m4
- ABSD m3, [%1+48+i*64], m5
- mova [rsp+ 0+i*64], m0
- mova [rsp+16+i*64], m1
- mova [rsp+32+i*64], m2
- mova [rsp+48+i*64], m3
+ %define %%abs ABSD
%else
- ABSW m0, [%1+ 0+i*32], m2
- ABSW m1, [%1+16+i*32], m3
- mova [rsp+ 0+i*32], m0
- mova [rsp+16+i*32], m1
-%endif
+ %define %%abs ABSW
+%endif
+%if mmsize == %2*SIZEOF_DCTCOEF
+ %%abs m0, [%1], m1
+ mova [rsp], m0
+%elif mmsize == %2*SIZEOF_DCTCOEF/2
+ %%abs m0, [%1+0*mmsize], m2
+ %%abs m1, [%1+1*mmsize], m3
+ mova [rsp+0*mmsize], m0
+ mova [rsp+1*mmsize], m1
+%else
+%assign i 0
+%rep %2*SIZEOF_DCTCOEF/(4*mmsize)
+ %%abs m0, [%1+(4*i+0)*mmsize], m4
+ %%abs m1, [%1+(4*i+1)*mmsize], m5
+ %%abs m2, [%1+(4*i+2)*mmsize], m4
+ %%abs m3, [%1+(4*i+3)*mmsize], m5
+ mova [rsp+(4*i+0)*mmsize], m0
+ mova [rsp+(4*i+1)*mmsize], m1
+ mova [rsp+(4*i+2)*mmsize], m2
+ mova [rsp+(4*i+3)*mmsize], m3
%assign i i+1
%endrep
+%endif
%endmacro
%macro SIG_OFFSET 1
%endif
%ifdef PIC
- cglobal func, 4,13
+ cglobal func, 4,13,6,-maxcoeffs*SIZEOF_DCTCOEF
lea r12, [$$]
%define GLOBAL +r12-$$
%else
- cglobal func, 4,12
+ cglobal func, 4,12,6,-maxcoeffs*SIZEOF_DCTCOEF
%define GLOBAL
%endif
-%assign pad gprsize+SIZEOF_DCTCOEF*maxcoeffs-(stack_offset&15)
- SUB rsp, pad
shl r1d, 4 ; MB_INTERLACED*16
%if %1
lea r4, [significant_coeff_flag_offset_8x8+r1*4 GLOBAL] ; r12 = sig offset 8x8
ABS_DCTCOEFS r0, 64
%else
mov r4, r0 ; r4 = dct
- mov r6, ~SIZEOF_DCTCOEF
- and r6, r4 ; handle AC coefficient case
- ABS_DCTCOEFS r6, 16
- sub r4, r6 ; calculate our new dct pointer
+ and r4, ~SIZEOF_DCTCOEF ; handle AC coefficient case
+ ABS_DCTCOEFS r4, 16
+ xor r4, r0 ; calculate our new dct pointer
add r4, rsp ; restore AC coefficient offset
%endif
- mov r1, [%2+gprsize*r2 GLOBAL]
; for improved OOE performance, run coeff_last on the original coefficients.
- call r1 ; coeff_last[ctx_block_cat]( dct )
+ call [%2+gprsize*r2 GLOBAL] ; coeff_last[ctx_block_cat]( dct )
; we know on 64-bit that the SSE2 versions of this function only
; overwrite r0, r1, and rax (r6). last64 overwrites r2 too, but we
; don't need r2 in 8x8 mode.
jge .coeff_loop
.end:
mov [r3+cb.bits_encoded-cb.state], r0d
- ADD rsp, pad
RET
%endmacro
INIT_XMM ssse3,lzcnt
CABAC_RESIDUAL_RD 0, coeff_last_lzcnt
CABAC_RESIDUAL_RD 1, coeff_last_lzcnt
+%if HIGH_BIT_DEPTH
+INIT_ZMM avx512
+%else
+INIT_YMM avx512
+%endif
+CABAC_RESIDUAL_RD 0, coeff_last_avx512
+INIT_ZMM avx512
+CABAC_RESIDUAL_RD 1, coeff_last_avx512
%endif
;-----------------------------------------------------------------------------
%endmacro
%macro CABAC_RESIDUAL 1
-cglobal cabac_block_residual_internal, 4,15
+cglobal cabac_block_residual_internal, 4,15,0,-4*64
%ifdef PIC
; if we use the same r7 as in cabac_encode_decision, we can cheat and save a register.
lea r7, [$$]
%define lastm r7d
%define GLOBAL
%endif
-%assign pad gprsize+4*2+4*64-(stack_offset&15)
- SUB rsp, pad
shl r1d, 4
%define sigoffq r8
mov dct, r0
mov leveloffm, leveloffd
- mov r1, [%1+gprsize*r2 GLOBAL]
- call r1
+ call [%1+gprsize*r2 GLOBAL]
mov lastm, eax
; put cabac in r0; needed for cabac_encode_decision
mov r0, r3
%endif
dec coeffidxd
jge .level_loop
- ADD rsp, pad
RET
%endmacro
CABAC_RESIDUAL coeff_last_lzcnt
INIT_XMM avx2
CABAC_RESIDUAL coeff_last_avx2
+INIT_XMM avx512
+CABAC_RESIDUAL coeff_last_avx512
%endif