/* Current MB DCT coeffs */
struct
{
- ALIGNED_32( dctcoef luma16x16_dc[3][16] );
+ ALIGNED_64( dctcoef luma16x16_dc[3][16] );
ALIGNED_16( dctcoef chroma_dc[2][8] );
// FIXME share memory?
ALIGNED_32( dctcoef luma8x8[12][64] );
- ALIGNED_32( dctcoef luma4x4[16*3][16] );
+ ALIGNED_64( dctcoef luma4x4[16*3][16] );
} dct;
/* MB table and cache for current frame/mb */
pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_avx;
}
#endif // ARCH_X86_64
+ if( cpu&X264_CPU_AVX512 )
+ {
+ pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_avx512;
+ }
#endif // HAVE_MMX
#else
#if HAVE_MMX
pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_xop;
pf_interlaced->scan_8x8 = x264_zigzag_scan_8x8_field_xop;
}
+ if( cpu&X264_CPU_AVX512 )
+ {
+ pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_avx512;
+ }
#endif // HAVE_MMX
#if HAVE_ALTIVEC
if( cpu&X264_CPU_ALTIVEC )
%include "x86inc.asm"
%include "x86util.asm"
-SECTION_RODATA 32
+SECTION_RODATA 64
+%if HIGH_BIT_DEPTH
+scan_frame_avx512: dd 0, 4, 1, 2, 5, 8,12, 9, 6, 3, 7,10,13,14,11,15
+%else
+scan_frame_avx512: dw 0, 4, 1, 2, 5, 8,12, 9, 6, 3, 7,10,13,14,11,15
+%endif
+
pw_ppmmmmpp: dw 1,1,-1,-1,-1,-1,1,1
pb_sub4frame: db 0,1,4,8,5,2,3,6,9,12,13,10,7,11,14,15
pb_sub4field: db 0,4,1,8,12,5,9,13,2,6,10,14,3,7,11,15
mov [r2+8], r0w
RET
%endif ; !HIGH_BIT_DEPTH
+
+%if HIGH_BIT_DEPTH
+INIT_ZMM avx512
+cglobal zigzag_scan_4x4_frame, 2,2
+ mova m0, [scan_frame_avx512]
+ vpermd m0, m0, [r1]
+ mova [r0], m0
+ RET
+%else ; !HIGH_BIT_DEPTH
+INIT_YMM avx512
+cglobal zigzag_scan_4x4_frame, 2,2
+ mova m0, [scan_frame_avx512]
+ vpermw m0, m0, [r1]
+ mova [r0], m0
+ RET
+%endif ; !HIGH_BIT_DEPTH
void x264_zigzag_scan_8x8_frame_ssse3( int16_t level[64], int16_t dct[64] );
void x264_zigzag_scan_8x8_frame_sse2 ( dctcoef level[64], dctcoef dct[64] );
void x264_zigzag_scan_8x8_frame_mmx2 ( int16_t level[64], int16_t dct[64] );
-void x264_zigzag_scan_4x4_frame_xop ( dctcoef level[16], dctcoef dct[16] );
-void x264_zigzag_scan_4x4_frame_avx ( dctcoef level[16], dctcoef dct[16] );
-void x264_zigzag_scan_4x4_frame_ssse3( int16_t level[16], int16_t dct[16] );
-void x264_zigzag_scan_4x4_frame_sse2 ( int32_t level[16], int32_t dct[16] );
-void x264_zigzag_scan_4x4_frame_mmx ( int16_t level[16], int16_t dct[16] );
+void x264_zigzag_scan_4x4_frame_mmx ( int16_t level[16], int16_t dct[16] );
+void x264_zigzag_scan_4x4_frame_sse2 ( int32_t level[16], int32_t dct[16] );
+void x264_zigzag_scan_4x4_frame_ssse3 ( int16_t level[16], int16_t dct[16] );
+void x264_zigzag_scan_4x4_frame_avx ( dctcoef level[16], dctcoef dct[16] );
+void x264_zigzag_scan_4x4_frame_xop ( dctcoef level[16], dctcoef dct[16] );
+void x264_zigzag_scan_4x4_frame_avx512( dctcoef level[16], dctcoef dct[16] );
void x264_zigzag_scan_4x4_field_sse2 ( int32_t level[16], int32_t dct[16] );
void x264_zigzag_scan_4x4_field_sse ( int16_t level[16], int16_t dct[16] );
void x264_zigzag_scan_8x8_field_xop ( int16_t level[64], int16_t dct[64] );
pixel *p_src = h->mb.pic.p_fenc[p];
pixel *p_dst = h->mb.pic.p_fdec[p];
- ALIGNED_ARRAY_32( dctcoef, dct4x4,[16],[16] );
- ALIGNED_ARRAY_32( dctcoef, dct_dc4x4,[16] );
+ ALIGNED_ARRAY_64( dctcoef, dct4x4,[16],[16] );
+ ALIGNED_ARRAY_64( dctcoef, dct_dc4x4,[16] );
int nz, block_cbp = 0;
int decimate_score = h->mb.b_dct_decimate ? 0 : 9;
int i_decimate_score = b_decimate ? 0 : 7;
int nz_ac = 0;
- ALIGNED_ARRAY_32( dctcoef, dct4x4,[8],[16] );
+ ALIGNED_ARRAY_64( dctcoef, dct4x4,[8],[16] );
if( h->mb.b_lossless )
{
}
else
{
- ALIGNED_ARRAY_32( dctcoef, dct4x4,[16],[16] );
+ ALIGNED_ARRAY_64( dctcoef, dct4x4,[16],[16] );
for( int p = 0; p < plane_count; p++, i_qp = h->mb.i_chroma_qp )
{
int quant_cat = p ? CQM_4PC : CQM_4PY;
*****************************************************************************/
static ALWAYS_INLINE int x264_macroblock_probe_skip_internal( x264_t *h, int b_bidir, int plane_count, int chroma )
{
- ALIGNED_ARRAY_32( dctcoef, dct4x4,[8],[16] );
- ALIGNED_ARRAY_16( dctcoef, dctscan,[16] );
+ ALIGNED_ARRAY_64( dctcoef, dct4x4,[8],[16] );
+ ALIGNED_ARRAY_64( dctcoef, dctscan,[16] );
ALIGNED_4( int16_t mvp[2] );
int i_qp = h->mb.i_qp;
pixel *p_fenc = h->mb.pic.p_fenc[p] + 8*x + 8*y*FENC_STRIDE;
pixel *p_fdec = h->mb.pic.p_fdec[p] + 8*x + 8*y*FDEC_STRIDE;
int i_decimate_8x8 = b_decimate ? 0 : 4;
- ALIGNED_ARRAY_32( dctcoef, dct4x4,[4],[16] );
+ ALIGNED_ARRAY_64( dctcoef, dct4x4,[4],[16] );
int nnz8x8 = 0;
h->dctf.sub8x8_dct( dct4x4, p_fenc, p_fdec );
i_qp = h->mb.i_chroma_qp;
for( int ch = 0; ch < 2; ch++ )
{
- ALIGNED_ARRAY_32( dctcoef, dct4x4,[2],[16] );
+ ALIGNED_ARRAY_64( dctcoef, dct4x4,[2],[16] );
pixel *p_fenc = h->mb.pic.p_fenc[1+ch] + 4*x + (chroma422?8:4)*y*FENC_STRIDE;
pixel *p_fdec = h->mb.pic.p_fdec[1+ch] + 4*x + (chroma422?8:4)*y*FDEC_STRIDE;
}
else
{
- ALIGNED_ARRAY_32( dctcoef, dct4x4,[16] );
+ ALIGNED_ARRAY_64( dctcoef, dct4x4,[16] );
h->dctf.sub4x4_dct( dct4x4, p_fenc, p_fdec );
nz = x264_quant_4x4( h, dct4x4, i_qp, ctx_cat_plane[DCT_LUMA_4x4][p], 0, p, i4 );
h->mb.cache.non_zero_count[x264_scan8[p*16+i4]] = nz;
int nz;
pixel *p_src = &h->mb.pic.p_fenc[p][block_idx_xy_fenc[idx]];
pixel *p_dst = &h->mb.pic.p_fdec[p][block_idx_xy_fdec[idx]];
- ALIGNED_ARRAY_32( dctcoef, dct4x4,[16] );
+ ALIGNED_ARRAY_64( dctcoef, dct4x4,[16] );
if( b_predict )
{
int b_chroma, int dc, int num_coefs, int idx )
{
ALIGNED_ARRAY_32( dctcoef, orig_coefs, [64] );
- ALIGNED_ARRAY_32( dctcoef, quant_coefs, [64] );
+ ALIGNED_ARRAY_64( dctcoef, quant_coefs, [64] );
const uint32_t *coef_weight1 = num_coefs == 64 ? x264_dct8_weight_tab : x264_dct4_weight_tab;
const uint32_t *coef_weight2 = num_coefs == 64 ? x264_dct8_weight2_tab : x264_dct4_weight2_tab;
const int b_interlaced = MB_INTERLACED;
x264_dct_function_t dct_asm;
x264_quant_function_t qf;
int ret = 0, ok, used_asm, interlace = 0;
- ALIGNED_ARRAY_32( dctcoef, dct1, [16],[16] );
+ ALIGNED_ARRAY_64( dctcoef, dct1, [16],[16] );
ALIGNED_ARRAY_32( dctcoef, dct2, [16],[16] );
- ALIGNED_ARRAY_32( dctcoef, dct4, [16],[16] );
+ ALIGNED_ARRAY_64( dctcoef, dct4, [16],[16] );
ALIGNED_ARRAY_32( dctcoef, dct8, [4],[64] );
ALIGNED_16( dctcoef dctdc[2][8] );
x264_t h_buf;
x264_zigzag_function_t zigzag_ref[2];
x264_zigzag_function_t zigzag_asm[2];
- ALIGNED_ARRAY_16( dctcoef, level1,[64] );
- ALIGNED_ARRAY_16( dctcoef, level2,[64] );
+ ALIGNED_ARRAY_64( dctcoef, level1,[64] );
+ ALIGNED_ARRAY_64( dctcoef, level2,[64] );
#define TEST_ZIGZAG_SCAN( name, t1, t2, dct, size ) \
if( zigzag_asm[interlace].name != zigzag_ref[interlace].name ) \