From 2b2f039512bde7c097280255c6376cf9a901e08e Mon Sep 17 00:00:00 2001 From: Henrik Gramner Date: Sat, 25 Mar 2017 19:14:22 +0100 Subject: [PATCH] x86: AVX-512 zigzag_scan_4x4_frame --- common/common.h | 4 ++-- common/dct.c | 8 ++++++++ common/x86/dct-a.asm | 24 +++++++++++++++++++++++- common/x86/dct.h | 11 ++++++----- encoder/macroblock.c | 18 +++++++++--------- encoder/macroblock.h | 2 +- encoder/rdo.c | 2 +- tools/checkasm.c | 8 ++++---- 8 files changed, 54 insertions(+), 23 deletions(-) diff --git a/common/common.h b/common/common.h index 8cc1dc1e..e14dec7d 100644 --- a/common/common.h +++ b/common/common.h @@ -635,11 +635,11 @@ struct x264_t /* Current MB DCT coeffs */ struct { - ALIGNED_32( dctcoef luma16x16_dc[3][16] ); + ALIGNED_64( dctcoef luma16x16_dc[3][16] ); ALIGNED_16( dctcoef chroma_dc[2][8] ); // FIXME share memory? ALIGNED_32( dctcoef luma8x8[12][64] ); - ALIGNED_32( dctcoef luma4x4[16*3][16] ); + ALIGNED_64( dctcoef luma4x4[16*3][16] ); } dct; /* MB table and cache for current frame/mb */ diff --git a/common/dct.c b/common/dct.c index a270c4cf..8ebb9ba5 100644 --- a/common/dct.c +++ b/common/dct.c @@ -986,6 +986,10 @@ void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf_progressive, x264_zig pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_avx; } #endif // ARCH_X86_64 + if( cpu&X264_CPU_AVX512 ) + { + pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_avx512; + } #endif // HAVE_MMX #else #if HAVE_MMX @@ -1026,6 +1030,10 @@ void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf_progressive, x264_zig pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_xop; pf_interlaced->scan_8x8 = x264_zigzag_scan_8x8_field_xop; } + if( cpu&X264_CPU_AVX512 ) + { + pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_avx512; + } #endif // HAVE_MMX #if HAVE_ALTIVEC if( cpu&X264_CPU_ALTIVEC ) diff --git a/common/x86/dct-a.asm b/common/x86/dct-a.asm index c2f8973b..ad457237 100644 --- a/common/x86/dct-a.asm +++ b/common/x86/dct-a.asm @@ -30,7 +30,13 @@ %include "x86inc.asm" %include "x86util.asm" -SECTION_RODATA 32 +SECTION_RODATA 64 +%if HIGH_BIT_DEPTH +scan_frame_avx512: dd 0, 4, 1, 2, 5, 8,12, 9, 6, 3, 7,10,13,14,11,15 +%else +scan_frame_avx512: dw 0, 4, 1, 2, 5, 8,12, 9, 6, 3, 7,10,13,14,11,15 +%endif + pw_ppmmmmpp: dw 1,1,-1,-1,-1,-1,1,1 pb_sub4frame: db 0,1,4,8,5,2,3,6,9,12,13,10,7,11,14,15 pb_sub4field: db 0,4,1,8,12,5,9,13,2,6,10,14,3,7,11,15 @@ -1883,3 +1889,19 @@ cglobal zigzag_interleave_8x8_cavlc, 3,3,6 mov [r2+8], r0w RET %endif ; !HIGH_BIT_DEPTH + +%if HIGH_BIT_DEPTH +INIT_ZMM avx512 +cglobal zigzag_scan_4x4_frame, 2,2 + mova m0, [scan_frame_avx512] + vpermd m0, m0, [r1] + mova [r0], m0 + RET +%else ; !HIGH_BIT_DEPTH +INIT_YMM avx512 +cglobal zigzag_scan_4x4_frame, 2,2 + mova m0, [scan_frame_avx512] + vpermw m0, m0, [r1] + mova [r0], m0 + RET +%endif ; !HIGH_BIT_DEPTH diff --git a/common/x86/dct.h b/common/x86/dct.h index 67221c39..ce88c7e4 100644 --- a/common/x86/dct.h +++ b/common/x86/dct.h @@ -106,11 +106,12 @@ void x264_zigzag_scan_8x8_frame_avx ( dctcoef level[64], dctcoef dct[64] ); void x264_zigzag_scan_8x8_frame_ssse3( int16_t level[64], int16_t dct[64] ); void x264_zigzag_scan_8x8_frame_sse2 ( dctcoef level[64], dctcoef dct[64] ); void x264_zigzag_scan_8x8_frame_mmx2 ( int16_t level[64], int16_t dct[64] ); -void x264_zigzag_scan_4x4_frame_xop ( dctcoef level[16], dctcoef dct[16] ); -void x264_zigzag_scan_4x4_frame_avx ( dctcoef level[16], dctcoef dct[16] ); -void x264_zigzag_scan_4x4_frame_ssse3( int16_t level[16], int16_t dct[16] ); -void x264_zigzag_scan_4x4_frame_sse2 ( int32_t level[16], int32_t dct[16] ); -void x264_zigzag_scan_4x4_frame_mmx ( int16_t level[16], int16_t dct[16] ); +void x264_zigzag_scan_4x4_frame_mmx ( int16_t level[16], int16_t dct[16] ); +void x264_zigzag_scan_4x4_frame_sse2 ( int32_t level[16], int32_t dct[16] ); +void x264_zigzag_scan_4x4_frame_ssse3 ( int16_t level[16], int16_t dct[16] ); +void x264_zigzag_scan_4x4_frame_avx ( dctcoef level[16], dctcoef dct[16] ); +void x264_zigzag_scan_4x4_frame_xop ( dctcoef level[16], dctcoef dct[16] ); +void x264_zigzag_scan_4x4_frame_avx512( dctcoef level[16], dctcoef dct[16] ); void x264_zigzag_scan_4x4_field_sse2 ( int32_t level[16], int32_t dct[16] ); void x264_zigzag_scan_4x4_field_sse ( int16_t level[16], int16_t dct[16] ); void x264_zigzag_scan_8x8_field_xop ( int16_t level[64], int16_t dct[64] ); diff --git a/encoder/macroblock.c b/encoder/macroblock.c index 87b076f5..3684e257 100644 --- a/encoder/macroblock.c +++ b/encoder/macroblock.c @@ -128,8 +128,8 @@ static void x264_mb_encode_i16x16( x264_t *h, int p, int i_qp ) pixel *p_src = h->mb.pic.p_fenc[p]; pixel *p_dst = h->mb.pic.p_fdec[p]; - ALIGNED_ARRAY_32( dctcoef, dct4x4,[16],[16] ); - ALIGNED_ARRAY_32( dctcoef, dct_dc4x4,[16] ); + ALIGNED_ARRAY_64( dctcoef, dct4x4,[16],[16] ); + ALIGNED_ARRAY_64( dctcoef, dct_dc4x4,[16] ); int nz, block_cbp = 0; int decimate_score = h->mb.b_dct_decimate ? 0 : 9; @@ -350,7 +350,7 @@ static ALWAYS_INLINE void x264_mb_encode_chroma_internal( x264_t *h, int b_inter int i_decimate_score = b_decimate ? 0 : 7; int nz_ac = 0; - ALIGNED_ARRAY_32( dctcoef, dct4x4,[8],[16] ); + ALIGNED_ARRAY_64( dctcoef, dct4x4,[8],[16] ); if( h->mb.b_lossless ) { @@ -824,7 +824,7 @@ static ALWAYS_INLINE void x264_macroblock_encode_internal( x264_t *h, int plane_ } else { - ALIGNED_ARRAY_32( dctcoef, dct4x4,[16],[16] ); + ALIGNED_ARRAY_64( dctcoef, dct4x4,[16],[16] ); for( int p = 0; p < plane_count; p++, i_qp = h->mb.i_chroma_qp ) { int quant_cat = p ? CQM_4PC : CQM_4PY; @@ -965,8 +965,8 @@ void x264_macroblock_encode( x264_t *h ) *****************************************************************************/ static ALWAYS_INLINE int x264_macroblock_probe_skip_internal( x264_t *h, int b_bidir, int plane_count, int chroma ) { - ALIGNED_ARRAY_32( dctcoef, dct4x4,[8],[16] ); - ALIGNED_ARRAY_16( dctcoef, dctscan,[16] ); + ALIGNED_ARRAY_64( dctcoef, dct4x4,[8],[16] ); + ALIGNED_ARRAY_64( dctcoef, dctscan,[16] ); ALIGNED_4( int16_t mvp[2] ); int i_qp = h->mb.i_qp; @@ -1252,7 +1252,7 @@ static ALWAYS_INLINE void x264_macroblock_encode_p8x8_internal( x264_t *h, int i pixel *p_fenc = h->mb.pic.p_fenc[p] + 8*x + 8*y*FENC_STRIDE; pixel *p_fdec = h->mb.pic.p_fdec[p] + 8*x + 8*y*FDEC_STRIDE; int i_decimate_8x8 = b_decimate ? 0 : 4; - ALIGNED_ARRAY_32( dctcoef, dct4x4,[4],[16] ); + ALIGNED_ARRAY_64( dctcoef, dct4x4,[4],[16] ); int nnz8x8 = 0; h->dctf.sub8x8_dct( dct4x4, p_fenc, p_fdec ); @@ -1311,7 +1311,7 @@ static ALWAYS_INLINE void x264_macroblock_encode_p8x8_internal( x264_t *h, int i i_qp = h->mb.i_chroma_qp; for( int ch = 0; ch < 2; ch++ ) { - ALIGNED_ARRAY_32( dctcoef, dct4x4,[2],[16] ); + ALIGNED_ARRAY_64( dctcoef, dct4x4,[2],[16] ); pixel *p_fenc = h->mb.pic.p_fenc[1+ch] + 4*x + (chroma422?8:4)*y*FENC_STRIDE; pixel *p_fdec = h->mb.pic.p_fdec[1+ch] + 4*x + (chroma422?8:4)*y*FDEC_STRIDE; @@ -1376,7 +1376,7 @@ static ALWAYS_INLINE void x264_macroblock_encode_p4x4_internal( x264_t *h, int i } else { - ALIGNED_ARRAY_32( dctcoef, dct4x4,[16] ); + ALIGNED_ARRAY_64( dctcoef, dct4x4,[16] ); h->dctf.sub4x4_dct( dct4x4, p_fenc, p_fdec ); nz = x264_quant_4x4( h, dct4x4, i_qp, ctx_cat_plane[DCT_LUMA_4x4][p], 0, p, i4 ); h->mb.cache.non_zero_count[x264_scan8[p*16+i4]] = nz; diff --git a/encoder/macroblock.h b/encoder/macroblock.h index 9ab47006..db85539f 100644 --- a/encoder/macroblock.h +++ b/encoder/macroblock.h @@ -116,7 +116,7 @@ static ALWAYS_INLINE void x264_mb_encode_i4x4( x264_t *h, int p, int idx, int i_ int nz; pixel *p_src = &h->mb.pic.p_fenc[p][block_idx_xy_fenc[idx]]; pixel *p_dst = &h->mb.pic.p_fdec[p][block_idx_xy_fdec[idx]]; - ALIGNED_ARRAY_32( dctcoef, dct4x4,[16] ); + ALIGNED_ARRAY_64( dctcoef, dct4x4,[16] ); if( b_predict ) { diff --git a/encoder/rdo.c b/encoder/rdo.c index bd2eafb5..12cf3276 100644 --- a/encoder/rdo.c +++ b/encoder/rdo.c @@ -635,7 +635,7 @@ int quant_trellis_cabac( x264_t *h, dctcoef *dct, int b_chroma, int dc, int num_coefs, int idx ) { ALIGNED_ARRAY_32( dctcoef, orig_coefs, [64] ); - ALIGNED_ARRAY_32( dctcoef, quant_coefs, [64] ); + ALIGNED_ARRAY_64( dctcoef, quant_coefs, [64] ); const uint32_t *coef_weight1 = num_coefs == 64 ? x264_dct8_weight_tab : x264_dct4_weight_tab; const uint32_t *coef_weight2 = num_coefs == 64 ? x264_dct8_weight2_tab : x264_dct4_weight2_tab; const int b_interlaced = MB_INTERLACED; diff --git a/tools/checkasm.c b/tools/checkasm.c index 75899dfe..02c84989 100644 --- a/tools/checkasm.c +++ b/tools/checkasm.c @@ -839,9 +839,9 @@ static int check_dct( int cpu_ref, int cpu_new ) x264_dct_function_t dct_asm; x264_quant_function_t qf; int ret = 0, ok, used_asm, interlace = 0; - ALIGNED_ARRAY_32( dctcoef, dct1, [16],[16] ); + ALIGNED_ARRAY_64( dctcoef, dct1, [16],[16] ); ALIGNED_ARRAY_32( dctcoef, dct2, [16],[16] ); - ALIGNED_ARRAY_32( dctcoef, dct4, [16],[16] ); + ALIGNED_ARRAY_64( dctcoef, dct4, [16],[16] ); ALIGNED_ARRAY_32( dctcoef, dct8, [4],[64] ); ALIGNED_16( dctcoef dctdc[2][8] ); x264_t h_buf; @@ -1044,8 +1044,8 @@ static int check_dct( int cpu_ref, int cpu_new ) x264_zigzag_function_t zigzag_ref[2]; x264_zigzag_function_t zigzag_asm[2]; - ALIGNED_ARRAY_16( dctcoef, level1,[64] ); - ALIGNED_ARRAY_16( dctcoef, level2,[64] ); + ALIGNED_ARRAY_64( dctcoef, level1,[64] ); + ALIGNED_ARRAY_64( dctcoef, level2,[64] ); #define TEST_ZIGZAG_SCAN( name, t1, t2, dct, size ) \ if( zigzag_asm[interlace].name != zigzag_ref[interlace].name ) \ -- 2.40.0