From: Henrik Gramner Date: Tue, 4 Apr 2017 18:54:12 +0000 (+0200) Subject: x86: AVX-512 dequant_8x8 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=40aca29a164d5e5e6589d507bdcae6717d72f6bf;p=libx264 x86: AVX-512 dequant_8x8 --- diff --git a/common/quant.c b/common/quant.c index 5710356a..37ae7b92 100644 --- a/common/quant.c +++ b/common/quant.c @@ -561,6 +561,7 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf ) if( cpu&X264_CPU_AVX512 ) { pf->dequant_4x4 = x264_dequant_4x4_avx512; + pf->dequant_8x8 = x264_dequant_8x8_avx512; pf->coeff_last4 = x264_coeff_last4_avx512; pf->coeff_last8 = x264_coeff_last8_avx512; pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_avx512; @@ -729,7 +730,10 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf ) if( cpu&X264_CPU_AVX512 ) { if( h->param.i_cqm_preset != X264_CQM_FLAT ) + { pf->dequant_4x4 = x264_dequant_4x4_avx512; + pf->dequant_8x8 = x264_dequant_8x8_avx512; + } pf->coeff_last8 = x264_coeff_last8_avx512; pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_avx512; pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_avx512; diff --git a/common/x86/quant-a.asm b/common/x86/quant-a.asm index b797e418..0803b6aa 100644 --- a/common/x86/quant-a.asm +++ b/common/x86/quant-a.asm @@ -30,7 +30,12 @@ %include "x86inc.asm" %include "x86util.asm" -SECTION_RODATA 32 +SECTION_RODATA 64 + +%if HIGH_BIT_DEPTH == 0 +dequant_shuf_avx512: dw 0, 2, 4, 6, 8,10,12,14,16,18,20,22,24,26,28,30 + dw 32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62 +%endif %macro DQM4 3 dw %1, %2, %1, %2, %2, %3, %2, %3 @@ -800,6 +805,78 @@ cglobal dequant_4x4, 0,3 %endif RET +cglobal dequant_8x8, 0,3 + DEQUANT_START_AVX512 8 + mova m0, [dmf+0*64] + mova m1, [dmf+1*64] + mova m2, [dmf+2*64] + mova m3, [dmf+3*64] +%if HIGH_BIT_DEPTH + pmaddwd m0, [r0+0*64] + pmaddwd m1, [r0+1*64] + pmaddwd m2, [r0+2*64] + pmaddwd m3, [r0+3*64] +%else + mova m6, [dequant_shuf_avx512] +%endif + sub t0d, 6 + jl .rshift +%if HIGH_BIT_DEPTH + vpbroadcastd m4, t0d + vpsllvd m0, m4 + vpsllvd m1, m4 + vpsllvd m2, m4 + vpsllvd m3, m4 + jmp .end +.rshift: +%else + vpbroadcastw m4, t0d + vpermt2w m0, m6, m1 + vpermt2w m2, m6, m3 + pmullw m0, [r0] + pmullw m2, [r0+64] + vpsllvw m0, m4 + vpsllvw m2, m4 + mova [r0], m0 + mova [r0+64], m2 + RET +.rshift: + pmovzxwd m4, [r0+0*32] + pmovzxwd m5, [r0+1*32] + pmaddwd m0, m4 + pmaddwd m1, m5 + pmovzxwd m4, [r0+2*32] + pmovzxwd m5, [r0+3*32] + pmaddwd m2, m4 + pmaddwd m3, m5 +%endif + mov r1d, 1<<31 + shrx r1d, r1d, t0d ; 1 << (-i_qbits-1) + neg t0d + vpbroadcastd m4, r1d + vpbroadcastd m5, t0d + paddd m0, m4 + paddd m1, m4 + vpsravd m0, m5 + vpsravd m1, m5 + paddd m2, m4 + paddd m3, m4 + vpsravd m2, m5 + vpsravd m3, m5 +%if HIGH_BIT_DEPTH +.end: + mova [r0+0*64], m0 + mova [r0+1*64], m1 + mova [r0+2*64], m2 + mova [r0+3*64], m3 +%else + vpermt2w m0, m6, m1 + vpermt2w m2, m6, m3 + mova [r0], m0 + mova [r0+64], m2 +%endif + RET + %undef dmf %macro DEQUANT_DC 2 diff --git a/common/x86/quant.h b/common/x86/quant.h index 67cc1e6b..7d28670a 100644 --- a/common/x86/quant.h +++ b/common/x86/quant.h @@ -67,6 +67,7 @@ void x264_dequant_4x4_avx2( dctcoef dct[16], int dequant_mf[6][16], int i_qp ); void x264_dequant_4x4dc_avx2( dctcoef dct[16], int dequant_mf[6][16], int i_qp ); void x264_dequant_8x8_avx2( dctcoef dct[64], int dequant_mf[6][64], int i_qp ); void x264_dequant_4x4_avx512( dctcoef dct[16], int dequant_mf[6][16], int i_qp ); +void x264_dequant_8x8_avx512( dctcoef dct[64], int dequant_mf[6][64], int i_qp ); void x264_dequant_4x4_flat16_mmx( int16_t dct[16], int dequant_mf[6][16], int i_qp ); void x264_dequant_8x8_flat16_mmx( int16_t dct[64], int dequant_mf[6][64], int i_qp ); void x264_dequant_4x4_flat16_sse2( int16_t dct[16], int dequant_mf[6][16], int i_qp );