From: Henrik Gramner Date: Tue, 28 Mar 2017 20:59:56 +0000 (+0200) Subject: x86: AVX-512 mbtree_propagate_cost X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=3451ba3af49e58a720277615df3d8e4a4171986f;p=libx264 x86: AVX-512 mbtree_propagate_cost Also make the AVX and AVX2 implementations slightly faster. --- diff --git a/common/frame.c b/common/frame.c index 2cbcf1e5..a81e9b10 100644 --- a/common/frame.c +++ b/common/frame.c @@ -223,11 +223,13 @@ static x264_frame_t *x264_frame_new( x264_t *h, int b_fdec ) PREALLOC( frame->lowres_mvs[j][i], 2*h->mb.i_mb_count*sizeof(int16_t) ); PREALLOC( frame->lowres_mv_costs[j][i], h->mb.i_mb_count*sizeof(int) ); } - PREALLOC( frame->i_propagate_cost, (i_mb_count+7) * sizeof(uint16_t) ); + PREALLOC( frame->i_propagate_cost, i_mb_count * sizeof(uint16_t) ); for( int j = 0; j <= h->param.i_bframe+1; j++ ) for( int i = 0; i <= h->param.i_bframe+1; i++ ) - PREALLOC( frame->lowres_costs[j][i], (i_mb_count+3) * sizeof(uint16_t) ); + PREALLOC( frame->lowres_costs[j][i], i_mb_count * sizeof(uint16_t) ); + /* mbtree asm can overread the input buffers, make sure we don't read outside of allocated memory. */ + prealloc_size += NATIVE_ALIGN; } if( h->param.rc.i_aq_mode ) { diff --git a/common/macroblock.c b/common/macroblock.c index e5097a6d..0aa09de4 100644 --- a/common/macroblock.c +++ b/common/macroblock.c @@ -388,7 +388,7 @@ int x264_macroblock_thread_allocate( x264_t *h, int b_lookahead ) ((me_range*2+24) * sizeof(int16_t) + (me_range+4) * (me_range+1) * 4 * sizeof(mvsad_t)); scratch_size = X264_MAX3( buf_hpel, buf_ssim, buf_tesa ); } - int buf_mbtree = h->param.rc.b_mb_tree * ((h->mb.i_mb_width+7)&~7) * sizeof(int16_t); + int buf_mbtree = h->param.rc.b_mb_tree * ((h->mb.i_mb_width+15)&~15) * sizeof(int16_t); scratch_size = X264_MAX( scratch_size, buf_mbtree ); if( scratch_size ) CHECKED_MALLOC( h->scratch_buffer, scratch_size ); diff --git a/common/x86/mc-a2.asm b/common/x86/mc-a2.asm index b3cb4634..e0b4a0cd 100644 --- a/common/x86/mc-a2.asm +++ b/common/x86/mc-a2.asm @@ -2147,13 +2147,13 @@ MBTREE cglobal mbtree_propagate_cost, 6,6,8-2*cpuflag(avx2) vbroadcastss m5, [r5] mov r5d, r6m - lea r0, [r0+r5*2] + lea r2, [r2+r5*2] add r5d, r5d - add r1, r5 - add r2, r5 - add r3, r5 add r4, r5 neg r5 + sub r1, r5 + sub r3, r5 + sub r0, r5 mova xm4, [pw_3fff] %if notcpuflag(avx2) pxor xm7, xm7 @@ -2165,9 +2165,8 @@ cglobal mbtree_propagate_cost, 6,6,8-2*cpuflag(avx2) pmovzxwd m2, [r1+r5] ; prop pand xm3, xm4, [r3+r5] ; inter pmovzxwd m3, xm3 - pminsd m3, m0 pmaddwd m1, m0 - psubd m3, m0, m3 + psubusw m3, m0, m3 cvtdq2ps m0, m0 cvtdq2ps m1, m1 cvtdq2ps m2, m2 @@ -2184,7 +2183,7 @@ cglobal mbtree_propagate_cost, 6,6,8-2*cpuflag(avx2) movu xm1, [r4+r5] movu xm2, [r1+r5] pand xm3, xm4, [r3+r5] - pminsw xm3, xm0 + psubusw xm3, xm0, xm3 INT16_UNPACK 0 INT16_UNPACK 1 INT16_UNPACK 2 @@ -2194,7 +2193,6 @@ cglobal mbtree_propagate_cost, 6,6,8-2*cpuflag(avx2) cvtdq2ps m2, m2 cvtdq2ps m3, m3 mulps m1, m0 - subps m3, m0, m3 mulps m1, m5 ; intra*invq*fps_factor>>8 addps m1, m2 ; prop + (intra*invq*fps_factor>>8) rcpps m2, m0 ; 1 / intra 1st approximation @@ -2205,7 +2203,7 @@ cglobal mbtree_propagate_cost, 6,6,8-2*cpuflag(avx2) subps m2, m0 ; 2nd approximation for 1/intra mulps m1, m2 ; / intra %endif - vcvtps2dq m1, m1 + cvtps2dq m1, m1 vextractf128 xm2, m1, 1 packssdw xm1, xm2 mova [r0+r5], xm1 @@ -2219,6 +2217,39 @@ MBTREE_AVX INIT_YMM avx2 MBTREE_AVX +INIT_ZMM avx512 +cglobal mbtree_propagate_cost, 6,6 + vbroadcastss m5, [r5] + mov r5d, 0x3fff3fff + vpbroadcastd ym4, r5d + mov r5d, r6m + lea r2, [r2+r5*2] + add r5d, r5d + add r1, r5 + neg r5 + sub r4, r5 + sub r3, r5 + sub r0, r5 +.loop: + pmovzxwd m0, [r2+r5] ; intra + pmovzxwd m1, [r1+r5] ; prop + pmovzxwd m2, [r4+r5] ; invq + pand ym3, ym4, [r3+r5] ; inter + pmovzxwd m3, ym3 + psubusw m3, m0, m3 + cvtdq2ps m0, m0 + cvtdq2ps m1, m1 + cvtdq2ps m2, m2 + cvtdq2ps m3, m3 + vdivps m1, m0, {rn-sae} + fmaddps m1, m2, m5, m1 + mulps m1, m3 + cvtps2dq m1, m1 + vpmovsdw [r0+r5], m1 + add r5, 32 + jl .loop + RET + %macro MBTREE_PROPAGATE_LIST 0 ;----------------------------------------------------------------------------- ; void mbtree_propagate_list_internal( int16_t (*mvs)[2], int16_t *propagate_amount, uint16_t *lowres_costs, diff --git a/common/x86/mc-c.c b/common/x86/mc-c.c index 90a6cc19..f6e349a9 100644 --- a/common/x86/mc-c.c +++ b/common/x86/mc-c.c @@ -160,14 +160,16 @@ void x264_integral_init4v_avx2( uint16_t *sum8, uint16_t *sum4, intptr_t stride void x264_integral_init8v_mmx ( uint16_t *sum8, intptr_t stride ); void x264_integral_init8v_sse2( uint16_t *sum8, intptr_t stride ); void x264_integral_init8v_avx2( uint16_t *sum8, intptr_t stride ); -void x264_mbtree_propagate_cost_sse2( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs, - uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len ); -void x264_mbtree_propagate_cost_avx ( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs, - uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len ); -void x264_mbtree_propagate_cost_fma4( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs, - uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len ); -void x264_mbtree_propagate_cost_avx2( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs, - uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len ); +void x264_mbtree_propagate_cost_sse2 ( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs, + uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len ); +void x264_mbtree_propagate_cost_avx ( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs, + uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len ); +void x264_mbtree_propagate_cost_fma4 ( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs, + uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len ); +void x264_mbtree_propagate_cost_avx2 ( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs, + uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len ); +void x264_mbtree_propagate_cost_avx512( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs, + uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len ); void x264_mbtree_fix8_pack_ssse3( uint16_t *dst, float *src, int count ); void x264_mbtree_fix8_pack_avx2 ( uint16_t *dst, float *src, int count ); void x264_mbtree_fix8_unpack_ssse3( float *dst, uint16_t *src, int count ); @@ -864,4 +866,8 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf ) pf->mbtree_propagate_list = x264_mbtree_propagate_list_avx2; pf->mbtree_fix8_pack = x264_mbtree_fix8_pack_avx2; pf->mbtree_fix8_unpack = x264_mbtree_fix8_unpack_avx2; + + if( !(cpu&X264_CPU_AVX512) ) + return; + pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_avx512; } diff --git a/tools/checkasm.c b/tools/checkasm.c index 7575d3f4..7ada2791 100644 --- a/tools/checkasm.c +++ b/tools/checkasm.c @@ -1743,7 +1743,7 @@ static int check_mc( int cpu_ref, int cpu_new ) { ok &= abs( dstc[j]-dsta[j] ) <= 1 || fabs( (double)dstc[j]/dsta[j]-1 ) < 1e-4; if( !ok ) - fprintf( stderr, "mbtree_propagate_cost FAILED: %f !~= %f\n", (double)dstc[j], (double)dsta[j] ); + fprintf( stderr, "mbtree_propagate_cost FAILED: %d !~= %d\n", dstc[j], dsta[j] ); } } }