From d2e8686121a0418f466a0d79ef6a5367e944f940 Mon Sep 17 00:00:00 2001 From: Fiona Glaser Date: Tue, 17 May 2011 14:50:51 -0700 Subject: [PATCH] AVX mbtree_propagate Up to ~20-30% faster than SSE2 on Sandy Bridge. --- common/frame.c | 2 +- common/macroblock.c | 2 +- common/x86/mc-a2.asm | 50 ++++++++++++++++++++++++++++++++++++++++++-- common/x86/mc-c.c | 6 ++++++ tools/checkasm.c | 4 ++-- 5 files changed, 58 insertions(+), 6 deletions(-) diff --git a/common/frame.c b/common/frame.c index 40b90bb4..759e2411 100644 --- a/common/frame.c +++ b/common/frame.c @@ -179,7 +179,7 @@ x264_frame_t *x264_frame_new( x264_t *h, int b_fdec ) CHECKED_MALLOCZERO( frame->lowres_mvs[j][i], 2*h->mb.i_mb_count*sizeof(int16_t) ); CHECKED_MALLOC( frame->lowres_mv_costs[j][i], h->mb.i_mb_count*sizeof(int) ); } - CHECKED_MALLOC( frame->i_propagate_cost, (i_mb_count+3) * sizeof(uint16_t) ); + CHECKED_MALLOC( frame->i_propagate_cost, (i_mb_count+7) * sizeof(uint16_t) ); for( int j = 0; j <= h->param.i_bframe+1; j++ ) for( int i = 0; i <= h->param.i_bframe+1; i++ ) CHECKED_MALLOC( frame->lowres_costs[j][i], (i_mb_count+3) * sizeof(uint16_t) ); diff --git a/common/macroblock.c b/common/macroblock.c index 891cf999..8ca8eca1 100644 --- a/common/macroblock.c +++ b/common/macroblock.c @@ -342,7 +342,7 @@ int x264_macroblock_thread_allocate( x264_t *h, int b_lookahead ) ((me_range*2+24) * sizeof(int16_t) + (me_range+4) * (me_range+1) * 4 * sizeof(mvsad_t)); scratch_size = X264_MAX3( buf_hpel, buf_ssim, buf_tesa ); } - int buf_mbtree = h->param.rc.b_mb_tree * ((h->mb.i_mb_width+3)&~3) * sizeof(int); + int buf_mbtree = h->param.rc.b_mb_tree * ((h->mb.i_mb_width+7)&~7) * sizeof(int); scratch_size = X264_MAX( scratch_size, buf_mbtree ); if( scratch_size ) CHECKED_MALLOC( h->scratch_buffer, scratch_size ); diff --git a/common/x86/mc-a2.asm b/common/x86/mc-a2.asm index eb757858..a448ebae 100644 --- a/common/x86/mc-a2.asm +++ b/common/x86/mc-a2.asm @@ -40,7 +40,7 @@ deinterleave_shuf: db 0,2,4,6,8,10,12,14,1,3,5,7,9,11,13,15 pd_16: times 4 dd 16 pd_0f: times 4 dd 0xffff -pf_inv256: times 4 dd 0.00390625 +pf_inv256: times 8 dd 0.00390625 pad10: times 8 dw 10*PIXEL_MAX pad20: times 8 dw 20*PIXEL_MAX @@ -1630,7 +1630,7 @@ FRAME_INIT_LOWRES ssse3 ; uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len ) ;----------------------------------------------------------------------------- cglobal mbtree_propagate_cost_sse2, 7,7,7 - shl r6d, 1 + add r6d, r6d lea r0, [r0+r6*2] add r1, r6 add r2, r6 @@ -1673,3 +1673,49 @@ cglobal mbtree_propagate_cost_sse2, 7,7,7 jl .loop REP_RET +%macro INT16_TO_FLOAT 1 + vpunpckhwd xmm4, xmm%1, xmm7 + vpunpcklwd xmm%1, xmm7 + vinsertf128 ymm%1, ymm%1, xmm4, 1 + vcvtdq2ps ymm%1, ymm%1 +%endmacro + +; FIXME: align loads/stores to 16 bytes +cglobal mbtree_propagate_cost_avx, 7,7,8 + add r6d, r6d + lea r0, [r0+r6*2] + add r1, r6 + add r2, r6 + add r3, r6 + add r4, r6 + neg r6 + vmovdqa xmm5, [pw_3fff] + vbroadcastss ymm6, [r5] + vmulps ymm6, ymm6, [pf_inv256] + vpxor xmm7, xmm7 +.loop: + vmovdqu xmm0, [r2+r6] ; intra + vmovdqu xmm1, [r4+r6] ; invq + vmovdqu xmm2, [r1+r6] ; prop + vpand xmm3, xmm5, [r3+r6] ; inter + INT16_TO_FLOAT 0 + INT16_TO_FLOAT 1 + INT16_TO_FLOAT 2 + INT16_TO_FLOAT 3 + vmulps ymm1, ymm1, ymm0 + vsubps ymm4, ymm0, ymm3 + vmulps ymm1, ymm1, ymm6 ; intra*invq*fps_factor>>8 + vaddps ymm1, ymm1, ymm2 ; prop + (intra*invq*fps_factor>>8) + vrcpps ymm3, ymm0 ; 1 / intra 1st approximation + vmulps ymm2, ymm0, ymm3 ; intra * (1/intra 1st approx) + vmulps ymm2, ymm2, ymm3 ; intra * (1/intra 1st approx)^2 + vmulps ymm1, ymm1, ymm4 ; (prop + (intra*invq*fps_factor>>8)) * (intra - inter) + vaddps ymm3, ymm3, ymm3 ; 2 * (1/intra 1st approx) + vsubps ymm3, ymm3, ymm2 ; 2nd approximation for 1/intra + vmulps ymm1, ymm1, ymm3 ; / intra + vcvtps2dq ymm1, ymm1 + vmovdqu [r0+r6*2], ymm1 + add r6, 16 + jl .loop + vzeroupper + RET diff --git a/common/x86/mc-c.c b/common/x86/mc-c.c index 09e90c78..0f88be38 100644 --- a/common/x86/mc-c.c +++ b/common/x86/mc-c.c @@ -140,6 +140,8 @@ void x264_integral_init8v_sse2( uint16_t *sum8, int stride ); void x264_integral_init4v_ssse3( uint16_t *sum8, uint16_t *sum4, int stride ); void x264_mbtree_propagate_cost_sse2( int *dst, uint16_t *propagate_in, uint16_t *intra_costs, uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len ); +void x264_mbtree_propagate_cost_avx( int *dst, uint16_t *propagate_in, uint16_t *intra_costs, + uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len ); #define MC_CHROMA(cpu)\ void x264_mc_chroma_##cpu( pixel *dstu, pixel *dstv, int i_dst,\ @@ -728,4 +730,8 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf ) if( !(cpu&X264_CPU_STACK_MOD4) ) pf->mc_chroma = x264_mc_chroma_avx; #endif // HIGH_BIT_DEPTH + + if( !(cpu&X264_CPU_AVX) ) + return; + pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_avx; } diff --git a/tools/checkasm.c b/tools/checkasm.c index dc224117..88ae68f7 100644 --- a/tools/checkasm.c +++ b/tools/checkasm.c @@ -1255,8 +1255,8 @@ static int check_mc( int cpu_ref, int cpu_new ) int *dstc = dsta+400; uint16_t *prop = (uint16_t*)buf1; uint16_t *intra = (uint16_t*)buf4; - uint16_t *inter = intra+100; - uint16_t *qscale = inter+100; + uint16_t *inter = intra+128; + uint16_t *qscale = inter+128; uint16_t *rnd = (uint16_t*)buf2; x264_emms(); for( int j = 0; j < 100; j++ ) -- 2.40.0