From: Loren Merritt Date: Sun, 9 Aug 2009 04:00:36 +0000 (+0000) Subject: simd part of x264_macroblock_tree_propagate. X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=e9ff8c4b1f647135f7b920fad69c616ccb08459a;p=libx264 simd part of x264_macroblock_tree_propagate. 1.6x faster on conroe. --- diff --git a/common/macroblock.c b/common/macroblock.c index 7ed79be1..cd66c717 100644 --- a/common/macroblock.c +++ b/common/macroblock.c @@ -743,7 +743,8 @@ int x264_macroblock_cache_init( x264_t *h ) int me_range = X264_MIN(h->param.analyse.i_me_range, h->param.analyse.i_mv_range); int buf_tesa = (h->param.analyse.i_me_method >= X264_ME_ESA) * ((me_range*2+18) * sizeof(int16_t) + (me_range+4) * (me_range+1) * 4 * sizeof(mvsad_t)); - CHECKED_MALLOC( h->scratch_buffer, X264_MAX3( buf_hpel, buf_ssim, buf_tesa ) ); + int buf_mbtree = h->param.rc.b_mb_tree * ((h->sps->i_mb_width+3)&~3) * sizeof(int); + CHECKED_MALLOC( h->scratch_buffer, X264_MAX4( buf_hpel, buf_ssim, buf_tesa, buf_mbtree ) ); return 0; fail: return -1; diff --git a/common/mc.c b/common/mc.c index e5d6cc83..ee769a01 100644 --- a/common/mc.c +++ b/common/mc.c @@ -356,6 +356,33 @@ static void frame_init_lowres_core( uint8_t *src0, uint8_t *dst0, uint8_t *dsth, } } +#if defined(__GNUC__) && (defined(ARCH_X86) || defined(ARCH_X86_64)) +// gcc isn't smart enough to use the "idiv" instruction +static ALWAYS_INLINE int32_t div_64_32(int64_t x, int32_t y) { + int32_t quotient, remainder; + asm("idiv %4" + :"=a"(quotient), "=d"(remainder) + :"a"((uint32_t)x), "d"((int32_t)(x>>32)), "r"(y) + ); + return quotient; +} +#else +#define div_64_32(x,y) ((x)/(y)) +#endif + +/* Estimate the total amount of influence on future quality that could be had if we + * were to improve the reference samples used to inter predict any given macroblock. */ +static void mbtree_propagate_cost( int *dst, uint16_t *propagate_in, uint16_t *intra_costs, + uint16_t *inter_costs, uint16_t *inv_qscales, int len ) +{ + int i; + for( i=0; i>8); + dst[i] = div_64_32((int64_t)propagate_amount * (intra_costs[i] - inter_costs[i]), intra_costs[i]); + } +} + void x264_mc_init( int cpu, x264_mc_functions_t *pf ) { pf->mc_luma = mc_luma; @@ -392,6 +419,8 @@ void x264_mc_init( int cpu, x264_mc_functions_t *pf ) pf->integral_init4v = integral_init4v; pf->integral_init8v = integral_init8v; + pf->mbtree_propagate_cost = mbtree_propagate_cost; + #ifdef HAVE_MMX x264_mc_init_mmx( cpu, pf ); #endif diff --git a/common/mc.h b/common/mc.h index 594940f8..556ae838 100644 --- a/common/mc.h +++ b/common/mc.h @@ -74,6 +74,9 @@ typedef struct void (*frame_init_lowres_core)( uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, int src_stride, int dst_stride, int width, int height ); + + void (*mbtree_propagate_cost)( int *dst, uint16_t *propagate_in, uint16_t *intra_costs, + uint16_t *inter_costs, uint16_t *inv_qscales, int len ); } x264_mc_functions_t; void x264_mc_init( int cpu, x264_mc_functions_t *pf ); diff --git a/common/x86/mc-a2.asm b/common/x86/mc-a2.asm index 9ed06fa4..ced38173 100644 --- a/common/x86/mc-a2.asm +++ b/common/x86/mc-a2.asm @@ -34,6 +34,7 @@ filt_mul51: times 8 db 1, -5 pw_1: times 8 dw 1 pw_16: times 8 dw 16 pw_32: times 8 dw 32 +pd_128: times 4 dd 128 SECTION .text @@ -1081,3 +1082,43 @@ INIT_XMM FRAME_INIT_LOWRES sse2, 12 %define PALIGNR PALIGNR_SSSE3 FRAME_INIT_LOWRES ssse3, 12 + +;----------------------------------------------------------------------------- +; void mbtree_propagate_cost( int *dst, uint16_t *propagate_in, uint16_t *intra_costs, +; uint16_t *inter_costs, uint16_t *inv_qscales, int len ) +;----------------------------------------------------------------------------- +cglobal x264_mbtree_propagate_cost_sse2, 6,6 + shl r5d, 1 + lea r0, [r0+r5*2] + lea r1, [r1+r5] + lea r2, [r2+r5] + lea r3, [r3+r5] + lea r4, [r4+r5] + neg r5 + pxor xmm5, xmm5 + movdqa xmm4, [pd_128 GLOBAL] +.loop: + movq xmm2, [r2+r5] ; intra + movq xmm0, [r4+r5] ; invq + punpcklwd xmm2, xmm5 + punpcklwd xmm0, xmm5 + pmaddwd xmm0, xmm2 + paddd xmm0, xmm4 + psrld xmm0, 8 ; intra*invq>>8 + movq xmm1, [r1+r5] ; prop + movq xmm3, [r3+r5] ; inter + punpcklwd xmm1, xmm5 + punpcklwd xmm3, xmm5 + paddd xmm0, xmm1 ; prop + (intra*invq>>8) + cvtdq2ps xmm1, xmm2 ; intra + psubd xmm2, xmm3 ; intra - inter + cvtdq2ps xmm0, xmm0 + cvtdq2ps xmm2, xmm2 + mulps xmm0, xmm2 ; (prop + (intra*invq>>8)) * (intra - inter) + divps xmm0, xmm1 ; / intra + cvttps2dq xmm0, xmm0 ; truncation isn't really desired, but matches the integer implementation + movdqa [r0+r5*2], xmm0 + add r5, 8 + jl .loop + REP_RET + diff --git a/common/x86/mc-c.c b/common/x86/mc-c.c index 78bc9638..da28249e 100644 --- a/common/x86/mc-c.c +++ b/common/x86/mc-c.c @@ -74,6 +74,8 @@ extern void x264_integral_init4v_sse2( uint16_t *sum8, uint16_t *sum4, int strid extern void x264_integral_init8v_mmx( uint16_t *sum8, int stride ); extern void x264_integral_init8v_sse2( uint16_t *sum8, int stride ); extern void x264_integral_init4v_ssse3( uint16_t *sum8, uint16_t *sum4, int stride ); +extern void x264_mbtree_propagate_cost_sse2( int *dst, uint16_t *propagate_in, uint16_t *intra_costs, + uint16_t *inter_costs, uint16_t *inv_qscales, int len ); #define LOWRES(cpu) \ extern void x264_frame_init_lowres_core_##cpu( uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,\ int src_stride, int dst_stride, int width, int height ); @@ -303,6 +305,7 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf ) pf->integral_init4v = x264_integral_init4v_sse2; pf->integral_init8v = x264_integral_init8v_sse2; pf->hpel_filter = x264_hpel_filter_sse2_amd; + pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_sse2; if( cpu&X264_CPU_SSE2_IS_SLOW ) return; diff --git a/encoder/encoder.c b/encoder/encoder.c index 76e8b472..bf5ad598 100644 --- a/encoder/encoder.c +++ b/encoder/encoder.c @@ -648,6 +648,7 @@ static int x264_validate_parameters( x264_t *h ) BOOLIFY( analyse.b_fast_pskip ); BOOLIFY( rc.b_stat_write ); BOOLIFY( rc.b_stat_read ); + BOOLIFY( rc.b_mb_tree ); #undef BOOLIFY return 0; diff --git a/encoder/slicetype.c b/encoder/slicetype.c index 7b7a4d07..da5ae454 100644 --- a/encoder/slicetype.c +++ b/encoder/slicetype.c @@ -406,22 +406,21 @@ static void x264_macroblock_tree_propagate( x264_t *h, x264_frame_t **frames, in int dist_scale_factor = ( ((b-p0) << 8) + ((p1-p0) >> 1) ) / (p1-p0); int i_bipred_weight = h->param.analyse.b_weighted_bipred ? 64 - (dist_scale_factor>>2) : 32; int16_t (*mvs[2])[2] = { frames[b]->lowres_mvs[0][b-p0-1], frames[b]->lowres_mvs[1][p1-b-1] }; + int *buf = h->scratch_buffer; for( h->mb.i_mb_y = 0; h->mb.i_mb_y < h->sps->i_mb_height; h->mb.i_mb_y++ ) { int mb_index = h->mb.i_mb_y*h->mb.i_mb_stride; + h->mc.mbtree_propagate_cost( buf, frames[b]->i_propagate_cost+mb_index, + frames[b]->i_intra_cost+mb_index, frames[b]->lowres_costs[b-p0][p1-b]+mb_index, + frames[b]->i_inv_qscale_factor+mb_index, h->sps->i_mb_width ); for( h->mb.i_mb_x = 0; h->mb.i_mb_x < h->sps->i_mb_width; h->mb.i_mb_x++, mb_index++ ) { - int inter_cost = frames[b]->lowres_costs[b-p0][p1-b][mb_index]; - int intra_cost = frames[b]->i_intra_cost[mb_index]; - + int propagate_amount = buf[h->mb.i_mb_x]; /* Don't propagate for an intra block. */ - if( inter_cost < intra_cost ) + if( propagate_amount > 0 ) { int lists_used = frames[b]->lowres_inter_types[b-p0][p1-b][mb_index]; - /* The approximate amount of data that this block contains. */ - int propagate_amount = frames[b]->i_propagate_cost[mb_index] + ((intra_cost * frames[b]->i_inv_qscale_factor[mb_index] + 128)>>8); - propagate_amount = ((uint64_t)propagate_amount*(intra_cost-inter_cost)) / intra_cost; int list; /* Follow the MVs to the previous frame(s). */ for( list = 0; list < 2; list++ ) diff --git a/tools/checkasm.c b/tools/checkasm.c index f599f00c..324d0269 100644 --- a/tools/checkasm.c +++ b/tools/checkasm.c @@ -960,6 +960,32 @@ static int check_mc( int cpu_ref, int cpu_new ) INTEGRAL_INIT( integral_init8v, 9, sum, stride ); report( "integral init :" ); + if( mc_a.mbtree_propagate_cost != mc_ref.mbtree_propagate_cost ) + { + ok = 1; used_asm = 1; + set_func_name( "mbtree_propagate" ); + int *dsta = (int*)buf3; + int *dstc = dsta+400; + uint16_t *prop = (uint16_t*)buf1; + uint16_t *intra = (uint16_t*)buf4; + uint16_t *inter = intra+400; + uint16_t *qscale = inter+400; + uint16_t *rand = (uint16_t*)buf2; + for( i=0; i<400; i++ ) + { + intra[i] = *rand++ & 0x7fff; + intra[i] += !intra[i]; + inter[i] = *rand++ & 0x7fff; + qscale[i] = *rand++ & 0x7fff; + } + call_c( mc_c.mbtree_propagate_cost, dstc, prop, intra, inter, qscale, 400 ); + call_a( mc_a.mbtree_propagate_cost, dsta, prop, intra, inter, qscale, 400 ); + // I don't care about exact rounding, this is just how close the floating-point implementation happens to be + for( i=0; i<400; i++ ) + ok &= abs(dstc[i]-dsta[i]) <= (abs(dstc[i])>512) || fabs((double)dstc[i]/dsta[i]-1) < 1e-6; + report( "mbtree propagate :" ); + } + return ret; }