From: Fiona Glaser Date: Sun, 23 Feb 2014 18:36:55 +0000 (-0800) Subject: Macroblock tree overhaul/optimization X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=b3fb718404d6cce9c82987ea2909cda5072d040c;p=libx264 Macroblock tree overhaul/optimization Move the second core part of macroblock tree into an assembly function; SIMD-optimize roughly half of it (for x86). Roughly ~25-65% faster mbtree, depending on content. Slightly change how mbtree handles the tradeoff between range and precision for propagation. Overall a slight (but mostly negligible) effect on SSIM and ~2% faster. --- diff --git a/common/macroblock.c b/common/macroblock.c index 8437b729..8494bfe1 100644 --- a/common/macroblock.c +++ b/common/macroblock.c @@ -389,7 +389,7 @@ int x264_macroblock_thread_allocate( x264_t *h, int b_lookahead ) ((me_range*2+24) * sizeof(int16_t) + (me_range+4) * (me_range+1) * 4 * sizeof(mvsad_t)); scratch_size = X264_MAX3( buf_hpel, buf_ssim, buf_tesa ); } - int buf_mbtree = h->param.rc.b_mb_tree * ((h->mb.i_mb_width+7)&~7) * sizeof(int); + int buf_mbtree = h->param.rc.b_mb_tree * ((h->mb.i_mb_width+7)&~7) * sizeof(int16_t); scratch_size = X264_MAX( scratch_size, buf_mbtree ); if( scratch_size ) CHECKED_MALLOC( h->scratch_buffer, scratch_size ); @@ -397,7 +397,9 @@ int x264_macroblock_thread_allocate( x264_t *h, int b_lookahead ) h->scratch_buffer = NULL; int buf_lookahead_threads = (h->mb.i_mb_height + (4 + 32) * h->param.i_lookahead_threads) * sizeof(int) * 2; - CHECKED_MALLOC( h->scratch_buffer2, buf_lookahead_threads ); + int buf_mbtree2 = buf_mbtree * 12; /* size of the internal propagate_list asm buffer */ + scratch_size = X264_MAX( buf_lookahead_threads, buf_mbtree2 ); + CHECKED_MALLOC( h->scratch_buffer2, scratch_size ); return 0; fail: diff --git a/common/mc.c b/common/mc.c index 71474965..6797f0ac 100644 --- a/common/mc.c +++ b/common/mc.c @@ -483,20 +483,97 @@ static void frame_init_lowres_core( pixel *src0, pixel *dst0, pixel *dsth, pixel /* Estimate the total amount of influence on future quality that could be had if we * were to improve the reference samples used to inter predict any given macroblock. */ -static void mbtree_propagate_cost( int *dst, uint16_t *propagate_in, uint16_t *intra_costs, +static void mbtree_propagate_cost( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs, uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len ) { float fps = *fps_factor; for( int i = 0; i < len; i++ ) { - float intra_cost = intra_costs[i] * inv_qscales[i]; - float propagate_amount = propagate_in[i] + intra_cost*fps; - float propagate_num = intra_costs[i] - (inter_costs[i] & LOWRES_COST_MASK); - float propagate_denom = intra_costs[i]; - dst[i] = (int)(propagate_amount * propagate_num / propagate_denom + 0.5f); + int intra_cost = intra_costs[i]; + int inter_cost = X264_MIN(intra_costs[i], inter_costs[i] & LOWRES_COST_MASK); + float propagate_intra = intra_cost * inv_qscales[i]; + float propagate_amount = propagate_in[i] + propagate_intra*fps; + float propagate_num = intra_cost - inter_cost; + float propagate_denom = intra_cost; + dst[i] = X264_MIN((int)(propagate_amount * propagate_num / propagate_denom + 0.5f), 32767); } } +static void mbtree_propagate_list( x264_t *h, uint16_t *ref_costs, int16_t (*mvs)[2], + int16_t *propagate_amount, uint16_t *lowres_costs, + int bipred_weight, int mb_y, int len, int list ) +{ + unsigned stride = h->mb.i_mb_stride; + unsigned width = h->mb.i_mb_width; + unsigned height = h->mb.i_mb_height; + + for( unsigned i = 0; i < len; i++ ) + { +#define CLIP_ADD(s,x) (s) = X264_MIN((s)+(x),(1<<15)-1) + int lists_used = lowres_costs[i]>>LOWRES_COST_SHIFT; + + if( !(lists_used & (1 << list)) ) + continue; + + int listamount = propagate_amount[i]; + /* Apply bipred weighting. */ + if( lists_used == 3 ) + listamount = (listamount * bipred_weight + 32) >> 6; + + /* Early termination for simple case of mv0. */ + if( !M32( mvs[i] ) ) + { + CLIP_ADD( ref_costs[mb_y*stride + i], listamount ); + continue; + } + + int x = mvs[i][0]; + int y = mvs[i][1]; + unsigned mbx = (x>>5)+i; + unsigned mby = (y>>5)+mb_y; + unsigned idx0 = mbx + mby * stride; + unsigned idx2 = idx0 + stride; + x &= 31; + y &= 31; + int idx0weight = (32-y)*(32-x); + int idx1weight = (32-y)*x; + int idx2weight = y*(32-x); + int idx3weight = y*x; + idx0weight = (idx0weight * listamount + 512) >> 10; + idx1weight = (idx1weight * listamount + 512) >> 10; + idx2weight = (idx2weight * listamount + 512) >> 10; + idx3weight = (idx3weight * listamount + 512) >> 10; + + if( mbx < width-1 && mby < height-1 ) + { + CLIP_ADD( ref_costs[idx0+0], idx0weight ); + CLIP_ADD( ref_costs[idx0+1], idx1weight ); + CLIP_ADD( ref_costs[idx2+0], idx2weight ); + CLIP_ADD( ref_costs[idx2+1], idx3weight ); + } + else + { + /* Note: this takes advantage of unsigned representation to + * catch negative mbx/mby. */ + if( mby < height ) + { + if( mbx < width ) + CLIP_ADD( ref_costs[idx0+0], idx0weight ); + if( mbx+1 < width ) + CLIP_ADD( ref_costs[idx0+1], idx1weight ); + } + if( mby+1 < height ) + { + if( mbx < width ) + CLIP_ADD( ref_costs[idx2+0], idx2weight ); + if( mbx+1 < width ) + CLIP_ADD( ref_costs[idx2+1], idx3weight ); + } + } + } +#undef CLIP_ADD +} + void x264_mc_init( int cpu, x264_mc_functions_t *pf, int cpu_independent ) { pf->mc_luma = mc_luma; @@ -552,6 +629,7 @@ void x264_mc_init( int cpu, x264_mc_functions_t *pf, int cpu_independent ) pf->integral_init8v = integral_init8v; pf->mbtree_propagate_cost = mbtree_propagate_cost; + pf->mbtree_propagate_list = mbtree_propagate_list; #if HAVE_MMX x264_mc_init_mmx( cpu, pf ); @@ -565,7 +643,10 @@ void x264_mc_init( int cpu, x264_mc_functions_t *pf, int cpu_independent ) #endif if( cpu_independent ) + { pf->mbtree_propagate_cost = mbtree_propagate_cost; + pf->mbtree_propagate_list = mbtree_propagate_list; + } } void x264_frame_filter( x264_t *h, x264_frame_t *frame, int mb_y, int b_end ) diff --git a/common/mc.h b/common/mc.h index 054ba60e..1e97499a 100644 --- a/common/mc.h +++ b/common/mc.h @@ -122,8 +122,12 @@ typedef struct weight_fn_t *offsetsub; void (*weight_cache)( x264_t *, x264_weight_t * ); - void (*mbtree_propagate_cost)( int *dst, uint16_t *propagate_in, uint16_t *intra_costs, + void (*mbtree_propagate_cost)( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs, uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len ); + + void (*mbtree_propagate_list)( x264_t *h, uint16_t *ref_costs, int16_t (*mvs)[2], + int16_t *propagate_amount, uint16_t *lowres_costs, + int bipred_weight, int mb_y, int len, int list ); } x264_mc_functions_t; void x264_mc_init( int cpu, x264_mc_functions_t *pf, int cpu_independent ); diff --git a/common/x86/const-a.asm b/common/x86/const-a.asm index fafe567d..19c55b23 100644 --- a/common/x86/const-a.asm +++ b/common/x86/const-a.asm @@ -36,6 +36,7 @@ const pw_32, times 16 dw 32 const pw_512, times 16 dw 512 const pw_00ff, times 16 dw 0x00ff const pw_pixel_max,times 16 dw ((1 << BIT_DEPTH)-1) +const pw_0to15, dw 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 const pd_1, times 8 dd 1 const deinterleave_shufd, dd 0,4,1,5,2,6,3,7 const pb_unpackbd1, times 2 db 0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3 diff --git a/common/x86/mc-a2.asm b/common/x86/mc-a2.asm index cc70ed12..d56ad4f6 100644 --- a/common/x86/mc-a2.asm +++ b/common/x86/mc-a2.asm @@ -32,6 +32,7 @@ SECTION_RODATA 32 +pw_1024: times 16 dw 1024 filt_mul20: times 32 db 20 filt_mul15: times 16 db 1, -5 filt_mul51: times 16 db -5, 1 @@ -56,8 +57,6 @@ deinterleave_shuf32a: db 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30 deinterleave_shuf32b: db 1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31 %endif ; !HIGH_BIT_DEPTH -pw_1024: times 16 dw 1024 - pd_16: times 4 dd 16 pd_0f: times 4 dd 0xffff @@ -70,16 +69,22 @@ tap1: times 4 dw 1, -5 tap2: times 4 dw 20, 20 tap3: times 4 dw -5, 1 +pw_0xc000: times 8 dw 0xc000 +pw_31: times 8 dw 31 +pd_4: times 4 dd 4 + SECTION .text cextern pb_0 cextern pw_1 +cextern pw_8 cextern pw_16 cextern pw_32 cextern pw_512 cextern pw_00ff cextern pw_3fff cextern pw_pixel_max +cextern pw_0to15 cextern pd_ffff %macro LOAD_ADD 4 @@ -1986,7 +1991,7 @@ FRAME_INIT_LOWRES cglobal mbtree_propagate_cost, 6,6,7 movss m6, [r5] mov r5d, r6m - lea r0, [r0+r5*4] + lea r0, [r0+r5*2] add r5d, r5d add r1, r5 add r2, r5 @@ -2001,10 +2006,11 @@ cglobal mbtree_propagate_cost, 6,6,7 movq m0, [r4+r5] ; invq movq m3, [r3+r5] ; inter movq m1, [r1+r5] ; prop + pand m3, m5 + pminsw m3, m2 punpcklwd m2, m4 punpcklwd m0, m4 pmaddwd m0, m2 - pand m3, m5 punpcklwd m1, m4 punpcklwd m3, m4 %if cpuflag(fma4) @@ -2037,7 +2043,8 @@ cglobal mbtree_propagate_cost, 6,6,7 mulps m0, m3 ; / intra %endif cvtps2dq m0, m0 - mova [r0+r5*2], m0 + packssdw m0, m0 + movh [r0+r5], m0 add r5, 8 jl .loop RET @@ -2060,7 +2067,7 @@ MBTREE cglobal mbtree_propagate_cost, 6,6,%1 vbroadcastss m6, [r5] mov r5d, r6m - lea r0, [r0+r5*4] + lea r0, [r0+r5*2] add r5d, r5d add r1, r5 add r2, r5 @@ -2078,6 +2085,7 @@ cglobal mbtree_propagate_cost, 6,6,%1 pmovzxwd m2, [r1+r5] ; prop pand xm3, xm5, [r3+r5] ; inter pmovzxwd m3, xm3 + pminsd m3, m0 pmaddwd m1, m0 psubd m4, m0, m3 cvtdq2ps m0, m0 @@ -2096,6 +2104,7 @@ cglobal mbtree_propagate_cost, 6,6,%1 movu xm1, [r4+r5] movu xm2, [r1+r5] pand xm3, xm5, [r3+r5] + pminsw xm3, xm0 INT16_UNPACK 0 INT16_UNPACK 1 INT16_UNPACK 2 @@ -2117,7 +2126,9 @@ cglobal mbtree_propagate_cost, 6,6,%1 mulps m1, m3 ; / intra %endif vcvtps2dq m1, m1 - mova [r0+r5*2], m1 + vextractf128 xm2, m1, 1 + packssdw xm1, xm2 + mova [r0+r5], xm1 add r5, 16 jl .loop RET @@ -2127,3 +2138,95 @@ INIT_YMM avx MBTREE_AVX 8 INIT_YMM avx2,fma3 MBTREE_AVX 7 + +%macro MBTREE_PROPAGATE_LIST 0 +;----------------------------------------------------------------------------- +; void mbtree_propagate_list_internal( int16_t (*mvs)[2], int *propagate_amount, uint16_t *lowres_costs, +; int16_t *output, int bipred_weight, int mb_y, int len ) +;----------------------------------------------------------------------------- +cglobal mbtree_propagate_list_internal, 4,6,8 + movh m6, [pw_0to15] ; mb_x + movd m7, r5m + pshuflw m7, m7, 0 + punpcklwd m6, m7 ; 0 y 1 y 2 y 3 y + movd m7, r4m + SPLATW m7, m7 ; bipred_weight + psllw m7, 9 ; bipred_weight << 9 + + mov r5d, r6m + xor r4d, r4d +.loop: + mova m3, [r1+r4*2] + movu m4, [r2+r4*2] + mova m5, [pw_0xc000] + pand m4, m5 + pcmpeqw m4, m5 + pmulhrsw m5, m3, m7 ; propagate_amount = (propagate_amount * bipred_weight + 32) >> 6 +%if cpuflag(avx) + pblendvb m5, m3, m5, m4 +%else + pand m5, m4 + pandn m4, m3 + por m5, m4 ; if( lists_used == 3 ) + ; propagate_amount = (propagate_amount * bipred_weight + 32) >> 6 +%endif + + movu m0, [r0+r4*4] ; x,y + movu m1, [r0+r4*4+mmsize] + + psraw m2, m0, 5 + psraw m3, m1, 5 + mova m4, [pd_4] + paddw m2, m6 ; {mbx, mby} = ({x,y}>>5)+{h->mb.i_mb_x,h->mb.i_mb_y} + paddw m6, m4 ; {mbx, mby} += {4, 0} + paddw m3, m6 ; {mbx, mby} = ({x,y}>>5)+{h->mb.i_mb_x,h->mb.i_mb_y} + paddw m6, m4 ; {mbx, mby} += {4, 0} + + mova [r3+mmsize*0], m2 + mova [r3+mmsize*1], m3 + + mova m3, [pw_31] + pand m0, m3 ; x &= 31 + pand m1, m3 ; y &= 31 + packuswb m0, m1 + psrlw m1, m0, 3 + pand m0, m3 ; x + SWAP 1, 3 + pandn m1, m3 ; y premultiplied by (1<<5) for later use of pmulhrsw + + mova m3, [pw_32] + psubw m3, m0 ; 32 - x + mova m4, [pw_1024] + psubw m4, m1 ; (32 - y) << 5 + + pmullw m2, m3, m4 ; idx0weight = (32-y)*(32-x) << 5 + pmullw m4, m0 ; idx1weight = (32-y)*x << 5 + pmullw m0, m1 ; idx3weight = y*x << 5 + pmullw m1, m3 ; idx2weight = y*(32-x) << 5 + + ; avoid overflow in the input to pmulhrsw + psrlw m3, m2, 15 + psubw m2, m3 ; idx0weight -= (idx0weight == 32768) + + pmulhrsw m2, m5 ; idx0weight * propagate_amount + 512 >> 10 + pmulhrsw m4, m5 ; idx1weight * propagate_amount + 512 >> 10 + pmulhrsw m1, m5 ; idx2weight * propagate_amount + 512 >> 10 + pmulhrsw m0, m5 ; idx3weight * propagate_amount + 512 >> 10 + + SBUTTERFLY wd, 2, 4, 3 + SBUTTERFLY wd, 1, 0, 3 + mova [r3+mmsize*2], m2 + mova [r3+mmsize*3], m4 + mova [r3+mmsize*4], m1 + mova [r3+mmsize*5], m0 + add r4d, mmsize/2 + add r3, mmsize*6 + cmp r4d, r5d + jl .loop + REP_RET +%endmacro + +INIT_XMM ssse3 +MBTREE_PROPAGATE_LIST +INIT_XMM avx +MBTREE_PROPAGATE_LIST diff --git a/common/x86/mc-c.c b/common/x86/mc-c.c index d38fcc16..9bd990ce 100644 --- a/common/x86/mc-c.c +++ b/common/x86/mc-c.c @@ -161,13 +161,13 @@ void x264_integral_init4v_avx2( uint16_t *sum8, uint16_t *sum4, intptr_t stride void x264_integral_init8v_mmx ( uint16_t *sum8, intptr_t stride ); void x264_integral_init8v_sse2( uint16_t *sum8, intptr_t stride ); void x264_integral_init8v_avx2( uint16_t *sum8, intptr_t stride ); -void x264_mbtree_propagate_cost_sse2( int *dst, uint16_t *propagate_in, uint16_t *intra_costs, +void x264_mbtree_propagate_cost_sse2( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs, uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len ); -void x264_mbtree_propagate_cost_avx ( int *dst, uint16_t *propagate_in, uint16_t *intra_costs, +void x264_mbtree_propagate_cost_avx ( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs, uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len ); -void x264_mbtree_propagate_cost_fma4( int *dst, uint16_t *propagate_in, uint16_t *intra_costs, +void x264_mbtree_propagate_cost_fma4( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs, uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len ); -void x264_mbtree_propagate_cost_avx2_fma3( int *dst, uint16_t *propagate_in, uint16_t *intra_costs, +void x264_mbtree_propagate_cost_avx2_fma3( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs, uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len ); #define MC_CHROMA(cpu)\ @@ -533,6 +533,113 @@ PLANE_INTERLEAVE(sse2) PLANE_INTERLEAVE(avx) #endif +#if HAVE_X86_INLINE_ASM +#define CLIP_ADD(s,x)\ +do\ +{\ + int temp;\ + asm("movd %0, %%xmm0 \n"\ + "movd %2, %%xmm1 \n"\ + "paddsw %%xmm1, %%xmm0 \n"\ + "movd %%xmm0, %1 \n"\ + :"+m"(s), "=&r"(temp)\ + :"m"(x)\ + );\ + s = temp;\ +} while(0) + +#define CLIP_ADD2(s,x)\ +do\ +{\ + asm("movd %0, %%xmm0 \n"\ + "movd %1, %%xmm1 \n"\ + "paddsw %%xmm1, %%xmm0 \n"\ + "movd %%xmm0, %0 \n"\ + :"+m"(M32(s))\ + :"m"(M32(x))\ + );\ +} while(0) +#else +#define CLIP_ADD(s,x) (s) = X264_MIN((s)+(x),(1<<15)-1) +#define CLIP_ADD2(s,x)\ +do\ +{\ + CLIP_ADD((s)[0], (x)[0]);\ + CLIP_ADD((s)[1], (x)[1]);\ +} while(0) +#endif + +#define PROPAGATE_LIST(cpu)\ +void x264_mbtree_propagate_list_internal_##cpu( int16_t (*mvs)[2], int16_t *propagate_amount,\ + uint16_t *lowres_costs, int16_t *output,\ + int bipred_weight, int mb_y, int len );\ +\ +static void x264_mbtree_propagate_list_##cpu( x264_t *h, uint16_t *ref_costs, int16_t (*mvs)[2],\ + int16_t *propagate_amount, uint16_t *lowres_costs,\ + int bipred_weight, int mb_y, int len, int list )\ +{\ + int16_t *current = h->scratch_buffer2;\ +\ + x264_mbtree_propagate_list_internal_##cpu( mvs, propagate_amount, lowres_costs,\ + current, bipred_weight, mb_y, len );\ +\ + unsigned stride = h->mb.i_mb_stride;\ + unsigned width = h->mb.i_mb_width;\ + unsigned height = h->mb.i_mb_height;\ +\ + for( unsigned i = 0; i < len; current += 32 )\ + {\ + int end = X264_MIN( i+8, len );\ + for( ; i < end; i++, current += 2 )\ + {\ + if( !(lowres_costs[i] & (1 << (list+LOWRES_COST_SHIFT))) )\ + continue;\ +\ + unsigned mbx = current[0];\ + unsigned mby = current[1];\ + unsigned idx0 = mbx + mby * stride;\ + unsigned idx2 = idx0 + stride;\ +\ + /* Shortcut for the simple/common case of zero MV */\ + if( !M32( mvs[i] ) )\ + {\ + CLIP_ADD( ref_costs[idx0], current[16] );\ + continue;\ + }\ +\ + if( mbx < width-1 && mby < height-1 )\ + {\ + CLIP_ADD2( ref_costs+idx0, current+16 );\ + CLIP_ADD2( ref_costs+idx2, current+32 );\ + }\ + else\ + {\ + /* Note: this takes advantage of unsigned representation to\ + * catch negative mbx/mby. */\ + if( mby < height )\ + {\ + if( mbx < width )\ + CLIP_ADD( ref_costs[idx0+0], current[16] );\ + if( mbx+1 < width )\ + CLIP_ADD( ref_costs[idx0+1], current[17] );\ + }\ + if( mby+1 < height )\ + {\ + if( mbx < width )\ + CLIP_ADD( ref_costs[idx2+0], current[32] );\ + if( mbx+1 < width )\ + CLIP_ADD( ref_costs[idx2+1], current[33] );\ + }\ + }\ + }\ + }\ +} + +PROPAGATE_LIST(ssse3) +PROPAGATE_LIST(avx) +#undef CLIP_ADD +#undef CLIP_ADD2 + void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf ) { if( !(cpu&X264_CPU_MMX) ) @@ -645,6 +752,7 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf ) pf->frame_init_lowres_core = x264_frame_init_lowres_core_ssse3; pf->plane_copy_deinterleave_v210 = x264_plane_copy_deinterleave_v210_ssse3; + pf->mbtree_propagate_list = x264_mbtree_propagate_list_ssse3; if( !(cpu&(X264_CPU_SLOW_SHUFFLE|X264_CPU_SLOW_ATOM|X264_CPU_SLOW_PALIGNR)) ) pf->integral_init4v = x264_integral_init4v_ssse3; @@ -748,6 +856,7 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf ) pf->avg[PIXEL_4x4] = x264_pixel_avg_4x4_ssse3; pf->avg[PIXEL_4x2] = x264_pixel_avg_4x2_ssse3; pf->plane_copy_deinterleave_rgb = x264_plane_copy_deinterleave_rgb_ssse3; + pf->mbtree_propagate_list = x264_mbtree_propagate_list_ssse3; if( !(cpu&X264_CPU_SLOW_PSHUFB) ) { @@ -824,6 +933,7 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf ) return; pf->memzero_aligned = x264_memzero_aligned_avx; pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_avx; + pf->mbtree_propagate_list = x264_mbtree_propagate_list_avx; if( cpu&X264_CPU_FMA4 ) pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_fma4; diff --git a/common/x86/predict-a.asm b/common/x86/predict-a.asm index c0fabfa8..98768442 100644 --- a/common/x86/predict-a.asm +++ b/common/x86/predict-a.asm @@ -31,7 +31,6 @@ SECTION_RODATA 32 -pw_0to15: dw 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 pw_43210123: times 2 dw -3, -2, -1, 0, 1, 2, 3, 4 pw_m3: times 16 dw -3 pw_m7: times 16 dw -7 @@ -56,6 +55,7 @@ cextern pw_8 cextern pw_16 cextern pw_00ff cextern pw_pixel_max +cextern pw_0to15 %macro STORE8 1 mova [r0+0*FDEC_STRIDEB], %1 diff --git a/encoder/slicetype.c b/encoder/slicetype.c index b3da7420..0f4d831a 100644 --- a/encoder/slicetype.c +++ b/encoder/slicetype.c @@ -1022,9 +1022,12 @@ static int x264_slicetype_frame_cost_recalculate( x264_t *h, x264_frame_t **fram return i_score; } +/* Trade off precision in mbtree for increased range */ +#define MBTREE_PRECISION 0.5f + static void x264_macroblock_tree_finish( x264_t *h, x264_frame_t *frame, float average_duration, int ref0_distance ) { - int fps_factor = round( CLIP_DURATION(average_duration) / CLIP_DURATION(frame->f_duration) * 256 ); + int fps_factor = round( CLIP_DURATION(average_duration) / CLIP_DURATION(frame->f_duration) * 256 / MBTREE_PRECISION ); float weightdelta = 0.0; if( ref0_distance && frame->f_weighted_cost_delta[ref0_distance-1] > 0 ) weightdelta = (1.0 - frame->f_weighted_cost_delta[ref0_distance-1]); @@ -1051,11 +1054,12 @@ static void x264_macroblock_tree_propagate( x264_t *h, x264_frame_t **frames, fl int i_bipred_weight = h->param.analyse.b_weighted_bipred ? 64 - (dist_scale_factor>>2) : 32; int16_t (*mvs[2])[2] = { frames[b]->lowres_mvs[0][b-p0-1], frames[b]->lowres_mvs[1][p1-b-1] }; int bipred_weights[2] = {i_bipred_weight, 64 - i_bipred_weight}; - int *buf = h->scratch_buffer; + int16_t *buf = h->scratch_buffer; uint16_t *propagate_cost = frames[b]->i_propagate_cost; + uint16_t *lowres_costs = frames[b]->lowres_costs[b-p0][p1-b]; x264_emms(); - float fps_factor = CLIP_DURATION(frames[b]->f_duration) / (CLIP_DURATION(average_duration) * 256.0f); + float fps_factor = CLIP_DURATION(frames[b]->f_duration) / (CLIP_DURATION(average_duration) * 256.0f) * MBTREE_PRECISION; /* For non-reffed frames the source costs are always zero, so just memset one row and re-use it. */ if( !referenced ) @@ -1065,72 +1069,17 @@ static void x264_macroblock_tree_propagate( x264_t *h, x264_frame_t **frames, fl { int mb_index = h->mb.i_mb_y*h->mb.i_mb_stride; h->mc.mbtree_propagate_cost( buf, propagate_cost, - frames[b]->i_intra_cost+mb_index, frames[b]->lowres_costs[b-p0][p1-b]+mb_index, + frames[b]->i_intra_cost+mb_index, lowres_costs+mb_index, frames[b]->i_inv_qscale_factor+mb_index, &fps_factor, h->mb.i_mb_width ); if( referenced ) propagate_cost += h->mb.i_mb_width; - for( h->mb.i_mb_x = 0; h->mb.i_mb_x < h->mb.i_mb_width; h->mb.i_mb_x++, mb_index++ ) + + h->mc.mbtree_propagate_list( h, ref_costs[0], &mvs[0][mb_index], buf, &lowres_costs[mb_index], + bipred_weights[0], h->mb.i_mb_y, h->mb.i_mb_width, 0 ); + if( b != p1 ) { - int propagate_amount = buf[h->mb.i_mb_x]; - /* Don't propagate for an intra block. */ - if( propagate_amount > 0 ) - { - /* Access width-2 bitfield. */ - int lists_used = frames[b]->lowres_costs[b-p0][p1-b][mb_index] >> LOWRES_COST_SHIFT; - /* Follow the MVs to the previous frame(s). */ - for( int list = 0; list < 2; list++ ) - if( (lists_used >> list)&1 ) - { -#define CLIP_ADD(s,x) (s) = X264_MIN((s)+(x),(1<<16)-1) - int listamount = propagate_amount; - /* Apply bipred weighting. */ - if( lists_used == 3 ) - listamount = (listamount * bipred_weights[list] + 32) >> 6; - - /* Early termination for simple case of mv0. */ - if( !M32( mvs[list][mb_index] ) ) - { - CLIP_ADD( ref_costs[list][mb_index], listamount ); - continue; - } - - int x = mvs[list][mb_index][0]; - int y = mvs[list][mb_index][1]; - int mbx = (x>>5)+h->mb.i_mb_x; - int mby = (y>>5)+h->mb.i_mb_y; - int idx0 = mbx + mby * h->mb.i_mb_stride; - int idx1 = idx0 + 1; - int idx2 = idx0 + h->mb.i_mb_stride; - int idx3 = idx0 + h->mb.i_mb_stride + 1; - x &= 31; - y &= 31; - int idx0weight = (32-y)*(32-x); - int idx1weight = (32-y)*x; - int idx2weight = y*(32-x); - int idx3weight = y*x; - - /* We could just clip the MVs, but pixels that lie outside the frame probably shouldn't - * be counted. */ - if( mbx < h->mb.i_mb_width-1 && mby < h->mb.i_mb_height-1 && mbx >= 0 && mby >= 0 ) - { - CLIP_ADD( ref_costs[list][idx0], (listamount*idx0weight+512)>>10 ); - CLIP_ADD( ref_costs[list][idx1], (listamount*idx1weight+512)>>10 ); - CLIP_ADD( ref_costs[list][idx2], (listamount*idx2weight+512)>>10 ); - CLIP_ADD( ref_costs[list][idx3], (listamount*idx3weight+512)>>10 ); - } - else /* Check offsets individually */ - { - if( mbx < h->mb.i_mb_width && mby < h->mb.i_mb_height && mbx >= 0 && mby >= 0 ) - CLIP_ADD( ref_costs[list][idx0], (listamount*idx0weight+512)>>10 ); - if( mbx+1 < h->mb.i_mb_width && mby < h->mb.i_mb_height && mbx+1 >= 0 && mby >= 0 ) - CLIP_ADD( ref_costs[list][idx1], (listamount*idx1weight+512)>>10 ); - if( mbx < h->mb.i_mb_width && mby+1 < h->mb.i_mb_height && mbx >= 0 && mby+1 >= 0 ) - CLIP_ADD( ref_costs[list][idx2], (listamount*idx2weight+512)>>10 ); - if( mbx+1 < h->mb.i_mb_width && mby+1 < h->mb.i_mb_height && mbx+1 >= 0 && mby+1 >= 0 ) - CLIP_ADD( ref_costs[list][idx3], (listamount*idx3weight+512)>>10 ); - } - } - } + h->mc.mbtree_propagate_list( h, ref_costs[1], &mvs[1][mb_index], buf, &lowres_costs[mb_index], + bipred_weights[1], h->mb.i_mb_y, h->mb.i_mb_width, 1 ); } } diff --git a/tools/checkasm.c b/tools/checkasm.c index adb73d51..f72b7a00 100644 --- a/tools/checkasm.c +++ b/tools/checkasm.c @@ -1598,16 +1598,17 @@ static int check_mc( int cpu_ref, int cpu_new ) INTEGRAL_INIT( integral_init8v, 9, sum, stride ); report( "integral init :" ); + ok = 1; used_asm = 0; if( mc_a.mbtree_propagate_cost != mc_ref.mbtree_propagate_cost ) { - ok = 1; used_asm = 1; + used_asm = 1; x264_emms(); for( int i = 0; i < 10; i++ ) { float fps_factor = (rand()&65535) / 65535.0f; - set_func_name( "mbtree_propagate" ); - int *dsta = (int*)buf3; - int *dstc = dsta+400; + set_func_name( "mbtree_propagate_cost" ); + int16_t *dsta = (int16_t*)buf3; + int16_t *dstc = dsta+400; uint16_t *prop = (uint16_t*)buf1; uint16_t *intra = (uint16_t*)buf4; uint16_t *inter = intra+128; @@ -1629,12 +1630,60 @@ static int check_mc( int cpu_ref, int cpu_new ) { ok &= abs( dstc[j]-dsta[j] ) <= 1 || fabs( (double)dstc[j]/dsta[j]-1 ) < 1e-4; if( !ok ) - fprintf( stderr, "mbtree_propagate FAILED: %f !~= %f\n", (double)dstc[j], (double)dsta[j] ); + fprintf( stderr, "mbtree_propagate_cost FAILED: %f !~= %f\n", (double)dstc[j], (double)dsta[j] ); } } - report( "mbtree propagate :" ); } + if( mc_a.mbtree_propagate_list != mc_ref.mbtree_propagate_list ) + { + used_asm = 1; + for( int i = 0; i < 8; i++ ) + { + set_func_name( "mbtree_propagate_list" ); + x264_t h; + int height = 4; + int width = 128; + int size = width*height; + h.mb.i_mb_stride = width; + h.mb.i_mb_width = width; + h.mb.i_mb_height = height; + + uint16_t *ref_costsc = (uint16_t*)buf3; + uint16_t *ref_costsa = (uint16_t*)buf4; + int16_t (*mvs)[2] = (int16_t(*)[2])(ref_costsc + size); + int16_t *propagate_amount = (int16_t*)(mvs + width); + uint16_t *lowres_costs = (uint16_t*)(propagate_amount + width); + h.scratch_buffer2 = (uint8_t*)(ref_costsa + size); + int bipred_weight = (rand()%63)+1; + int list = i&1; + for( int j = 0; j < size; j++ ) + ref_costsc[j] = ref_costsa[j] = rand()&32767; + for( int j = 0; j < width; j++ ) + { + static const uint8_t list_dist[2][8] = {{0,1,1,1,1,1,1,1},{1,1,3,3,3,3,3,2}}; + for( int k = 0; k < 2; k++ ) + mvs[j][k] = (rand()&127) - 64; + propagate_amount[j] = rand()&32767; + lowres_costs[j] = list_dist[list][rand()&7] << LOWRES_COST_SHIFT; + } + + call_c1( mc_c.mbtree_propagate_list, &h, ref_costsc, mvs, propagate_amount, lowres_costs, bipred_weight, 0, width, list ); + call_a1( mc_a.mbtree_propagate_list, &h, ref_costsa, mvs, propagate_amount, lowres_costs, bipred_weight, 0, width, list ); + + for( int j = 0; j < size && ok; j++ ) + { + ok &= abs(ref_costsa[j] - ref_costsc[j]) <= 1; + if( !ok ) + fprintf( stderr, "mbtree_propagate_list FAILED at %d: %d !~= %d\n", j, ref_costsc[j], ref_costsa[j] ); + } + + call_c2( mc_c.mbtree_propagate_list, &h, ref_costsc, mvs, propagate_amount, lowres_costs, bipred_weight, 0, width, list ); + call_a2( mc_a.mbtree_propagate_list, &h, ref_costsa, mvs, propagate_amount, lowres_costs, bipred_weight, 0, width, list ); + } + } + report( "mbtree :" ); + if( mc_a.memcpy_aligned != mc_ref.memcpy_aligned ) { set_func_name( "memcpy_aligned" );