From: Fiona Glaser Date: Mon, 26 Oct 2009 19:53:07 +0000 (-0700) Subject: Motion compensation optimizations X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=80a3909c1373ceceabed0f41eee366fc7de7cb1b;p=libx264 Motion compensation optimizations Turning off inlining saves a whole boatload of code size for near-zero speed cost. Simplify offset calculation. Various other optimizations. --- diff --git a/common/macroblock.c b/common/macroblock.c index 50867b03..27e010ba 100644 --- a/common/macroblock.c +++ b/common/macroblock.c @@ -462,62 +462,62 @@ static void setup_inverse_delta_pocs( x264_t *h ) } } -static inline void x264_mb_mc_0xywh( x264_t *h, int x, int y, int width, int height ) +static NOINLINE void x264_mb_mc_0xywh( x264_t *h, int x, int y, int width, int height ) { const int i8 = x264_scan8[0]+x+8*y; const int i_ref = h->mb.cache.ref[0][i8]; - const int mvx = x264_clip3( h->mb.cache.mv[0][i8][0], h->mb.mv_min[0], h->mb.mv_max[0] ); - int mvy = x264_clip3( h->mb.cache.mv[0][i8][1], h->mb.mv_min[1], h->mb.mv_max[1] ); + const int mvx = x264_clip3( h->mb.cache.mv[0][i8][0], h->mb.mv_min[0], h->mb.mv_max[0] ) + 4*4*x; + int mvy = x264_clip3( h->mb.cache.mv[0][i8][1], h->mb.mv_min[1], h->mb.mv_max[1] ) + 4*4*y; h->mc.mc_luma( &h->mb.pic.p_fdec[0][4*y*FDEC_STRIDE+4*x], FDEC_STRIDE, h->mb.pic.p_fref[0][i_ref], h->mb.pic.i_stride[0], - mvx + 4*4*x, mvy + 4*4*y, 4*width, 4*height ); + mvx, mvy, 4*width, 4*height ); // chroma is offset if MCing from a field of opposite parity if( h->mb.b_interlaced & i_ref ) mvy += (h->mb.i_mb_y & 1)*4 - 2; h->mc.mc_chroma( &h->mb.pic.p_fdec[1][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, - &h->mb.pic.p_fref[0][i_ref][4][2*y*h->mb.pic.i_stride[1]+2*x], h->mb.pic.i_stride[1], + h->mb.pic.p_fref[0][i_ref][4], h->mb.pic.i_stride[1], mvx, mvy, 2*width, 2*height ); h->mc.mc_chroma( &h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, - &h->mb.pic.p_fref[0][i_ref][5][2*y*h->mb.pic.i_stride[2]+2*x], h->mb.pic.i_stride[2], + h->mb.pic.p_fref[0][i_ref][5], h->mb.pic.i_stride[2], mvx, mvy, 2*width, 2*height ); } -static inline void x264_mb_mc_1xywh( x264_t *h, int x, int y, int width, int height ) +static NOINLINE void x264_mb_mc_1xywh( x264_t *h, int x, int y, int width, int height ) { const int i8 = x264_scan8[0]+x+8*y; const int i_ref = h->mb.cache.ref[1][i8]; - const int mvx = x264_clip3( h->mb.cache.mv[1][i8][0], h->mb.mv_min[0], h->mb.mv_max[0] ); - int mvy = x264_clip3( h->mb.cache.mv[1][i8][1], h->mb.mv_min[1], h->mb.mv_max[1] ); + const int mvx = x264_clip3( h->mb.cache.mv[1][i8][0], h->mb.mv_min[0], h->mb.mv_max[0] ) + 4*4*x; + int mvy = x264_clip3( h->mb.cache.mv[1][i8][1], h->mb.mv_min[1], h->mb.mv_max[1] ) + 4*4*y; h->mc.mc_luma( &h->mb.pic.p_fdec[0][4*y*FDEC_STRIDE+4*x], FDEC_STRIDE, h->mb.pic.p_fref[1][i_ref], h->mb.pic.i_stride[0], - mvx + 4*4*x, mvy + 4*4*y, 4*width, 4*height ); + mvx, mvy, 4*width, 4*height ); if( h->mb.b_interlaced & i_ref ) mvy += (h->mb.i_mb_y & 1)*4 - 2; h->mc.mc_chroma( &h->mb.pic.p_fdec[1][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, - &h->mb.pic.p_fref[1][i_ref][4][2*y*h->mb.pic.i_stride[1]+2*x], h->mb.pic.i_stride[1], + h->mb.pic.p_fref[1][i_ref][4], h->mb.pic.i_stride[1], mvx, mvy, 2*width, 2*height ); h->mc.mc_chroma( &h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, - &h->mb.pic.p_fref[1][i_ref][5][2*y*h->mb.pic.i_stride[2]+2*x], h->mb.pic.i_stride[2], + h->mb.pic.p_fref[1][i_ref][5], h->mb.pic.i_stride[2], mvx, mvy, 2*width, 2*height ); } -static inline void x264_mb_mc_01xywh( x264_t *h, int x, int y, int width, int height ) +static NOINLINE void x264_mb_mc_01xywh( x264_t *h, int x, int y, int width, int height ) { const int i8 = x264_scan8[0]+x+8*y; const int i_ref0 = h->mb.cache.ref[0][i8]; const int i_ref1 = h->mb.cache.ref[1][i8]; const int weight = h->mb.bipred_weight[i_ref0][i_ref1]; - const int mvx0 = x264_clip3( h->mb.cache.mv[0][i8][0], h->mb.mv_min[0], h->mb.mv_max[0] ); - const int mvx1 = x264_clip3( h->mb.cache.mv[1][i8][0], h->mb.mv_min[0], h->mb.mv_max[0] ); - int mvy0 = x264_clip3( h->mb.cache.mv[0][i8][1], h->mb.mv_min[1], h->mb.mv_max[1] ); - int mvy1 = x264_clip3( h->mb.cache.mv[1][i8][1], h->mb.mv_min[1], h->mb.mv_max[1] ); + const int mvx0 = x264_clip3( h->mb.cache.mv[0][i8][0], h->mb.mv_min[0], h->mb.mv_max[0] ) + 4*4*x; + const int mvx1 = x264_clip3( h->mb.cache.mv[1][i8][0], h->mb.mv_min[0], h->mb.mv_max[0] ) + 4*4*x; + int mvy0 = x264_clip3( h->mb.cache.mv[0][i8][1], h->mb.mv_min[1], h->mb.mv_max[1] ) + 4*4*y; + int mvy1 = x264_clip3( h->mb.cache.mv[1][i8][1], h->mb.mv_min[1], h->mb.mv_max[1] ) + 4*4*y; int i_mode = x264_size2pixel[height][width]; int i_stride0 = 16, i_stride1 = 16; ALIGNED_ARRAY_16( uint8_t, tmp0,[16*16] ); @@ -525,9 +525,9 @@ static inline void x264_mb_mc_01xywh( x264_t *h, int x, int y, int width, int he uint8_t *src0, *src1; src0 = h->mc.get_ref( tmp0, &i_stride0, h->mb.pic.p_fref[0][i_ref0], h->mb.pic.i_stride[0], - mvx0 + 4*4*x, mvy0 + 4*4*y, 4*width, 4*height ); + mvx0, mvy0, 4*width, 4*height ); src1 = h->mc.get_ref( tmp1, &i_stride1, h->mb.pic.p_fref[1][i_ref1], h->mb.pic.i_stride[0], - mvx1 + 4*4*x, mvy1 + 4*4*y, 4*width, 4*height ); + mvx1, mvy1, 4*width, 4*height ); h->mc.avg[i_mode]( &h->mb.pic.p_fdec[0][4*y*FDEC_STRIDE+4*x], FDEC_STRIDE, src0, i_stride0, src1, i_stride1, weight ); @@ -536,14 +536,14 @@ static inline void x264_mb_mc_01xywh( x264_t *h, int x, int y, int width, int he if( h->mb.b_interlaced & i_ref1 ) mvy1 += (h->mb.i_mb_y & 1)*4 - 2; - h->mc.mc_chroma( tmp0, 16, &h->mb.pic.p_fref[0][i_ref0][4][2*y*h->mb.pic.i_stride[1]+2*x], h->mb.pic.i_stride[1], + h->mc.mc_chroma( tmp0, 16, h->mb.pic.p_fref[0][i_ref0][4], h->mb.pic.i_stride[1], mvx0, mvy0, 2*width, 2*height ); - h->mc.mc_chroma( tmp1, 16, &h->mb.pic.p_fref[1][i_ref1][4][2*y*h->mb.pic.i_stride[1]+2*x], h->mb.pic.i_stride[1], + h->mc.mc_chroma( tmp1, 16, h->mb.pic.p_fref[1][i_ref1][4], h->mb.pic.i_stride[1], mvx1, mvy1, 2*width, 2*height ); h->mc.avg[i_mode+3]( &h->mb.pic.p_fdec[1][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, tmp0, 16, tmp1, 16, weight ); - h->mc.mc_chroma( tmp0, 16, &h->mb.pic.p_fref[0][i_ref0][5][2*y*h->mb.pic.i_stride[2]+2*x], h->mb.pic.i_stride[2], + h->mc.mc_chroma( tmp0, 16, h->mb.pic.p_fref[0][i_ref0][5], h->mb.pic.i_stride[2], mvx0, mvy0, 2*width, 2*height ); - h->mc.mc_chroma( tmp1, 16, &h->mb.pic.p_fref[1][i_ref1][5][2*y*h->mb.pic.i_stride[2]+2*x], h->mb.pic.i_stride[2], + h->mc.mc_chroma( tmp1, 16, h->mb.pic.p_fref[1][i_ref1][5], h->mb.pic.i_stride[2], mvx1, mvy1, 2*width, 2*height ); h->mc.avg[i_mode+3]( &h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, tmp0, 16, tmp1, 16, weight ); } diff --git a/encoder/analyse.c b/encoder/analyse.c index f3fd473f..0c664dbe 100644 --- a/encoder/analyse.c +++ b/encoder/analyse.c @@ -1488,20 +1488,23 @@ static int x264_mb_analyse_inter_p4x4_chroma( x264_t *h, x264_mb_analysis_t *a, if( pixel == PIXEL_4x4 ) { - CHROMA4x4MC( 2,2, a->l0.me4x4[i8x8][0], 0,0 ); - CHROMA4x4MC( 2,2, a->l0.me4x4[i8x8][1], 2,0 ); - CHROMA4x4MC( 2,2, a->l0.me4x4[i8x8][2], 0,2 ); - CHROMA4x4MC( 2,2, a->l0.me4x4[i8x8][3], 2,2 ); + x264_me_t *m = a->l0.me4x4[i8x8]; + CHROMA4x4MC( 2,2, m[0], 0,0 ); + CHROMA4x4MC( 2,2, m[1], 2,0 ); + CHROMA4x4MC( 2,2, m[2], 0,2 ); + CHROMA4x4MC( 2,2, m[3], 2,2 ); } else if( pixel == PIXEL_8x4 ) { - CHROMA4x4MC( 4,2, a->l0.me8x4[i8x8][0], 0,0 ); - CHROMA4x4MC( 4,2, a->l0.me8x4[i8x8][1], 0,2 ); + x264_me_t *m = a->l0.me8x4[i8x8]; + CHROMA4x4MC( 4,2, m[0], 0,0 ); + CHROMA4x4MC( 4,2, m[1], 0,2 ); } else { - CHROMA4x4MC( 2,4, a->l0.me4x8[i8x8][0], 0,0 ); - CHROMA4x4MC( 2,4, a->l0.me4x8[i8x8][1], 2,0 ); + x264_me_t *m = a->l0.me4x8[i8x8]; + CHROMA4x4MC( 2,4, m[0], 0,0 ); + CHROMA4x4MC( 2,4, m[1], 2,0 ); } return h->pixf.mbcmp[PIXEL_4x4]( &h->mb.pic.p_fenc[1][oe], FENC_STRIDE, pix1, 16 ) @@ -1645,11 +1648,6 @@ static void x264_mb_analyse_inter_direct( x264_t *h, x264_mb_analysis_t *a ) } } -#define WEIGHTED_AVG( size, pix, stride, src1, stride1, src2, stride2 ) \ -{ \ - h->mc.avg[size]( pix, stride, src1, stride1, src2, stride2, h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref] ); \ -} - static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a ) { ALIGNED_ARRAY_16( uint8_t, pix0,[16*16] );