From: Fiona Glaser <fiona@x264.com>
Date: Mon, 26 Oct 2009 19:53:07 +0000 (-0700)
Subject: Motion compensation optimizations
X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=80a3909c1373ceceabed0f41eee366fc7de7cb1b;p=libx264

Motion compensation optimizations
Turning off inlining saves a whole boatload of code size for near-zero speed cost.
Simplify offset calculation.
Various other optimizations.
---

diff --git a/common/macroblock.c b/common/macroblock.c
index 50867b03..27e010ba 100644
--- a/common/macroblock.c
+++ b/common/macroblock.c
@@ -462,62 +462,62 @@ static void setup_inverse_delta_pocs( x264_t *h )
     }
 }
 
-static inline void x264_mb_mc_0xywh( x264_t *h, int x, int y, int width, int height )
+static NOINLINE void x264_mb_mc_0xywh( x264_t *h, int x, int y, int width, int height )
 {
     const int i8 = x264_scan8[0]+x+8*y;
     const int i_ref = h->mb.cache.ref[0][i8];
-    const int mvx   = x264_clip3( h->mb.cache.mv[0][i8][0], h->mb.mv_min[0], h->mb.mv_max[0] );
-    int       mvy   = x264_clip3( h->mb.cache.mv[0][i8][1], h->mb.mv_min[1], h->mb.mv_max[1] );
+    const int mvx   = x264_clip3( h->mb.cache.mv[0][i8][0], h->mb.mv_min[0], h->mb.mv_max[0] ) + 4*4*x;
+    int       mvy   = x264_clip3( h->mb.cache.mv[0][i8][1], h->mb.mv_min[1], h->mb.mv_max[1] ) + 4*4*y;
 
     h->mc.mc_luma( &h->mb.pic.p_fdec[0][4*y*FDEC_STRIDE+4*x], FDEC_STRIDE,
                    h->mb.pic.p_fref[0][i_ref], h->mb.pic.i_stride[0],
-                   mvx + 4*4*x, mvy + 4*4*y, 4*width, 4*height );
+                   mvx, mvy, 4*width, 4*height );
 
     // chroma is offset if MCing from a field of opposite parity
     if( h->mb.b_interlaced & i_ref )
         mvy += (h->mb.i_mb_y & 1)*4 - 2;
 
     h->mc.mc_chroma( &h->mb.pic.p_fdec[1][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE,
-                     &h->mb.pic.p_fref[0][i_ref][4][2*y*h->mb.pic.i_stride[1]+2*x], h->mb.pic.i_stride[1],
+                     h->mb.pic.p_fref[0][i_ref][4], h->mb.pic.i_stride[1],
                      mvx, mvy, 2*width, 2*height );
 
     h->mc.mc_chroma( &h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE,
-                     &h->mb.pic.p_fref[0][i_ref][5][2*y*h->mb.pic.i_stride[2]+2*x], h->mb.pic.i_stride[2],
+                     h->mb.pic.p_fref[0][i_ref][5], h->mb.pic.i_stride[2],
                      mvx, mvy, 2*width, 2*height );
 }
-static inline void x264_mb_mc_1xywh( x264_t *h, int x, int y, int width, int height )
+static NOINLINE void x264_mb_mc_1xywh( x264_t *h, int x, int y, int width, int height )
 {
     const int i8 = x264_scan8[0]+x+8*y;
     const int i_ref = h->mb.cache.ref[1][i8];
-    const int mvx   = x264_clip3( h->mb.cache.mv[1][i8][0], h->mb.mv_min[0], h->mb.mv_max[0] );
-    int       mvy   = x264_clip3( h->mb.cache.mv[1][i8][1], h->mb.mv_min[1], h->mb.mv_max[1] );
+    const int mvx   = x264_clip3( h->mb.cache.mv[1][i8][0], h->mb.mv_min[0], h->mb.mv_max[0] ) + 4*4*x;
+    int       mvy   = x264_clip3( h->mb.cache.mv[1][i8][1], h->mb.mv_min[1], h->mb.mv_max[1] ) + 4*4*y;
 
     h->mc.mc_luma( &h->mb.pic.p_fdec[0][4*y*FDEC_STRIDE+4*x], FDEC_STRIDE,
                    h->mb.pic.p_fref[1][i_ref], h->mb.pic.i_stride[0],
-                   mvx + 4*4*x, mvy + 4*4*y, 4*width, 4*height );
+                   mvx, mvy, 4*width, 4*height );
 
     if( h->mb.b_interlaced & i_ref )
         mvy += (h->mb.i_mb_y & 1)*4 - 2;
 
     h->mc.mc_chroma( &h->mb.pic.p_fdec[1][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE,
-                     &h->mb.pic.p_fref[1][i_ref][4][2*y*h->mb.pic.i_stride[1]+2*x], h->mb.pic.i_stride[1],
+                     h->mb.pic.p_fref[1][i_ref][4], h->mb.pic.i_stride[1],
                      mvx, mvy, 2*width, 2*height );
 
     h->mc.mc_chroma( &h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE,
-                     &h->mb.pic.p_fref[1][i_ref][5][2*y*h->mb.pic.i_stride[2]+2*x], h->mb.pic.i_stride[2],
+                     h->mb.pic.p_fref[1][i_ref][5], h->mb.pic.i_stride[2],
                      mvx, mvy, 2*width, 2*height );
 }
 
-static inline void x264_mb_mc_01xywh( x264_t *h, int x, int y, int width, int height )
+static NOINLINE void x264_mb_mc_01xywh( x264_t *h, int x, int y, int width, int height )
 {
     const int i8 = x264_scan8[0]+x+8*y;
     const int i_ref0 = h->mb.cache.ref[0][i8];
     const int i_ref1 = h->mb.cache.ref[1][i8];
     const int weight = h->mb.bipred_weight[i_ref0][i_ref1];
-    const int mvx0   = x264_clip3( h->mb.cache.mv[0][i8][0], h->mb.mv_min[0], h->mb.mv_max[0] );
-    const int mvx1   = x264_clip3( h->mb.cache.mv[1][i8][0], h->mb.mv_min[0], h->mb.mv_max[0] );
-    int       mvy0   = x264_clip3( h->mb.cache.mv[0][i8][1], h->mb.mv_min[1], h->mb.mv_max[1] );
-    int       mvy1   = x264_clip3( h->mb.cache.mv[1][i8][1], h->mb.mv_min[1], h->mb.mv_max[1] );
+    const int mvx0   = x264_clip3( h->mb.cache.mv[0][i8][0], h->mb.mv_min[0], h->mb.mv_max[0] ) + 4*4*x;
+    const int mvx1   = x264_clip3( h->mb.cache.mv[1][i8][0], h->mb.mv_min[0], h->mb.mv_max[0] ) + 4*4*x;
+    int       mvy0   = x264_clip3( h->mb.cache.mv[0][i8][1], h->mb.mv_min[1], h->mb.mv_max[1] ) + 4*4*y;
+    int       mvy1   = x264_clip3( h->mb.cache.mv[1][i8][1], h->mb.mv_min[1], h->mb.mv_max[1] ) + 4*4*y;
     int       i_mode = x264_size2pixel[height][width];
     int       i_stride0 = 16, i_stride1 = 16;
     ALIGNED_ARRAY_16( uint8_t, tmp0,[16*16] );
@@ -525,9 +525,9 @@ static inline void x264_mb_mc_01xywh( x264_t *h, int x, int y, int width, int he
     uint8_t *src0, *src1;
 
     src0 = h->mc.get_ref( tmp0, &i_stride0, h->mb.pic.p_fref[0][i_ref0], h->mb.pic.i_stride[0],
-                          mvx0 + 4*4*x, mvy0 + 4*4*y, 4*width, 4*height );
+                          mvx0, mvy0, 4*width, 4*height );
     src1 = h->mc.get_ref( tmp1, &i_stride1, h->mb.pic.p_fref[1][i_ref1], h->mb.pic.i_stride[0],
-                          mvx1 + 4*4*x, mvy1 + 4*4*y, 4*width, 4*height );
+                          mvx1, mvy1, 4*width, 4*height );
     h->mc.avg[i_mode]( &h->mb.pic.p_fdec[0][4*y*FDEC_STRIDE+4*x], FDEC_STRIDE,
                        src0, i_stride0, src1, i_stride1, weight );
 
@@ -536,14 +536,14 @@ static inline void x264_mb_mc_01xywh( x264_t *h, int x, int y, int width, int he
     if( h->mb.b_interlaced & i_ref1 )
         mvy1 += (h->mb.i_mb_y & 1)*4 - 2;
 
-    h->mc.mc_chroma( tmp0, 16, &h->mb.pic.p_fref[0][i_ref0][4][2*y*h->mb.pic.i_stride[1]+2*x], h->mb.pic.i_stride[1],
+    h->mc.mc_chroma( tmp0, 16, h->mb.pic.p_fref[0][i_ref0][4], h->mb.pic.i_stride[1],
                      mvx0, mvy0, 2*width, 2*height );
-    h->mc.mc_chroma( tmp1, 16, &h->mb.pic.p_fref[1][i_ref1][4][2*y*h->mb.pic.i_stride[1]+2*x], h->mb.pic.i_stride[1],
+    h->mc.mc_chroma( tmp1, 16, h->mb.pic.p_fref[1][i_ref1][4], h->mb.pic.i_stride[1],
                      mvx1, mvy1, 2*width, 2*height );
     h->mc.avg[i_mode+3]( &h->mb.pic.p_fdec[1][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, tmp0, 16, tmp1, 16, weight );
-    h->mc.mc_chroma( tmp0, 16, &h->mb.pic.p_fref[0][i_ref0][5][2*y*h->mb.pic.i_stride[2]+2*x], h->mb.pic.i_stride[2],
+    h->mc.mc_chroma( tmp0, 16, h->mb.pic.p_fref[0][i_ref0][5], h->mb.pic.i_stride[2],
                      mvx0, mvy0, 2*width, 2*height );
-    h->mc.mc_chroma( tmp1, 16, &h->mb.pic.p_fref[1][i_ref1][5][2*y*h->mb.pic.i_stride[2]+2*x], h->mb.pic.i_stride[2],
+    h->mc.mc_chroma( tmp1, 16, h->mb.pic.p_fref[1][i_ref1][5], h->mb.pic.i_stride[2],
                      mvx1, mvy1, 2*width, 2*height );
     h->mc.avg[i_mode+3]( &h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, tmp0, 16, tmp1, 16, weight );
 }
diff --git a/encoder/analyse.c b/encoder/analyse.c
index f3fd473f..0c664dbe 100644
--- a/encoder/analyse.c
+++ b/encoder/analyse.c
@@ -1488,20 +1488,23 @@ static int x264_mb_analyse_inter_p4x4_chroma( x264_t *h, x264_mb_analysis_t *a,
 
     if( pixel == PIXEL_4x4 )
     {
-        CHROMA4x4MC( 2,2, a->l0.me4x4[i8x8][0], 0,0 );
-        CHROMA4x4MC( 2,2, a->l0.me4x4[i8x8][1], 2,0 );
-        CHROMA4x4MC( 2,2, a->l0.me4x4[i8x8][2], 0,2 );
-        CHROMA4x4MC( 2,2, a->l0.me4x4[i8x8][3], 2,2 );
+        x264_me_t *m = a->l0.me4x4[i8x8];
+        CHROMA4x4MC( 2,2, m[0], 0,0 );
+        CHROMA4x4MC( 2,2, m[1], 2,0 );
+        CHROMA4x4MC( 2,2, m[2], 0,2 );
+        CHROMA4x4MC( 2,2, m[3], 2,2 );
     }
     else if( pixel == PIXEL_8x4 )
     {
-        CHROMA4x4MC( 4,2, a->l0.me8x4[i8x8][0], 0,0 );
-        CHROMA4x4MC( 4,2, a->l0.me8x4[i8x8][1], 0,2 );
+        x264_me_t *m = a->l0.me8x4[i8x8];
+        CHROMA4x4MC( 4,2, m[0], 0,0 );
+        CHROMA4x4MC( 4,2, m[1], 0,2 );
     }
     else
     {
-        CHROMA4x4MC( 2,4, a->l0.me4x8[i8x8][0], 0,0 );
-        CHROMA4x4MC( 2,4, a->l0.me4x8[i8x8][1], 2,0 );
+        x264_me_t *m = a->l0.me4x8[i8x8];
+        CHROMA4x4MC( 2,4, m[0], 0,0 );
+        CHROMA4x4MC( 2,4, m[1], 2,0 );
     }
 
     return h->pixf.mbcmp[PIXEL_4x4]( &h->mb.pic.p_fenc[1][oe], FENC_STRIDE, pix1, 16 )
@@ -1645,11 +1648,6 @@ static void x264_mb_analyse_inter_direct( x264_t *h, x264_mb_analysis_t *a )
     }
 }
 
-#define WEIGHTED_AVG( size, pix, stride, src1, stride1, src2, stride2 ) \
-{ \
-    h->mc.avg[size]( pix, stride, src1, stride1, src2, stride2, h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref] ); \
-}
-
 static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a )
 {
     ALIGNED_ARRAY_16( uint8_t, pix0,[16*16] );