From 32bd2d645c63c7cf55a2f9b33e39e63144c3e835 Mon Sep 17 00:00:00 2001
From: Fiona Glaser <fiona@x264.com>
Date: Thu, 15 May 2008 05:41:43 -0600
Subject: [PATCH] force unroll macroblock_load_pic_pointers and a few other
 minor optimizations

---
 common/macroblock.c  | 78 ++++++++++++++++++++++----------------------
 encoder/macroblock.c | 20 ++++--------
 2 files changed, 45 insertions(+), 53 deletions(-)
diff --git a/common/macroblock.c b/common/macroblock.c
index a68c8152..cd1f9cc7 100644
--- a/common/macroblock.c
+++ b/common/macroblock.c
@@ -1011,6 +1011,42 @@ static NOINLINE void copy_column8( uint8_t *dst, uint8_t *src )
         dst[i*FDEC_STRIDE] = src[i*FDEC_STRIDE];
 }
 
+static void ALWAYS_INLINE x264_macroblock_load_pic_pointers( x264_t *h, int i_mb_x, int i_mb_y, int i)
+{
+    const int w = (i == 0 ? 16 : 8);
+    const int i_stride = h->fdec->i_stride[i];
+    const int i_stride2 = i_stride << h->mb.b_interlaced;
+    const int i_pix_offset = h->mb.b_interlaced
+                           ? w * (i_mb_x + (i_mb_y&~1) * i_stride) + (i_mb_y&1) * i_stride
+                           : w * (i_mb_x + i_mb_y * i_stride);
+    int ref_pix_offset[2] = { i_pix_offset, i_pix_offset };
+    const uint8_t *intra_fdec = &h->mb.intra_border_backup[i_mb_y & h->sh.b_mbaff][i][i_mb_x*16>>!!i];
+    x264_frame_t **fref[2] = { h->fref0, h->fref1 };
+    int j, k, l;
+    if( h->mb.b_interlaced )
+        ref_pix_offset[1] += (1-2*(i_mb_y&1)) * i_stride;
+    h->mb.pic.i_stride[i] = i_stride2;
+    h->mc.copy[i?PIXEL_8x8:PIXEL_16x16]( h->mb.pic.p_fenc[i], FENC_STRIDE,
+        &h->fenc->plane[i][i_pix_offset], i_stride2, w );
+    memcpy( &h->mb.pic.p_fdec[i][-1-FDEC_STRIDE], intra_fdec-1, w*3/2+1 );
+    if( h->mb.b_interlaced )
+    {
+        const uint8_t *plane_fdec = &h->fdec->plane[i][i_pix_offset];
+        for( j = 0; j < w; j++ )
+            h->mb.pic.p_fdec[i][-1+j*FDEC_STRIDE] = plane_fdec[-1+j*i_stride2];
+    }
+    for( l=0; l<2; l++ )
+    {
+        for( j=0; j<h->mb.pic.i_fref[l]; j++ )
+        {
+            h->mb.pic.p_fref[l][j][i==0 ? 0:i+3] = &fref[l][j >> h->mb.b_interlaced]->plane[i][ref_pix_offset[j&1]];
+            if( i == 0 )
+                for( k = 1; k < 4; k++ )
+                    h->mb.pic.p_fref[l][j][k] = &fref[l][j >> h->mb.b_interlaced]->filtered[k][ref_pix_offset[j&1]];
+        }
+    }
+}
+
 void x264_macroblock_cache_load( x264_t *h, int i_mb_x, int i_mb_y )
 {
     int i_mb_xy = i_mb_y * h->mb.i_mb_stride + i_mb_x;
@@ -1189,45 +1225,9 @@ void x264_macroblock_cache_load( x264_t *h, int i_mb_x, int i_mb_y )
     }
 
     /* load picture pointers */
-    for( i = 0; i < 3; i++ )
-    {
-        const int w = (i == 0 ? 16 : 8);
-        const int i_stride = h->fdec->i_stride[i];
-        const int i_stride2 = i_stride << h->mb.b_interlaced;
-        const int i_pix_offset = h->mb.b_interlaced
-                               ? w * (i_mb_x + (i_mb_y&~1) * i_stride) + (i_mb_y&1) * i_stride
-                               : w * (i_mb_x + i_mb_y * i_stride);
-        int ref_pix_offset[2] = { i_pix_offset, i_pix_offset };
-        const uint8_t *intra_fdec = &h->mb.intra_border_backup[i_mb_y & h->sh.b_mbaff][i][i_mb_x*16>>!!i];
-        x264_frame_t **fref[2] = { h->fref0, h->fref1 };
-        int j, k, l;
-
-        if( h->mb.b_interlaced )
-            ref_pix_offset[1] += (1-2*(i_mb_y&1)) * i_stride;
-
-        h->mb.pic.i_stride[i] = i_stride2;
-
-        h->mc.copy[i?PIXEL_8x8:PIXEL_16x16]( h->mb.pic.p_fenc[i], FENC_STRIDE,
-            &h->fenc->plane[i][i_pix_offset], i_stride2, w );
-        memcpy( &h->mb.pic.p_fdec[i][-1-FDEC_STRIDE], intra_fdec-1, w*3/2+1 );
-        if( h->mb.b_interlaced )
-        {
-            const uint8_t *plane_fdec = &h->fdec->plane[i][i_pix_offset];
-            for( j = 0; j < w; j++ )
-                h->mb.pic.p_fdec[i][-1+j*FDEC_STRIDE] = plane_fdec[-1+j*i_stride2];
-        }
-
-        for( l=0; l<2; l++ )
-        {
-            for( j=0; j<h->mb.pic.i_fref[l]; j++ )
-            {
-                h->mb.pic.p_fref[l][j][i==0 ? 0:i+3] = &fref[l][j >> h->mb.b_interlaced]->plane[i][ref_pix_offset[j&1]];
-                if( i == 0 )
-                    for( k = 1; k < 4; k++ )
-                        h->mb.pic.p_fref[l][j][k] = &fref[l][j >> h->mb.b_interlaced]->filtered[k][ref_pix_offset[j&1]];
-            }
-        }
-    }
+    x264_macroblock_load_pic_pointers( h, i_mb_x, i_mb_y, 0 );
+    x264_macroblock_load_pic_pointers( h, i_mb_x, i_mb_y, 1 );
+    x264_macroblock_load_pic_pointers( h, i_mb_x, i_mb_y, 2 );
 
     if( h->fdec->integral )
     {
diff --git a/encoder/macroblock.c b/encoder/macroblock.c
index 01d09bc6..33547146 100644
--- a/encoder/macroblock.c
+++ b/encoder/macroblock.c
@@ -64,7 +64,7 @@ static int x264_mb_decimate_score( int16_t *dct, int i_max )
     {
         int i_run;
 
-        if( abs( dct[idx--] ) > 1 )
+        if( (unsigned)(dct[idx--] + 1) > 2 )
             return 9;
 
         i_run = 0;
@@ -273,15 +273,9 @@ void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qscale )
 
 static void x264_macroblock_encode_skip( x264_t *h )
 {
-    int i;
     h->mb.i_cbp_luma = 0x00;
     h->mb.i_cbp_chroma = 0x00;
-
-    for( i = 0; i < 16+8; i++ )
-    {
-        h->mb.cache.non_zero_count[x264_scan8[i]] = 0;
-    }
-
+    memset( h->mb.cache.non_zero_count, 0, X264_SCAN8_SIZE );
     /* store cbp */
     h->mb.cbp[h->mb.i_mb_xy] = 0;
 }
@@ -500,8 +494,8 @@ void x264_macroblock_encode( x264_t *h )
                         h->quantf.quant_4x4( dct4x4[idx], h->quant4_mf[CQM_4PY][i_qp], h->quant4_bias[CQM_4PY][i_qp] );
 
                     h->zigzagf.scan_4x4( h->dct.luma4x4[idx], dct4x4[idx] );
-                    
-                    if( b_decimate )
+
+                    if( b_decimate && i_decimate_8x8 <= 6 )
                         i_decimate_8x8 += x264_mb_decimate_score( h->dct.luma4x4[idx], 16 );
                 }
 
@@ -799,10 +793,8 @@ void x264_macroblock_encode_p8x8( x264_t *h, int i8 )
         int i4;
         DECLARE_ALIGNED_16( int16_t dct4x4[4][4][4] );
         h->dctf.sub8x8_dct( dct4x4, p_fenc, p_fdec );
-        h->quantf.quant_4x4( dct4x4[0], h->quant4_mf[CQM_4PY][i_qp], h->quant4_bias[CQM_4PY][i_qp] );
-        h->quantf.quant_4x4( dct4x4[1], h->quant4_mf[CQM_4PY][i_qp], h->quant4_bias[CQM_4PY][i_qp] );
-        h->quantf.quant_4x4( dct4x4[2], h->quant4_mf[CQM_4PY][i_qp], h->quant4_bias[CQM_4PY][i_qp] );
-        h->quantf.quant_4x4( dct4x4[3], h->quant4_mf[CQM_4PY][i_qp], h->quant4_bias[CQM_4PY][i_qp] );
+        for( i4 = 0; i4 < 4; i4++ )
+            h->quantf.quant_4x4( dct4x4[i4], h->quant4_mf[CQM_4PY][i_qp], h->quant4_bias[CQM_4PY][i_qp] );
         for( i4 = 0; i4 < 4; i4++ )
             h->zigzagf.scan_4x4( h->dct.luma4x4[i8*4+i4], dct4x4[i4] );
 
-- 
2.50.1