From dba0e5a2e089cd675e201cdf4e3358eb7a0e22cc Mon Sep 17 00:00:00 2001
From: Fiona Glaser <fiona@x264.com>
Date: Sun, 15 Jun 2008 11:50:17 -0600
Subject: [PATCH] Cosmetics and loop unrolling GCC is not very good at loop
 unrolling in cases where it can perform constant propagation, so the
 unrolling unfortunately has to be done manually.

---
 common/cabac.h      |   2 +-
 common/frame.c      | 225 +++++++++++++++++++++-----------------------
 common/macroblock.c |  33 ++++---
 3 files changed, 128 insertions(+), 132 deletions(-)

diff --git a/common/cabac.h b/common/cabac.h
index bfdc5b3f..8289d2b0 100644
--- a/common/cabac.h
+++ b/common/cabac.h
@@ -38,7 +38,7 @@ typedef struct
     uint8_t *p;
     uint8_t *p_end;
 
-    /* aligned for aligned_memcpy starting here */
+    /* aligned for memcpy_aligned starting here */
     DECLARE_ALIGNED_16( int f8_bits_encoded ); // only if using x264_cabac_size_decision()
     
     /* context */
diff --git a/common/frame.c b/common/frame.c
index 99565271..214d4fc7 100644
--- a/common/frame.c
+++ b/common/frame.c
@@ -594,7 +594,7 @@ void x264_frame_deblock_row( x264_t *h, int mb_y )
         const int mb_4x4 = 4 * s4x4 * mb_y + 4 * mb_x;
         const int b_8x8_transform = h->mb.mb_transform_size[mb_xy];
         const int i_edge_end = (h->mb.type[mb_xy] == P_SKIP) ? 1 : 4;
-        int i_edge, i_dir;
+        int i_edge;
 
         int i_pix_y[3] = { 16*mb_y*h->fdec->i_stride[0] + 16*mb_x,
                             8*mb_y*h->fdec->i_stride[1] +  8*mb_x,
@@ -610,125 +610,116 @@ void x264_frame_deblock_row( x264_t *h, int mb_y )
 
         /* i_dir == 0 -> vertical edge
          * i_dir == 1 -> horizontal edge */
-        for( i_dir = 0; i_dir < 2; i_dir++ )
-        {
-            int i_start = (i_dir ? (mb_y <= b_interlaced) : (mb_x == 0));
-            int i_qp, i_qpn;
-
-            for( i_edge = i_start; i_edge < i_edge_end; i_edge++ )
-            {
-                int mbn_xy, mbn_8x8, mbn_4x4;
-                int bS[4];  /* filtering strength */
-
-                if( b_8x8_transform && (i_edge&1) )
-                    continue;
-
-                mbn_xy  = i_edge > 0 ? mb_xy  : ( i_dir == 0 ? mb_xy  - 1 : mb_xy - h->mb.i_mb_stride );
-                mbn_8x8 = i_edge > 0 ? mb_8x8 : ( i_dir == 0 ? mb_8x8 - 2 : mb_8x8 - 2 * s8x8 );
-                mbn_4x4 = i_edge > 0 ? mb_4x4 : ( i_dir == 0 ? mb_4x4 - 4 : mb_4x4 - 4 * s4x4 );
-
-                if( b_interlaced && i_edge == 0 && i_dir == 1 )
-                {
-                    mbn_xy -= h->mb.i_mb_stride;
-                    mbn_8x8 -= 2 * s8x8;
-                    mbn_4x4 -= 4 * s4x4;
-                }
-
-                /* *** Get bS for each 4px for the current edge *** */
-                if( IS_INTRA( h->mb.type[mb_xy] ) || IS_INTRA( h->mb.type[mbn_xy] ) )
-                {
-                    bS[0] = bS[1] = bS[2] = bS[3] = ( i_edge == 0 && !(b_interlaced && i_dir) ? 4 : 3 );
-                }
-                else
-                {
-                    int i;
-                    for( i = 0; i < 4; i++ )
-                    {
-                        int x  = i_dir == 0 ? i_edge : i;
-                        int y  = i_dir == 0 ? i      : i_edge;
-                        int xn = (x - (i_dir == 0 ? 1 : 0 ))&0x03;
-                        int yn = (y - (i_dir == 0 ? 0 : 1 ))&0x03;
-
-                        if( h->mb.non_zero_count[mb_xy][block_idx_xy[x][y]] != 0 ||
-                            h->mb.non_zero_count[mbn_xy][block_idx_xy[xn][yn]] != 0 )
-                        {
-                            bS[i] = 2;
-                        }
-                        else
-                        {
-                            /* FIXME: A given frame may occupy more than one position in
-                             * the reference list. So we should compare the frame numbers,
-                             * not the indices in the ref list.
-                             * No harm yet, as we don't generate that case.*/
-
-                            int i8p= mb_8x8+(x/2)+(y/2)*s8x8;
-                            int i8q= mbn_8x8+(xn/2)+(yn/2)*s8x8;
-                            int i4p= mb_4x4+x+y*s4x4;
-                            int i4q= mbn_4x4+xn+yn*s4x4;
-                            int l;
-
-                            bS[i] = 0;
-
-                            for( l = 0; l < 1 + (h->sh.i_type == SLICE_TYPE_B); l++ )
-                            {
-                                if( h->mb.ref[l][i8p] != h->mb.ref[l][i8q] ||
-                                    abs( h->mb.mv[l][i4p][0] - h->mb.mv[l][i4q][0] ) >= 4 ||
-                                    abs( h->mb.mv[l][i4p][1] - h->mb.mv[l][i4q][1] ) >= mvy_limit )
-                                {
-                                    bS[i] = 1;
-                                    break;
-                                }
-                            }
-                        }
-                    }
-                }
 
-                /* *** filter *** */
-                /* Y plane */
-                i_qp = h->mb.qp[mb_xy];
-                i_qpn= h->mb.qp[mbn_xy];
-
-                if( i_dir == 0 )
-                {
-                    /* vertical edge */
-                    deblock_edge( h, &h->fdec->plane[0][i_pix_y[0] + 4*i_edge],
-                                  i_stride2[0], bS, (i_qp+i_qpn+1) >> 1, 0,
-                                  h->loopf.deblock_h_luma, h->loopf.deblock_h_luma_intra );
-                    if( !(i_edge & 1) )
-                    {
-                        /* U/V planes */
-                        int i_qpc = ( i_chroma_qp_table[x264_clip3( i_qp + h->pps->i_chroma_qp_index_offset, 0, 51 )] +
-                                      i_chroma_qp_table[x264_clip3( i_qpn + h->pps->i_chroma_qp_index_offset, 0, 51 )] + 1 ) >> 1;
-                        deblock_edge( h, &h->fdec->plane[1][i_pix_y[1] + 2*i_edge],
-                                      i_stride2[1], bS, i_qpc, 1,
-                                      h->loopf.deblock_h_chroma, h->loopf.deblock_h_chroma_intra );
-                        deblock_edge( h, &h->fdec->plane[2][i_pix_y[2] + 2*i_edge],
-                                      i_stride2[2], bS, i_qpc, 1,
-                                      h->loopf.deblock_h_chroma, h->loopf.deblock_h_chroma_intra );
-                    }
-                }
-                else
-                {
-                    /* horizontal edge */
-                    deblock_edge( h, &h->fdec->plane[0][i_pix_y[0] + 4*i_edge*i_stride2[0]],
-                                  i_stride2[0], bS, (i_qp+i_qpn+1) >> 1, 0,
-                                  h->loopf.deblock_v_luma, h->loopf.deblock_v_luma_intra );
-                    /* U/V planes */
-                    if( !(i_edge & 1) )
-                    {
-                        int i_qpc = ( i_chroma_qp_table[x264_clip3( i_qp + h->pps->i_chroma_qp_index_offset, 0, 51 )] +
-                                      i_chroma_qp_table[x264_clip3( i_qpn + h->pps->i_chroma_qp_index_offset, 0, 51 )] + 1 ) >> 1;
-                        deblock_edge( h, &h->fdec->plane[1][i_pix_y[1] + 2*i_edge*i_stride2[1]],
-                                      i_stride2[1], bS, i_qpc, 1,
-                                      h->loopf.deblock_v_chroma, h->loopf.deblock_v_chroma_intra );
-                        deblock_edge( h, &h->fdec->plane[2][i_pix_y[2] + 2*i_edge*i_stride2[2]],
-                                      i_stride2[2], bS, i_qpc, 1,
-                                      h->loopf.deblock_v_chroma, h->loopf.deblock_v_chroma_intra );
-                    }
-                }
-            }
+        #define deblock_dir(i_dir)\
+        {\
+            int i_start = (i_dir ? (mb_y <= b_interlaced) : (mb_x == 0));\
+            int i_qp, i_qpn;\
+            for( i_edge = i_start; i_edge < i_edge_end; i_edge++ )\
+            {\
+                int mbn_xy, mbn_8x8, mbn_4x4;\
+                int bS[4];  /* filtering strength */\
+                if( b_8x8_transform && (i_edge&1) )\
+                    continue;\
+                mbn_xy  = i_edge > 0 ? mb_xy  : ( i_dir == 0 ? mb_xy  - 1 : mb_xy - h->mb.i_mb_stride );\
+                mbn_8x8 = i_edge > 0 ? mb_8x8 : ( i_dir == 0 ? mb_8x8 - 2 : mb_8x8 - 2 * s8x8 );\
+                mbn_4x4 = i_edge > 0 ? mb_4x4 : ( i_dir == 0 ? mb_4x4 - 4 : mb_4x4 - 4 * s4x4 );\
+                if( b_interlaced && i_edge == 0 && i_dir == 1 )\
+                {\
+                    mbn_xy -= h->mb.i_mb_stride;\
+                    mbn_8x8 -= 2 * s8x8;\
+                    mbn_4x4 -= 4 * s4x4;\
+                }\
+                /* *** Get bS for each 4px for the current edge *** */\
+                if( IS_INTRA( h->mb.type[mb_xy] ) || IS_INTRA( h->mb.type[mbn_xy] ) )\
+                    bS[0] = bS[1] = bS[2] = bS[3] = ( i_edge == 0 && !(b_interlaced && i_dir) ? 4 : 3 );\
+                else\
+                {\
+                    int i;\
+                    for( i = 0; i < 4; i++ )\
+                    {\
+                        int x  = i_dir == 0 ? i_edge : i;\
+                        int y  = i_dir == 0 ? i      : i_edge;\
+                        int xn = (x - (i_dir == 0 ? 1 : 0 ))&0x03;\
+                        int yn = (y - (i_dir == 0 ? 0 : 1 ))&0x03;\
+                        if( h->mb.non_zero_count[mb_xy][block_idx_xy[x][y]] != 0 ||\
+                            h->mb.non_zero_count[mbn_xy][block_idx_xy[xn][yn]] != 0 )\
+                        {\
+                            bS[i] = 2;\
+                        }\
+                        else\
+                        {\
+                            /* FIXME: A given frame may occupy more than one position in\
+                             * the reference list. So we should compare the frame numbers,\
+                             * not the indices in the ref list.\
+                             * No harm yet, as we don't generate that case.*/\
+                            int i8p= mb_8x8+(x/2)+(y/2)*s8x8;\
+                            int i8q= mbn_8x8+(xn/2)+(yn/2)*s8x8;\
+                            int i4p= mb_4x4+x+y*s4x4;\
+                            int i4q= mbn_4x4+xn+yn*s4x4;\
+                            int l;\
+                            bS[i] = 0;\
+                            for( l = 0; l < 1 + (h->sh.i_type == SLICE_TYPE_B); l++ )\
+                            {\
+                                if( h->mb.ref[l][i8p] != h->mb.ref[l][i8q] ||\
+                                    abs( h->mb.mv[l][i4p][0] - h->mb.mv[l][i4q][0] ) >= 4 ||\
+                                    abs( h->mb.mv[l][i4p][1] - h->mb.mv[l][i4q][1] ) >= mvy_limit )\
+                                {\
+                                    bS[i] = 1;\
+                                    break;\
+                                }\
+                            }\
+                        }\
+                    }\
+                }\
+                /* *** filter *** */\
+                /* Y plane */\
+                i_qp = h->mb.qp[mb_xy];\
+                i_qpn= h->mb.qp[mbn_xy];\
+                if( i_dir == 0 )\
+                {\
+                    /* vertical edge */\
+                    deblock_edge( h, &h->fdec->plane[0][i_pix_y[0] + 4*i_edge],\
+                                  i_stride2[0], bS, (i_qp+i_qpn+1) >> 1, 0,\
+                                  h->loopf.deblock_h_luma, h->loopf.deblock_h_luma_intra );\
+                    if( !(i_edge & 1) )\
+                    {\
+                        /* U/V planes */\
+                        int i_qpc = ( i_chroma_qp_table[x264_clip3( i_qp + h->pps->i_chroma_qp_index_offset, 0, 51 )] +\
+                                      i_chroma_qp_table[x264_clip3( i_qpn + h->pps->i_chroma_qp_index_offset, 0, 51 )] + 1 ) >> 1;\
+                        deblock_edge( h, &h->fdec->plane[1][i_pix_y[1] + 2*i_edge],\
+                                      i_stride2[1], bS, i_qpc, 1,\
+                                      h->loopf.deblock_h_chroma, h->loopf.deblock_h_chroma_intra );\
+                        deblock_edge( h, &h->fdec->plane[2][i_pix_y[2] + 2*i_edge],\
+                                      i_stride2[2], bS, i_qpc, 1,\
+                                      h->loopf.deblock_h_chroma, h->loopf.deblock_h_chroma_intra );\
+                    }\
+                }\
+                else\
+                {\
+                    /* horizontal edge */\
+                    deblock_edge( h, &h->fdec->plane[0][i_pix_y[0] + 4*i_edge*i_stride2[0]],\
+                                  i_stride2[0], bS, (i_qp+i_qpn+1) >> 1, 0,\
+                                  h->loopf.deblock_v_luma, h->loopf.deblock_v_luma_intra );\
+                    /* U/V planes */\
+                    if( !(i_edge & 1) )\
+                    {\
+                        int i_qpc = ( i_chroma_qp_table[x264_clip3( i_qp + h->pps->i_chroma_qp_index_offset, 0, 51 )] +\
+                                      i_chroma_qp_table[x264_clip3( i_qpn + h->pps->i_chroma_qp_index_offset, 0, 51 )] + 1 ) >> 1;\
+                        deblock_edge( h, &h->fdec->plane[1][i_pix_y[1] + 2*i_edge*i_stride2[1]],\
+                                      i_stride2[1], bS, i_qpc, 1,\
+                                      h->loopf.deblock_v_chroma, h->loopf.deblock_v_chroma_intra );\
+                        deblock_edge( h, &h->fdec->plane[2][i_pix_y[2] + 2*i_edge*i_stride2[2]],\
+                                      i_stride2[2], bS, i_qpc, 1,\
+                                      h->loopf.deblock_v_chroma, h->loopf.deblock_v_chroma_intra );\
+                    }\
+                }\
+            }\
         }
 
+        deblock_dir(0);
+        deblock_dir(1);
+
         /* next mb */
         if( !b_interlaced || (mb_y&1) )
             mb_x++;
diff --git a/common/macroblock.c b/common/macroblock.c
index 182b5e0d..df912ee3 100644
--- a/common/macroblock.c
+++ b/common/macroblock.c
@@ -467,16 +467,16 @@ void x264_mb_load_mv_direct8x8( x264_t *h, int idx )
 {
     const int x = 2*(idx%2);
     const int y = 2*(idx/2);
-    int l;
     x264_macroblock_cache_ref( h, x, y, 2, 2, 0, h->mb.cache.direct_ref[0][idx] );
     x264_macroblock_cache_ref( h, x, y, 2, 2, 1, h->mb.cache.direct_ref[1][idx] );
-    for( l = 0; l < 2; l++ )
-    {
-        *(uint64_t*)h->mb.cache.mv[l][x264_scan8[idx*4]] =
-        *(uint64_t*)h->mb.cache.direct_mv[l][x264_scan8[idx*4]];
-        *(uint64_t*)h->mb.cache.mv[l][x264_scan8[idx*4]+8] =
-        *(uint64_t*)h->mb.cache.direct_mv[l][x264_scan8[idx*4]+8];
-    }
+    *(uint64_t*)h->mb.cache.mv[0][x264_scan8[idx*4]] =
+    *(uint64_t*)h->mb.cache.direct_mv[0][x264_scan8[idx*4]];
+    *(uint64_t*)h->mb.cache.mv[0][x264_scan8[idx*4]+8] =
+    *(uint64_t*)h->mb.cache.direct_mv[0][x264_scan8[idx*4]+8];
+    *(uint64_t*)h->mb.cache.mv[1][x264_scan8[idx*4]] =
+    *(uint64_t*)h->mb.cache.direct_mv[1][x264_scan8[idx*4]];
+    *(uint64_t*)h->mb.cache.mv[1][x264_scan8[idx*4]+8] =
+    *(uint64_t*)h->mb.cache.direct_mv[1][x264_scan8[idx*4]+8];
 }
 
 #define FIXED_SCALE 256
@@ -979,7 +979,7 @@ static void ALWAYS_INLINE x264_macroblock_load_pic_pointers( x264_t *h, int i_mb
     int ref_pix_offset[2] = { i_pix_offset, i_pix_offset };
     const uint8_t *intra_fdec = &h->mb.intra_border_backup[i_mb_y & h->sh.b_mbaff][i][i_mb_x*16>>!!i];
     x264_frame_t **fref[2] = { h->fref0, h->fref1 };
-    int j, k, l;
+    int j, k;
     if( h->mb.b_interlaced )
         ref_pix_offset[1] += (1-2*(i_mb_y&1)) * i_stride;
     h->mb.pic.i_stride[i] = i_stride2;
@@ -992,16 +992,21 @@ static void ALWAYS_INLINE x264_macroblock_load_pic_pointers( x264_t *h, int i_mb
         for( j = 0; j < w; j++ )
             h->mb.pic.p_fdec[i][-1+j*FDEC_STRIDE] = plane_fdec[-1+j*i_stride2];
     }
-    for( l=0; l<2; l++ )
+    for( j = 0; j < h->mb.pic.i_fref[0]; j++ )
     {
-        for( j=0; j<h->mb.pic.i_fref[l]; j++ )
+        h->mb.pic.p_fref[0][j][i==0 ? 0:i+3] = &fref[0][j >> h->mb.b_interlaced]->plane[i][ref_pix_offset[j&1]];
+        if( i == 0 )
+            for( k = 1; k < 4; k++ )
+                h->mb.pic.p_fref[0][j][k] = &fref[0][j >> h->mb.b_interlaced]->filtered[k][ref_pix_offset[j&1]];
+    }
+    if( h->sh.i_type == SLICE_TYPE_B )
+        for( j = 0; j < h->mb.pic.i_fref[1]; j++ )
         {
-            h->mb.pic.p_fref[l][j][i==0 ? 0:i+3] = &fref[l][j >> h->mb.b_interlaced]->plane[i][ref_pix_offset[j&1]];
+            h->mb.pic.p_fref[1][j][i==0 ? 0:i+3] = &fref[1][j >> h->mb.b_interlaced]->plane[i][ref_pix_offset[j&1]];
             if( i == 0 )
                 for( k = 1; k < 4; k++ )
-                    h->mb.pic.p_fref[l][j][k] = &fref[l][j >> h->mb.b_interlaced]->filtered[k][ref_pix_offset[j&1]];
+                    h->mb.pic.p_fref[1][j][k] = &fref[1][j >> h->mb.b_interlaced]->filtered[k][ref_pix_offset[j&1]];
         }
-    }
 }
 
 void x264_macroblock_cache_load( x264_t *h, int i_mb_x, int i_mb_y )
-- 
2.50.1