From: Fiona Glaser <fiona@x264.com>
Date: Sun, 6 Jul 2008 18:59:15 +0000 (-0600)
Subject: Various optimizations and cosmetics
X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=c9c7edf3e6fa8fbdd4d7bf2beccb448bdcac9aa4;p=libx264

Various optimizations and cosmetics
Update AUTHORS file with Gabriel and me
update XCHG macro to work correctly in if statements
Add new lookup tables for block_idx and fdec/fenc addresses
Slightly faster array_non_zero_count_mmx (patch by holger)
Eliminate branch in analyse_intra
Unroll loops in and clean up chroma encode
Convert some for loops to do/while loops for speed improvement
Do explicit write-combining on --me tesa mvsad_t struct
Shrink --me esa zero[] array
Speed up bime by reducing size of visited[][][] array
---

diff --git a/AUTHORS b/AUTHORS
index 4e7a6003..129d0115 100644
--- a/AUTHORS
+++ b/AUTHORS
@@ -39,11 +39,21 @@ S: France
 N: Francesco Corriga
 D: VfW
 
+N: Gabriel Bouvigne
+E: gabriel.bouvigne AT joost DOT com
+D: 2pass VBV
+
 N: Guillaume Poirier
 E: gpoirier CHEZ mplayerhq POINT hu
 D: Altivec optimizations
 S: Brittany, France
 
+N: Fiona Glaser
+E: fiona AT x264 DOT com
+D: x86 asm, 1pass VBV, adaptive quantization, inline asm
+D: various speed optimizations, bugfixes
+S: USA
+
 N: Justin Clay
 E: justin_clay AT hotmail DOT com
 C: wheatgerm
diff --git a/common/common.h b/common/common.h
index 376ec1e9..4095b8d7 100644
--- a/common/common.h
+++ b/common/common.h
@@ -33,7 +33,7 @@
 #define X264_MAX3(a,b,c) X264_MAX((a),X264_MAX((b),(c)))
 #define X264_MIN4(a,b,c,d) X264_MIN((a),X264_MIN3((b),(c),(d)))
 #define X264_MAX4(a,b,c,d) X264_MAX((a),X264_MAX3((b),(c),(d)))
-#define XCHG(type,a,b) { type t = a; a = b; b = t; }
+#define XCHG(type,a,b) do{ type t = a; a = b; b = t; } while(0)
 #define FIX8(f) ((int)(f*(1<<8)+.5))
 
 #define CHECKED_MALLOC( var, size )\
diff --git a/common/macroblock.h b/common/macroblock.h
index 14741fc2..9d9b2223 100644
--- a/common/macroblock.h
+++ b/common/macroblock.h
@@ -219,6 +219,32 @@ static const uint8_t block_idx_xy[4][4] =
     { 4, 6, 12, 14 },
     { 5, 7, 13, 15 }
 };
+static const uint8_t block_idx_xy_1d[16] =
+{
+    0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15
+};
+static const uint8_t block_idx_xy_fenc[16] =
+{
+    0*4 + 0*4*FENC_STRIDE, 1*4 + 0*4*FENC_STRIDE,
+    0*4 + 1*4*FENC_STRIDE, 1*4 + 1*4*FENC_STRIDE,
+    2*4 + 0*4*FENC_STRIDE, 3*4 + 0*4*FENC_STRIDE,
+    2*4 + 1*4*FENC_STRIDE, 3*4 + 1*4*FENC_STRIDE,
+    0*4 + 2*4*FENC_STRIDE, 1*4 + 2*4*FENC_STRIDE,
+    0*4 + 3*4*FENC_STRIDE, 1*4 + 3*4*FENC_STRIDE,
+    2*4 + 2*4*FENC_STRIDE, 3*4 + 2*4*FENC_STRIDE,
+    2*4 + 3*4*FENC_STRIDE, 3*4 + 3*4*FENC_STRIDE
+};
+static const uint16_t block_idx_xy_fdec[16] =
+{
+    0*4 + 0*4*FDEC_STRIDE, 1*4 + 0*4*FDEC_STRIDE,
+    0*4 + 1*4*FDEC_STRIDE, 1*4 + 1*4*FDEC_STRIDE,
+    2*4 + 0*4*FDEC_STRIDE, 3*4 + 0*4*FDEC_STRIDE,
+    2*4 + 1*4*FDEC_STRIDE, 3*4 + 1*4*FDEC_STRIDE,
+    0*4 + 2*4*FDEC_STRIDE, 1*4 + 2*4*FDEC_STRIDE,
+    0*4 + 3*4*FDEC_STRIDE, 1*4 + 3*4*FDEC_STRIDE,
+    2*4 + 2*4*FDEC_STRIDE, 3*4 + 2*4*FDEC_STRIDE,
+    2*4 + 3*4*FDEC_STRIDE, 3*4 + 3*4*FDEC_STRIDE
+};
 
 static const uint8_t i_chroma_qp_table[52] =
 {
diff --git a/common/x86/util.h b/common/x86/util.h
index 2158c373..7d638c27 100644
--- a/common/x86/util.h
+++ b/common/x86/util.h
@@ -77,24 +77,22 @@ static inline int x264_predictor_difference_mmxext( int16_t (*mvc)[2], intptr_t
 #define array_non_zero_count array_non_zero_count_mmx
 static inline int array_non_zero_count_mmx( int16_t *v )
 {
-    static const uint64_t pw_2 = 0x0202020202020202ULL;
     int count;
     asm(
         "pxor     %%mm7,  %%mm7 \n"
         "movq     (%1),   %%mm0 \n"
-        "movq     16(%1), %%mm1 \n"
-        "packsswb 8(%1),  %%mm0 \n"
+        "movq     8(%1),  %%mm1 \n"
+        "packsswb 16(%1), %%mm0 \n"
         "packsswb 24(%1), %%mm1 \n"
         "pcmpeqb  %%mm7,  %%mm0 \n"
         "pcmpeqb  %%mm7,  %%mm1 \n"
         "paddb    %%mm0,  %%mm1 \n"
-        "paddb    %2,     %%mm1 \n"
         "psadbw   %%mm7,  %%mm1 \n"
         "movd     %%mm1,  %0    \n"
         :"=r"(count)
-        :"r"(v), "m"(pw_2)
+        :"r"(v)
     );
-    return count;
+    return (count+0x10)&0xff;
 }
 #undef array_non_zero_int
 #define array_non_zero_int array_non_zero_int_mmx
diff --git a/encoder/analyse.c b/encoder/analyse.c
index 7afa8531..d22412c9 100644
--- a/encoder/analyse.c
+++ b/encoder/analyse.c
@@ -606,10 +606,9 @@ static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_
 
             if( b_merged_satd && i_max == 9 )
             {
-                int satd[3];
+                int satd[9];
                 h->pixf.intra_sa8d_x3_8x8( p_src_by, edge, satd );
-                if( i_pred_mode < 3 )
-                    satd[i_pred_mode] -= 3 * a->i_lambda;
+                satd[i_pred_mode] -= 3 * a->i_lambda;
                 for( i=2; i>=0; i-- )
                 {
                     int cost = a->i_satd_i8x8_dir[i][idx] = satd[i] + 4 * a->i_lambda;
@@ -679,10 +678,8 @@ static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_
 
         for( idx = 0;; idx++ )
         {
-            int x = block_idx_x[idx];
-            int y = block_idx_y[idx];
-            uint8_t *p_src_by = p_src + 4*x + 4*y*FENC_STRIDE;
-            uint8_t *p_dst_by = p_dst + 4*x + 4*y*FDEC_STRIDE;
+            uint8_t *p_src_by = p_src + block_idx_xy_fenc[idx];
+            uint8_t *p_dst_by = p_dst + block_idx_xy_fdec[idx];
             int i_best = COST_MAX;
             int i_pred_mode = x264_mb_predict_intra4x4_mode( h, idx );
 
@@ -694,10 +691,9 @@ static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_
 
             if( b_merged_satd && i_max >= 6 )
             {
-                int satd[3];
+                int satd[9];
                 h->pixf.intra_satd_x3_4x4( p_src_by, p_dst_by, satd );
-                if( i_pred_mode < 3 )
-                    satd[i_pred_mode] -= 3 * a->i_lambda;
+                satd[i_pred_mode] -= 3 * a->i_lambda;
                 for( i=2; i>=0; i-- )
                     COPY2_IF_LT( i_best, satd[i] + 4 * a->i_lambda,
                                  a->i_predict4x4[idx], i );
@@ -808,16 +804,11 @@ static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a )
         int i_nnz = 0;
         for( idx = 0; idx < 16; idx++ )
         {
-            uint8_t *p_src_by;
-            uint8_t *p_dst_by;
+            uint8_t *p_dst_by = p_dst + block_idx_xy_fdec[idx];
             i_best = COST_MAX;
 
             i_pred_mode = x264_mb_predict_intra4x4_mode( h, idx );
-            x = block_idx_x[idx];
-            y = block_idx_y[idx];
 
-            p_src_by = p_src + 4*x + 4*y*FENC_STRIDE;
-            p_dst_by = p_dst + 4*x + 4*y*FDEC_STRIDE;
             predict_4x4_mode_available( h->mb.i_neighbour4[idx], predict_mode, &i_max );
 
             if( (h->mb.i_neighbour4[idx] & (MB_TOPRIGHT|MB_TOP)) == MB_TOP )
diff --git a/encoder/macroblock.c b/encoder/macroblock.c
index 3eb29ecc..96c0db53 100644
--- a/encoder/macroblock.c
+++ b/encoder/macroblock.c
@@ -19,13 +19,12 @@
  *
  * You should have received a copy of the GNU General Public License
  * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
  *****************************************************************************/
 
 #include "common/common.h"
 #include "macroblock.h"
 
-
 #define ZIG(i,y,x) level[i] = dct[x][y];
 static inline void zigzag_scan_2x2_dc( int16_t level[4], int16_t dct[2][2] )
 {
@@ -82,10 +81,8 @@ static int x264_mb_decimate_score( int16_t *dct, int i_max )
 
 void x264_mb_encode_i4x4( x264_t *h, int idx, int i_qscale )
 {
-    int x = 4 * block_idx_x[idx];
-    int y = 4 * block_idx_y[idx];
-    uint8_t *p_src = &h->mb.pic.p_fenc[0][x+y*FENC_STRIDE];
-    uint8_t *p_dst = &h->mb.pic.p_fdec[0][x+y*FDEC_STRIDE];
+    uint8_t *p_src = &h->mb.pic.p_fenc[0][block_idx_xy_fenc[idx]];
+    uint8_t *p_dst = &h->mb.pic.p_fdec[0][block_idx_xy_fdec[idx]];
     DECLARE_ALIGNED_16( int16_t dct4x4[4][4] );
 
     if( h->mb.b_lossless )
@@ -147,10 +144,10 @@ static void x264_mb_encode_i16x16( x264_t *h, int i_qscale )
     {
         for( i = 0; i < 16; i++ )
         {
-            int oe = block_idx_x[i]*4 + block_idx_y[i]*4*FENC_STRIDE;
-            int od = block_idx_x[i]*4 + block_idx_y[i]*4*FDEC_STRIDE;
+            int oe = block_idx_xy_fenc[i];
+            int od = block_idx_xy_fdec[i];
             h->zigzagf.sub_4x4( h->dct.luma4x4[i], p_src+oe, p_dst+od );
-            dct_dc4x4[block_idx_x[i]][block_idx_y[i]] = h->dct.luma4x4[i][0];
+            dct_dc4x4[0][block_idx_xy_1d[i]] = h->dct.luma4x4[i][0];
             h->dct.luma4x4[i][0] = 0;
         }
         h->zigzagf.scan_4x4( h->dct.luma16x16_dc, dct_dc4x4 );
@@ -161,7 +158,7 @@ static void x264_mb_encode_i16x16( x264_t *h, int i_qscale )
     for( i = 0; i < 16; i++ )
     {
         /* copy dc coeff */
-        dct_dc4x4[block_idx_y[i]][block_idx_x[i]] = dct4x4[i][0][0];
+        dct_dc4x4[0][block_idx_xy_1d[i]] = dct4x4[i][0][0];
         dct4x4[i][0][0] = 0;
 
         /* quant/scan/dequant */
@@ -186,7 +183,7 @@ static void x264_mb_encode_i16x16( x264_t *h, int i_qscale )
     for( i = 0; i < 16; i++ )
     {
         /* copy dc coeff */
-        dct4x4[i][0][0] = dct_dc4x4[block_idx_y[i]][block_idx_x[i]];
+        dct4x4[i][0][0] = dct_dc4x4[0][block_idx_xy_1d[i]];
     }
     /* put pixels to fdec */
     h->dctf.add16x16_idct( p_dst, dct4x4 );
@@ -224,7 +221,7 @@ void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qscale )
         for( i = 0; i < 4; i++ )
         {
             /* copy dc coeff */
-            dct2x2[block_idx_y[i]][block_idx_x[i]] = dct4x4[i][0][0];
+            dct2x2[i>>1][i&1] = dct4x4[i][0][0];
             dct4x4[i][0][0] = 0;
 
             /* no trellis; it doesn't seem to help chroma noticeably */
@@ -258,9 +255,10 @@ void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qscale )
             for( i = 0; i < 4; i++ )
                 h->quantf.dequant_4x4( dct4x4[i], h->dequant4_mf[CQM_4IC + b_inter], i_qscale );
         }
-
-        for( i = 0; i < 4; i++ )
-            dct4x4[i][0][0] = dct2x2[0][i];
+        dct4x4[0][0][0] = dct2x2[0][0];
+        dct4x4[1][0][0] = dct2x2[0][1];
+        dct4x4[2][0][0] = dct2x2[1][0];
+        dct4x4[3][0][0] = dct2x2[1][1];
         h->dctf.add8x8_idct( p_dst, dct4x4 );
     }
 
@@ -408,7 +406,7 @@ void x264_macroblock_encode( x264_t *h )
         }
         for( i = h->mb.i_skip_intra ? 15 : 0 ; i < 16; i++ )
         {
-            uint8_t  *p_dst = &h->mb.pic.p_fdec[0][4 * block_idx_x[i] + 4 * block_idx_y[i] * FDEC_STRIDE];
+            uint8_t  *p_dst = &h->mb.pic.p_fdec[0][block_idx_xy_fdec[i]];
             int      i_mode = h->mb.cache.intra4x4_pred_mode[x264_scan8[i]];
 
             if( (h->mb.i_neighbour4[i] & (MB_TOPRIGHT|MB_TOP)) == MB_TOP )
@@ -432,11 +430,9 @@ void x264_macroblock_encode( x264_t *h )
         {
             for( i4x4 = 0; i4x4 < 16; i4x4++ )
             {
-                int x = 4*block_idx_x[i4x4];
-                int y = 4*block_idx_y[i4x4];
                 h->zigzagf.sub_4x4( h->dct.luma4x4[i4x4],
-                                    h->mb.pic.p_fenc[0]+x+y*FENC_STRIDE,
-                                    h->mb.pic.p_fdec[0]+x+y*FDEC_STRIDE );
+                                    h->mb.pic.p_fenc[0]+block_idx_xy_fenc[i4x4],
+                                    h->mb.pic.p_fdec[0]+block_idx_xy_fdec[i4x4] );
             }
         }
         else if( h->mb.b_transform_8x8 )
diff --git a/encoder/me.c b/encoder/me.c
index 5d1a2e6b..d4f3eaa6 100644
--- a/encoder/me.c
+++ b/encoder/me.c
@@ -196,8 +196,7 @@ void x264_me_search_ref( x264_t *h, x264_me_t *m, int16_t (*mvc)[2], int i_mvc,
                 int my = x264_clip3( mvc[i][1], mv_y_min*4, mv_y_max*4 );
                 COST_MV_HPEL( mx, my );
             }
-            i++;
-        } while( i < i_mvc );
+        } while( ++i < i_mvc );
         bmx = ( bpred_mx + 2 ) >> 2;
         bmy = ( bpred_my + 2 ) >> 2;
         COST_MV( bmx, bmy );
@@ -223,8 +222,7 @@ void x264_me_search_ref( x264_t *h, x264_me_t *m, int16_t (*mvc)[2], int i_mvc,
                 my = x264_clip3( my, mv_y_min, mv_y_max );
                 COST_MV( mx, my );
             }
-            i++;
-        } while( i < i_mvc );
+        } while( ++i < i_mvc );
     }
     COST_MV( 0, 0 );
 
@@ -232,14 +230,15 @@ void x264_me_search_ref( x264_t *h, x264_me_t *m, int16_t (*mvc)[2], int i_mvc,
     {
     case X264_ME_DIA:
         /* diamond search, radius 1 */
-        for( i = 0; i < i_me_range; i++ )
+        i = 0;
+        do
         {
             DIA1_ITER( bmx, bmy );
             if( (bmx == omx) & (bmy == omy) )
                 break;
             if( !CHECK_MVRANGE(bmx, bmy) )
                 break;
-        }
+        } while( ++i < i_me_range );
         break;
 
     case X264_ME_HEX:
@@ -410,7 +409,9 @@ me_hex2:
 
             /* hexagon grid */
             omx = bmx; omy = bmy;
-            for( i = 1; i <= i_me_range/4; i++ )
+
+            i = 1;
+            do
             {
                 static const int hex4[16][2] = {
                     {-4, 2}, {-4, 1}, {-4, 0}, {-4,-1}, {-4,-2},
@@ -437,7 +438,7 @@ me_hex2:
                     COST_MV_X4(  4*i, 1*i,  4*i, 2*i,  2*i, 3*i,  0*i, 4*i );
                     COST_MV_X4( -2*i, 3*i, -2*i,-3*i,  0*i,-4*i,  2*i,-3*i );
                 }
-            }
+            } while( ++i <= i_me_range/4 );
             if( bmy <= mv_y_max )
                 goto me_hex2;
             break;
@@ -464,7 +465,10 @@ me_hex2:
              * because sum(abs(diff)) >= abs(diff(sum)). */
             const int stride = m->i_stride[0];
             uint16_t *sums_base = m->integral;
-            DECLARE_ALIGNED_16( static uint8_t zero[16*16] );
+            /* due to a GCC bug on some platforms (win32?), zero[] may not actually be aligned.
+             * unlike the similar case in ratecontrol.c, this is not a problem because it is not used for any
+             * SSE instructions and the only loss is a tiny bit of performance. */
+            DECLARE_ALIGNED_16( static uint8_t zero[8*FENC_STRIDE] );
             DECLARE_ALIGNED_16( int enc_dc[4] );
             int sad_size = i_pixel <= PIXEL_8x8 ? PIXEL_8x8 : PIXEL_4x4;
             int delta = x264_pixel_size[sad_size].w;
@@ -546,7 +550,13 @@ me_hex2:
                     for( i=0; i<nmvsad && mvsads[i].sad <= bsad; i++ );
                     for( j=i; j<nmvsad; j++ )
                         if( mvsads[j].sad <= bsad )
-                            mvsads[i++] = mvsads[j];
+                        {
+                            /* mvsad_t is not guaranteed to be 8 bytes on all archs, so check before using explicit write-combining */
+                            if( sizeof( mvsad_t ) == sizeof( uint64_t ) )
+                                *(uint64_t*)&mvsads[i++] = *(uint64_t*)&mvsads[j];
+                            else
+                                mvsads[i++] = mvsads[j];
+                        }
                     nmvsad = i;
                 }
                 if( nmvsad > limit )
@@ -558,7 +568,12 @@ me_hex2:
                         for( j=i+1; j<nmvsad; j++ )
                             COPY2_IF_LT( bsad, mvsads[j].sad, bj, j );
                         if( bj > i )
-                            XCHG( mvsad_t, mvsads[i], mvsads[bj] );
+                        {
+                            if( sizeof( mvsad_t ) == sizeof( uint64_t ) )
+                                XCHG( uint64_t, *(uint64_t*)&mvsads[i], *(uint64_t*)&mvsads[bj] );
+                            else
+                                XCHG( mvsad_t, mvsads[i], mvsads[bj] );
+                        }
                     }
                     nmvsad = limit;
                 }
@@ -781,12 +796,12 @@ static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_ite
     BIME_CACHE(-(a),-(b))
 
 #define COST_BIMV_SATD( m0x, m0y, m1x, m1y ) \
-if( pass == 0 || !visited[(m0x)&7][(m0y)&7][(m1x)&7][(m1y)&7] ) \
+if( pass == 0 || !((visited[(m0x)&7][(m0y)&7][(m1x)&7] & (1<<((m1y)&7)))) ) \
 { \
     int cost; \
     int i0 = 4 + 3*(m0x-om0x) + (m0y-om0y); \
     int i1 = 4 + 3*(m1x-om1x) + (m1y-om1y); \
-    visited[(m0x)&7][(m0y)&7][(m1x)&7][(m1y)&7] = 1; \
+    visited[(m0x)&7][(m0y)&7][(m1x)&7] |= (1<<((m1y)&7));\
     h->mc.memcpy_aligned( pix, pix0[i0], bs ); \
     if( i_weight == 32 ) \
         h->mc.avg[i_pixel]( pix, bw, pix1[i1], bw ); \
@@ -837,7 +852,8 @@ int x264_me_refine_bidir( x264_t *h, x264_me_t *m0, x264_me_t *m1, int i_weight
     int bm1y = m1->mv[1], om1y = bm1y;
     int bcost = COST_MAX;
     int pass = 0;
-    uint8_t visited[8][8][8][8];
+    /* each byte of visited represents 8 possible m1y positions, so a 4D array isn't needed */
+    uint8_t visited[8][8][8];
     h->mc.memzero_aligned( visited, sizeof(visited) );
 
     BIME_CACHE( 0, 0 );
@@ -898,8 +914,7 @@ int x264_me_refine_bidir( x264_t *h, x264_me_t *m0, x264_me_t *m1, int i_weight
     if( satd <= bsatd * SATD_THRESH )\
     { \
         int cost; \
-        cache_mv[0] = cache_mv2[0] = mx; \
-        cache_mv[1] = cache_mv2[1] = my; \
+        *(uint32_t*)cache_mv = *(uint32_t*)cache_mv2 = pack16to32_mask(mx,my); \
         cost = x264_rd_cost_part( h, i_lambda2, i8, m->i_pixel ); \
         COPY4_IF_LT( bcost, cost, bmx, mx, bmy, my, dir, do_dir?mdir:dir ); \
     } \
@@ -937,7 +952,7 @@ void x264_me_refine_qpel_rd( x264_t *h, x264_me_t *m, int i_lambda2, int i8 )
     p_cost_mvx = m->p_cost_mv - pmx;
     p_cost_mvy = m->p_cost_mv - pmy;
     COST_MV_SATD( bmx, bmy, bsatd );
-    COST_MV_RD( bmx, bmy, 0, 0, 0);
+    COST_MV_RD( bmx, bmy, 0, 0, 0 );
 
     /* check the predicted mv */
     if( (bmx != pmx || bmy != pmy)