From: Fiona Glaser <fiona@x264.com>
Date: Sat, 26 Apr 2008 03:41:40 +0000 (-0600)
Subject: remove some redundant nnz counts
X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=36f80085d73652cbddfeb9de92ec6e41e6b6d34f;p=libx264

remove some redundant nnz counts
move some nnz counts from macroblock_encode to cavlc if cabac doesn't need them
---

diff --git a/encoder/macroblock.c b/encoder/macroblock.c
index 20bc52a8..de94536b 100644
--- a/encoder/macroblock.c
+++ b/encoder/macroblock.c
@@ -154,6 +154,7 @@ static void x264_mb_encode_i16x16( x264_t *h, int i_qscale )
     {
         /* copy dc coeff */
         dct4x4[0][block_idx_y[i]][block_idx_x[i]] = dct4x4[1+i][0][0];
+        dct4x4[1+i][0][0] = 0;
 
         /* quant/scan/dequant */
         if( h->mb.b_trellis )
@@ -215,6 +216,7 @@ void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qscale )
         {
             /* copy dc coeff */
             dct2x2[block_idx_y[i]][block_idx_x[i]] = dct4x4[i][0][0];
+            dct4x4[i][0][0] = 0;
 
             /* no trellis; it doesn't seem to help chroma noticeably */
             h->quantf.quant_4x4( dct4x4[i], h->quant4_mf[CQM_4IC+b_inter][i_qscale], h->quant4_bias[CQM_4IC+b_inter][i_qscale] );
@@ -257,7 +259,7 @@ void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qscale )
     h->mb.i_cbp_chroma = 0;
     for( i = 0; i < 8; i++ )
     {
-        int nz = array_non_zero_count( h->dct.luma4x4[16+i]+1, 15 );
+        int nz = array_non_zero( h->dct.luma4x4[16+i] );
         h->mb.cache.non_zero_count[x264_scan8[16+i]] = nz;
         h->mb.i_cbp_chroma |= nz;
     }
@@ -321,7 +323,8 @@ void x264_macroblock_encode( x264_t *h )
     int i_qp = h->mb.i_qp;
     int b_decimate = h->sh.i_type == SLICE_TYPE_B || h->param.analyse.b_dct_decimate;
     int b_force_no_skip = 0;
-    int i;
+    int i,j,idx;
+    uint8_t nnz8x8[4] = {1,1,1,1};
 
     if( h->sh.b_mbaff
         && h->mb.i_mb_xy == h->sh.i_first_mb + h->mb.i_mb_stride
@@ -386,6 +389,8 @@ void x264_macroblock_encode( x264_t *h )
             h->predict_8x8[i_mode]( p_dst, edge );
             x264_mb_encode_i8x8( h, i, i_qp );
         }
+        for( i = 0; i < 4; i++ )
+            nnz8x8[i] = array_non_zero( h->dct.luma8x8[i] );
     }
     else if( h->mb.i_type == I_4x4 )
     {
@@ -413,7 +418,7 @@ void x264_macroblock_encode( x264_t *h )
     }
     else    /* Inter MB */
     {
-        int i8x8, i4x4, idx;
+        int i8x8, i4x4;
         int i_decimate_mb = 0;
 
         /* Motion compensation */
@@ -433,7 +438,6 @@ void x264_macroblock_encode( x264_t *h )
         else if( h->mb.b_transform_8x8 )
         {
             DECLARE_ALIGNED_16( int16_t dct8x8[4][8][8] );
-            int nnz8x8[4] = {1,1,1,1};
             b_decimate &= !h->mb.b_trellis; // 8x8 trellis is inherently optimal decimation
             h->dctf.sub16x16_dct8( dct8x8, h->mb.pic.p_fenc[0], h->mb.pic.p_fdec[0] );
 
@@ -453,18 +457,14 @@ void x264_macroblock_encode( x264_t *h )
                     int i_decimate_8x8 = x264_mb_decimate_score( h->dct.luma8x8[idx], 64 );
                     i_decimate_mb += i_decimate_8x8;
                     if( i_decimate_8x8 < 4 )
-                    {
-                        memset( h->dct.luma8x8[idx], 0, sizeof( h->dct.luma8x8[idx] ) );
-                        memset( dct8x8[idx], 0, sizeof( dct8x8[idx] ) );
                         nnz8x8[idx] = 0;
-                    }
                 }
                 else
                     nnz8x8[idx] = array_non_zero( dct8x8[idx] );
             }
 
             if( i_decimate_mb < 6 && b_decimate )
-                memset( h->dct.luma8x8, 0, sizeof( h->dct.luma8x8 ) );
+                *(uint32_t*)nnz8x8 = 0;
             else
             {
                 for( idx = 0; idx < 4; idx++ )
@@ -478,7 +478,6 @@ void x264_macroblock_encode( x264_t *h )
         else
         {
             DECLARE_ALIGNED_16( int16_t dct4x4[16][4][4] );
-            int nnz8x8[4] = {1,1,1,1};
             h->dctf.sub16x16_dct( dct4x4, h->mb.pic.p_fenc[0], h->mb.pic.p_fdec[0] );
 
             for( i8x8 = 0; i8x8 < 4; i8x8++ )
@@ -507,15 +506,11 @@ void x264_macroblock_encode( x264_t *h )
                 /* decimate this 8x8 block */
                 i_decimate_mb += i_decimate_8x8;
                 if( i_decimate_8x8 < 4 && b_decimate )
-                {
-                    memset( &dct4x4[i8x8*4], 0, 4 * sizeof( *dct4x4 ) );
-                    memset( &h->dct.luma4x4[i8x8*4], 0, 4 * sizeof( *h->dct.luma4x4 ) );
                     nnz8x8[i8x8] = 0;
-                }
             }
 
             if( i_decimate_mb < 6 && b_decimate )
-                memset( h->dct.luma4x4, 0, 16 * sizeof( *h->dct.luma4x4 ) );
+                *(uint32_t*)nnz8x8 = 0;
             else
             {
                 for( i8x8 = 0; i8x8 < 4; i8x8++ )
@@ -546,34 +541,35 @@ void x264_macroblock_encode( x264_t *h )
     {
         for( i = 0; i < 16; i++ )
         {
-            const int nz = array_non_zero_count( h->dct.luma4x4[i]+1, 15 );
+            int nz = array_non_zero( h->dct.luma4x4[i] );
             h->mb.cache.non_zero_count[x264_scan8[i]] = nz;
-            if( nz > 0 )
-                h->mb.i_cbp_luma = 0x0f;
-        }
-    }
-    else if( h->mb.b_transform_8x8 )
-    {
-        /* coded_block_flag is enough for CABAC.
-         * the full non_zero_count is done only in CAVLC. */
-        for( i = 0; i < 4; i++ )
-        {
-            const int nz = array_non_zero( h->dct.luma8x8[i] );
-            int j;
-            for( j = 0; j < 4; j++ )
-                h->mb.cache.non_zero_count[x264_scan8[4*i+j]] = nz;
-            if( nz > 0 )
-                h->mb.i_cbp_luma |= 1 << i;
+            h->mb.i_cbp_luma |= nz;
         }
+        h->mb.i_cbp_luma *= 0xf;
     }
     else
     {
-        for( i = 0; i < 16; i++ )
+        for( i = 0; i < 4; i++)
         {
-            const int nz = array_non_zero_count( h->dct.luma4x4[i], 16 );
-            h->mb.cache.non_zero_count[x264_scan8[i]] = nz;
-            if( nz > 0 )
-                h->mb.i_cbp_luma |= 1 << (i/4);
+            if(!nnz8x8[i])
+                for( j = 0; j < 4; j++ )
+                    h->mb.cache.non_zero_count[x264_scan8[j+i*4]] = 0;
+            else if( h->mb.b_transform_8x8 )
+            {
+                int nz = nnz8x8[i];
+                for( j = 0; j < 4; j++ )
+                    h->mb.cache.non_zero_count[x264_scan8[j+4*i]] = nz;
+                h->mb.i_cbp_luma |= nz << i;
+            }
+            else
+            {
+                for( j = 0; j < 4; j++ )
+                {
+                    int nz = array_non_zero( h->dct.luma4x4[j+i*4] );
+                    h->mb.cache.non_zero_count[x264_scan8[j+i*4]] = nz;
+                    h->mb.i_cbp_luma |= nz << i;
+                }
+            }
         }
     }
 
@@ -657,10 +653,7 @@ int x264_macroblock_probe_skip( x264_t *h, const int b_bidir )
             i_decimate_mb += x264_mb_decimate_score( dctscan, 16 );
 
             if( i_decimate_mb >= 6 )
-            {
-                /* not as P_SKIP */
                 return 0;
-            }
         }
     }
 
@@ -688,11 +681,8 @@ int x264_macroblock_probe_skip( x264_t *h, const int b_bidir )
         dct2x2[1][1] = dct4x4[3][0][0];
         h->dctf.dct2x2dc( dct2x2 );
         h->quantf.quant_2x2_dc( dct2x2, h->quant4_mf[CQM_4PC][i_qp][0]>>1, h->quant4_bias[CQM_4PC][i_qp][0]<<1 );
-        if( dct2x2[0][0] || dct2x2[0][1] || dct2x2[1][0] || dct2x2[1][1]  )
-        {
-            /* can't be */
+        if( *(uint64_t*)dct2x2 )
             return 0;
-        }
 
         /* calculate dct coeffs */
         for( i4x4 = 0, i_decimate_mb = 0; i4x4 < 4; i4x4++ )
@@ -702,9 +692,7 @@ int x264_macroblock_probe_skip( x264_t *h, const int b_bidir )
 
             i_decimate_mb += x264_mb_decimate_score( dctscan+1, 15 );
             if( i_decimate_mb >= 7 )
-            {
                 return 0;
-            }
         }
     }
 
@@ -852,9 +840,7 @@ void x264_macroblock_encode_p8x8( x264_t *h, int i8 )
         }
     }
 
-    if( nnz8x8 )
-        h->mb.i_cbp_luma |= (1 << i8);
-    else
-        h->mb.i_cbp_luma &= ~(1 << i8);
+    h->mb.i_cbp_luma &= ~(1 << i8);
+    h->mb.i_cbp_luma |= nnz8x8 << i8;
     h->mb.i_cbp_chroma = 0x02;
 }