From: Fiona Glaser <fiona@x264.com>
Date: Tue, 24 Jun 2008 18:23:50 +0000 (-0600)
Subject: Convert NNZ to raster order and other optimizations
X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=ec3d09554addbcecb8cf82f3ff33ac737a6f996b;p=libx264

Convert NNZ to raster order and other optimizations
Converting NNZ to raster order simplifies a lot of the load/store code and allows more use of write-combining.
More use of write-combining throughout load/save code in common/macroblock.c
GCC has aliasing issues in the case of stores to 8-bit heap-allocated arrays; dereferencing the pointer once avoids this problem and significantly increases performance.
More manual loop unrolling and such.
Move all packXtoY functions to macroblock.h so any function can use them.
Add pack8to32.
Minor optimizations to encoder/macroblock.c
---

diff --git a/common/common.h b/common/common.h
index 06363941..33b71c40 100644
--- a/common/common.h
+++ b/common/common.h
@@ -405,7 +405,8 @@ struct x264_t
         int8_t  *type;                      /* mb type */
         int8_t  *qp;                        /* mb qp */
         int16_t *cbp;                       /* mb cbp: 0x0?: luma, 0x?0: chroma, 0x100: luma dc, 0x0200 and 0x0400: chroma dc  (all set for PCM)*/
-        int8_t  (*intra4x4_pred_mode)[7];   /* intra4x4 pred mode. for non I4x4 set to I_PRED_4x4_DC(2) */
+        int8_t  (*intra4x4_pred_mode)[8];   /* intra4x4 pred mode. for non I4x4 set to I_PRED_4x4_DC(2) */
+                                            /* actually has only 7 entries; set to 8 for write-combining optimizations */
         uint8_t (*non_zero_count)[16+4+4];  /* nzc. for I_PCM set to 16 */
         int8_t  *chroma_pred_mode;          /* chroma_pred_mode. cabac only. for non intra I_PRED_CHROMA_DC(0) */
         int16_t (*mv[2])[2];                /* mb mv. set to 0 for intra mb */
diff --git a/common/frame.c b/common/frame.c
index 214d4fc7..0c7a56c5 100644
--- a/common/frame.c
+++ b/common/frame.c
@@ -306,16 +306,16 @@ void munge_cavlc_nnz_row( x264_t *h, int mb_y, uint8_t (*buf)[16] )
 {
     uint32_t (*src)[6] = (uint32_t(*)[6])h->mb.non_zero_count + mb_y * h->sps->i_mb_width;
     int8_t *transform = h->mb.mb_transform_size + mb_y * h->sps->i_mb_width;
-    int x;
+    int x, nnz;
     for( x=0; x<h->sps->i_mb_width; x++ )
     {
         memcpy( buf+x, src+x, 16 );
         if( transform[x] )
         {
-            if( src[x][0] ) src[x][0] = 0x01010101;
-            if( src[x][1] ) src[x][1] = 0x01010101;
-            if( src[x][2] ) src[x][2] = 0x01010101;
-            if( src[x][3] ) src[x][3] = 0x01010101;
+            nnz = src[x][0] | src[x][1];
+            src[x][0] = src[x][1] = ((uint16_t)nnz ? 0x0101 : 0) + (nnz>>16 ? 0x01010000 : 0);
+            nnz = src[x][2] | src[x][3];
+            src[x][2] = src[x][3] = ((uint16_t)nnz ? 0x0101 : 0) + (nnz>>16 ? 0x01010000 : 0);
         }
     }
 }
@@ -642,8 +642,8 @@ void x264_frame_deblock_row( x264_t *h, int mb_y )
                         int y  = i_dir == 0 ? i      : i_edge;\
                         int xn = (x - (i_dir == 0 ? 1 : 0 ))&0x03;\
                         int yn = (y - (i_dir == 0 ? 0 : 1 ))&0x03;\
-                        if( h->mb.non_zero_count[mb_xy][block_idx_xy[x][y]] != 0 ||\
-                            h->mb.non_zero_count[mbn_xy][block_idx_xy[xn][yn]] != 0 )\
+                        if( h->mb.non_zero_count[mb_xy][x+y*4] != 0 ||\
+                            h->mb.non_zero_count[mbn_xy][xn+yn*4] != 0 )\
                         {\
                             bS[i] = 2;\
                         }\
diff --git a/common/macroblock.c b/common/macroblock.c
index df912ee3..b08d4303 100644
--- a/common/macroblock.c
+++ b/common/macroblock.c
@@ -855,7 +855,7 @@ int x264_macroblock_cache_init( x264_t *h )
     CHECKED_MALLOC( h->mb.mb_transform_size, i_mb_count * sizeof(int8_t) );
 
     /* 0 -> 3 top(4), 4 -> 6 : left(3) */
-    CHECKED_MALLOC( h->mb.intra4x4_pred_mode, i_mb_count * 7 * sizeof(int8_t) );
+    CHECKED_MALLOC( h->mb.intra4x4_pred_mode, i_mb_count * 8 * sizeof(int8_t) );
 
     /* all coeffs */
     CHECKED_MALLOC( h->mb.non_zero_count, i_mb_count * 24 * sizeof(uint8_t) );
@@ -1045,27 +1045,18 @@ void x264_macroblock_cache_load( x264_t *h, int i_mb_x, int i_mb_y )
         h->mb.i_neighbour |= MB_TOP;
 
         /* load intra4x4 */
-        h->mb.cache.intra4x4_pred_mode[x264_scan8[0] - 8] = h->mb.intra4x4_pred_mode[i_top_xy][0];
-        h->mb.cache.intra4x4_pred_mode[x264_scan8[1] - 8] = h->mb.intra4x4_pred_mode[i_top_xy][1];
-        h->mb.cache.intra4x4_pred_mode[x264_scan8[4] - 8] = h->mb.intra4x4_pred_mode[i_top_xy][2];
-        h->mb.cache.intra4x4_pred_mode[x264_scan8[5] - 8] = h->mb.intra4x4_pred_mode[i_top_xy][3];
+        *(uint32_t*)&h->mb.cache.intra4x4_pred_mode[x264_scan8[0] - 8] = *(uint32_t*)&h->mb.intra4x4_pred_mode[i_top_xy][0];
 
         /* load non_zero_count */
-        h->mb.cache.non_zero_count[x264_scan8[0] - 8] = h->mb.non_zero_count[i_top_xy][10];
-        h->mb.cache.non_zero_count[x264_scan8[1] - 8] = h->mb.non_zero_count[i_top_xy][11];
-        h->mb.cache.non_zero_count[x264_scan8[4] - 8] = h->mb.non_zero_count[i_top_xy][14];
-        h->mb.cache.non_zero_count[x264_scan8[5] - 8] = h->mb.non_zero_count[i_top_xy][15];
-
-        h->mb.cache.non_zero_count[x264_scan8[16+0] - 8] = h->mb.non_zero_count[i_top_xy][16+2];
-        h->mb.cache.non_zero_count[x264_scan8[16+1] - 8] = h->mb.non_zero_count[i_top_xy][16+3];
-
-        h->mb.cache.non_zero_count[x264_scan8[16+4+0] - 8] = h->mb.non_zero_count[i_top_xy][16+4+2];
-        h->mb.cache.non_zero_count[x264_scan8[16+4+1] - 8] = h->mb.non_zero_count[i_top_xy][16+4+3];
+        *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[0] - 8] = *(uint32_t*)&h->mb.non_zero_count[i_top_xy][12];
+        /* shift because x264_scan8[16] is misaligned */
+        *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[16] - 9] = *(uint16_t*)&h->mb.non_zero_count[i_top_xy][18] << 8;
+        *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[16+4] - 9] = *(uint16_t*)&h->mb.non_zero_count[i_top_xy][22] << 8;
     }
     else
     {
         h->mb.i_mb_type_top = -1;
-        
+
         /* load intra4x4 */
         h->mb.cache.intra4x4_pred_mode[x264_scan8[0] - 8] =
         h->mb.cache.intra4x4_pred_mode[x264_scan8[1] - 8] =
@@ -1081,7 +1072,6 @@ void x264_macroblock_cache_load( x264_t *h, int i_mb_x, int i_mb_y )
         h->mb.cache.non_zero_count[x264_scan8[16+1] - 8] =
         h->mb.cache.non_zero_count[x264_scan8[16+4+0] - 8] =
         h->mb.cache.non_zero_count[x264_scan8[16+4+1] - 8] = 0x80;
-
     }
 
     if( i_mb_x > 0 && i_mb_xy > h->sh.i_first_mb )
@@ -1099,9 +1089,9 @@ void x264_macroblock_cache_load( x264_t *h, int i_mb_x, int i_mb_y )
         h->mb.cache.intra4x4_pred_mode[x264_scan8[10] - 1] = h->mb.intra4x4_pred_mode[i_left_xy][3];
 
         /* load non_zero_count */
-        h->mb.cache.non_zero_count[x264_scan8[0 ] - 1] = h->mb.non_zero_count[i_left_xy][5];
+        h->mb.cache.non_zero_count[x264_scan8[0 ] - 1] = h->mb.non_zero_count[i_left_xy][3];
         h->mb.cache.non_zero_count[x264_scan8[2 ] - 1] = h->mb.non_zero_count[i_left_xy][7];
-        h->mb.cache.non_zero_count[x264_scan8[8 ] - 1] = h->mb.non_zero_count[i_left_xy][13];
+        h->mb.cache.non_zero_count[x264_scan8[8 ] - 1] = h->mb.non_zero_count[i_left_xy][11];
         h->mb.cache.non_zero_count[x264_scan8[10] - 1] = h->mb.non_zero_count[i_left_xy][15];
 
         h->mb.cache.non_zero_count[x264_scan8[16+0] - 1] = h->mb.non_zero_count[i_left_xy][16+1];
@@ -1329,13 +1319,15 @@ void x264_macroblock_cache_load( x264_t *h, int i_mb_x, int i_mb_y )
             memset( h->mb.cache.skip, 0, X264_SCAN8_SIZE * sizeof( int8_t ) );
             if( i_left_type >= 0 )
             {
-                h->mb.cache.skip[x264_scan8[0] - 1] = h->mb.skipbp[i_left_xy] & 0x2;
-                h->mb.cache.skip[x264_scan8[8] - 1] = h->mb.skipbp[i_left_xy] & 0x8;
+                uint8_t skipbp = h->mb.skipbp[i_left_xy];
+                h->mb.cache.skip[x264_scan8[0] - 1] = skipbp & 0x2;
+                h->mb.cache.skip[x264_scan8[8] - 1] = skipbp & 0x8;
             }
             if( i_top_type >= 0 )
             {
-                h->mb.cache.skip[x264_scan8[0] - 8] = h->mb.skipbp[i_top_xy] & 0x4;
-                h->mb.cache.skip[x264_scan8[4] - 8] = h->mb.skipbp[i_top_xy] & 0x8;
+                uint8_t skipbp = h->mb.skipbp[i_top_xy];
+                h->mb.cache.skip[x264_scan8[0] - 8] = skipbp & 0x4;
+                h->mb.cache.skip[x264_scan8[4] - 8] = skipbp & 0x8;
             }
         }
 
@@ -1367,6 +1359,19 @@ void x264_macroblock_cache_load( x264_t *h, int i_mb_x, int i_mb_y )
     h->mb.i_neighbour4[14] = MB_LEFT|MB_TOP|MB_TOPLEFT|MB_TOPRIGHT;
 }
 
+static void ALWAYS_INLINE x264_macroblock_store_pic( x264_t *h, int i)
+{
+    int w = i ? 8 : 16;
+    int i_stride = h->fdec->i_stride[i];
+    int i_stride2 = i_stride << h->mb.b_interlaced;
+    int i_pix_offset = h->mb.b_interlaced
+                     ? w * (h->mb.i_mb_x + (h->mb.i_mb_y&~1) * i_stride) + (h->mb.i_mb_y&1) * i_stride
+                     : w * (h->mb.i_mb_x + h->mb.i_mb_y * i_stride);
+    h->mc.copy[i?PIXEL_8x8:PIXEL_16x16](
+        &h->fdec->plane[i][i_pix_offset], i_stride2,
+        h->mb.pic.p_fdec[i], FDEC_STRIDE, w );
+}
+
 void x264_macroblock_cache_save( x264_t *h )
 {
     const int i_mb_xy = h->mb.i_mb_xy;
@@ -1376,20 +1381,16 @@ void x264_macroblock_cache_save( x264_t *h )
     const int i_mb_4x4 = h->mb.i_b4_xy;
     const int i_mb_8x8 = h->mb.i_b8_xy;
 
-    int i;
+    /* GCC pessimizes direct stores to heap-allocated 8-bit arrays due to aliasing.*/
+    /* By only dereferencing them once, we avoid this issue. */
+    int8_t *intra4x4_pred_mode = h->mb.intra4x4_pred_mode[i_mb_xy];
+    uint8_t *non_zero_count = h->mb.non_zero_count[i_mb_xy];
 
-    for( i = 0; i < 3; i++ )
-    {
-        int w = i ? 8 : 16;
-        int i_stride = h->fdec->i_stride[i];
-        int i_stride2 = i_stride << h->mb.b_interlaced;
-        int i_pix_offset = h->mb.b_interlaced
-                         ? w * (h->mb.i_mb_x + (h->mb.i_mb_y&~1) * i_stride) + (h->mb.i_mb_y&1) * i_stride
-                         : w * (h->mb.i_mb_x + h->mb.i_mb_y * i_stride);
-        h->mc.copy[i?PIXEL_8x8:PIXEL_16x16](
-            &h->fdec->plane[i][i_pix_offset], i_stride2,
-            h->mb.pic.p_fdec[i], FDEC_STRIDE, w );
-    }
+    int i, y;
+
+    x264_macroblock_store_pic( h, 0 );
+    x264_macroblock_store_pic( h, 1 );
+    x264_macroblock_store_pic( h, 2 );
 
     x264_prefetch_fenc( h, h->fdec, h->mb.i_mb_x, h->mb.i_mb_y );
 
@@ -1406,40 +1407,28 @@ void x264_macroblock_cache_save( x264_t *h )
     /* save intra4x4 */
     if( i_mb_type == I_4x4 )
     {
-        h->mb.intra4x4_pred_mode[i_mb_xy][0] = h->mb.cache.intra4x4_pred_mode[x264_scan8[10] ];
-        h->mb.intra4x4_pred_mode[i_mb_xy][1] = h->mb.cache.intra4x4_pred_mode[x264_scan8[11] ];
-        h->mb.intra4x4_pred_mode[i_mb_xy][2] = h->mb.cache.intra4x4_pred_mode[x264_scan8[14] ];
-        h->mb.intra4x4_pred_mode[i_mb_xy][3] = h->mb.cache.intra4x4_pred_mode[x264_scan8[15] ];
-        h->mb.intra4x4_pred_mode[i_mb_xy][4] = h->mb.cache.intra4x4_pred_mode[x264_scan8[5] ];
-        h->mb.intra4x4_pred_mode[i_mb_xy][5] = h->mb.cache.intra4x4_pred_mode[x264_scan8[7] ];
-        h->mb.intra4x4_pred_mode[i_mb_xy][6] = h->mb.cache.intra4x4_pred_mode[x264_scan8[13] ];
+        *(uint32_t*)&intra4x4_pred_mode[0] = *(uint32_t*)&h->mb.cache.intra4x4_pred_mode[x264_scan8[10] ];
+        *(uint32_t*)&intra4x4_pred_mode[4] = pack8to32(h->mb.cache.intra4x4_pred_mode[x264_scan8[5] ],
+                                                       h->mb.cache.intra4x4_pred_mode[x264_scan8[7] ],
+                                                       h->mb.cache.intra4x4_pred_mode[x264_scan8[13] ], 0);
     }
     else
-    {
-        h->mb.intra4x4_pred_mode[i_mb_xy][0] =
-        h->mb.intra4x4_pred_mode[i_mb_xy][1] =
-        h->mb.intra4x4_pred_mode[i_mb_xy][2] =
-        h->mb.intra4x4_pred_mode[i_mb_xy][3] =
-        h->mb.intra4x4_pred_mode[i_mb_xy][4] =
-        h->mb.intra4x4_pred_mode[i_mb_xy][5] =
-        h->mb.intra4x4_pred_mode[i_mb_xy][6] = I_PRED_4x4_DC;
-    }
+        *(uint64_t*)intra4x4_pred_mode = I_PRED_4x4_DC * 0x0101010101010101ULL;
 
     if( i_mb_type == I_PCM )
     {
         h->mb.cbp[i_mb_xy] = 0x72f;   /* all set */
         for( i = 0; i < 16 + 2*4; i++ )
-        {
-            h->mb.non_zero_count[i_mb_xy][i] = 16;
-        }
+            non_zero_count[i] = 16;
     }
     else
     {
         /* save non zero count */
-        for( i = 0; i < 16 + 2*4; i++ )
-        {
-            h->mb.non_zero_count[i_mb_xy][i] = h->mb.cache.non_zero_count[x264_scan8[i]];
-        }
+        for( y = 0; y < 4; y++ )
+            *(uint32_t*)&non_zero_count[y*4] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[0]+y*8];
+        for( y = 0; y < 4; y++ )
+            *(uint16_t*)&non_zero_count[16+y*2] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[16+y*2]-1] >> 8;
+
     }
 
     if( h->mb.i_cbp_luma == 0 && h->mb.i_type != I_8x8 )
@@ -1448,20 +1437,25 @@ void x264_macroblock_cache_save( x264_t *h )
 
     if( !IS_INTRA( i_mb_type ) )
     {
-        int i_list;
-        for( i_list = 0; i_list < (h->sh.i_type == SLICE_TYPE_B ? 2  : 1 ); i_list++ )
+        h->mb.ref[0][i_mb_8x8+0+0*s8x8] = h->mb.cache.ref[0][x264_scan8[0]];
+        h->mb.ref[0][i_mb_8x8+1+0*s8x8] = h->mb.cache.ref[0][x264_scan8[4]];
+        h->mb.ref[0][i_mb_8x8+0+1*s8x8] = h->mb.cache.ref[0][x264_scan8[8]];
+        h->mb.ref[0][i_mb_8x8+1+1*s8x8] = h->mb.cache.ref[0][x264_scan8[12]];
+        for( y = 0; y < 4; y++ )
         {
-            int y;
-
-            h->mb.ref[i_list][i_mb_8x8+0+0*s8x8] = h->mb.cache.ref[i_list][x264_scan8[0]];
-            h->mb.ref[i_list][i_mb_8x8+1+0*s8x8] = h->mb.cache.ref[i_list][x264_scan8[4]];
-            h->mb.ref[i_list][i_mb_8x8+0+1*s8x8] = h->mb.cache.ref[i_list][x264_scan8[8]];
-            h->mb.ref[i_list][i_mb_8x8+1+1*s8x8] = h->mb.cache.ref[i_list][x264_scan8[12]];
-
+            *(uint64_t*)h->mb.mv[0][i_mb_4x4+y*s4x4+0] = *(uint64_t*)h->mb.cache.mv[0][x264_scan8[0]+8*y+0];
+            *(uint64_t*)h->mb.mv[0][i_mb_4x4+y*s4x4+2] = *(uint64_t*)h->mb.cache.mv[0][x264_scan8[0]+8*y+2];
+        }
+        if(h->sh.i_type == SLICE_TYPE_B)
+        {
+            h->mb.ref[1][i_mb_8x8+0+0*s8x8] = h->mb.cache.ref[1][x264_scan8[0]];
+            h->mb.ref[1][i_mb_8x8+1+0*s8x8] = h->mb.cache.ref[1][x264_scan8[4]];
+            h->mb.ref[1][i_mb_8x8+0+1*s8x8] = h->mb.cache.ref[1][x264_scan8[8]];
+            h->mb.ref[1][i_mb_8x8+1+1*s8x8] = h->mb.cache.ref[1][x264_scan8[12]];
             for( y = 0; y < 4; y++ )
             {
-                *(uint64_t*)h->mb.mv[i_list][i_mb_4x4+y*s4x4+0] = *(uint64_t*)h->mb.cache.mv[i_list][x264_scan8[0]+8*y+0];
-                *(uint64_t*)h->mb.mv[i_list][i_mb_4x4+y*s4x4+2] = *(uint64_t*)h->mb.cache.mv[i_list][x264_scan8[0]+8*y+2];
+                *(uint64_t*)h->mb.mv[1][i_mb_4x4+y*s4x4+0] = *(uint64_t*)h->mb.cache.mv[1][x264_scan8[0]+8*y+0];
+                *(uint64_t*)h->mb.mv[1][i_mb_4x4+y*s4x4+2] = *(uint64_t*)h->mb.cache.mv[1][x264_scan8[0]+8*y+2];
             }
         }
     }
@@ -1470,11 +1464,8 @@ void x264_macroblock_cache_save( x264_t *h )
         int i_list;
         for( i_list = 0; i_list < (h->sh.i_type == SLICE_TYPE_B ? 2  : 1 ); i_list++ )
         {
-            int y;
-
             *(uint16_t*)&h->mb.ref[i_list][i_mb_8x8+0*s8x8] = (uint8_t)(-1) * 0x0101;
             *(uint16_t*)&h->mb.ref[i_list][i_mb_8x8+1*s8x8] = (uint8_t)(-1) * 0x0101;
-
             for( y = 0; y < 4; y++ )
             {
                 *(uint64_t*)h->mb.mv[i_list][i_mb_4x4+y*s4x4+0] = 0;
@@ -1492,32 +1483,33 @@ void x264_macroblock_cache_save( x264_t *h )
 
         if( !IS_INTRA( i_mb_type ) && !IS_SKIP( i_mb_type ) && !IS_DIRECT( i_mb_type ) )
         {
-            int i_list;
-            for( i_list  = 0; i_list < 2; i_list++ )
+            for( y = 0; y < 4; y++ )
             {
-                const int s4x4 = 4 * h->mb.i_mb_stride;
-                int y;
+                *(uint64_t*)h->mb.mvd[0][i_mb_4x4+y*s4x4+0] = *(uint64_t*)h->mb.cache.mvd[0][x264_scan8[0]+8*y+0];
+                *(uint64_t*)h->mb.mvd[0][i_mb_4x4+y*s4x4+2] = *(uint64_t*)h->mb.cache.mvd[0][x264_scan8[0]+8*y+2];
+            }
+            if( h->sh.i_type == SLICE_TYPE_B )
                 for( y = 0; y < 4; y++ )
                 {
-                    *(uint64_t*)h->mb.mvd[i_list][i_mb_4x4+y*s4x4+0] = *(uint64_t*)h->mb.cache.mvd[i_list][x264_scan8[0]+8*y+0];
-                    *(uint64_t*)h->mb.mvd[i_list][i_mb_4x4+y*s4x4+2] = *(uint64_t*)h->mb.cache.mvd[i_list][x264_scan8[0]+8*y+2];
+                    *(uint64_t*)h->mb.mvd[1][i_mb_4x4+y*s4x4+0] = *(uint64_t*)h->mb.cache.mvd[1][x264_scan8[0]+8*y+0];
+                    *(uint64_t*)h->mb.mvd[1][i_mb_4x4+y*s4x4+2] = *(uint64_t*)h->mb.cache.mvd[1][x264_scan8[0]+8*y+2];
                 }
-            }
         }
         else
         {
-            int i_list;
-            for( i_list  = 0; i_list < 2; i_list++ )
+            for( y = 0; y < 4; y++ )
             {
-                const int s4x4 = 4 * h->mb.i_mb_stride;
-                int y;
+                *(uint64_t*)h->mb.mvd[0][i_mb_4x4+y*s4x4+0] = 0;
+                *(uint64_t*)h->mb.mvd[0][i_mb_4x4+y*s4x4+2] = 0;
+            }
+            if( h->sh.i_type == SLICE_TYPE_B )
                 for( y = 0; y < 4; y++ )
                 {
-                    *(uint64_t*)h->mb.mvd[i_list][i_mb_4x4+y*s4x4+0] = 0;
-                    *(uint64_t*)h->mb.mvd[i_list][i_mb_4x4+y*s4x4+2] = 0;
+                    *(uint64_t*)h->mb.mvd[1][i_mb_4x4+y*s4x4+0] = 0;
+                    *(uint64_t*)h->mb.mvd[1][i_mb_4x4+y*s4x4+2] = 0;
                 }
-            }
         }
+
         if( h->sh.i_type == SLICE_TYPE_B )
         {
             if( i_mb_type == B_SKIP || i_mb_type == B_DIRECT )
diff --git a/common/macroblock.h b/common/macroblock.h
index 660978a9..a59b4f76 100644
--- a/common/macroblock.h
+++ b/common/macroblock.h
@@ -293,6 +293,30 @@ int  x264_mb_transform_8x8_allowed( x264_t *h );
 void x264_mb_mc( x264_t *h );
 void x264_mb_mc_8x8( x264_t *h, int i8 );
 
+static ALWAYS_INLINE uint32_t pack16to32( int a, int b )
+{
+#ifdef WORDS_BIGENDIAN
+   return b + (a<<16);
+#else
+   return a + (b<<16);
+#endif
+}
+static ALWAYS_INLINE uint32_t pack8to16( int a, int b )
+{
+#ifdef WORDS_BIGENDIAN
+   return b + (a<<8);
+#else
+   return a + (b<<8);
+#endif
+}
+static ALWAYS_INLINE uint32_t pack8to32( int a, int b, int c, int d )
+{
+#ifdef WORDS_BIGENDIAN
+   return d + (c<<8) + (b<<16) + (a<<24);
+#else
+   return a + (b<<8) + (c<<16) + (d<<24);
+#endif
+}
 static ALWAYS_INLINE uint32_t pack16to32_mask( int a, int b )
 {
 #ifdef WORDS_BIGENDIAN
diff --git a/common/predict.c b/common/predict.c
index d2538798..7f7a5f68 100644
--- a/common/predict.c
+++ b/common/predict.c
@@ -37,24 +37,6 @@
 #   include "ppc/predict.h"
 #endif
 
-static ALWAYS_INLINE uint32_t pack16to32( int a, int b )
-{
-#ifdef WORDS_BIGENDIAN
-   return b + (a<<16);
-#else
-   return a + (b<<16);
-#endif
-}
-
-static ALWAYS_INLINE uint32_t pack8to16( int a, int b )
-{
-#ifdef WORDS_BIGENDIAN
-   return b + (a<<8);
-#else
-   return a + (b<<8);
-#endif
-}
-
 /****************************************************************************
  * 16x16 prediction for intra luma block
  ****************************************************************************/
diff --git a/encoder/macroblock.c b/encoder/macroblock.c
index 7ac93b11..7d03e41b 100644
--- a/encoder/macroblock.c
+++ b/encoder/macroblock.c
@@ -549,23 +549,26 @@ void x264_macroblock_encode( x264_t *h )
         for( i = 0; i < 4; i++)
         {
             if(!nnz8x8[i])
-                for( j = 0; j < 4; j++ )
-                    h->mb.cache.non_zero_count[x264_scan8[j+i*4]] = 0;
+            {
+                *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[0+i*4]] = 0;
+                *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[2+i*4]] = 0;
+            }
             else if( h->mb.b_transform_8x8 )
             {
-                int nz = nnz8x8[i];
-                for( j = 0; j < 4; j++ )
-                    h->mb.cache.non_zero_count[x264_scan8[j+4*i]] = nz;
-                h->mb.i_cbp_luma |= nz << i;
+                *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[0+4*i]] = nnz8x8[i] * 0x0101;
+                *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[2+4*i]] = nnz8x8[i] * 0x0101;
+                h->mb.i_cbp_luma |= nnz8x8[i] << i;
             }
             else
             {
+                int nz, cbp = 0;
                 for( j = 0; j < 4; j++ )
                 {
-                    int nz = array_non_zero( h->dct.luma4x4[j+i*4] );
-                    h->mb.cache.non_zero_count[x264_scan8[j+i*4]] = nz;
-                    h->mb.i_cbp_luma |= nz << i;
+                    nz = array_non_zero( h->dct.luma4x4[j+4*i] );
+                    h->mb.cache.non_zero_count[x264_scan8[j+4*i]] = nz;
+                    cbp |= nz;
                 }
+                h->mb.i_cbp_luma |= cbp << i;
             }
         }
     }