]> granicus.if.org Git - libx264/commitdiff
Convert NNZ to raster order and other optimizations
authorFiona Glaser <fiona@x264.com>
Tue, 24 Jun 2008 18:23:50 +0000 (12:23 -0600)
committerFiona Glaser <fiona@x264.com>
Tue, 24 Jun 2008 18:23:50 +0000 (12:23 -0600)
Converting NNZ to raster order simplifies a lot of the load/store code and allows more use of write-combining.
More use of write-combining throughout load/save code in common/macroblock.c
GCC has aliasing issues in the case of stores to 8-bit heap-allocated arrays; dereferencing the pointer once avoids this problem and significantly increases performance.
More manual loop unrolling and such.
Move all packXtoY functions to macroblock.h so any function can use them.
Add pack8to32.
Minor optimizations to encoder/macroblock.c

common/common.h
common/frame.c
common/macroblock.c
common/macroblock.h
common/predict.c
encoder/macroblock.c

index 0636394164aee0c615ce93da7ce44ecb4927dbe0..33b71c40c77c6acf1aa2b1aa0bc2b328c603d584 100644 (file)
@@ -405,7 +405,8 @@ struct x264_t
         int8_t  *type;                      /* mb type */
         int8_t  *qp;                        /* mb qp */
         int16_t *cbp;                       /* mb cbp: 0x0?: luma, 0x?0: chroma, 0x100: luma dc, 0x0200 and 0x0400: chroma dc  (all set for PCM)*/
-        int8_t  (*intra4x4_pred_mode)[7];   /* intra4x4 pred mode. for non I4x4 set to I_PRED_4x4_DC(2) */
+        int8_t  (*intra4x4_pred_mode)[8];   /* intra4x4 pred mode. for non I4x4 set to I_PRED_4x4_DC(2) */
+                                            /* actually has only 7 entries; set to 8 for write-combining optimizations */
         uint8_t (*non_zero_count)[16+4+4];  /* nzc. for I_PCM set to 16 */
         int8_t  *chroma_pred_mode;          /* chroma_pred_mode. cabac only. for non intra I_PRED_CHROMA_DC(0) */
         int16_t (*mv[2])[2];                /* mb mv. set to 0 for intra mb */
index 214d4fc7f007fad4483c67e84eaece23bb9f23da..0c7a56c5b17d581e4160963a1e60a8ccc269a5a7 100644 (file)
@@ -306,16 +306,16 @@ void munge_cavlc_nnz_row( x264_t *h, int mb_y, uint8_t (*buf)[16] )
 {
     uint32_t (*src)[6] = (uint32_t(*)[6])h->mb.non_zero_count + mb_y * h->sps->i_mb_width;
     int8_t *transform = h->mb.mb_transform_size + mb_y * h->sps->i_mb_width;
-    int x;
+    int x, nnz;
     for( x=0; x<h->sps->i_mb_width; x++ )
     {
         memcpy( buf+x, src+x, 16 );
         if( transform[x] )
         {
-            if( src[x][0] ) src[x][0] = 0x01010101;
-            if( src[x][1] ) src[x][1] = 0x01010101;
-            if( src[x][2] ) src[x][2] = 0x01010101;
-            if( src[x][3] ) src[x][3] = 0x01010101;
+            nnz = src[x][0] | src[x][1];
+            src[x][0] = src[x][1] = ((uint16_t)nnz ? 0x0101 : 0) + (nnz>>16 ? 0x01010000 : 0);
+            nnz = src[x][2] | src[x][3];
+            src[x][2] = src[x][3] = ((uint16_t)nnz ? 0x0101 : 0) + (nnz>>16 ? 0x01010000 : 0);
         }
     }
 }
@@ -642,8 +642,8 @@ void x264_frame_deblock_row( x264_t *h, int mb_y )
                         int y  = i_dir == 0 ? i      : i_edge;\
                         int xn = (x - (i_dir == 0 ? 1 : 0 ))&0x03;\
                         int yn = (y - (i_dir == 0 ? 0 : 1 ))&0x03;\
-                        if( h->mb.non_zero_count[mb_xy][block_idx_xy[x][y]] != 0 ||\
-                            h->mb.non_zero_count[mbn_xy][block_idx_xy[xn][yn]] != 0 )\
+                        if( h->mb.non_zero_count[mb_xy][x+y*4] != 0 ||\
+                            h->mb.non_zero_count[mbn_xy][xn+yn*4] != 0 )\
                         {\
                             bS[i] = 2;\
                         }\
index df912ee302dbbb2aa888f85ade588b811a920e60..b08d4303bb6bd2d4ec9c5b8032fd57e29a4a160e 100644 (file)
@@ -855,7 +855,7 @@ int x264_macroblock_cache_init( x264_t *h )
     CHECKED_MALLOC( h->mb.mb_transform_size, i_mb_count * sizeof(int8_t) );
 
     /* 0 -> 3 top(4), 4 -> 6 : left(3) */
-    CHECKED_MALLOC( h->mb.intra4x4_pred_mode, i_mb_count * 7 * sizeof(int8_t) );
+    CHECKED_MALLOC( h->mb.intra4x4_pred_mode, i_mb_count * 8 * sizeof(int8_t) );
 
     /* all coeffs */
     CHECKED_MALLOC( h->mb.non_zero_count, i_mb_count * 24 * sizeof(uint8_t) );
@@ -1045,27 +1045,18 @@ void x264_macroblock_cache_load( x264_t *h, int i_mb_x, int i_mb_y )
         h->mb.i_neighbour |= MB_TOP;
 
         /* load intra4x4 */
-        h->mb.cache.intra4x4_pred_mode[x264_scan8[0] - 8] = h->mb.intra4x4_pred_mode[i_top_xy][0];
-        h->mb.cache.intra4x4_pred_mode[x264_scan8[1] - 8] = h->mb.intra4x4_pred_mode[i_top_xy][1];
-        h->mb.cache.intra4x4_pred_mode[x264_scan8[4] - 8] = h->mb.intra4x4_pred_mode[i_top_xy][2];
-        h->mb.cache.intra4x4_pred_mode[x264_scan8[5] - 8] = h->mb.intra4x4_pred_mode[i_top_xy][3];
+        *(uint32_t*)&h->mb.cache.intra4x4_pred_mode[x264_scan8[0] - 8] = *(uint32_t*)&h->mb.intra4x4_pred_mode[i_top_xy][0];
 
         /* load non_zero_count */
-        h->mb.cache.non_zero_count[x264_scan8[0] - 8] = h->mb.non_zero_count[i_top_xy][10];
-        h->mb.cache.non_zero_count[x264_scan8[1] - 8] = h->mb.non_zero_count[i_top_xy][11];
-        h->mb.cache.non_zero_count[x264_scan8[4] - 8] = h->mb.non_zero_count[i_top_xy][14];
-        h->mb.cache.non_zero_count[x264_scan8[5] - 8] = h->mb.non_zero_count[i_top_xy][15];
-
-        h->mb.cache.non_zero_count[x264_scan8[16+0] - 8] = h->mb.non_zero_count[i_top_xy][16+2];
-        h->mb.cache.non_zero_count[x264_scan8[16+1] - 8] = h->mb.non_zero_count[i_top_xy][16+3];
-
-        h->mb.cache.non_zero_count[x264_scan8[16+4+0] - 8] = h->mb.non_zero_count[i_top_xy][16+4+2];
-        h->mb.cache.non_zero_count[x264_scan8[16+4+1] - 8] = h->mb.non_zero_count[i_top_xy][16+4+3];
+        *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[0] - 8] = *(uint32_t*)&h->mb.non_zero_count[i_top_xy][12];
+        /* shift because x264_scan8[16] is misaligned */
+        *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[16] - 9] = *(uint16_t*)&h->mb.non_zero_count[i_top_xy][18] << 8;
+        *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[16+4] - 9] = *(uint16_t*)&h->mb.non_zero_count[i_top_xy][22] << 8;
     }
     else
     {
         h->mb.i_mb_type_top = -1;
-        
+
         /* load intra4x4 */
         h->mb.cache.intra4x4_pred_mode[x264_scan8[0] - 8] =
         h->mb.cache.intra4x4_pred_mode[x264_scan8[1] - 8] =
@@ -1081,7 +1072,6 @@ void x264_macroblock_cache_load( x264_t *h, int i_mb_x, int i_mb_y )
         h->mb.cache.non_zero_count[x264_scan8[16+1] - 8] =
         h->mb.cache.non_zero_count[x264_scan8[16+4+0] - 8] =
         h->mb.cache.non_zero_count[x264_scan8[16+4+1] - 8] = 0x80;
-
     }
 
     if( i_mb_x > 0 && i_mb_xy > h->sh.i_first_mb )
@@ -1099,9 +1089,9 @@ void x264_macroblock_cache_load( x264_t *h, int i_mb_x, int i_mb_y )
         h->mb.cache.intra4x4_pred_mode[x264_scan8[10] - 1] = h->mb.intra4x4_pred_mode[i_left_xy][3];
 
         /* load non_zero_count */
-        h->mb.cache.non_zero_count[x264_scan8[0 ] - 1] = h->mb.non_zero_count[i_left_xy][5];
+        h->mb.cache.non_zero_count[x264_scan8[0 ] - 1] = h->mb.non_zero_count[i_left_xy][3];
         h->mb.cache.non_zero_count[x264_scan8[2 ] - 1] = h->mb.non_zero_count[i_left_xy][7];
-        h->mb.cache.non_zero_count[x264_scan8[8 ] - 1] = h->mb.non_zero_count[i_left_xy][13];
+        h->mb.cache.non_zero_count[x264_scan8[8 ] - 1] = h->mb.non_zero_count[i_left_xy][11];
         h->mb.cache.non_zero_count[x264_scan8[10] - 1] = h->mb.non_zero_count[i_left_xy][15];
 
         h->mb.cache.non_zero_count[x264_scan8[16+0] - 1] = h->mb.non_zero_count[i_left_xy][16+1];
@@ -1329,13 +1319,15 @@ void x264_macroblock_cache_load( x264_t *h, int i_mb_x, int i_mb_y )
             memset( h->mb.cache.skip, 0, X264_SCAN8_SIZE * sizeof( int8_t ) );
             if( i_left_type >= 0 )
             {
-                h->mb.cache.skip[x264_scan8[0] - 1] = h->mb.skipbp[i_left_xy] & 0x2;
-                h->mb.cache.skip[x264_scan8[8] - 1] = h->mb.skipbp[i_left_xy] & 0x8;
+                uint8_t skipbp = h->mb.skipbp[i_left_xy];
+                h->mb.cache.skip[x264_scan8[0] - 1] = skipbp & 0x2;
+                h->mb.cache.skip[x264_scan8[8] - 1] = skipbp & 0x8;
             }
             if( i_top_type >= 0 )
             {
-                h->mb.cache.skip[x264_scan8[0] - 8] = h->mb.skipbp[i_top_xy] & 0x4;
-                h->mb.cache.skip[x264_scan8[4] - 8] = h->mb.skipbp[i_top_xy] & 0x8;
+                uint8_t skipbp = h->mb.skipbp[i_top_xy];
+                h->mb.cache.skip[x264_scan8[0] - 8] = skipbp & 0x4;
+                h->mb.cache.skip[x264_scan8[4] - 8] = skipbp & 0x8;
             }
         }
 
@@ -1367,6 +1359,19 @@ void x264_macroblock_cache_load( x264_t *h, int i_mb_x, int i_mb_y )
     h->mb.i_neighbour4[14] = MB_LEFT|MB_TOP|MB_TOPLEFT|MB_TOPRIGHT;
 }
 
+static void ALWAYS_INLINE x264_macroblock_store_pic( x264_t *h, int i)
+{
+    int w = i ? 8 : 16;
+    int i_stride = h->fdec->i_stride[i];
+    int i_stride2 = i_stride << h->mb.b_interlaced;
+    int i_pix_offset = h->mb.b_interlaced
+                     ? w * (h->mb.i_mb_x + (h->mb.i_mb_y&~1) * i_stride) + (h->mb.i_mb_y&1) * i_stride
+                     : w * (h->mb.i_mb_x + h->mb.i_mb_y * i_stride);
+    h->mc.copy[i?PIXEL_8x8:PIXEL_16x16](
+        &h->fdec->plane[i][i_pix_offset], i_stride2,
+        h->mb.pic.p_fdec[i], FDEC_STRIDE, w );
+}
+
 void x264_macroblock_cache_save( x264_t *h )
 {
     const int i_mb_xy = h->mb.i_mb_xy;
@@ -1376,20 +1381,16 @@ void x264_macroblock_cache_save( x264_t *h )
     const int i_mb_4x4 = h->mb.i_b4_xy;
     const int i_mb_8x8 = h->mb.i_b8_xy;
 
-    int i;
+    /* GCC pessimizes direct stores to heap-allocated 8-bit arrays due to aliasing.*/
+    /* By only dereferencing them once, we avoid this issue. */
+    int8_t *intra4x4_pred_mode = h->mb.intra4x4_pred_mode[i_mb_xy];
+    uint8_t *non_zero_count = h->mb.non_zero_count[i_mb_xy];
 
-    for( i = 0; i < 3; i++ )
-    {
-        int w = i ? 8 : 16;
-        int i_stride = h->fdec->i_stride[i];
-        int i_stride2 = i_stride << h->mb.b_interlaced;
-        int i_pix_offset = h->mb.b_interlaced
-                         ? w * (h->mb.i_mb_x + (h->mb.i_mb_y&~1) * i_stride) + (h->mb.i_mb_y&1) * i_stride
-                         : w * (h->mb.i_mb_x + h->mb.i_mb_y * i_stride);
-        h->mc.copy[i?PIXEL_8x8:PIXEL_16x16](
-            &h->fdec->plane[i][i_pix_offset], i_stride2,
-            h->mb.pic.p_fdec[i], FDEC_STRIDE, w );
-    }
+    int i, y;
+
+    x264_macroblock_store_pic( h, 0 );
+    x264_macroblock_store_pic( h, 1 );
+    x264_macroblock_store_pic( h, 2 );
 
     x264_prefetch_fenc( h, h->fdec, h->mb.i_mb_x, h->mb.i_mb_y );
 
@@ -1406,40 +1407,28 @@ void x264_macroblock_cache_save( x264_t *h )
     /* save intra4x4 */
     if( i_mb_type == I_4x4 )
     {
-        h->mb.intra4x4_pred_mode[i_mb_xy][0] = h->mb.cache.intra4x4_pred_mode[x264_scan8[10] ];
-        h->mb.intra4x4_pred_mode[i_mb_xy][1] = h->mb.cache.intra4x4_pred_mode[x264_scan8[11] ];
-        h->mb.intra4x4_pred_mode[i_mb_xy][2] = h->mb.cache.intra4x4_pred_mode[x264_scan8[14] ];
-        h->mb.intra4x4_pred_mode[i_mb_xy][3] = h->mb.cache.intra4x4_pred_mode[x264_scan8[15] ];
-        h->mb.intra4x4_pred_mode[i_mb_xy][4] = h->mb.cache.intra4x4_pred_mode[x264_scan8[5] ];
-        h->mb.intra4x4_pred_mode[i_mb_xy][5] = h->mb.cache.intra4x4_pred_mode[x264_scan8[7] ];
-        h->mb.intra4x4_pred_mode[i_mb_xy][6] = h->mb.cache.intra4x4_pred_mode[x264_scan8[13] ];
+        *(uint32_t*)&intra4x4_pred_mode[0] = *(uint32_t*)&h->mb.cache.intra4x4_pred_mode[x264_scan8[10] ];
+        *(uint32_t*)&intra4x4_pred_mode[4] = pack8to32(h->mb.cache.intra4x4_pred_mode[x264_scan8[5] ],
+                                                       h->mb.cache.intra4x4_pred_mode[x264_scan8[7] ],
+                                                       h->mb.cache.intra4x4_pred_mode[x264_scan8[13] ], 0);
     }
     else
-    {
-        h->mb.intra4x4_pred_mode[i_mb_xy][0] =
-        h->mb.intra4x4_pred_mode[i_mb_xy][1] =
-        h->mb.intra4x4_pred_mode[i_mb_xy][2] =
-        h->mb.intra4x4_pred_mode[i_mb_xy][3] =
-        h->mb.intra4x4_pred_mode[i_mb_xy][4] =
-        h->mb.intra4x4_pred_mode[i_mb_xy][5] =
-        h->mb.intra4x4_pred_mode[i_mb_xy][6] = I_PRED_4x4_DC;
-    }
+        *(uint64_t*)intra4x4_pred_mode = I_PRED_4x4_DC * 0x0101010101010101ULL;
 
     if( i_mb_type == I_PCM )
     {
         h->mb.cbp[i_mb_xy] = 0x72f;   /* all set */
         for( i = 0; i < 16 + 2*4; i++ )
-        {
-            h->mb.non_zero_count[i_mb_xy][i] = 16;
-        }
+            non_zero_count[i] = 16;
     }
     else
     {
         /* save non zero count */
-        for( i = 0; i < 16 + 2*4; i++ )
-        {
-            h->mb.non_zero_count[i_mb_xy][i] = h->mb.cache.non_zero_count[x264_scan8[i]];
-        }
+        for( y = 0; y < 4; y++ )
+            *(uint32_t*)&non_zero_count[y*4] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[0]+y*8];
+        for( y = 0; y < 4; y++ )
+            *(uint16_t*)&non_zero_count[16+y*2] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[16+y*2]-1] >> 8;
+
     }
 
     if( h->mb.i_cbp_luma == 0 && h->mb.i_type != I_8x8 )
@@ -1448,20 +1437,25 @@ void x264_macroblock_cache_save( x264_t *h )
 
     if( !IS_INTRA( i_mb_type ) )
     {
-        int i_list;
-        for( i_list = 0; i_list < (h->sh.i_type == SLICE_TYPE_B ? 2  : 1 ); i_list++ )
+        h->mb.ref[0][i_mb_8x8+0+0*s8x8] = h->mb.cache.ref[0][x264_scan8[0]];
+        h->mb.ref[0][i_mb_8x8+1+0*s8x8] = h->mb.cache.ref[0][x264_scan8[4]];
+        h->mb.ref[0][i_mb_8x8+0+1*s8x8] = h->mb.cache.ref[0][x264_scan8[8]];
+        h->mb.ref[0][i_mb_8x8+1+1*s8x8] = h->mb.cache.ref[0][x264_scan8[12]];
+        for( y = 0; y < 4; y++ )
         {
-            int y;
-
-            h->mb.ref[i_list][i_mb_8x8+0+0*s8x8] = h->mb.cache.ref[i_list][x264_scan8[0]];
-            h->mb.ref[i_list][i_mb_8x8+1+0*s8x8] = h->mb.cache.ref[i_list][x264_scan8[4]];
-            h->mb.ref[i_list][i_mb_8x8+0+1*s8x8] = h->mb.cache.ref[i_list][x264_scan8[8]];
-            h->mb.ref[i_list][i_mb_8x8+1+1*s8x8] = h->mb.cache.ref[i_list][x264_scan8[12]];
-
+            *(uint64_t*)h->mb.mv[0][i_mb_4x4+y*s4x4+0] = *(uint64_t*)h->mb.cache.mv[0][x264_scan8[0]+8*y+0];
+            *(uint64_t*)h->mb.mv[0][i_mb_4x4+y*s4x4+2] = *(uint64_t*)h->mb.cache.mv[0][x264_scan8[0]+8*y+2];
+        }
+        if(h->sh.i_type == SLICE_TYPE_B)
+        {
+            h->mb.ref[1][i_mb_8x8+0+0*s8x8] = h->mb.cache.ref[1][x264_scan8[0]];
+            h->mb.ref[1][i_mb_8x8+1+0*s8x8] = h->mb.cache.ref[1][x264_scan8[4]];
+            h->mb.ref[1][i_mb_8x8+0+1*s8x8] = h->mb.cache.ref[1][x264_scan8[8]];
+            h->mb.ref[1][i_mb_8x8+1+1*s8x8] = h->mb.cache.ref[1][x264_scan8[12]];
             for( y = 0; y < 4; y++ )
             {
-                *(uint64_t*)h->mb.mv[i_list][i_mb_4x4+y*s4x4+0] = *(uint64_t*)h->mb.cache.mv[i_list][x264_scan8[0]+8*y+0];
-                *(uint64_t*)h->mb.mv[i_list][i_mb_4x4+y*s4x4+2] = *(uint64_t*)h->mb.cache.mv[i_list][x264_scan8[0]+8*y+2];
+                *(uint64_t*)h->mb.mv[1][i_mb_4x4+y*s4x4+0] = *(uint64_t*)h->mb.cache.mv[1][x264_scan8[0]+8*y+0];
+                *(uint64_t*)h->mb.mv[1][i_mb_4x4+y*s4x4+2] = *(uint64_t*)h->mb.cache.mv[1][x264_scan8[0]+8*y+2];
             }
         }
     }
@@ -1470,11 +1464,8 @@ void x264_macroblock_cache_save( x264_t *h )
         int i_list;
         for( i_list = 0; i_list < (h->sh.i_type == SLICE_TYPE_B ? 2  : 1 ); i_list++ )
         {
-            int y;
-
             *(uint16_t*)&h->mb.ref[i_list][i_mb_8x8+0*s8x8] = (uint8_t)(-1) * 0x0101;
             *(uint16_t*)&h->mb.ref[i_list][i_mb_8x8+1*s8x8] = (uint8_t)(-1) * 0x0101;
-
             for( y = 0; y < 4; y++ )
             {
                 *(uint64_t*)h->mb.mv[i_list][i_mb_4x4+y*s4x4+0] = 0;
@@ -1492,32 +1483,33 @@ void x264_macroblock_cache_save( x264_t *h )
 
         if( !IS_INTRA( i_mb_type ) && !IS_SKIP( i_mb_type ) && !IS_DIRECT( i_mb_type ) )
         {
-            int i_list;
-            for( i_list  = 0; i_list < 2; i_list++ )
+            for( y = 0; y < 4; y++ )
             {
-                const int s4x4 = 4 * h->mb.i_mb_stride;
-                int y;
+                *(uint64_t*)h->mb.mvd[0][i_mb_4x4+y*s4x4+0] = *(uint64_t*)h->mb.cache.mvd[0][x264_scan8[0]+8*y+0];
+                *(uint64_t*)h->mb.mvd[0][i_mb_4x4+y*s4x4+2] = *(uint64_t*)h->mb.cache.mvd[0][x264_scan8[0]+8*y+2];
+            }
+            if( h->sh.i_type == SLICE_TYPE_B )
                 for( y = 0; y < 4; y++ )
                 {
-                    *(uint64_t*)h->mb.mvd[i_list][i_mb_4x4+y*s4x4+0] = *(uint64_t*)h->mb.cache.mvd[i_list][x264_scan8[0]+8*y+0];
-                    *(uint64_t*)h->mb.mvd[i_list][i_mb_4x4+y*s4x4+2] = *(uint64_t*)h->mb.cache.mvd[i_list][x264_scan8[0]+8*y+2];
+                    *(uint64_t*)h->mb.mvd[1][i_mb_4x4+y*s4x4+0] = *(uint64_t*)h->mb.cache.mvd[1][x264_scan8[0]+8*y+0];
+                    *(uint64_t*)h->mb.mvd[1][i_mb_4x4+y*s4x4+2] = *(uint64_t*)h->mb.cache.mvd[1][x264_scan8[0]+8*y+2];
                 }
-            }
         }
         else
         {
-            int i_list;
-            for( i_list  = 0; i_list < 2; i_list++ )
+            for( y = 0; y < 4; y++ )
             {
-                const int s4x4 = 4 * h->mb.i_mb_stride;
-                int y;
+                *(uint64_t*)h->mb.mvd[0][i_mb_4x4+y*s4x4+0] = 0;
+                *(uint64_t*)h->mb.mvd[0][i_mb_4x4+y*s4x4+2] = 0;
+            }
+            if( h->sh.i_type == SLICE_TYPE_B )
                 for( y = 0; y < 4; y++ )
                 {
-                    *(uint64_t*)h->mb.mvd[i_list][i_mb_4x4+y*s4x4+0] = 0;
-                    *(uint64_t*)h->mb.mvd[i_list][i_mb_4x4+y*s4x4+2] = 0;
+                    *(uint64_t*)h->mb.mvd[1][i_mb_4x4+y*s4x4+0] = 0;
+                    *(uint64_t*)h->mb.mvd[1][i_mb_4x4+y*s4x4+2] = 0;
                 }
-            }
         }
+
         if( h->sh.i_type == SLICE_TYPE_B )
         {
             if( i_mb_type == B_SKIP || i_mb_type == B_DIRECT )
index 660978a9b15145f6199e8a054da43fc37d2a373b..a59b4f76ce58c1180e237bfc9dae3ff528e09484 100644 (file)
@@ -293,6 +293,30 @@ int  x264_mb_transform_8x8_allowed( x264_t *h );
 void x264_mb_mc( x264_t *h );
 void x264_mb_mc_8x8( x264_t *h, int i8 );
 
+static ALWAYS_INLINE uint32_t pack16to32( int a, int b )
+{
+#ifdef WORDS_BIGENDIAN
+   return b + (a<<16);
+#else
+   return a + (b<<16);
+#endif
+}
+static ALWAYS_INLINE uint32_t pack8to16( int a, int b )
+{
+#ifdef WORDS_BIGENDIAN
+   return b + (a<<8);
+#else
+   return a + (b<<8);
+#endif
+}
+static ALWAYS_INLINE uint32_t pack8to32( int a, int b, int c, int d )
+{
+#ifdef WORDS_BIGENDIAN
+   return d + (c<<8) + (b<<16) + (a<<24);
+#else
+   return a + (b<<8) + (c<<16) + (d<<24);
+#endif
+}
 static ALWAYS_INLINE uint32_t pack16to32_mask( int a, int b )
 {
 #ifdef WORDS_BIGENDIAN
index d253879878b400eabb342a63100572b9a93ace0b..7f7a5f6830bfcea68174af1469b2fa479ee4950e 100644 (file)
 #   include "ppc/predict.h"
 #endif
 
-static ALWAYS_INLINE uint32_t pack16to32( int a, int b )
-{
-#ifdef WORDS_BIGENDIAN
-   return b + (a<<16);
-#else
-   return a + (b<<16);
-#endif
-}
-
-static ALWAYS_INLINE uint32_t pack8to16( int a, int b )
-{
-#ifdef WORDS_BIGENDIAN
-   return b + (a<<8);
-#else
-   return a + (b<<8);
-#endif
-}
-
 /****************************************************************************
  * 16x16 prediction for intra luma block
  ****************************************************************************/
index 7ac93b11902cb5161597c5b3b877318d7252bd14..7d03e41b21c6008950ea831b3ab6ae8e1de5eda5 100644 (file)
@@ -549,23 +549,26 @@ void x264_macroblock_encode( x264_t *h )
         for( i = 0; i < 4; i++)
         {
             if(!nnz8x8[i])
-                for( j = 0; j < 4; j++ )
-                    h->mb.cache.non_zero_count[x264_scan8[j+i*4]] = 0;
+            {
+                *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[0+i*4]] = 0;
+                *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[2+i*4]] = 0;
+            }
             else if( h->mb.b_transform_8x8 )
             {
-                int nz = nnz8x8[i];
-                for( j = 0; j < 4; j++ )
-                    h->mb.cache.non_zero_count[x264_scan8[j+4*i]] = nz;
-                h->mb.i_cbp_luma |= nz << i;
+                *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[0+4*i]] = nnz8x8[i] * 0x0101;
+                *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[2+4*i]] = nnz8x8[i] * 0x0101;
+                h->mb.i_cbp_luma |= nnz8x8[i] << i;
             }
             else
             {
+                int nz, cbp = 0;
                 for( j = 0; j < 4; j++ )
                 {
-                    int nz = array_non_zero( h->dct.luma4x4[j+i*4] );
-                    h->mb.cache.non_zero_count[x264_scan8[j+i*4]] = nz;
-                    h->mb.i_cbp_luma |= nz << i;
+                    nz = array_non_zero( h->dct.luma4x4[j+4*i] );
+                    h->mb.cache.non_zero_count[x264_scan8[j+4*i]] = nz;
+                    cbp |= nz;
                 }
+                h->mb.i_cbp_luma |= cbp << i;
             }
         }
     }