Convert NNZ to raster order and other optimizations

author Fiona Glaser <fiona@x264.com>

Tue, 24 Jun 2008 18:23:50 +0000 (12:23 -0600)

committer Fiona Glaser <fiona@x264.com>

Tue, 24 Jun 2008 18:23:50 +0000 (12:23 -0600)
author Fiona Glaser <fiona@x264.com>
Tue, 24 Jun 2008 18:23:50 +0000 (12:23 -0600)
committer Fiona Glaser <fiona@x264.com>
Tue, 24 Jun 2008 18:23:50 +0000 (12:23 -0600)
diff --git a/common/common.h b/common/common.h

index 0636394164aee0c615ce93da7ce44ecb4927dbe0..33b71c40c77c6acf1aa2b1aa0bc2b328c603d584 100644 (file)
--- a/common/common.h
+++ b/common/common.h
@@ -405,7 +405,8 @@ struct x264_t
          int8_t  *type;                      /* mb type */
          int8_t  *qp;                        /* mb qp */
          int16_t *cbp;                       /* mb cbp: 0x0?: luma, 0x?0: chroma, 0x100: luma dc, 0x0200 and 0x0400: chroma dc  (all set for PCM)*/
-        int8_t  (*intra4x4_pred_mode)[7];   /* intra4x4 pred mode. for non I4x4 set to I_PRED_4x4_DC(2) */
+        int8_t  (*intra4x4_pred_mode)[8];   /* intra4x4 pred mode. for non I4x4 set to I_PRED_4x4_DC(2) */
+                                            /* actually has only 7 entries; set to 8 for write-combining optimizations */
          uint8_t (*non_zero_count)[16+4+4];  /* nzc. for I_PCM set to 16 */
          int8_t  *chroma_pred_mode;          /* chroma_pred_mode. cabac only. for non intra I_PRED_CHROMA_DC(0) */
          int16_t (*mv[2])[2];                /* mb mv. set to 0 for intra mb */
diff --git a/common/frame.c b/common/frame.c

index 214d4fc7f007fad4483c67e84eaece23bb9f23da..0c7a56c5b17d581e4160963a1e60a8ccc269a5a7 100644 (file)
--- a/common/frame.c
+++ b/common/frame.c
@@ -306,16 +306,16 @@ void munge_cavlc_nnz_row( x264_t *h, int mb_y, uint8_t (*buf)[16] )
  {
      uint32_t (*src)[6] = (uint32_t(*)[6])h->mb.non_zero_count + mb_y * h->sps->i_mb_width;
      int8_t *transform = h->mb.mb_transform_size + mb_y * h->sps->i_mb_width;
-    int x;
+    int x, nnz;
      for( x=0; x<h->sps->i_mb_width; x++ )
      {
          memcpy( buf+x, src+x, 16 );
          if( transform[x] )
          {
-            if( src[x][0] ) src[x][0] = 0x01010101;
-            if( src[x][1] ) src[x][1] = 0x01010101;
-            if( src[x][2] ) src[x][2] = 0x01010101;
-            if( src[x][3] ) src[x][3] = 0x01010101;
+            nnz = src[x][0] | src[x][1];
+            src[x][0] = src[x][1] = ((uint16_t)nnz ? 0x0101 : 0) + (nnz>>16 ? 0x01010000 : 0);
+            nnz = src[x][2] | src[x][3];
+            src[x][2] = src[x][3] = ((uint16_t)nnz ? 0x0101 : 0) + (nnz>>16 ? 0x01010000 : 0);
          }
      }
  }
@@ -642,8 +642,8 @@ void x264_frame_deblock_row( x264_t *h, int mb_y )
                          int y  = i_dir == 0 ? i      : i_edge;\
                          int xn = (x - (i_dir == 0 ? 1 : 0 ))&0x03;\
                          int yn = (y - (i_dir == 0 ? 0 : 1 ))&0x03;\
-                        if( h->mb.non_zero_count[mb_xy][block_idx_xy[x][y]] != 0 ||\
-                            h->mb.non_zero_count[mbn_xy][block_idx_xy[xn][yn]] != 0 )\
+                        if( h->mb.non_zero_count[mb_xy][x+y*4] != 0 ||\
+                            h->mb.non_zero_count[mbn_xy][xn+yn*4] != 0 )\
                          {\
                              bS[i] = 2;\
                          }\
diff --git a/common/macroblock.c b/common/macroblock.c

index df912ee302dbbb2aa888f85ade588b811a920e60..b08d4303bb6bd2d4ec9c5b8032fd57e29a4a160e 100644 (file)
--- a/common/macroblock.c
+++ b/common/macroblock.c
@@ -855,7 +855,7 @@ int x264_macroblock_cache_init( x264_t *h )
      CHECKED_MALLOC( h->mb.mb_transform_size, i_mb_count * sizeof(int8_t) );
  
      /* 0 -> 3 top(4), 4 -> 6 : left(3) */
-    CHECKED_MALLOC( h->mb.intra4x4_pred_mode, i_mb_count * 7 * sizeof(int8_t) );
+    CHECKED_MALLOC( h->mb.intra4x4_pred_mode, i_mb_count * 8 * sizeof(int8_t) );
  
      /* all coeffs */
      CHECKED_MALLOC( h->mb.non_zero_count, i_mb_count * 24 * sizeof(uint8_t) );
@@ -1045,27 +1045,18 @@ void x264_macroblock_cache_load( x264_t *h, int i_mb_x, int i_mb_y )
          h->mb.i_neighbour |= MB_TOP;
  
          /* load intra4x4 */
-        h->mb.cache.intra4x4_pred_mode[x264_scan8[0] - 8] = h->mb.intra4x4_pred_mode[i_top_xy][0];
-        h->mb.cache.intra4x4_pred_mode[x264_scan8[1] - 8] = h->mb.intra4x4_pred_mode[i_top_xy][1];
-        h->mb.cache.intra4x4_pred_mode[x264_scan8[4] - 8] = h->mb.intra4x4_pred_mode[i_top_xy][2];
-        h->mb.cache.intra4x4_pred_mode[x264_scan8[5] - 8] = h->mb.intra4x4_pred_mode[i_top_xy][3];
+        *(uint32_t*)&h->mb.cache.intra4x4_pred_mode[x264_scan8[0] - 8] = *(uint32_t*)&h->mb.intra4x4_pred_mode[i_top_xy][0];
  
          /* load non_zero_count */
-        h->mb.cache.non_zero_count[x264_scan8[0] - 8] = h->mb.non_zero_count[i_top_xy][10];
-        h->mb.cache.non_zero_count[x264_scan8[1] - 8] = h->mb.non_zero_count[i_top_xy][11];
-        h->mb.cache.non_zero_count[x264_scan8[4] - 8] = h->mb.non_zero_count[i_top_xy][14];
-        h->mb.cache.non_zero_count[x264_scan8[5] - 8] = h->mb.non_zero_count[i_top_xy][15];
-
-        h->mb.cache.non_zero_count[x264_scan8[16+0] - 8] = h->mb.non_zero_count[i_top_xy][16+2];
-        h->mb.cache.non_zero_count[x264_scan8[16+1] - 8] = h->mb.non_zero_count[i_top_xy][16+3];
-
-        h->mb.cache.non_zero_count[x264_scan8[16+4+0] - 8] = h->mb.non_zero_count[i_top_xy][16+4+2];
-        h->mb.cache.non_zero_count[x264_scan8[16+4+1] - 8] = h->mb.non_zero_count[i_top_xy][16+4+3];
+        *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[0] - 8] = *(uint32_t*)&h->mb.non_zero_count[i_top_xy][12];
+        /* shift because x264_scan8[16] is misaligned */
+        *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[16] - 9] = *(uint16_t*)&h->mb.non_zero_count[i_top_xy][18] << 8;
+        *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[16+4] - 9] = *(uint16_t*)&h->mb.non_zero_count[i_top_xy][22] << 8;
      }
      else
      {
          h->mb.i_mb_type_top = -1;
-        
+
          /* load intra4x4 */
          h->mb.cache.intra4x4_pred_mode[x264_scan8[0] - 8] =
          h->mb.cache.intra4x4_pred_mode[x264_scan8[1] - 8] =
@@ -1081,7 +1072,6 @@ void x264_macroblock_cache_load( x264_t *h, int i_mb_x, int i_mb_y )
          h->mb.cache.non_zero_count[x264_scan8[16+1] - 8] =
          h->mb.cache.non_zero_count[x264_scan8[16+4+0] - 8] =
          h->mb.cache.non_zero_count[x264_scan8[16+4+1] - 8] = 0x80;
-
      }
  
      if( i_mb_x > 0 && i_mb_xy > h->sh.i_first_mb )
@@ -1099,9 +1089,9 @@ void x264_macroblock_cache_load( x264_t *h, int i_mb_x, int i_mb_y )
          h->mb.cache.intra4x4_pred_mode[x264_scan8[10] - 1] = h->mb.intra4x4_pred_mode[i_left_xy][3];
  
          /* load non_zero_count */
-        h->mb.cache.non_zero_count[x264_scan8[0 ] - 1] = h->mb.non_zero_count[i_left_xy][5];
+        h->mb.cache.non_zero_count[x264_scan8[0 ] - 1] = h->mb.non_zero_count[i_left_xy][3];
          h->mb.cache.non_zero_count[x264_scan8[2 ] - 1] = h->mb.non_zero_count[i_left_xy][7];
-        h->mb.cache.non_zero_count[x264_scan8[8 ] - 1] = h->mb.non_zero_count[i_left_xy][13];
+        h->mb.cache.non_zero_count[x264_scan8[8 ] - 1] = h->mb.non_zero_count[i_left_xy][11];
          h->mb.cache.non_zero_count[x264_scan8[10] - 1] = h->mb.non_zero_count[i_left_xy][15];
  
          h->mb.cache.non_zero_count[x264_scan8[16+0] - 1] = h->mb.non_zero_count[i_left_xy][16+1];
@@ -1329,13 +1319,15 @@ void x264_macroblock_cache_load( x264_t *h, int i_mb_x, int i_mb_y )
              memset( h->mb.cache.skip, 0, X264_SCAN8_SIZE * sizeof( int8_t ) );
              if( i_left_type >= 0 )
              {
-                h->mb.cache.skip[x264_scan8[0] - 1] = h->mb.skipbp[i_left_xy] & 0x2;
-                h->mb.cache.skip[x264_scan8[8] - 1] = h->mb.skipbp[i_left_xy] & 0x8;
+                uint8_t skipbp = h->mb.skipbp[i_left_xy];
+                h->mb.cache.skip[x264_scan8[0] - 1] = skipbp & 0x2;
+                h->mb.cache.skip[x264_scan8[8] - 1] = skipbp & 0x8;
              }
              if( i_top_type >= 0 )
              {
-                h->mb.cache.skip[x264_scan8[0] - 8] = h->mb.skipbp[i_top_xy] & 0x4;
-                h->mb.cache.skip[x264_scan8[4] - 8] = h->mb.skipbp[i_top_xy] & 0x8;
+                uint8_t skipbp = h->mb.skipbp[i_top_xy];
+                h->mb.cache.skip[x264_scan8[0] - 8] = skipbp & 0x4;
+                h->mb.cache.skip[x264_scan8[4] - 8] = skipbp & 0x8;
              }
          }
  
@@ -1367,6 +1359,19 @@ void x264_macroblock_cache_load( x264_t *h, int i_mb_x, int i_mb_y )
      h->mb.i_neighbour4[14] = MB_LEFT|MB_TOP|MB_TOPLEFT|MB_TOPRIGHT;
  }
  
+static void ALWAYS_INLINE x264_macroblock_store_pic( x264_t *h, int i)
+{
+    int w = i ? 8 : 16;
+    int i_stride = h->fdec->i_stride[i];
+    int i_stride2 = i_stride << h->mb.b_interlaced;
+    int i_pix_offset = h->mb.b_interlaced
+                     ? w * (h->mb.i_mb_x + (h->mb.i_mb_y&~1) * i_stride) + (h->mb.i_mb_y&1) * i_stride
+                     : w * (h->mb.i_mb_x + h->mb.i_mb_y * i_stride);
+    h->mc.copy[i?PIXEL_8x8:PIXEL_16x16](
+        &h->fdec->plane[i][i_pix_offset], i_stride2,
+        h->mb.pic.p_fdec[i], FDEC_STRIDE, w );
+}
+
  void x264_macroblock_cache_save( x264_t *h )
  {
      const int i_mb_xy = h->mb.i_mb_xy;
@@ -1376,20 +1381,16 @@ void x264_macroblock_cache_save( x264_t *h )
      const int i_mb_4x4 = h->mb.i_b4_xy;
      const int i_mb_8x8 = h->mb.i_b8_xy;
  
-    int i;
+    /* GCC pessimizes direct stores to heap-allocated 8-bit arrays due to aliasing.*/
+    /* By only dereferencing them once, we avoid this issue. */
+    int8_t *intra4x4_pred_mode = h->mb.intra4x4_pred_mode[i_mb_xy];
+    uint8_t *non_zero_count = h->mb.non_zero_count[i_mb_xy];
  
-    for( i = 0; i < 3; i++ )
-    {
-        int w = i ? 8 : 16;
-        int i_stride = h->fdec->i_stride[i];
-        int i_stride2 = i_stride << h->mb.b_interlaced;
-        int i_pix_offset = h->mb.b_interlaced
-                         ? w * (h->mb.i_mb_x + (h->mb.i_mb_y&~1) * i_stride) + (h->mb.i_mb_y&1) * i_stride
-                         : w * (h->mb.i_mb_x + h->mb.i_mb_y * i_stride);
-        h->mc.copy[i?PIXEL_8x8:PIXEL_16x16](
-            &h->fdec->plane[i][i_pix_offset], i_stride2,
-            h->mb.pic.p_fdec[i], FDEC_STRIDE, w );
-    }
+    int i, y;
+
+    x264_macroblock_store_pic( h, 0 );
+    x264_macroblock_store_pic( h, 1 );
+    x264_macroblock_store_pic( h, 2 );
  
      x264_prefetch_fenc( h, h->fdec, h->mb.i_mb_x, h->mb.i_mb_y );
  
@@ -1406,40 +1407,28 @@ void x264_macroblock_cache_save( x264_t *h )
      /* save intra4x4 */
      if( i_mb_type == I_4x4 )
      {
-        h->mb.intra4x4_pred_mode[i_mb_xy][0] = h->mb.cache.intra4x4_pred_mode[x264_scan8[10] ];
-        h->mb.intra4x4_pred_mode[i_mb_xy][1] = h->mb.cache.intra4x4_pred_mode[x264_scan8[11] ];
-        h->mb.intra4x4_pred_mode[i_mb_xy][2] = h->mb.cache.intra4x4_pred_mode[x264_scan8[14] ];
-        h->mb.intra4x4_pred_mode[i_mb_xy][3] = h->mb.cache.intra4x4_pred_mode[x264_scan8[15] ];
-        h->mb.intra4x4_pred_mode[i_mb_xy][4] = h->mb.cache.intra4x4_pred_mode[x264_scan8[5] ];
-        h->mb.intra4x4_pred_mode[i_mb_xy][5] = h->mb.cache.intra4x4_pred_mode[x264_scan8[7] ];
-        h->mb.intra4x4_pred_mode[i_mb_xy][6] = h->mb.cache.intra4x4_pred_mode[x264_scan8[13] ];
+        *(uint32_t*)&intra4x4_pred_mode[0] = *(uint32_t*)&h->mb.cache.intra4x4_pred_mode[x264_scan8[10] ];
+        *(uint32_t*)&intra4x4_pred_mode[4] = pack8to32(h->mb.cache.intra4x4_pred_mode[x264_scan8[5] ],
+                                                       h->mb.cache.intra4x4_pred_mode[x264_scan8[7] ],
+                                                       h->mb.cache.intra4x4_pred_mode[x264_scan8[13] ], 0);
      }
      else
-    {
-        h->mb.intra4x4_pred_mode[i_mb_xy][0] =
-        h->mb.intra4x4_pred_mode[i_mb_xy][1] =
-        h->mb.intra4x4_pred_mode[i_mb_xy][2] =
-        h->mb.intra4x4_pred_mode[i_mb_xy][3] =
-        h->mb.intra4x4_pred_mode[i_mb_xy][4] =
-        h->mb.intra4x4_pred_mode[i_mb_xy][5] =
-        h->mb.intra4x4_pred_mode[i_mb_xy][6] = I_PRED_4x4_DC;
-    }
+        *(uint64_t*)intra4x4_pred_mode = I_PRED_4x4_DC * 0x0101010101010101ULL;
  
      if( i_mb_type == I_PCM )
      {
          h->mb.cbp[i_mb_xy] = 0x72f;   /* all set */
          for( i = 0; i < 16 + 2*4; i++ )
-        {
-            h->mb.non_zero_count[i_mb_xy][i] = 16;
-        }
+            non_zero_count[i] = 16;
      }
      else
      {
          /* save non zero count */
-        for( i = 0; i < 16 + 2*4; i++ )
-        {
-            h->mb.non_zero_count[i_mb_xy][i] = h->mb.cache.non_zero_count[x264_scan8[i]];
-        }
+        for( y = 0; y < 4; y++ )
+            *(uint32_t*)&non_zero_count[y*4] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[0]+y*8];
+        for( y = 0; y < 4; y++ )
+            *(uint16_t*)&non_zero_count[16+y*2] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[16+y*2]-1] >> 8;
+
      }
  
      if( h->mb.i_cbp_luma == 0 && h->mb.i_type != I_8x8 )
@@ -1448,20 +1437,25 @@ void x264_macroblock_cache_save( x264_t *h )
  
      if( !IS_INTRA( i_mb_type ) )
      {
-        int i_list;
-        for( i_list = 0; i_list < (h->sh.i_type == SLICE_TYPE_B ? 2  : 1 ); i_list++ )
+        h->mb.ref[0][i_mb_8x8+0+0*s8x8] = h->mb.cache.ref[0][x264_scan8[0]];
+        h->mb.ref[0][i_mb_8x8+1+0*s8x8] = h->mb.cache.ref[0][x264_scan8[4]];
+        h->mb.ref[0][i_mb_8x8+0+1*s8x8] = h->mb.cache.ref[0][x264_scan8[8]];
+        h->mb.ref[0][i_mb_8x8+1+1*s8x8] = h->mb.cache.ref[0][x264_scan8[12]];
+        for( y = 0; y < 4; y++ )
          {
-            int y;
-
-            h->mb.ref[i_list][i_mb_8x8+0+0*s8x8] = h->mb.cache.ref[i_list][x264_scan8[0]];
-            h->mb.ref[i_list][i_mb_8x8+1+0*s8x8] = h->mb.cache.ref[i_list][x264_scan8[4]];
-            h->mb.ref[i_list][i_mb_8x8+0+1*s8x8] = h->mb.cache.ref[i_list][x264_scan8[8]];
-            h->mb.ref[i_list][i_mb_8x8+1+1*s8x8] = h->mb.cache.ref[i_list][x264_scan8[12]];
-
+            *(uint64_t*)h->mb.mv[0][i_mb_4x4+y*s4x4+0] = *(uint64_t*)h->mb.cache.mv[0][x264_scan8[0]+8*y+0];
+            *(uint64_t*)h->mb.mv[0][i_mb_4x4+y*s4x4+2] = *(uint64_t*)h->mb.cache.mv[0][x264_scan8[0]+8*y+2];
+        }
+        if(h->sh.i_type == SLICE_TYPE_B)
+        {
+            h->mb.ref[1][i_mb_8x8+0+0*s8x8] = h->mb.cache.ref[1][x264_scan8[0]];
+            h->mb.ref[1][i_mb_8x8+1+0*s8x8] = h->mb.cache.ref[1][x264_scan8[4]];
+            h->mb.ref[1][i_mb_8x8+0+1*s8x8] = h->mb.cache.ref[1][x264_scan8[8]];
+            h->mb.ref[1][i_mb_8x8+1+1*s8x8] = h->mb.cache.ref[1][x264_scan8[12]];
              for( y = 0; y < 4; y++ )
              {
-                *(uint64_t*)h->mb.mv[i_list][i_mb_4x4+y*s4x4+0] = *(uint64_t*)h->mb.cache.mv[i_list][x264_scan8[0]+8*y+0];
-                *(uint64_t*)h->mb.mv[i_list][i_mb_4x4+y*s4x4+2] = *(uint64_t*)h->mb.cache.mv[i_list][x264_scan8[0]+8*y+2];
+                *(uint64_t*)h->mb.mv[1][i_mb_4x4+y*s4x4+0] = *(uint64_t*)h->mb.cache.mv[1][x264_scan8[0]+8*y+0];
+                *(uint64_t*)h->mb.mv[1][i_mb_4x4+y*s4x4+2] = *(uint64_t*)h->mb.cache.mv[1][x264_scan8[0]+8*y+2];
              }
          }
      }
@@ -1470,11 +1464,8 @@ void x264_macroblock_cache_save( x264_t *h )
          int i_list;
          for( i_list = 0; i_list < (h->sh.i_type == SLICE_TYPE_B ? 2  : 1 ); i_list++ )
          {
-            int y;
-
              *(uint16_t*)&h->mb.ref[i_list][i_mb_8x8+0*s8x8] = (uint8_t)(-1) * 0x0101;
              *(uint16_t*)&h->mb.ref[i_list][i_mb_8x8+1*s8x8] = (uint8_t)(-1) * 0x0101;
-
              for( y = 0; y < 4; y++ )
              {
                  *(uint64_t*)h->mb.mv[i_list][i_mb_4x4+y*s4x4+0] = 0;
@@ -1492,32 +1483,33 @@ void x264_macroblock_cache_save( x264_t *h )
  
          if( !IS_INTRA( i_mb_type ) && !IS_SKIP( i_mb_type ) && !IS_DIRECT( i_mb_type ) )
          {
-            int i_list;
-            for( i_list  = 0; i_list < 2; i_list++ )
+            for( y = 0; y < 4; y++ )
              {
-                const int s4x4 = 4 * h->mb.i_mb_stride;
-                int y;
+                *(uint64_t*)h->mb.mvd[0][i_mb_4x4+y*s4x4+0] = *(uint64_t*)h->mb.cache.mvd[0][x264_scan8[0]+8*y+0];
+                *(uint64_t*)h->mb.mvd[0][i_mb_4x4+y*s4x4+2] = *(uint64_t*)h->mb.cache.mvd[0][x264_scan8[0]+8*y+2];
+            }
+            if( h->sh.i_type == SLICE_TYPE_B )
                  for( y = 0; y < 4; y++ )
                  {
-                    *(uint64_t*)h->mb.mvd[i_list][i_mb_4x4+y*s4x4+0] = *(uint64_t*)h->mb.cache.mvd[i_list][x264_scan8[0]+8*y+0];
-                    *(uint64_t*)h->mb.mvd[i_list][i_mb_4x4+y*s4x4+2] = *(uint64_t*)h->mb.cache.mvd[i_list][x264_scan8[0]+8*y+2];
+                    *(uint64_t*)h->mb.mvd[1][i_mb_4x4+y*s4x4+0] = *(uint64_t*)h->mb.cache.mvd[1][x264_scan8[0]+8*y+0];
+                    *(uint64_t*)h->mb.mvd[1][i_mb_4x4+y*s4x4+2] = *(uint64_t*)h->mb.cache.mvd[1][x264_scan8[0]+8*y+2];
                  }
-            }
          }
          else
          {
-            int i_list;
-            for( i_list  = 0; i_list < 2; i_list++ )
+            for( y = 0; y < 4; y++ )
              {
-                const int s4x4 = 4 * h->mb.i_mb_stride;
-                int y;
+                *(uint64_t*)h->mb.mvd[0][i_mb_4x4+y*s4x4+0] = 0;
+                *(uint64_t*)h->mb.mvd[0][i_mb_4x4+y*s4x4+2] = 0;
+            }
+            if( h->sh.i_type == SLICE_TYPE_B )
                  for( y = 0; y < 4; y++ )
                  {
-                    *(uint64_t*)h->mb.mvd[i_list][i_mb_4x4+y*s4x4+0] = 0;
-                    *(uint64_t*)h->mb.mvd[i_list][i_mb_4x4+y*s4x4+2] = 0;
+                    *(uint64_t*)h->mb.mvd[1][i_mb_4x4+y*s4x4+0] = 0;
+                    *(uint64_t*)h->mb.mvd[1][i_mb_4x4+y*s4x4+2] = 0;
                  }
-            }
          }
+
          if( h->sh.i_type == SLICE_TYPE_B )
          {
              if( i_mb_type == B_SKIP || i_mb_type == B_DIRECT )
diff --git a/common/macroblock.h b/common/macroblock.h

index 660978a9b15145f6199e8a054da43fc37d2a373b..a59b4f76ce58c1180e237bfc9dae3ff528e09484 100644 (file)
--- a/common/macroblock.h
+++ b/common/macroblock.h
@@ -293,6 +293,30 @@ int  x264_mb_transform_8x8_allowed( x264_t *h );
  void x264_mb_mc( x264_t *h );
  void x264_mb_mc_8x8( x264_t *h, int i8 );
  
+static ALWAYS_INLINE uint32_t pack16to32( int a, int b )
+{
+#ifdef WORDS_BIGENDIAN
+   return b + (a<<16);
+#else
+   return a + (b<<16);
+#endif
+}
+static ALWAYS_INLINE uint32_t pack8to16( int a, int b )
+{
+#ifdef WORDS_BIGENDIAN
+   return b + (a<<8);
+#else
+   return a + (b<<8);
+#endif
+}
+static ALWAYS_INLINE uint32_t pack8to32( int a, int b, int c, int d )
+{
+#ifdef WORDS_BIGENDIAN
+   return d + (c<<8) + (b<<16) + (a<<24);
+#else
+   return a + (b<<8) + (c<<16) + (d<<24);
+#endif
+}
  static ALWAYS_INLINE uint32_t pack16to32_mask( int a, int b )
  {
  #ifdef WORDS_BIGENDIAN
diff --git a/common/predict.c b/common/predict.c

index d253879878b400eabb342a63100572b9a93ace0b..7f7a5f6830bfcea68174af1469b2fa479ee4950e 100644 (file)
--- a/common/predict.c
+++ b/common/predict.c
@@ -37,24 +37,6 @@
  #   include "ppc/predict.h"
  #endif
  
-static ALWAYS_INLINE uint32_t pack16to32( int a, int b )
-{
-#ifdef WORDS_BIGENDIAN
-   return b + (a<<16);
-#else
-   return a + (b<<16);
-#endif
-}
-
-static ALWAYS_INLINE uint32_t pack8to16( int a, int b )
-{
-#ifdef WORDS_BIGENDIAN
-   return b + (a<<8);
-#else
-   return a + (b<<8);
-#endif
-}
-
  /****************************************************************************
   * 16x16 prediction for intra luma block
   ****************************************************************************/
diff --git a/encoder/macroblock.c b/encoder/macroblock.c

index 7ac93b11902cb5161597c5b3b877318d7252bd14..7d03e41b21c6008950ea831b3ab6ae8e1de5eda5 100644 (file)
--- a/encoder/macroblock.c
+++ b/encoder/macroblock.c
@@ -549,23 +549,26 @@ void x264_macroblock_encode( x264_t *h )
          for( i = 0; i < 4; i++)
          {
              if(!nnz8x8[i])
-                for( j = 0; j < 4; j++ )
-                    h->mb.cache.non_zero_count[x264_scan8[j+i*4]] = 0;
+            {
+                *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[0+i*4]] = 0;
+                *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[2+i*4]] = 0;
+            }
              else if( h->mb.b_transform_8x8 )
              {
-                int nz = nnz8x8[i];
-                for( j = 0; j < 4; j++ )
-                    h->mb.cache.non_zero_count[x264_scan8[j+4*i]] = nz;
-                h->mb.i_cbp_luma |= nz << i;
+                *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[0+4*i]] = nnz8x8[i] * 0x0101;
+                *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[2+4*i]] = nnz8x8[i] * 0x0101;
+                h->mb.i_cbp_luma |= nnz8x8[i] << i;
              }
              else
              {
+                int nz, cbp = 0;
                  for( j = 0; j < 4; j++ )
                  {
-                    int nz = array_non_zero( h->dct.luma4x4[j+i*4] );
-                    h->mb.cache.non_zero_count[x264_scan8[j+i*4]] = nz;
-                    h->mb.i_cbp_luma |= nz << i;
+                    nz = array_non_zero( h->dct.luma4x4[j+4*i] );
+                    h->mb.cache.non_zero_count[x264_scan8[j+4*i]] = nz;
+                    cbp |= nz;
                  }
+                h->mb.i_cbp_luma |= cbp << i;
              }
          }
      }
author	Fiona Glaser <fiona@x264.com>
	Tue, 24 Jun 2008 18:23:50 +0000 (12:23 -0600)
committer	Fiona Glaser <fiona@x264.com>
	Tue, 24 Jun 2008 18:23:50 +0000 (12:23 -0600)
common/common.h		patch \| blob \| history
common/frame.c		patch \| blob \| history
common/macroblock.c		patch \| blob \| history
common/macroblock.h		patch \| blob \| history
common/predict.c		patch \| blob \| history
encoder/macroblock.c		patch \| blob \| history