Much faster chroma encoding and other opts

author Fiona Glaser <fiona@x264.com>

Tue, 27 Jan 2009 07:43:25 +0000 (23:43 -0800)

committer Fiona Glaser <fiona@x264.com>

Tue, 27 Jan 2009 08:56:57 +0000 (00:56 -0800)
author Fiona Glaser <fiona@x264.com>
Tue, 27 Jan 2009 07:43:25 +0000 (23:43 -0800)
committer Fiona Glaser <fiona@x264.com>
Tue, 27 Jan 2009 08:56:57 +0000 (00:56 -0800)
diff --git a/common/dct.c b/common/dct.c

index 1b525474aea1a81525b5fed33e53fe47b5bea056..f8d51e40feb3312ed1e74922fd4440a1293dd905 100644 (file)
--- a/common/dct.c
+++ b/common/dct.c
@@ -348,6 +348,27 @@ static void add16x16_idct8( uint8_t *dst, int16_t dct[4][8][8] )
      add8x8_idct8( &dst[8*FDEC_STRIDE+8], dct[3] );
  }
  
+static void inline add4x4_idct_dc( uint8_t *p_dst, int16_t dc )
+{
+    int i;
+    dc = (dc + 32) >> 6;
+    for( i = 0; i < 4; i++, p_dst += FDEC_STRIDE )
+    {
+        p_dst[0] = x264_clip_uint8( p_dst[0] + dc );
+        p_dst[1] = x264_clip_uint8( p_dst[1] + dc );
+        p_dst[2] = x264_clip_uint8( p_dst[2] + dc );
+        p_dst[3] = x264_clip_uint8( p_dst[3] + dc );
+    }
+}
+
+static void add8x8_idct_dc( uint8_t *p_dst, int16_t dct[2][2] )
+{
+    add4x4_idct_dc( &p_dst[0],               dct[0][0] );
+    add4x4_idct_dc( &p_dst[4],               dct[0][1] );
+    add4x4_idct_dc( &p_dst[4*FDEC_STRIDE+0], dct[1][0] );
+    add4x4_idct_dc( &p_dst[4*FDEC_STRIDE+4], dct[1][1] );
+}
+
  
  /****************************************************************************
   * x264_dct_init:
@@ -359,6 +380,7 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf )
  
      dctf->sub8x8_dct    = sub8x8_dct;
      dctf->add8x8_idct   = add8x8_idct;
+    dctf->add8x8_idct_dc = add8x8_idct_dc;
  
      dctf->sub16x16_dct  = sub16x16_dct;
      dctf->add16x16_idct = add16x16_idct;
@@ -377,6 +399,7 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf )
      {
          dctf->sub4x4_dct    = x264_sub4x4_dct_mmx;
          dctf->add4x4_idct   = x264_add4x4_idct_mmx;
+        dctf->add8x8_idct_dc = x264_add8x8_idct_dc_mmx;
          dctf->dct4x4dc      = x264_dct4x4dc_mmx;
          dctf->idct4x4dc     = x264_idct4x4dc_mmx;
  
@@ -405,6 +428,9 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf )
          dctf->add8x8_idct   = x264_add8x8_idct_sse2;
          dctf->add16x16_idct = x264_add16x16_idct_sse2;
      }
+
+    if( cpu&X264_CPU_SSSE3 )
+        dctf->add8x8_idct_dc = x264_add8x8_idct_dc_ssse3;
  #endif //HAVE_MMX
  
  #ifdef ARCH_PPC
diff --git a/common/dct.h b/common/dct.h

index 6d78bc75a8401fb7e99c9e9bf5bb30d7d1340bd7..f4474fcc00078c787547e5937587dc267c744d50 100644 (file)
--- a/common/dct.h
+++ b/common/dct.h
@@ -96,6 +96,7 @@ typedef struct
  
      void (*sub8x8_dct)   ( int16_t dct[4][4][4], uint8_t *pix1, uint8_t *pix2 );
      void (*add8x8_idct)  ( uint8_t *p_dst, int16_t dct[4][4][4] );
+    void (*add8x8_idct_dc) ( uint8_t *p_dst, int16_t dct[2][2] );
  
      void (*sub16x16_dct) ( int16_t dct[16][4][4], uint8_t *pix1, uint8_t *pix2 );
      void (*add16x16_idct)( uint8_t *p_dst, int16_t dct[16][4][4] );
diff --git a/common/macroblock.c b/common/macroblock.c

index 5f5823a7efb0f9be0cefa5e0704fa2ef4a8aac08..6c74ef2ec2cc880167f124bee53d2af04a6ae99d 100644 (file)
--- a/common/macroblock.c
+++ b/common/macroblock.c
@@ -1354,9 +1354,10 @@ void x264_macroblock_cache_save( x264_t *h )
                  h->mb.skipbp[i_mb_xy] = 0xf;
              else if( i_mb_type == B_8x8 )
              {
-                int skipbp = 0;
-                for( i = 0; i < 4; i++ )
-                    skipbp |= ( h->mb.i_sub_partition[i] == D_DIRECT_8x8 ) << i;
+                int skipbp = ( h->mb.i_sub_partition[0] == D_DIRECT_8x8 ) << 0;
+                skipbp    |= ( h->mb.i_sub_partition[1] == D_DIRECT_8x8 ) << 1;
+                skipbp    |= ( h->mb.i_sub_partition[2] == D_DIRECT_8x8 ) << 2;
+                skipbp    |= ( h->mb.i_sub_partition[3] == D_DIRECT_8x8 ) << 3;
                  h->mb.skipbp[i_mb_xy] = skipbp;
              }
              else
diff --git a/common/macroblock.h b/common/macroblock.h

index 1b0f0ca2fe62e54e633578ff62bcefa25b986b3f..d6741bc95b51b2c12b3e394560d99335bb9751f3 100644 (file)
--- a/common/macroblock.h
+++ b/common/macroblock.h
@@ -404,19 +404,19 @@ static ALWAYS_INLINE void x264_macroblock_cache_intra8x8_pred( x264_t *h, int x,
  #define array_non_zero_int array_non_zero_int_c
  static ALWAYS_INLINE int array_non_zero_int_c( void *v, int i_count )
  {
-    uint64_t *x = v;
+    union {uint16_t s[4]; uint64_t l;} *x = v;
      if(i_count == 8)
-        return !!x[0];
+        return !!x[0].l;
      else if(i_count == 16)
-        return !!(x[0]|x[1]);
+        return !!(x[0].l|x[1].l);
      else if(i_count == 32)
-        return !!(x[0]|x[1]|x[2]|x[3]);
+        return !!(x[0].l|x[1].l|x[2].l|x[3].l);
      else
      {
          int i;
          i_count /= sizeof(uint64_t);
          for( i = 0; i < i_count; i++ )
-            if( x[i] ) return 1;
+            if( x[i].l ) return 1;
          return 0;
      }
  }
diff --git a/common/x86/dct-a.asm b/common/x86/dct-a.asm

index 012e25aa0ad1b8c6248851cfcae285b672807891..a474932fe40b703ca317759ecfe709ae97899b7e 100644 (file)
--- a/common/x86/dct-a.asm
+++ b/common/x86/dct-a.asm
@@ -32,6 +32,7 @@ pw_8000: times 8 dw 0x8000
  pb_sub4frame:   db 0,1,4,8,5,2,3,6,9,12,13,10,7,11,14,15
  pb_scan4framea: db 12,13,6,7,14,15,0,1,8,9,2,3,4,5,10,11
  pb_scan4frameb: db 0,1,8,9,2,3,4,5,10,11,12,13,6,7,14,15
+pb_idctdc_unpack: db 0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3
  
  SECTION .text
  
@@ -244,6 +245,85 @@ cextern x264_add8x8_idct8_sse2
  SUB_NxN_DCT  x264_sub16x16_dct8_sse2,  x264_sub8x8_dct8_sse2,  128, 8, 0, 0
  ADD_NxN_IDCT x264_add16x16_idct8_sse2, x264_add8x8_idct8_sse2, 128, 8, 0, 0
  
+;-----------------------------------------------------------------------------
+; void add8x8_idct_dc( uint8_t *p_dst, int16_t *dct2x2 )
+;-----------------------------------------------------------------------------
+
+%macro ADD_DC 3
+    movq      mm4, [%3+FDEC_STRIDE*0]
+    movq      mm5, [%3+FDEC_STRIDE*1]
+    movq      mm6, [%3+FDEC_STRIDE*2]
+    paddusb   mm4, %1
+    paddusb   mm5, %1
+    paddusb   mm6, %1
+    paddusb    %1, [%3+FDEC_STRIDE*3]
+    psubusb   mm4, %2
+    psubusb   mm5, %2
+    psubusb   mm6, %2
+    psubusb    %1, %2
+    movq      [%3+FDEC_STRIDE*0], mm4
+    movq      [%3+FDEC_STRIDE*1], mm5
+    movq      [%3+FDEC_STRIDE*2], mm6
+    movq      [%3+FDEC_STRIDE*3], %1
+%endmacro
+
+cglobal x264_add8x8_idct_dc_mmx, 2,2
+    movq      mm0, [r1]
+    pxor      mm1, mm1
+    add        r0, FDEC_STRIDE*4
+    paddw     mm0, [pw_32 GLOBAL]
+    psraw     mm0, 6
+    psubw     mm1, mm0
+    packuswb  mm0, mm0
+    packuswb  mm1, mm1
+    punpcklbw mm0, mm0
+    punpcklbw mm1, mm1
+    pshufw    mm2, mm0, 0xFA
+    pshufw    mm3, mm1, 0xFA
+    punpcklbw mm0, mm0
+    punpcklbw mm1, mm1
+    ADD_DC    mm0, mm1, r0-FDEC_STRIDE*4
+    ADD_DC    mm2, mm3, r0
+    ret
+
+cglobal x264_add8x8_idct_dc_ssse3, 2,2
+    movq      xmm0, [r1]
+    pxor      xmm1, xmm1
+    add         r0, FDEC_STRIDE*4
+    paddw     xmm0, [pw_32 GLOBAL]
+    psraw     xmm0, 6
+    psubw     xmm1, xmm0
+    movdqa    xmm5, [pb_idctdc_unpack GLOBAL]
+    packuswb  xmm0, xmm0
+    packuswb  xmm1, xmm1
+    pshufb    xmm0, xmm5
+    pshufb    xmm1, xmm5
+    movq      xmm2, [r0+FDEC_STRIDE*-4]
+    movq      xmm3, [r0+FDEC_STRIDE*-3]
+    movq      xmm4, [r0+FDEC_STRIDE*-2]
+    movq      xmm5, [r0+FDEC_STRIDE*-1]
+    movhps    xmm2, [r0+FDEC_STRIDE* 0]
+    movhps    xmm3, [r0+FDEC_STRIDE* 1]
+    movhps    xmm4, [r0+FDEC_STRIDE* 2]
+    movhps    xmm5, [r0+FDEC_STRIDE* 3]
+    paddusb   xmm2, xmm0
+    paddusb   xmm3, xmm0
+    paddusb   xmm4, xmm0
+    paddusb   xmm5, xmm0
+    psubusb   xmm2, xmm1
+    psubusb   xmm3, xmm1
+    psubusb   xmm4, xmm1
+    psubusb   xmm5, xmm1
+    movq      [r0+FDEC_STRIDE*-4], xmm2
+    movq      [r0+FDEC_STRIDE*-3], xmm3
+    movq      [r0+FDEC_STRIDE*-2], xmm4
+    movq      [r0+FDEC_STRIDE*-1], xmm5
+    movhps    [r0+FDEC_STRIDE* 0], xmm2
+    movhps    [r0+FDEC_STRIDE* 1], xmm3
+    movhps    [r0+FDEC_STRIDE* 2], xmm4
+    movhps    [r0+FDEC_STRIDE* 3], xmm5
+    ret
+
  ;-----------------------------------------------------------------------------
  ; void x264_zigzag_scan_8x8_frame_ssse3( int16_t level[64], int16_t dct[8][8] )
  ;-----------------------------------------------------------------------------
diff --git a/common/x86/dct.h b/common/x86/dct.h

index 4617f973ef3f400a2fec87eaf03ffe5dc97d30e8..d30fa97274c25733dd3ea795aeb89cd4d31895aa 100644 (file)
--- a/common/x86/dct.h
+++ b/common/x86/dct.h
@@ -32,9 +32,11 @@ void x264_sub16x16_dct_sse2  ( int16_t dct[16][4][4],  uint8_t *pix1, uint8_t *p
  
  void x264_add4x4_idct_mmx    ( uint8_t *p_dst, int16_t dct[ 4][4]    );
  void x264_add8x8_idct_mmx    ( uint8_t *p_dst, int16_t dct[ 4][4][4] );
+void x264_add8x8_idct_dc_mmx ( uint8_t *p_dst, int16_t dct[2][2] );
  void x264_add16x16_idct_mmx  ( uint8_t *p_dst, int16_t dct[16][4][4] );
  void x264_add8x8_idct_sse2   ( uint8_t *p_dst, int16_t dct[ 4][4][4] );
  void x264_add16x16_idct_sse2 ( uint8_t *p_dst, int16_t dct[16][4][4] );
+void x264_add8x8_idct_dc_ssse3( uint8_t *p_dst, int16_t dct[2][2] );
  
  void x264_dct4x4dc_mmx       ( int16_t d[4][4] );
  void x264_idct4x4dc_mmx      ( int16_t d[4][4] );
diff --git a/common/x86/x86inc.asm b/common/x86/x86inc.asm

index b2aee3f800f061368b2437fd89f7a4d4dbc1876f..7d33027336ee52c54c9557f401f6daf2395cab68 100644 (file)
--- a/common/x86/x86inc.asm
+++ b/common/x86/x86inc.asm
@@ -497,3 +497,27 @@ INIT_MMX
      %endif
  %endmacro
  
+;Substitutions that reduce instruction size but are functionally equivalent
+%macro add 2
+    %ifnum %2
+        %if %2==128
+            sub %1, -128
+        %else
+            add %1, %2
+        %endif
+    %else
+        add %1, %2
+    %endif
+%endmacro
+
+%macro sub 2
+    %ifnum %2
+        %if %2==128
+            add %1, -128
+        %else
+            sub %1, %2
+        %endif
+    %else
+        sub %1, %2
+    %endif
+%endmacro
diff --git a/encoder/macroblock.c b/encoder/macroblock.c

index 6703059e296a3cb458da89923b7aa4fd452f6b9f..30df7781810b9b6238cb4cda1f8cb359dfd81b58 100644 (file)
--- a/encoder/macroblock.c
+++ b/encoder/macroblock.c
@@ -37,25 +37,37 @@ static inline void zigzag_scan_2x2_dc( int16_t level[4], int16_t dct[2][2] )
  }
  #undef ZIG
  
+#define IDCT_DEQUANT_START \
+    int d0 = dct[0][0] + dct[0][1]; \
+    int d1 = dct[1][0] + dct[1][1]; \
+    int d2 = dct[0][0] - dct[0][1]; \
+    int d3 = dct[1][0] - dct[1][1]; \
+    int dmf = dequant_mf[i_qp%6][0][0]; \
+    int qbits = i_qp/6 - 5; \
+    if( qbits > 0 ) \
+    { \
+        dmf <<= qbits; \
+        qbits = 0; \
+    }
+
  static inline void idct_dequant_2x2_dc( int16_t dct[2][2], int16_t dct4x4[4][4][4], int dequant_mf[6][4][4], int i_qp )
  {
-    int d0 = dct[0][0] + dct[0][1];
-    int d1 = dct[1][0] + dct[1][1];
-    int d2 = dct[0][0] - dct[0][1];
-    int d3 = dct[1][0] - dct[1][1];
-    int dmf = dequant_mf[i_qp%6][0][0];
-    int qbits = i_qp/6 - 5;
-    if( qbits > 0 )
-    {
-        dmf <<= qbits;
-        qbits = 0;
-    }
+    IDCT_DEQUANT_START
      dct4x4[0][0][0] = (d0 + d1) * dmf >> -qbits;
      dct4x4[1][0][0] = (d0 - d1) * dmf >> -qbits;
      dct4x4[2][0][0] = (d2 + d3) * dmf >> -qbits;
      dct4x4[3][0][0] = (d2 - d3) * dmf >> -qbits;
  }
  
+static inline void idct_dequant_2x2_dconly( int16_t dct[2][2], int dequant_mf[6][4][4], int i_qp )
+{
+    IDCT_DEQUANT_START
+    dct[0][0] = (d0 + d1) * dmf >> -qbits;
+    dct[0][1] = (d0 - d1) * dmf >> -qbits;
+    dct[1][0] = (d2 + d3) * dmf >> -qbits;
+    dct[1][1] = (d2 - d3) * dmf >> -qbits;
+}
+
  static inline void dct2x2dc( int16_t d[2][2], int16_t dct4x4[4][4][4] )
  {
      int d0 = dct4x4[0][0][0] + dct4x4[1][0][0];
@@ -202,8 +214,9 @@ static void x264_mb_encode_i16x16( x264_t *h, int i_qp )
  
  void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qp )
  {
-    int i, ch;
+    int i, ch, nz;
      int b_decimate = b_inter && (h->sh.i_type == SLICE_TYPE_B || h->param.analyse.b_dct_decimate);
+    h->mb.i_cbp_chroma = 0;
  
      for( ch = 0; ch < 2; ch++ )
      {
@@ -223,7 +236,11 @@ void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qp )
                  h->zigzagf.sub_4x4( h->dct.luma4x4[16+i+ch*4], p_src+oe, p_dst+od );
                  h->dct.chroma_dc[ch][i] = h->dct.luma4x4[16+i+ch*4][0];
                  h->dct.luma4x4[16+i+ch*4][0] = 0;
+                nz = array_non_zero( h->dct.luma4x4[16+i+ch*4] );
+                h->mb.cache.non_zero_count[x264_scan8[16+i+ch*4]] = nz;
+                h->mb.i_cbp_chroma |= nz;
              }
+            h->mb.cache.non_zero_count[x264_scan8[25]+ch] = array_non_zero( h->dct.chroma_dc[ch] );
              continue;
          }
  
@@ -249,36 +266,40 @@ void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qp )
  
          if( b_decimate && i_decimate_score < 7 )
          {
-            /* Near null chroma 8x8 block so make it null (bits saving) */
-            memset( &h->dct.luma4x4[16+ch*4], 0, 4 * sizeof( *h->dct.luma4x4 ) );
-            if( !array_non_zero( dct2x2 ) )
+            /* Decimate the block */
+            h->mb.cache.non_zero_count[x264_scan8[16+0]+24*ch] = 0;
+            h->mb.cache.non_zero_count[x264_scan8[16+1]+24*ch] = 0;
+            h->mb.cache.non_zero_count[x264_scan8[16+2]+24*ch] = 0;
+            h->mb.cache.non_zero_count[x264_scan8[16+3]+24*ch] = 0;
+            if( !array_non_zero( dct2x2 ) ) /* Whole block is empty */
              {
-                memset( h->dct.chroma_dc[ch], 0, sizeof( h->dct.chroma_dc[ch] ) );
+                h->mb.cache.non_zero_count[x264_scan8[25]+ch] = 0;
                  continue;
              }
-            memset( dct4x4, 0, sizeof( dct4x4 ) );
+            /* DC-only */
+            h->mb.cache.non_zero_count[x264_scan8[25]+ch] = 1;
+            zigzag_scan_2x2_dc( h->dct.chroma_dc[ch], dct2x2 );
+            idct_dequant_2x2_dconly( dct2x2, h->dequant4_mf[CQM_4IC + b_inter], i_qp );
+            h->dctf.add8x8_idct_dc( p_dst, dct2x2 );
          }
          else
          {
              for( i = 0; i < 4; i++ )
-                h->quantf.dequant_4x4( dct4x4[i], h->dequant4_mf[CQM_4IC + b_inter], i_qp );
+            {
+                nz = array_non_zero( h->dct.luma4x4[16+ch*4+i] );
+                h->mb.cache.non_zero_count[x264_scan8[16+ch*4+i]] = nz;
+                h->mb.i_cbp_chroma |= nz;
+                if( nz )
+                    h->quantf.dequant_4x4( dct4x4[i], h->dequant4_mf[CQM_4IC + b_inter], i_qp );
+            }
+            /* Don't optimize for the AC-only case--it's very rare */
+            h->mb.cache.non_zero_count[x264_scan8[25]+ch] = array_non_zero( dct2x2 );
+            zigzag_scan_2x2_dc( h->dct.chroma_dc[ch], dct2x2 );
+            idct_dequant_2x2_dc( dct2x2, dct4x4, h->dequant4_mf[CQM_4IC + b_inter], i_qp );
+            h->dctf.add8x8_idct( p_dst, dct4x4 );
          }
-
-        zigzag_scan_2x2_dc( h->dct.chroma_dc[ch], dct2x2 );
-        idct_dequant_2x2_dc( dct2x2, dct4x4, h->dequant4_mf[CQM_4IC + b_inter], i_qp );
-        h->dctf.add8x8_idct( p_dst, dct4x4 );
      }
  
-    /* coded block pattern */
-    h->mb.i_cbp_chroma = 0;
-    for( i = 0; i < 8; i++ )
-    {
-        int nz = array_non_zero( h->dct.luma4x4[16+i] );
-        h->mb.cache.non_zero_count[x264_scan8[16+i]] = nz;
-        h->mb.i_cbp_chroma |= nz;
-    }
-    h->mb.cache.non_zero_count[x264_scan8[25]] = array_non_zero( h->dct.chroma_dc[0] );
-    h->mb.cache.non_zero_count[x264_scan8[26]] = array_non_zero( h->dct.chroma_dc[1] );
      if( h->mb.i_cbp_chroma )
          h->mb.i_cbp_chroma = 2;    /* dc+ac (we can't do only ac) */
      else if( h->mb.cache.non_zero_count[x264_scan8[25]] |
diff --git a/tools/checkasm.c b/tools/checkasm.c

index 203a5963d01a20cd8a6c757f3fd02673a9412ffb..599ce23b4e798e60ceb1bfe43758bfb4e31ea340 100644 (file)
--- a/tools/checkasm.c
+++ b/tools/checkasm.c
@@ -551,6 +551,7 @@ static int check_dct( int cpu_ref, int cpu_new )
      ok = 1; used_asm = 0;
      TEST_IDCT( add4x4_idct, dct4 );
      TEST_IDCT( add8x8_idct, dct4 );
+    TEST_IDCT( add8x8_idct_dc, dct4 );
      TEST_IDCT( add16x16_idct, dct4 );
      report( "add_idct4 :" );
author	Fiona Glaser <fiona@x264.com>
	Tue, 27 Jan 2009 07:43:25 +0000 (23:43 -0800)
committer	Fiona Glaser <fiona@x264.com>
	Tue, 27 Jan 2009 08:56:57 +0000 (00:56 -0800)
common/dct.c		patch \| blob \| history
common/dct.h		patch \| blob \| history
common/macroblock.c		patch \| blob \| history
common/macroblock.h		patch \| blob \| history
common/x86/dct-a.asm		patch \| blob \| history
common/x86/dct.h		patch \| blob \| history
common/x86/x86inc.asm		patch \| blob \| history
encoder/macroblock.c		patch \| blob \| history
tools/checkasm.c		patch \| blob \| history