keep transposed dct coefs. ~1% overall speedup.

author Loren Merritt <pengvado@videolan.org>

Thu, 9 Mar 2006 05:30:08 +0000 (05:30 +0000)

committer Loren Merritt <pengvado@videolan.org>

Thu, 9 Mar 2006 05:30:08 +0000 (05:30 +0000)
author Loren Merritt <pengvado@videolan.org>
Thu, 9 Mar 2006 05:30:08 +0000 (05:30 +0000)
committer Loren Merritt <pengvado@videolan.org>
Thu, 9 Mar 2006 05:30:08 +0000 (05:30 +0000)
diff --git a/common/amd64/dct-a.asm b/common/amd64/dct-a.asm

index a4b12f71e32906027f602adde6fbdcd7c46afc09..1bb352faae54671f3426844fe777feabc40bd274 100644 (file)
--- a/common/amd64/dct-a.asm
+++ b/common/amd64/dct-a.asm
@@ -177,21 +177,19 @@ x264_dct4x4dc_mmxext:
      MMX_SUMSUB_BADC     mm2, mm3, mm0, mm4          ; mm2=s01  mm3=d01  mm0=s23  mm4=d23
      MMX_SUMSUB_BADC     mm0, mm2, mm4, mm3          ; mm0=s01+s23  mm2=s01-s23  mm4=d01+d23  mm3=d01-d23
  
-    MMX_TRANSPOSE       mm0, mm2, mm3, mm4, mm1     ; in: mm0, mm2, mm3, mm4  out: mm0, mm4, mm1, mm3
-
      movq    mm6,        [pw_1 GLOBAL]
      paddw   mm0,        mm6
-    paddw   mm4,        mm6
+    paddw   mm2,        mm6
      psraw   mm0,        1
      movq    [parm1q+ 0],mm0
-    psraw   mm4,        1
-    movq    [parm1q+ 8],mm4
-    paddw   mm1,        mm6
+    psraw   mm2,        1
+    movq    [parm1q+ 8],mm2
      paddw   mm3,        mm6
-    psraw   mm1,        1
-    movq    [parm1q+16],mm1
+    paddw   mm4,        mm6
      psraw   mm3,        1
-    movq    [parm1q+24],mm3
+    movq    [parm1q+16],mm3
+    psraw   mm4,        1
+    movq    [parm1q+24],mm4
      ret
  
  cglobal x264_idct4x4dc_mmxext
@@ -214,12 +212,10 @@ x264_idct4x4dc_mmxext:
      MMX_SUMSUB_BADC     mm2, mm3, mm0, mm4          ; mm2=s01  mm3=d01  mm0=s23  mm4=d23
      MMX_SUMSUB_BADC     mm0, mm2, mm4, mm3          ; mm0=s01+s23  mm2=s01-s23  mm4=d01+d23  mm3=d01-d23
  
-    MMX_TRANSPOSE       mm0, mm2, mm3, mm4, mm1     ; in: mm0, mm2, mm3, mm4  out: mm0, mm4, mm1, mm3
-
      movq    [parm1q+ 0], mm0
-    movq    [parm1q+ 8], mm4
-    movq    [parm1q+16], mm1
-    movq    [parm1q+24], mm3
+    movq    [parm1q+ 8], mm2
+    movq    [parm1q+16], mm3
+    movq    [parm1q+24], mm4
      ret
  
  cglobal x264_sub4x4_dct_mmxext
@@ -267,13 +263,10 @@ x264_sub4x4_dct_mmxext:
      MMX_SUMSUB_BA       mm1, mm3                    ; mm1=s03+s12      mm3=s03-s12
      MMX_SUMSUB2_AB      mm2, mm4, mm0               ; mm2=2.d03+d12    mm0=d03-2.d12
  
-    ; transpose in: mm1, mm2, mm3, mm0, out: mm1, mm0, mm4, mm3
-    MMX_TRANSPOSE       mm1, mm2, mm3, mm0, mm4
-
      movq    [r10+ 0],   mm1 ; dct
-    movq    [r10+ 8],   mm0
-    movq    [r10+16],   mm4
-    movq    [r10+24],   mm3
+    movq    [r10+ 8],   mm2
+    movq    [r10+16],   mm3
+    movq    [r10+24],   mm0
  
      pop     rbx
      ret
@@ -288,17 +281,14 @@ ALIGN 16
  x264_add4x4_idct_mmxext:
      ; Load dct coeffs
      movq    mm0, [parm3q+ 0] ; dct
-    movq    mm4, [parm3q+ 8]
-    movq    mm3, [parm3q+16]
-    movq    mm1, [parm3q+24]
+    movq    mm1, [parm3q+ 8]
+    movq    mm2, [parm3q+16]
+    movq    mm3, [parm3q+24]
      
      mov     rax, parm1q      ; p_dst
      movsxd  rcx, parm2d      ; i_dst
      lea     rdx, [rcx+rcx*2]
  
-    ; out:mm0, mm1, mm2, mm3
-    MMX_TRANSPOSE       mm0, mm4, mm3, mm1, mm2
-
      MMX_SUMSUB_BA       mm2, mm0                        ; mm2=s02  mm0=d02
      MMX_SUMSUBD2_AB     mm1, mm3, mm5, mm4              ; mm1=s13  mm4=d13 ( well 1 + 3>>1 and 1>>1 + 3)
  
@@ -408,19 +398,18 @@ x264_sub8x8_dct8_sse2:
      MMX_LOAD_DIFF_8P  xmm6, xmm8, xmm9, [rsi+r9   ], [rcx+r10]
      MMX_LOAD_DIFF_8P  xmm7, xmm8, xmm9, [rsi+rdx*4], [rcx+r8*4]
  
-    SSE2_TRANSPOSE8x8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8
-    DCT8_1D           xmm0, xmm5, xmm7, xmm3, xmm8, xmm4, xmm2, xmm1, xmm6, xmm9
-    SSE2_TRANSPOSE8x8 xmm4, xmm5, xmm7, xmm2, xmm8, xmm3, xmm1, xmm6, xmm0
-    DCT8_1D           xmm4, xmm3, xmm6, xmm2, xmm0, xmm8, xmm7, xmm5, xmm1, xmm9
+    DCT8_1D           xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9
+    SSE2_TRANSPOSE8x8 xmm5, xmm1, xmm2, xmm6, xmm4, xmm3, xmm7, xmm8, xmm0
+    DCT8_1D           xmm5, xmm3, xmm8, xmm6, xmm0, xmm4, xmm2, xmm1, xmm7, xmm9
  
-    movdqa  [rdi+0x00], xmm8
+    movdqa  [rdi+0x00], xmm4
      movdqa  [rdi+0x10], xmm3
-    movdqa  [rdi+0x20], xmm6
-    movdqa  [rdi+0x30], xmm7
+    movdqa  [rdi+0x20], xmm8
+    movdqa  [rdi+0x30], xmm2
      movdqa  [rdi+0x40], xmm0
-    movdqa  [rdi+0x50], xmm2
-    movdqa  [rdi+0x60], xmm5
-    movdqa  [rdi+0x70], xmm1
+    movdqa  [rdi+0x50], xmm6
+    movdqa  [rdi+0x60], xmm1
+    movdqa  [rdi+0x70], xmm7
  
      ret
  
@@ -494,22 +483,21 @@ x264_add8x8_idct8_sse2:
      movdqa  xmm6, [rdx+0x60]
      movdqa  xmm7, [rdx+0x70]
  
-    SSE2_TRANSPOSE8x8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8
-    IDCT8_1D          xmm0, xmm5, xmm7, xmm3, xmm8, xmm4, xmm2, xmm1, xmm9, xmm6
-    SSE2_TRANSPOSE8x8 xmm9, xmm5, xmm1, xmm3, xmm8, xmm0, xmm7, xmm2, xmm4
+    IDCT8_1D          xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm9, xmm8
+    SSE2_TRANSPOSE8x8 xmm9, xmm1, xmm7, xmm3, xmm4, xmm0, xmm2, xmm6, xmm5
      paddw             xmm9, [pw_32 GLOBAL] ; rounding for the >>6 at the end
-    IDCT8_1D          xmm9, xmm0, xmm2, xmm3, xmm4, xmm8, xmm1, xmm5, xmm6, xmm7
+    IDCT8_1D          xmm9, xmm0, xmm6, xmm3, xmm5, xmm4, xmm7, xmm1, xmm8, xmm2
   
      MMX_ZERO  xmm15
-    MMX_STORE_DIFF_8P   xmm6, xmm14, xmm15, [rdi]
+    MMX_STORE_DIFF_8P   xmm8, xmm14, xmm15, [rdi]
      MMX_STORE_DIFF_8P   xmm0, xmm14, xmm15, [rdi+rsi]
-    MMX_STORE_DIFF_8P   xmm5, xmm14, xmm15, [rdi+rsi*2]
+    MMX_STORE_DIFF_8P   xmm1, xmm14, xmm15, [rdi+rsi*2]
      lea  rax, [rsi+rsi*2]
      add  rdi, rax
      MMX_STORE_DIFF_8P   xmm3, xmm14, xmm15, [rdi]
-    MMX_STORE_DIFF_8P   xmm4, xmm14, xmm15, [rdi+rsi]
+    MMX_STORE_DIFF_8P   xmm5, xmm14, xmm15, [rdi+rsi]
      MMX_STORE_DIFF_8P   xmm9, xmm14, xmm15, [rdi+rsi*2]
-    MMX_STORE_DIFF_8P   xmm2, xmm14, xmm15, [rdi+rax]
-    MMX_STORE_DIFF_8P   xmm1, xmm14, xmm15, [rdi+rsi*4]
+    MMX_STORE_DIFF_8P   xmm6, xmm14, xmm15, [rdi+rax]
+    MMX_STORE_DIFF_8P   xmm7, xmm14, xmm15, [rdi+rsi*4]
  
      ret
diff --git a/common/dct.c b/common/dct.c

index 42a2c9bd49955d02ee07b37d3353f1b6a7b51d49..6212e323c60bb98fb11d10ba6df9d33878aa056d 100644 (file)
--- a/common/dct.c
+++ b/common/dct.c
@@ -52,8 +52,8 @@ static void dct2x2dc( int16_t d[2][2] )
      tmp[1][1] = d[1][0] - d[1][1];
  
      d[0][0] = tmp[0][0] + tmp[0][1];
-    d[0][1] = tmp[1][0] + tmp[1][1];
-    d[1][0] = tmp[0][0] - tmp[0][1];
+    d[1][0] = tmp[1][0] + tmp[1][1];
+    d[0][1] = tmp[0][0] - tmp[0][1];
      d[1][1] = tmp[1][0] - tmp[1][1];
  }
  
@@ -84,10 +84,10 @@ static void dct4x4dc( int16_t d[4][4] )
          s23 = tmp[i][2] + tmp[i][3];
          d23 = tmp[i][2] - tmp[i][3];
  
-        d[0][i] = ( s01 + s23 + 1 ) >> 1;
-        d[1][i] = ( s01 - s23 + 1 ) >> 1;
-        d[2][i] = ( d01 - d23 + 1 ) >> 1;
-        d[3][i] = ( d01 + d23 + 1 ) >> 1;
+        d[i][0] = ( s01 + s23 + 1 ) >> 1;
+        d[i][1] = ( s01 - s23 + 1 ) >> 1;
+        d[i][2] = ( d01 - d23 + 1 ) >> 1;
+        d[i][3] = ( d01 + d23 + 1 ) >> 1;
      }
  }
  
@@ -100,10 +100,10 @@ static void idct4x4dc( int16_t d[4][4] )
  
      for( i = 0; i < 4; i++ )
      {
-        s01 = d[0][i] + d[1][i];
-        d01 = d[0][i] - d[1][i];
-        s23 = d[2][i] + d[3][i];
-        d23 = d[2][i] - d[3][i];
+        s01 = d[i][0] + d[i][1];
+        d01 = d[i][0] - d[i][1];
+        s23 = d[i][2] + d[i][3];
+        d23 = d[i][2] - d[i][3];
  
          tmp[0][i] = s01 + s23;
          tmp[1][i] = s01 - s23;
@@ -168,10 +168,10 @@ static void sub4x4_dct( int16_t dct[4][4], uint8_t *pix1, int i_pix1, uint8_t *p
          const int d03 = tmp[i][0] - tmp[i][3];
          const int d12 = tmp[i][1] - tmp[i][2];
  
-        dct[0][i] =   s03 +   s12;
-        dct[1][i] = 2*d03 +   d12;
-        dct[2][i] =   s03 -   s12;
-        dct[3][i] =   d03 - 2*d12;
+        dct[i][0] =   s03 +   s12;
+        dct[i][1] = 2*d03 +   d12;
+        dct[i][2] =   s03 -   s12;
+        dct[i][3] =   d03 - 2*d12;
      }
  }
  
@@ -201,10 +201,10 @@ static void add4x4_idct( uint8_t *p_dst, int i_dst, int16_t dct[4][4] )
  
      for( i = 0; i < 4; i++ )
      {
-        const int s02 =  dct[i][0]     +  dct[i][2];
-        const int d02 =  dct[i][0]     -  dct[i][2];
-        const int s13 =  dct[i][1]     + (dct[i][3]>>1);
-        const int d13 = (dct[i][1]>>1) -  dct[i][3];
+        const int s02 =  dct[0][i]     +  dct[2][i];
+        const int d02 =  dct[0][i]     -  dct[2][i];
+        const int s13 =  dct[1][i]     + (dct[3][i]>>1);
+        const int d13 = (dct[1][i]>>1) -  dct[3][i];
  
          tmp[i][0] = s02 + s13;
          tmp[i][1] = d02 + d13;
@@ -217,7 +217,7 @@ static void add4x4_idct( uint8_t *p_dst, int i_dst, int16_t dct[4][4] )
          const int s02 =  tmp[0][i]     +  tmp[2][i];
          const int d02 =  tmp[0][i]     -  tmp[2][i];
          const int s13 =  tmp[1][i]     + (tmp[3][i]>>1);
-        const int d13 = (tmp[1][i]>>1) -   tmp[3][i];
+        const int d13 = (tmp[1][i]>>1) -  tmp[3][i];
  
          d[0][i] = ( s02 + s13 + 32 ) >> 6;
          d[1][i] = ( d02 + d13 + 32 ) >> 6;
@@ -273,31 +273,36 @@ static void add16x16_idct( uint8_t *p_dst, int i_dst, int16_t dct[16][4][4] )
      const int a5 = d07 - d34 - (d25 + (d25>>1));\
      const int a6 = d07 + d34 - (d16 + (d16>>1));\
      const int a7 = d16 - d25 + (d34 + (d34>>1));\
-    SRC(0) =  a0 + a1     ;\
-    SRC(1) =  a4 + (a7>>2);\
-    SRC(2) =  a2 + (a3>>1);\
-    SRC(3) =  a5 + (a6>>2);\
-    SRC(4) =  a0 - a1     ;\
-    SRC(5) =  a6 - (a5>>2);\
-    SRC(6) = (a2>>1) - a3 ;\
-    SRC(7) = (a4>>2) - a7 ;\
+    DST(0) =  a0 + a1     ;\
+    DST(1) =  a4 + (a7>>2);\
+    DST(2) =  a2 + (a3>>1);\
+    DST(3) =  a5 + (a6>>2);\
+    DST(4) =  a0 - a1     ;\
+    DST(5) =  a6 - (a5>>2);\
+    DST(6) = (a2>>1) - a3 ;\
+    DST(7) = (a4>>2) - a7 ;\
  }
  
  static void sub8x8_dct8( int16_t dct[8][8], uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 )
  {
      int i;
+    int16_t tmp[8][8];
  
-    pixel_sub_wxh( (int16_t*)dct, 8, pix1, i_pix1, pix2, i_pix2 );
+    pixel_sub_wxh( (int16_t*)tmp, 8, pix1, i_pix1, pix2, i_pix2 );
  
-#define SRC(x) dct[x][i]
+#define SRC(x) tmp[x][i]
+#define DST(x) tmp[x][i]
      for( i = 0; i < 8; i++ )
          DCT8_1D
  #undef SRC
+#undef DST
  
-#define SRC(x) dct[i][x]
+#define SRC(x) tmp[i][x]
+#define DST(x) dct[x][i]
      for( i = 0; i < 8; i++ )
          DCT8_1D
  #undef SRC
+#undef DST
  }
  
  static void sub16x16_dct8( int16_t dct[4][8][8], uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 )
@@ -341,14 +346,14 @@ static void add8x8_idct8( uint8_t *dst, int i_dst, int16_t dct[8][8] )
  
      dct[0][0] += 32; // rounding for the >>6 at the end
  
-#define SRC(x)     dct[i][x]
-#define DST(x,rhs) dct[i][x] = (rhs)
+#define SRC(x)     dct[x][i]
+#define DST(x,rhs) dct[x][i] = (rhs)
      for( i = 0; i < 8; i++ )
          IDCT8_1D
  #undef SRC
  #undef DST
  
-#define SRC(x)     dct[x][i]
+#define SRC(x)     dct[i][x]
  #define DST(x,rhs) dst[i + x*i_dst] = clip_uint8( dst[i + x*i_dst] + ((rhs) >> 6) );
      for( i = 0; i < 8; i++ )
          IDCT8_1D
@@ -404,16 +409,19 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf )
  
          dctf->dct4x4dc  = x264_dct4x4dc_mmxext;
          dctf->idct4x4dc = x264_idct4x4dc_mmxext;
+    }
  
  #ifndef ARCH_X86_64
-        dctf->sub8x8_dct8   = x264_sub8x8_dct8_mmxext;
-        dctf->sub16x16_dct8 = x264_sub16x16_dct8_mmxext;
+    if( cpu&X264_CPU_MMX )
+    {
+        dctf->sub8x8_dct8   = x264_sub8x8_dct8_mmx;
+        dctf->sub16x16_dct8 = x264_sub16x16_dct8_mmx;
  
-        dctf->add8x8_idct8  = x264_add8x8_idct8_mmxext;
-        dctf->add16x16_idct8= x264_add16x16_idct8_mmxext;
-#endif
+        dctf->add8x8_idct8  = x264_add8x8_idct8_mmx;
+        dctf->add16x16_idct8= x264_add16x16_idct8_mmx;
      }
  #endif
+#endif
  
  #if defined(HAVE_SSE2) && defined(ARCH_X86_64)
      if( cpu&X264_CPU_SSE2 )
@@ -425,7 +433,7 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf )
          dctf->add16x16_idct8= x264_add16x16_idct8_sse2;
      }
  #endif
-
+/* FIXME altivec dct is not transposed yet
  #ifdef ARCH_PPC
      if( cpu&X264_CPU_ALTIVEC )
      {
@@ -434,5 +442,6 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf )
          dctf->sub16x16_dct  = x264_sub16x16_dct_altivec;
      }
  #endif
+*/
  }
  
diff --git a/common/i386/dct-a.asm b/common/i386/dct-a.asm

index 801ba764316bbb1014f052d2a4452d34523bd26c..504e1336aa4fa8229fed6dc2f4047c33770cc87c 100644 (file)
--- a/common/i386/dct-a.asm
+++ b/common/i386/dct-a.asm
@@ -167,21 +167,19 @@ x264_dct4x4dc_mmxext:
      MMX_SUMSUB_BADC     mm2, mm3, mm0, mm4          ; mm2=s01  mm3=d01  mm0=s23  mm4=d23
      MMX_SUMSUB_BADC     mm0, mm2, mm4, mm3          ; mm0=s01+s23  mm2=s01-s23  mm4=d01+d23  mm3=d01-d23
  
-    MMX_TRANSPOSE       mm0, mm2, mm3, mm4, mm1     ; in: mm0, mm2, mm3, mm4  out: mm0, mm4, mm1, mm3
-
      movq    mm6,        [x264_mmx_1 GOT_ebx]
      paddw   mm0,        mm6
-    paddw   mm4,        mm6
+    paddw   mm2,        mm6
      psraw   mm0,        1
      movq    [eax+ 0],   mm0
-    psraw   mm4,        1
-    movq    [eax+ 8],   mm4
-    paddw   mm1,        mm6
+    psraw   mm2,        1
+    movq    [eax+ 8],   mm2
      paddw   mm3,        mm6
-    psraw   mm1,        1
-    movq    [eax+16],   mm1
+    paddw   mm4,        mm6
      psraw   mm3,        1
-    movq    [eax+24],   mm3
+    movq    [eax+16],   mm3
+    psraw   mm4,        1
+    movq    [eax+24],   mm4
      picpop  ebx
      ret
  
@@ -206,12 +204,10 @@ x264_idct4x4dc_mmxext:
      MMX_SUMSUB_BADC     mm2, mm3, mm0, mm4          ; mm2=s01  mm3=d01  mm0=s23  mm4=d23
      MMX_SUMSUB_BADC     mm0, mm2, mm4, mm3          ; mm0=s01+s23  mm2=s01-s23  mm4=d01+d23  mm3=d01-d23
  
-    MMX_TRANSPOSE       mm0, mm2, mm3, mm4, mm1     ; in: mm0, mm2, mm3, mm4  out: mm0, mm4, mm1, mm3
-
      movq    [eax+ 0],   mm0
-    movq    [eax+ 8],   mm4
-    movq    [eax+16],   mm1
-    movq    [eax+24],   mm3
+    movq    [eax+ 8],   mm2
+    movq    [eax+16],   mm3
+    movq    [eax+24],   mm4
      ret
  
  cglobal x264_sub4x4_dct_mmxext
@@ -250,14 +246,11 @@ x264_sub4x4_dct_mmxext:
      MMX_SUMSUB_BA       mm1, mm3                    ; mm1=s03+s12      mm3=s03-s12
      MMX_SUMSUB2_AB      mm2, mm4, mm0               ; mm2=2.d03+d12    mm0=d03-2.d12
  
-    ; transpose in: mm1, mm2, mm3, mm0, out: mm1, mm0, mm4, mm3
-    MMX_TRANSPOSE       mm1, mm2, mm3, mm0, mm4
-
      mov     eax, [esp+ 8]   ; dct
      movq    [eax+ 0],   mm1
-    movq    [eax+ 8],   mm0
-    movq    [eax+16],   mm4
-    movq    [eax+24],   mm3
+    movq    [eax+ 8],   mm2
+    movq    [eax+16],   mm3
+    movq    [eax+24],   mm0
  
      pop     ebx
      ret
@@ -272,9 +265,9 @@ x264_add4x4_idct_mmxext:
      ; Load dct coeffs
      mov     eax, [esp+12]   ; dct
      movq    mm0, [eax+ 0]
-    movq    mm4, [eax+ 8]
-    movq    mm3, [eax+16]
-    movq    mm1, [eax+24]
+    movq    mm1, [eax+ 8]
+    movq    mm2, [eax+16]
+    movq    mm3, [eax+24]
      
      mov     eax, [esp+ 4]   ; p_dst
      mov     ecx, [esp+ 8]   ; i_dst
@@ -283,9 +276,6 @@ x264_add4x4_idct_mmxext:
      picpush ebx
      picgetgot ebx
  
-    ; out:mm0, mm1, mm2, mm3
-    MMX_TRANSPOSE       mm0, mm4, mm3, mm1, mm2
-
      MMX_SUMSUB_BA       mm2, mm0                        ; mm2=s02  mm0=d02
      MMX_SUMSUBD2_AB     mm1, mm3, mm5, mm4              ; mm1=s13  mm4=d13 ( well 1 + 3>>1 and 1>>1 + 3)
  
@@ -338,24 +328,11 @@ x264_add4x4_idct_mmxext:
      MMX_SUMSUB_BA   %1, %2
  %endmacro
  
-%macro MMX_STORE_DIFF_8P 6
-    movq            %1, %3
-    movq            %2, %1
-    punpcklbw       %1, %6
-    punpckhbw       %2, %6
-    paddw           %1, %4
-    paddw           %2, %5
-    packuswb        %1, %2
-    movq            %3, %1
-%endmacro
-
  cglobal x264_pixel_sub_8x8_mmx
-cglobal x264_xdct8_mmxext
+cglobal x264_pixel_add_8x8_mmx
+cglobal x264_transpose_8x8_mmx
  cglobal x264_ydct8_mmx
-
-cglobal x264_xidct8_mmxext
  cglobal x264_yidct8_mmx
-cglobal x264_pixel_add_8x8_mmx
  
  ALIGN 16
  ;-----------------------------------------------------------------------------
@@ -387,78 +364,6 @@ x264_pixel_sub_8x8_mmx:
      pop         ebx
      ret
  
-ALIGN 16
-;-----------------------------------------------------------------------------
-;   void __cdecl x264_xdct8_mmxext( int16_t dest[8][8] );
-;-----------------------------------------------------------------------------
-x264_xdct8_mmxext:
-    mov         eax, [esp+04]           ; dest
-
-    picpush     ebx
-    picgetgot   ebx
-
-    movq        mm5, [x264_mmx_PPNN GOT_ebx]
-    movq        mm6, [x264_mmx_PNNP GOT_ebx]
-    movq        mm4, [x264_mmx_PPPN GOT_ebx]
-    movq        mm7, [x264_mmx_PPNP GOT_ebx]
-
-    ;-------------------------------------------------------------------------
-    ; horizontal dct ( compute 1 row at a time -> 8 loops )
-    ;-------------------------------------------------------------------------
-
-    %assign disp 0
-    %rep 8
-    
-    movq        mm0, [eax+disp]
-    movq        mm1, [eax+disp+8]
-
-    pshufw      mm2, mm1, 00011011b
-    movq        mm1, mm0
-    paddw       mm0, mm2                ; (low)s07/s16/d25/s34(high)
-    psubw       mm1, mm2                ; (low)d07/d16/d25/d34(high)
-
-    pshufw      mm2, mm0, 00011011b     ; (low)s34/s25/s16/s07(high)
-    pmullw      mm0, mm5                ; (low)s07/s16/-s25/-s34(high)
-    paddw       mm0, mm2                ; (low)a0/a1/a3/a2(high)
-
-    movq        mm3, mm1
-    psraw       mm1, 1                  ; (low)d07/d16/d25/d34(high) (x>>1)
-    pshufw      mm2, mm3, 10110001b     ; (low)d16/d07/d34/d25(high)
-    paddw       mm1, mm3                ; (low)d07/d16/d25/d34(high) (x+(x>>1))
-    pshufw      mm3, mm2, 00011011b     ; (low)d25/d34/d07/d16(high)
-    pmullw      mm2, mm5                ; (low)d16/d07/-d34/-d25(high)
-    pmullw      mm1, mm6                ; (low)d07/-d16/-d25/d34(high) (x+(x>>1))
-    paddw       mm3, mm2
-    paddw       mm1, mm3                ; (low)a4/a6/a5/a7(high)
-
-
-    pshufw      mm2, mm0, 11001001b     ; (low)a1/a3/a0/a2(high)
-    pshufw      mm0, mm0, 10011100b     ; (low)a0/a2/a1/a3(high)
-    pmullw      mm2, [x264_mmx_2121 GOT_ebx]
-    pmullw      mm0, mm5                ; (low)a0/a2/-a1/-a3(high)
-    psraw       mm2, 1                  ; (low)a1/a3>>1/a0/a2>>1(high)
-    paddw       mm0, mm2                ; (low)dst0/dst2/dst4/dst6(high)
-
-    pshufw      mm1, mm1, 00100111b     ; (low)a7/a6/a5/a4(high)
-    pshufw      mm2, mm1, 00011011b     ; (low)a4/a5/a6/a7(high)
-    psraw       mm1, 2                  ; (low)a7>>2/a6>>2/a5>>2/a4>>2(high)
-    pmullw      mm2, mm4                ; (low)a4/a5/a6/-a7(high)
-    pmullw      mm1, mm7                ; (low)a7>>2/a6>>2/-a5>>2/a4>>2(high)
-    paddw       mm1, mm2                ; (low)dst1/dst3/dst5/dst7(high)
-
-    movq        mm2, mm0
-    punpcklwd   mm0, mm1                ; (low)dst0/dst1/dst2/dst3(high)
-    punpckhwd   mm2, mm1                ; (low)dst4/dst5/dst6/dst7(high)
-
-    movq        [eax+disp], mm0
-    movq        [eax+disp+8], mm2
-
-    %assign disp disp+16
-    %endrep
-
-    picpop      ebx
-    ret
-
  ALIGN 16
  ;-----------------------------------------------------------------------------
  ;   void __cdecl x264_ydct8_mmx( int16_t dest[8][8] );
@@ -544,73 +449,6 @@ x264_ydct8_mmx:
  
      ret
  
-ALIGN 16
-;-----------------------------------------------------------------------------
-;   void __cdecl x264_xidct8_mmxext( int16_t dest[8][8] );
-;-----------------------------------------------------------------------------
-x264_xidct8_mmxext:
-    mov         eax, [esp+04]           ; dest
-
-    picpush     ebx
-    picgetgot   ebx
-
-    movq        mm4, [x264_mmx_PPNN GOT_ebx]
-    movq        mm5, [x264_mmx_PNPN GOT_ebx]
-    movq        mm6, [x264_mmx_PPNP GOT_ebx]
-    movq        mm7, [x264_mmx_PPPN GOT_ebx]
-
-    ;-------------------------------------------------------------------------
-    ; horizontal idct ( compute 1 row at a time -> 8 loops )
-    ;-------------------------------------------------------------------------
-
-    %assign disp 0
-    %rep 8
-
-    pshufw      mm0, [eax+disp], 11011000b      ; (low)d0,d2,d1,d3(high)
-    pshufw      mm2, [eax+disp+8], 11011000b    ; (low)d4,d6,d5,d7(high)
-    movq        mm1, mm0
-    punpcklwd   mm0, mm2                ; (low)d0,d4,d2,d6(high)
-    punpckhwd   mm1, mm2                ; (low)d1,d5,d3,d7(high)
-
-    pshufw      mm2, mm0, 10110001b     ; (low)d4,d0,d6,d2(high)
-    pmullw      mm0, [x264_mmx_p2n2p1p1 GOT_ebx]; (low)2*d0,-2*d4,d2,d6(high)
-    pmullw      mm2, mm6                ; (low)d4,d0,-d6,d2(high)
-    psraw       mm0, 1                  ; (low)d0,-d4,d2>>1,d6>>1(high)
-    paddw       mm0, mm2                ; (low)e0,e2,e4,e6(high)
-
-    movq        mm3, mm1                ; (low)d1,d5,d3,d7(high)
-    psraw       mm1, 1                  ; (low)d1>>1,d5>>1,d3>>1,d7>>1(high)
-    pshufw      mm2, mm3, 10110001b     ; (low)d5,d1,d7,d3(high)
-    paddw       mm1, mm3                ; (low)d1+(d1>>1),d5+(d5>>1),d3+(d3>>1),d7+(d7>>1)(high)
-    pshufw      mm3, mm2, 00011011b     ; (low)d3,d7,d1,d5(high)
-    pmullw      mm1, mm4                ; (low)d1+(d1>>1),d5+(d5>>1),-d3-(d3>>1),-d7-(d7>>1)(high)
-    pmullw      mm2, mm5                ; (low)d5,-d1,d7,-d3(high)
-    paddw       mm1, mm3
-    paddw       mm1, mm2                ; (low)e7,e5,e3,e1(high)
-
-    pshufw      mm2, mm0, 00011011b     ; (low)e6,e4,e2,e0(high)
-    pmullw      mm0, mm4                ; (low)e0,e2,-e4,-e6(high)
-    pshufw      mm3, mm1, 00011011b     ; (low)e1,e3,e5,e7(high)
-    psraw       mm1, 2                  ; (low)e7>>2,e5>>2,e3>>2,e1>>2(high)
-    pmullw      mm3, mm6                ; (low)e1,e3,-e5,e7(high)
-    pmullw      mm1, mm7                ; (low)e7>>2,e5>>2,e3>>2,-e1>>2(high)
-    paddw       mm0, mm2                ; (low)f0,f2,f4,f6(high)
-    paddw       mm1, mm3                ; (low)f1,f3,f5,f7(high)
-
-    pshufw      mm3, mm0, 00011011b     ; (low)f6,f4,f2,f0(high)
-    pshufw      mm2, mm1, 00011011b     ; (low)f7,f5,f3,f1(high)
-    psubw       mm3, mm1
-    paddw       mm0, mm2
-
-    movq        [eax+disp], mm0
-    movq        [eax+disp+8], mm3
-
-    %assign disp disp+16
-    %endrep
-
-    picpop      ebx
-    ret
-
  ALIGN 16
  ;-----------------------------------------------------------------------------
  ;   void __cdecl x264_yidct8_mmx( int16_t dest[8][8] );
@@ -691,15 +529,6 @@ x264_yidct8_mmx:
      MMX_SUMSUB_BA   mm3, mm2                ; mm3 = g2, mm2 = g5
      MMX_SUMSUB_BA   mm1, mm0                ; mm1 = g3, mm0 = g4
  
-    psraw       mm7, 6
-    psraw       mm6, 6
-    psraw       mm5, 6
-    psraw       mm4, 6
-    psraw       mm3, 6
-    psraw       mm2, 6
-    psraw       mm1, 6
-    psraw       mm0, 6
-
      movq        [eax+disp+0*16], mm7
      movq        [eax+disp+1*16], mm5
      movq        [eax+disp+2*16], mm3
@@ -716,7 +545,7 @@ x264_yidct8_mmx:
  
  ALIGN 16
  ;-----------------------------------------------------------------------------
-;   void __cdecl x264_pixel_add_8x8_mmx( unit8_t *dst, int i_dst, int16_t src[8][8] );
+;   void __cdecl x264_pixel_add_8x8_mmx( uint8_t *dst, int i_dst, int16_t src[8][8] );
  ;-----------------------------------------------------------------------------
  x264_pixel_add_8x8_mmx:
      mov         eax, [esp+04]       ; dst
@@ -727,9 +556,69 @@ x264_pixel_add_8x8_mmx:
  
      %assign disp 0
      %rep 8
-    MMX_STORE_DIFF_8P   mm0, mm1, [eax], [edx+disp], [edx+disp+8], mm7
+    movq        mm0, [eax]
+    movq        mm2, [edx+disp]
+    movq        mm3, [edx+disp+8]
+    movq        mm1, mm0
+    psraw       mm2, 6
+    psraw       mm3, 6
+    punpcklbw   mm0, mm7
+    punpckhbw   mm1, mm7
+    paddw       mm0, mm2
+    paddw       mm1, mm3
+    packuswb    mm0, mm1
+    movq      [eax], mm0
      add         eax, ecx
      %assign disp disp+16
      %endrep
      ret
  
+ALIGN 16
+;-----------------------------------------------------------------------------
+;   void __cdecl x264_transpose_8x8_mmx( int16_t src[8][8] );
+;-----------------------------------------------------------------------------
+x264_transpose_8x8_mmx:
+    mov   eax, [esp+4]
+
+    movq  mm0, [eax    ]
+    movq  mm1, [eax+ 16]
+    movq  mm2, [eax+ 32]
+    movq  mm3, [eax+ 48]
+    MMX_TRANSPOSE  mm0, mm1, mm2, mm3, mm4
+    movq  [eax    ], mm0
+    movq  [eax+ 16], mm3
+    movq  [eax+ 32], mm4
+    movq  [eax+ 48], mm2
+
+    movq  mm0, [eax+ 72]
+    movq  mm1, [eax+ 88]
+    movq  mm2, [eax+104]
+    movq  mm3, [eax+120]
+    MMX_TRANSPOSE  mm0, mm1, mm2, mm3, mm4
+    movq  [eax+ 72], mm0
+    movq  [eax+ 88], mm3
+    movq  [eax+104], mm4
+    movq  [eax+120], mm2
+
+    movq  mm0, [eax+  8]
+    movq  mm1, [eax+ 24]
+    movq  mm2, [eax+ 40]
+    movq  mm3, [eax+ 56]
+    MMX_TRANSPOSE  mm0, mm1, mm2, mm3, mm4
+    movq  mm1, [eax+ 64]
+    movq  mm5, [eax+ 80]
+    movq  mm6, [eax+ 96]
+    movq  mm7, [eax+112]
+
+    movq  [eax+ 64], mm0
+    movq  [eax+ 80], mm3
+    movq  [eax+ 96], mm4
+    movq  [eax+112], mm2
+    MMX_TRANSPOSE  mm1, mm5, mm6, mm7, mm4
+    movq  [eax+  8], mm1
+    movq  [eax+ 24], mm7
+    movq  [eax+ 40], mm4
+    movq  [eax+ 56], mm6
+
+    ret
+
diff --git a/common/i386/dct-c.c b/common/i386/dct-c.c

index 5fc88f95c39870090d48bfaa5420a953126ad0de..3ca08b1384ba0cb10012c6af5fb2907003f35f7c 100644 (file)
--- a/common/i386/dct-c.c
+++ b/common/i386/dct-c.c
@@ -97,39 +97,40 @@ void x264_add16x16_idct8_sse2( uint8_t *p_dst, int i_dst, int16_t dct[4][8][8] )
  
  void x264_pixel_sub_8x8_mmx( int16_t *diff, uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 );
  void x264_pixel_add_8x8_mmx( uint8_t *pix, int i_pix, uint16_t *diff );
-void x264_xdct8_mmxext( int16_t dct[8][8] );
-void x264_xidct8_mmxext( int16_t dct[8][8] );
+void x264_transpose_8x8_mmx( int16_t src[8][8] );
  void x264_ydct8_mmx( int16_t dct[8][8] );
-void x264_yidct8_mmx( int16_t dct[8][8] );       // including >>6 at the end
+void x264_yidct8_mmx( int16_t dct[8][8] );
  
-inline void x264_sub8x8_dct8_mmxext( int16_t dct[8][8], uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 )
+inline void x264_sub8x8_dct8_mmx( int16_t dct[8][8], uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 )
  {
      x264_pixel_sub_8x8_mmx( (int16_t *)dct, pix1, i_pix1, pix2, i_pix2 );
      x264_ydct8_mmx( dct );
-    x264_xdct8_mmxext( dct );
+    x264_transpose_8x8_mmx( dct );
+    x264_ydct8_mmx( dct );
  }
  
-void x264_sub16x16_dct8_mmxext( int16_t dct[4][8][8], uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 )
+void x264_sub16x16_dct8_mmx( int16_t dct[4][8][8], uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 )
  {
-    x264_sub8x8_dct8_mmxext( dct[0], pix1,            i_pix1, pix2,            i_pix2 );
-    x264_sub8x8_dct8_mmxext( dct[1], pix1+8,          i_pix1, pix2+8,          i_pix2 );
-    x264_sub8x8_dct8_mmxext( dct[2], pix1+8*i_pix1,   i_pix1, pix2+8*i_pix2,   i_pix2 );
-    x264_sub8x8_dct8_mmxext( dct[3], pix1+8*i_pix1+8, i_pix1, pix2+8*i_pix2+8, i_pix2 );
+    x264_sub8x8_dct8_mmx( dct[0], pix1,            i_pix1, pix2,            i_pix2 );
+    x264_sub8x8_dct8_mmx( dct[1], pix1+8,          i_pix1, pix2+8,          i_pix2 );
+    x264_sub8x8_dct8_mmx( dct[2], pix1+8*i_pix1,   i_pix1, pix2+8*i_pix2,   i_pix2 );
+    x264_sub8x8_dct8_mmx( dct[3], pix1+8*i_pix1+8, i_pix1, pix2+8*i_pix2+8, i_pix2 );
  }
  
-inline void x264_add8x8_idct8_mmxext( uint8_t *dst, int i_dst, int16_t dct[8][8] )
+inline void x264_add8x8_idct8_mmx( uint8_t *dst, int i_dst, int16_t dct[8][8] )
  {
      dct[0][0] += 32;
-    x264_xidct8_mmxext( dct );
      x264_yidct8_mmx( dct );
-    x264_pixel_add_8x8_mmx( dst, i_dst, (uint16_t *)dct );
+    x264_transpose_8x8_mmx( dct );
+    x264_yidct8_mmx( dct );
+    x264_pixel_add_8x8_mmx( dst, i_dst, (uint16_t *)dct ); // including >>6 at the end
  }
  
-void x264_add16x16_idct8_mmxext( uint8_t *dst, int i_dst, int16_t dct[4][8][8] )
+void x264_add16x16_idct8_mmx( uint8_t *dst, int i_dst, int16_t dct[4][8][8] )
  {
-    x264_add8x8_idct8_mmxext( dst,           i_dst, dct[0] );
-    x264_add8x8_idct8_mmxext( dst+8,         i_dst, dct[1] );
-    x264_add8x8_idct8_mmxext( dst+8*i_dst,   i_dst, dct[2] );
-    x264_add8x8_idct8_mmxext( dst+8*i_dst+8, i_dst, dct[3] );
+    x264_add8x8_idct8_mmx( dst,           i_dst, dct[0] );
+    x264_add8x8_idct8_mmx( dst+8,         i_dst, dct[1] );
+    x264_add8x8_idct8_mmx( dst+8*i_dst,   i_dst, dct[2] );
+    x264_add8x8_idct8_mmx( dst+8*i_dst+8, i_dst, dct[3] );
  }
  #endif
diff --git a/common/i386/dct.h b/common/i386/dct.h

index 74c78294f39d11d80a32e2dda8d536505a8a66d4..f89b23baad5c31931c804bbcbd3859a2ed2b1503 100644 (file)
--- a/common/i386/dct.h
+++ b/common/i386/dct.h
@@ -35,11 +35,11 @@ void x264_add16x16_idct_mmxext( uint8_t *p_dst, int i_dst, int16_t dct[16][4][4]
  void x264_dct4x4dc_mmxext( int16_t d[4][4] );
  void x264_idct4x4dc_mmxext( int16_t d[4][4] );
  
-void x264_sub8x8_dct8_mmxext( int16_t dct[8][8], uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 );
-void x264_sub16x16_dct8_mmxext( int16_t dct[4][8][8], uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 );
+void x264_sub8x8_dct8_mmx( int16_t dct[8][8], uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 );
+void x264_sub16x16_dct8_mmx( int16_t dct[4][8][8], uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 );
  
-void x264_add8x8_idct8_mmxext( uint8_t *dst, int i_dst, int16_t dct[8][8] );
-void x264_add16x16_idct8_mmxext( uint8_t *dst, int i_dst, int16_t dct[4][8][8] );
+void x264_add8x8_idct8_mmx( uint8_t *dst, int i_dst, int16_t dct[8][8] );
+void x264_add16x16_idct8_mmx( uint8_t *dst, int i_dst, int16_t dct[4][8][8] );
  
  void x264_sub8x8_dct8_sse2( int16_t dct[8][8],  uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 );
  void x264_sub16x16_dct8_sse2( int16_t dct[4][8][8], uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 );
diff --git a/common/macroblock.h b/common/macroblock.h

index cab5e81cc67a8d271cc652809eb2c7a75d1e55cf..6ca6492b7a7d492dd67a55e5c37439c5833b5fbe 100644 (file)
--- a/common/macroblock.h
+++ b/common/macroblock.h
@@ -161,17 +161,18 @@ static const int x264_mb_partition_pixel_table[17] =
      6, 4, 5, 3, 6, 4, 5, 3, 6, 4, 5, 3, 3, 3, 1, 2, 0
  };
  
+/* zigzags are transposed with respect to the tables in the standard */
  static const int x264_zigzag_scan4[16] =
  {
-    0,  1,  4,  8,  5,  2,  3,  6,  9, 12, 13, 10,  7, 11, 14, 15
+    0,  4,  1,  2,  5,  8, 12,  9,  6,  3,  7, 10, 13, 14, 11, 15
  };
  static const int x264_zigzag_scan8[64] =
  {
-    0,  1,  8, 16,  9,  2,  3, 10, 17, 24, 32, 25, 18, 11,  4,  5,
-   12, 19, 26, 33, 40, 48, 41, 34, 27, 20, 13,  6,  7, 14, 21, 28,
-   35, 42, 49, 56, 57, 50, 43, 36, 29, 22, 15, 23, 30, 37, 44, 51,
-   58, 59, 52, 45, 38, 31, 39, 46, 53, 60, 61, 54, 47, 55, 62, 63
-};  
+    0,  8,  1,  2,  9, 16, 24, 17, 10,  3,  4, 11, 18, 25, 32, 40,
+   33, 26, 19, 12,  5,  6, 13, 20, 27, 34, 41, 48, 56, 49, 42, 35,
+   28, 21, 14,  7, 15, 22, 29, 36, 43, 50, 57, 58, 51, 44, 37, 30,
+   23, 31, 38, 45, 52, 59, 60, 53, 46, 39, 47, 54, 61, 62, 55, 63
+};
  
  static const uint8_t block_idx_x[16] =
  {
diff --git a/encoder/macroblock.c b/encoder/macroblock.c

index bb6adea7cb3e43251084d2a92d936d645599bfb7..3cc6716afbf0af410201d3f5fb17bff8c994081e 100644 (file)
--- a/encoder/macroblock.c
+++ b/encoder/macroblock.c
@@ -50,10 +50,8 @@ static const int def_quant4_mf[6][4][4] =
  /****************************************************************************
   * Scan and Quant functions
   ****************************************************************************/
-//static const int scan_zigzag_x[16]={0, 1, 0, 0, 1, 2, 3, 2, 1, 0, 1, 2, 3, 3, 2, 3};
-//static const int scan_zigzag_y[16]={0, 0, 1, 2, 1, 0, 0, 1, 2, 3, 3, 2, 1, 2, 3, 3};
  
-#define ZIG(i,y,x) level[i] = dct[y][x];
+#define ZIG(i,y,x) level[i] = dct[x][y];
  static inline void scan_zigzag_8x8full( int level[64], int16_t dct[8][8] )
  {
      ZIG( 0,0,0) ZIG( 1,0,1) ZIG( 2,1,0) ZIG( 3,2,0)
diff --git a/encoder/set.c b/encoder/set.c

index 26549c21122ef665a38cb85cebe3061e9c322aae..048e4722701cd37221b0f11302f003c1b7c0b9a5 100644 (file)
--- a/encoder/set.c
+++ b/encoder/set.c
@@ -30,6 +30,14 @@
  #include "config.h"
  #endif
  
+static void transpose( uint8_t *buf, int w )
+{
+    int i, j;
+    for( i = 0; i < w; i++ )
+        for( j = 0; j < i; j++ )
+            XCHG( uint8_t, buf[w*i+j], buf[w*j+i] );
+}
+
  static void scaling_list_write( bs_t *s, x264_pps_t *pps, int idx )
  {
      const int len = idx<4 ? 16 : 64;
@@ -387,6 +395,13 @@ void x264_pps_init( x264_pps_t *pps, int i_id, x264_param_t *param, x264_sps_t *
              pps->scaling_list[i] = x264_cqm_jvt[i];
          break;
      case X264_CQM_CUSTOM:
+        /* match the transposed DCT & zigzag */
+        transpose( param->cqm_4iy, 4 );
+        transpose( param->cqm_4ic, 4 );
+        transpose( param->cqm_4py, 4 );
+        transpose( param->cqm_4pc, 4 );
+        transpose( param->cqm_8iy, 8 );
+        transpose( param->cqm_8py, 8 );
          pps->scaling_list[CQM_4IY] = param->cqm_4iy;
          pps->scaling_list[CQM_4IC] = param->cqm_4ic;
          pps->scaling_list[CQM_4PY] = param->cqm_4py;
author	Loren Merritt <pengvado@videolan.org>
	Thu, 9 Mar 2006 05:30:08 +0000 (05:30 +0000)
committer	Loren Merritt <pengvado@videolan.org>
	Thu, 9 Mar 2006 05:30:08 +0000 (05:30 +0000)
common/amd64/dct-a.asm		patch \| blob \| history
common/dct.c		patch \| blob \| history
common/i386/dct-a.asm		patch \| blob \| history
common/i386/dct-c.c		patch \| blob \| history
common/i386/dct.h		patch \| blob \| history
common/macroblock.h		patch \| blob \| history
encoder/macroblock.c		patch \| blob \| history
encoder/set.c		patch \| blob \| history