x86 asm for some high-bit-depth coefficient functions

author Fiona Glaser <fiona@x264.com>

Sun, 31 Oct 2010 02:13:05 +0000 (19:13 -0700)

committer Fiona Glaser <fiona@x264.com>

Fri, 19 Nov 2010 17:47:33 +0000 (09:47 -0800)
author Fiona Glaser <fiona@x264.com>
Sun, 31 Oct 2010 02:13:05 +0000 (19:13 -0700)
committer Fiona Glaser <fiona@x264.com>
Fri, 19 Nov 2010 17:47:33 +0000 (09:47 -0800)
diff --git a/common/common.h b/common/common.h

index 82af6f59c57effe38c336fc71bc7c4abb249d0c7..c3bfaf5e3caa7e5edfbebe1da1c2ee4c0bde3a45 100644 (file)
--- a/common/common.h
+++ b/common/common.h
@@ -812,7 +812,7 @@ struct x264_t
      } stat;
  
      ALIGNED_16( uint32_t nr_residual_sum[2][64] );
-    ALIGNED_16( uint16_t nr_offset[2][64] );
+    ALIGNED_16( udctcoef nr_offset[2][64] );
      uint32_t        nr_count[2];
  
      /* Buffers that are allocated per-thread even in sliced threads. */
diff --git a/common/quant.c b/common/quant.c

index 7867f34e00c721707f804f91cf633f13fa07a9ef..a8e9287df4535f15091091a180549fbf9f9ec0c0 100644 (file)
--- a/common/quant.c
+++ b/common/quant.c
@@ -141,7 +141,7 @@ static void dequant_4x4_dc( dctcoef dct[16], int dequant_mf[6][16], int i_qp )
      }
  }
  
-static void x264_denoise_dct( dctcoef *dct, uint32_t *sum, uint16_t *offset, int size )
+static void x264_denoise_dct( dctcoef *dct, uint32_t *sum, udctcoef *offset, int size )
  {
      for( int i = 1; i < size; i++ )
      {
@@ -296,6 +296,25 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
      {
          pf->quant_2x2_dc = x264_quant_2x2_dc_mmxext;
          pf->quant_4x4_dc = x264_quant_4x4_dc_mmxext;
+#if ARCH_X86
+        pf->denoise_dct = x264_denoise_dct_mmx;
+        pf->decimate_score15 = x264_decimate_score15_mmxext;
+        pf->decimate_score16 = x264_decimate_score16_mmxext;
+        if( cpu&X264_CPU_SLOW_CTZ )
+        {
+            pf->decimate_score15 = x264_decimate_score15_mmxext_slowctz;
+            pf->decimate_score16 = x264_decimate_score16_mmxext_slowctz;
+        }
+        pf->decimate_score64 = x264_decimate_score64_mmxext;
+        pf->coeff_last[  DCT_LUMA_AC] = x264_coeff_last15_mmxext;
+        pf->coeff_last[ DCT_LUMA_4x4] = x264_coeff_last16_mmxext;
+        pf->coeff_last[ DCT_LUMA_8x8] = x264_coeff_last64_mmxext;
+        pf->coeff_level_run[  DCT_LUMA_AC] = x264_coeff_level_run15_mmxext;
+        pf->coeff_level_run[ DCT_LUMA_4x4] = x264_coeff_level_run16_mmxext;
+#endif
+        pf->coeff_level_run[DCT_CHROMA_DC] = x264_coeff_level_run4_mmxext;
+        if( cpu&X264_CPU_LZCNT )
+            pf->coeff_level_run[DCT_CHROMA_DC] = x264_coeff_level_run4_mmxext_lzcnt;
      }
      if( cpu&X264_CPU_SSE2 )
      {
@@ -303,6 +322,28 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
          pf->quant_8x8 = x264_quant_8x8_sse2;
          pf->quant_2x2_dc = x264_quant_2x2_dc_sse2;
          pf->quant_4x4_dc = x264_quant_4x4_dc_sse2;
+        pf->denoise_dct = x264_denoise_dct_sse2;
+        pf->decimate_score15 = x264_decimate_score15_sse2;
+        pf->decimate_score16 = x264_decimate_score16_sse2;
+        pf->decimate_score64 = x264_decimate_score64_sse2;
+        if( cpu&X264_CPU_SLOW_CTZ )
+        {
+            pf->decimate_score15 = x264_decimate_score15_sse2_slowctz;
+            pf->decimate_score16 = x264_decimate_score16_sse2_slowctz;
+        }
+        pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_sse2;
+        pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_sse2;
+        pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_sse2;
+        pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_sse2;
+        pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_sse2;
+        if( cpu&X264_CPU_LZCNT )
+        {
+            pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_sse2_lzcnt;
+            pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_sse2_lzcnt;
+            pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_sse2_lzcnt;
+            pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_sse2_lzcnt;
+            pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_sse2_lzcnt;
+        }
      }
      if( cpu&X264_CPU_SSSE3 )
      {
@@ -310,6 +351,15 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
          pf->quant_8x8 = x264_quant_8x8_ssse3;
          pf->quant_2x2_dc = x264_quant_2x2_dc_ssse3;
          pf->quant_4x4_dc = x264_quant_4x4_dc_ssse3;
+        pf->denoise_dct = x264_denoise_dct_ssse3;
+        pf->decimate_score15 = x264_decimate_score15_ssse3;
+        pf->decimate_score16 = x264_decimate_score16_ssse3;
+        if( cpu&X264_CPU_SLOW_CTZ )
+        {
+            pf->decimate_score15 = x264_decimate_score15_ssse3_slowctz;
+            pf->decimate_score16 = x264_decimate_score16_ssse3_slowctz;
+        }
+        pf->decimate_score64 = x264_decimate_score64_ssse3;
      }
      if( cpu&X264_CPU_SSE4 )
      {
diff --git a/common/quant.h b/common/quant.h

index 8c31eaf3876705d0d33cfc5d060804449d871979..c8ef8d80e0e134869ed9d413707dcd454aab0a33 100644 (file)
--- a/common/quant.h
+++ b/common/quant.h
@@ -38,7 +38,7 @@ typedef struct
      void (*dequant_4x4)( dctcoef dct[16], int dequant_mf[6][16], int i_qp );
      void (*dequant_4x4_dc)( dctcoef dct[16], int dequant_mf[6][16], int i_qp );
  
-    void (*denoise_dct)( dctcoef *dct, uint32_t *sum, uint16_t *offset, int size );
+    void (*denoise_dct)( dctcoef *dct, uint32_t *sum, udctcoef *offset, int size );
  
      int (*decimate_score15)( dctcoef *dct );
      int (*decimate_score16)( dctcoef *dct );
diff --git a/common/x86/quant-a.asm b/common/x86/quant-a.asm

index cd9bb10d963d5778e4ececeda652013ec295a462..9c2cd1e39b9e274dc835bb5697d86fd9b16613a5 100644 (file)
--- a/common/x86/quant-a.asm
+++ b/common/x86/quant-a.asm
@@ -694,12 +694,63 @@ DEQUANT_DC mmxext
  INIT_XMM
  DEQUANT_DC sse2
  
+%ifdef X264_HIGH_BIT_DEPTH
+;-----------------------------------------------------------------------------
+; void denoise_dct( int32_t *dct, uint32_t *sum, uint32_t *offset, int size )
+;-----------------------------------------------------------------------------
+%macro DENOISE_DCT 1-2 0
+cglobal denoise_dct_%1, 4,5,%2
+    mov       r4d, [r0] ; backup DC coefficient
+    pxor      m6, m6
+.loop:
+    sub       r3, mmsize/2
+    mova      m2, [r0+r3*4+0*mmsize]
+    mova      m3, [r0+r3*4+1*mmsize]
+    PABSD     m0, m2
+    PABSD     m1, m3
+    mova      m4, m0
+    mova      m5, m1
+    psubd     m0, [r2+r3*4+0*mmsize]
+    psubd     m1, [r2+r3*4+1*mmsize]
+    mova      m7, m0
+    pcmpgtd   m7, m6
+    pand      m0, m7
+    mova      m7, m1
+    pcmpgtd   m7, m6
+    pand      m1, m7
+    PSIGND    m0, m2
+    PSIGND    m1, m3
+    mova      [r0+r3*4+0*mmsize], m0
+    mova      [r0+r3*4+1*mmsize], m1
+    paddd     m4, [r1+r3*4+0*mmsize]
+    paddd     m5, [r1+r3*4+1*mmsize]
+    mova      [r1+r3*4+0*mmsize], m4
+    mova      [r1+r3*4+1*mmsize], m5
+    jg .loop
+    mov       [r0], r4d ; restore DC coefficient
+    RET
+%endmacro
+
+%define PABSD PABSD_MMX
+%define PSIGND PSIGND_MMX
+%ifndef ARCH_X86_64
+INIT_MMX
+DENOISE_DCT mmx
+%endif
+INIT_XMM
+DENOISE_DCT sse2, 8
+%define PABSD PABSD_SSSE3
+%define PSIGND PSIGND_SSSE3
+DENOISE_DCT ssse3, 8
+
+%else ; !X264_HIGH_BIT_DEPTH
+
  ;-----------------------------------------------------------------------------
  ; void denoise_dct( int16_t *dct, uint32_t *sum, uint16_t *offset, int size )
  ;-----------------------------------------------------------------------------
  %macro DENOISE_DCT 1-2 0
  cglobal denoise_dct_%1, 4,5,%2
-    movzx     r4d, word [r0] ; backup DC coefficient
+    movzx     r4d, word [r0]
      pxor      m6, m6
  .loop:
      sub       r3, mmsize
@@ -730,7 +781,7 @@ cglobal denoise_dct_%1, 4,5,%2
      mova      [r1+r3*4+2*mmsize], m5
      mova      [r1+r3*4+3*mmsize], m3
      jg .loop
-    mov       [r0], r4w ; restore DC coefficient
+    mov       [r0], r4w
      RET
  %endmacro
  
@@ -746,20 +797,33 @@ DENOISE_DCT sse2, 7
  %define PSIGNW PSIGNW_SSSE3
  DENOISE_DCT ssse3, 7
  
-
+%endif ; !X264_HIGH_BIT_DEPTH
  
  ;-----------------------------------------------------------------------------
-; int decimate_score( int16_t *dct )
+; int decimate_score( dctcoef *dct )
  ;-----------------------------------------------------------------------------
  
-%macro DECIMATE_MASK_SSE2 6
-%ifidn %5, ssse3
+%macro DECIMATE_MASK_SSE2 7
+%ifdef X264_HIGH_BIT_DEPTH
+    movdqa   xmm0, [%3+ 0]
+    movdqa   xmm1, [%3+32]
+    packssdw xmm0, [%3+16]
+    packssdw xmm1, [%3+48]
+%if %7
+    pabsw    xmm0, xmm0
+    pabsw    xmm1, xmm1
+%else
+    ABS2_MMX xmm0, xmm1, xmm3, xmm4
+%endif
+%else
+%if %7
      pabsw    xmm0, [%3+ 0]
      pabsw    xmm1, [%3+16]
  %else
      movdqa   xmm0, [%3+ 0]
      movdqa   xmm1, [%3+16]
      ABS2_MMX xmm0, xmm1, xmm3, xmm4
+%endif
  %endif
      packsswb xmm0, xmm1
      pxor     xmm2, xmm2
@@ -769,23 +833,34 @@ DENOISE_DCT ssse3, 7
      pmovmskb %2, xmm0
  %endmacro
  
-%macro DECIMATE_MASK_MMX 6
+%macro DECIMATE_MASK_MMX 7
+%ifdef X264_HIGH_BIT_DEPTH
+    movq      mm0, [%3+ 0]
+    movq      mm1, [%3+16]
+    movq      mm2, [%3+32]
+    movq      mm3, [%3+48]
+    packssdw  mm0, [%3+ 8]
+    packssdw  mm1, [%3+24]
+    packssdw  mm2, [%3+40]
+    packssdw  mm3, [%3+56]
+%else
      movq      mm0, [%3+ 0]
      movq      mm1, [%3+ 8]
      movq      mm2, [%3+16]
      movq      mm3, [%3+24]
-    ABS2_MMX  mm0, mm1, mm4, mm5
-    ABS2_MMX  mm2, mm3, mm4, mm5
+%endif
+    ABS2_MMX  mm0, mm1, mm6, mm7
+    ABS2_MMX  mm2, mm3, mm6, mm7
      packsswb  mm0, mm1
      packsswb  mm2, mm3
      pxor      mm4, mm4
-    pxor      mm5, mm5
+    pxor      mm6, mm6
      pcmpeqb   mm4, mm0
-    pcmpeqb   mm5, mm2
+    pcmpeqb   mm6, mm2
      pcmpgtb   mm0, %4
      pcmpgtb   mm2, %4
      pmovmskb   %6, mm4
-    pmovmskb   %1, mm5
+    pmovmskb   %1, mm6
      shl        %1, 8
      or         %1, %6
      pmovmskb   %6, mm0
@@ -797,7 +872,7 @@ DENOISE_DCT ssse3, 7
  cextern decimate_table4
  cextern decimate_table8
  
-%macro DECIMATE4x4 3
+%macro DECIMATE4x4 4
  
  ;A LUT is faster than bsf on AMD processors.
  ;This is not true for score64.
@@ -811,7 +886,7 @@ cglobal decimate_score%1_%2, 1,3
      %define table decimate_table4
      %define mask_table decimate_mask_table4
  %endif
-    DECIMATE_MASK edx, eax, r0, [pb_1], %2, ecx
+    DECIMATE_MASK edx, eax, r0, [pb_1], %2, ecx, %4
      xor   edx, 0xffff
      je   .ret
      test  eax, eax
@@ -850,22 +925,22 @@ cglobal decimate_score%1_%2, 1,3
  
  %ifndef ARCH_X86_64
  %define DECIMATE_MASK DECIMATE_MASK_MMX
-DECIMATE4x4 15, mmxext, 0
-DECIMATE4x4 16, mmxext, 0
-DECIMATE4x4 15, mmxext_slowctz, 1
-DECIMATE4x4 16, mmxext_slowctz, 1
+DECIMATE4x4 15, mmxext, 0, 0
+DECIMATE4x4 16, mmxext, 0, 0
+DECIMATE4x4 15, mmxext_slowctz, 1, 0
+DECIMATE4x4 16, mmxext_slowctz, 1, 0
  %endif
  %define DECIMATE_MASK DECIMATE_MASK_SSE2
-DECIMATE4x4 15, sse2, 0
-DECIMATE4x4 16, sse2, 0
-DECIMATE4x4 15, sse2_slowctz, 1
-DECIMATE4x4 16, sse2_slowctz, 1
-DECIMATE4x4 15, ssse3, 0
-DECIMATE4x4 16, ssse3, 0
-DECIMATE4x4 15, ssse3_slowctz, 1
-DECIMATE4x4 16, ssse3_slowctz, 1
+DECIMATE4x4 15, sse2, 0, 0
+DECIMATE4x4 16, sse2, 0, 0
+DECIMATE4x4 15, sse2_slowctz, 1, 0
+DECIMATE4x4 16, sse2_slowctz, 1, 0
+DECIMATE4x4 15, ssse3, 0, 1
+DECIMATE4x4 16, ssse3, 0, 1
+DECIMATE4x4 15, ssse3_slowctz, 1, 1
+DECIMATE4x4 16, ssse3_slowctz, 1, 1
  
-%macro DECIMATE8x8 1
+%macro DECIMATE8x8 2
  
  %ifdef ARCH_X86_64
  cglobal decimate_score64_%1, 1,4
@@ -876,17 +951,17 @@ cglobal decimate_score64_%1, 1,4
      %define table decimate_table8
  %endif
      mova  m5, [pb_1]
-    DECIMATE_MASK r1d, eax, r0, m5, %1, null
+    DECIMATE_MASK r1d, eax, r0+SIZEOF_DCTCOEF* 0, m5, %1, null, %2
      test  eax, eax
      jne  .ret9
-    DECIMATE_MASK r2d, eax, r0+32, m5, %1, null
+    DECIMATE_MASK r2d, eax, r0+SIZEOF_DCTCOEF*16, m5, %1, null, %2
      shl   r2d, 16
      or    r1d, r2d
-    DECIMATE_MASK r2d, r3d, r0+64, m5, %1, null
+    DECIMATE_MASK r2d, r3d, r0+SIZEOF_DCTCOEF*32, m5, %1, null, %2
      shl   r2, 32
      or    eax, r3d
      or    r1, r2
-    DECIMATE_MASK r2d, r3d, r0+96, m5, %1, null
+    DECIMATE_MASK r2d, r3d, r0+SIZEOF_DCTCOEF*48, m5, %1, null, %2
      shl   r2, 48
      or    r1, r2
      xor   r1, -1
@@ -911,16 +986,16 @@ cglobal decimate_score64_%1, 1,6
  %else
  cglobal decimate_score64_%1, 1,5
  %endif
-    mova  m7, [pb_1]
-    DECIMATE_MASK r3, r2, r0, m7, %1, r5
+    mova  m5, [pb_1]
+    DECIMATE_MASK r3, r2, r0+SIZEOF_DCTCOEF* 0, m5, %1, r5, %2
      test  r2, r2
      jne  .ret9
-    DECIMATE_MASK r4, r2, r0+32, m7, %1, r5
+    DECIMATE_MASK r4, r2, r0+SIZEOF_DCTCOEF*16, m5, %1, r5, %2
      shl   r4, 16
      or    r3, r4
-    DECIMATE_MASK r4, r1, r0+64, m7, %1, r5
+    DECIMATE_MASK r4, r1, r0+SIZEOF_DCTCOEF*32, m5, %1, r5, %2
      or    r2, r1
-    DECIMATE_MASK r1, r0, r0+96, m7, %1, r5
+    DECIMATE_MASK r1, r0, r0+SIZEOF_DCTCOEF*48, m5, %1, r5, %2
      shl   r1, 16
      or    r4, r1
      xor   r3, -1
@@ -968,17 +1043,71 @@ cglobal decimate_score64_%1, 1,5
  %ifndef ARCH_X86_64
  INIT_MMX
  %define DECIMATE_MASK DECIMATE_MASK_MMX
-DECIMATE8x8 mmxext
+DECIMATE8x8 mmxext, 0
  %endif
  INIT_XMM
  %define DECIMATE_MASK DECIMATE_MASK_SSE2
-DECIMATE8x8 sse2
-DECIMATE8x8 ssse3
+DECIMATE8x8 sse2, 0
+DECIMATE8x8 ssse3, 1
  
  ;-----------------------------------------------------------------------------
-; int coeff_last( int16_t *dct )
+; int coeff_last( dctcoef *dct )
  ;-----------------------------------------------------------------------------
  
+%macro LAST_X86 3
+    bsr %1, %2
+%endmacro
+
+%macro LAST_SSE4A 3
+    lzcnt %1, %2
+    xor %1, %3
+%endmacro
+
+%ifdef X264_HIGH_BIT_DEPTH
+%macro LAST_MASK4_MMX 2-3
+    movq     mm0, [%2]
+    packssdw mm0, [%2+8]
+    packsswb mm0, mm0
+    pcmpeqb  mm0, mm2
+    pmovmskb  %1, mm0
+%endmacro
+
+%macro LAST_MASK_SSE2 2-3
+    movdqa   xmm0, [%2+ 0]
+    movdqa   xmm1, [%2+32]
+    packssdw xmm0, [%2+16]
+    packssdw xmm1, [%2+48]
+    packsswb xmm0, xmm1
+    pcmpeqb  xmm0, xmm2
+    pmovmskb   %1, xmm0
+%endmacro
+
+%macro LAST_MASK_MMX 3
+    movq     mm0, [%2+ 0]
+    movq     mm1, [%2+16]
+    packssdw mm0, [%2+ 8]
+    packssdw mm1, [%2+24]
+    movq     mm3, [%2+32]
+    movq     mm4, [%2+48]
+    packssdw mm3, [%2+40]
+    packssdw mm4, [%2+56]
+    packsswb mm0, mm1
+    packsswb mm3, mm4
+    pcmpeqb  mm0, mm2
+    pcmpeqb  mm3, mm2
+    pmovmskb  %1, mm0
+    pmovmskb  %3, mm3
+    shl       %3, 8
+    or        %1, %3
+%endmacro
+%else ; !X264_HIGH_BIT_DEPTH
+%macro LAST_MASK4_MMX 2-3
+    movq     mm0, [%2]
+    packsswb mm0, mm0
+    pcmpeqb  mm0, mm2
+    pmovmskb  %1, mm0
+%endmacro
+
  %macro LAST_MASK_SSE2 2-3
      movdqa   xmm0, [%2+ 0]
      packsswb xmm0, [%2+16]
@@ -999,20 +1128,11 @@ DECIMATE8x8 ssse3
      or        %1, %3
  %endmacro
  
-%macro LAST_X86 3
-    bsr %1, %2
-%endmacro
-
-%macro LAST_SSE4A 3
-    lzcnt %1, %2
-    xor %1, %3
-%endmacro
-
  %macro COEFF_LAST4 1
  %ifdef ARCH_X86_64
  cglobal coeff_last4_%1, 1,1
      LAST rax, [r0], 0x3f
-    shr eax, 4
+    shr  eax, 4
      RET
  %else
  cglobal coeff_last4_%1, 0,3
@@ -1033,11 +1153,12 @@ cglobal coeff_last4_%1, 0,3
  COEFF_LAST4 mmxext
  %define LAST LAST_SSE4A
  COEFF_LAST4 mmxext_lzcnt
+%endif ; X264_HIGH_BIT_DEPTH
  
  %macro COEFF_LAST 1
  cglobal coeff_last15_%1, 1,3
      pxor m2, m2
-    LAST_MASK r1d, r0-2, r2d
+    LAST_MASK r1d, r0-SIZEOF_DCTCOEF, r2d
      xor r1d, 0xffff
      LAST eax, r1d, 0x1f
      dec eax
@@ -1053,14 +1174,14 @@ cglobal coeff_last16_%1, 1,3
  %ifndef ARCH_X86_64
  cglobal coeff_last64_%1, 1, 5-mmsize/16
      pxor m2, m2
-    LAST_MASK r2d, r0+64, r4d
-    LAST_MASK r3d, r0+96, r4d
+    LAST_MASK r2d, r0+SIZEOF_DCTCOEF* 32, r4d
+    LAST_MASK r3d, r0+SIZEOF_DCTCOEF* 48, r4d
      shl r3d, 16
      or  r2d, r3d
      xor r2d, -1
      jne .secondhalf
-    LAST_MASK r1d, r0, r4d
-    LAST_MASK r3d, r0+32, r4d
+    LAST_MASK r1d, r0+SIZEOF_DCTCOEF* 0, r4d
+    LAST_MASK r3d, r0+SIZEOF_DCTCOEF*16, r4d
      shl r3d, 16
      or  r1d, r3d
      not r1d
@@ -1073,10 +1194,10 @@ cglobal coeff_last64_%1, 1, 5-mmsize/16
  %else
  cglobal coeff_last64_%1, 1,4
      pxor m2, m2
-    LAST_MASK_SSE2 r1d, r0
-    LAST_MASK_SSE2 r2d, r0+32
-    LAST_MASK_SSE2 r3d, r0+64
-    LAST_MASK_SSE2 r0d, r0+96
+    LAST_MASK_SSE2 r1d, r0+SIZEOF_DCTCOEF* 0
+    LAST_MASK_SSE2 r2d, r0+SIZEOF_DCTCOEF*16
+    LAST_MASK_SSE2 r3d, r0+SIZEOF_DCTCOEF*32
+    LAST_MASK_SSE2 r0d, r0+SIZEOF_DCTCOEF*48
      shl r2d, 16
      shl r0d, 16
      or  r1d, r2d
@@ -1102,16 +1223,9 @@ COEFF_LAST sse2
  COEFF_LAST sse2_lzcnt
  
  ;-----------------------------------------------------------------------------
-; int coeff_level_run( int16_t *dct, run_level_t *runlevel )
+; int coeff_level_run( dctcoef *dct, run_level_t *runlevel )
  ;-----------------------------------------------------------------------------
  
-%macro LAST_MASK4_MMX 2-3
-    movq     mm0, [%2]
-    packsswb mm0, mm0
-    pcmpeqb  mm0, mm2
-    pmovmskb  %1, mm0
-%endmacro
-
  %macro LZCOUNT_X86 3
      bsr %1, %2
      xor %1, %3
@@ -1135,7 +1249,7 @@ cglobal coeff_level_run%2_%1,0,7
      movifnidn t0, r0mp
      movifnidn t1, r1mp
      pxor    m2, m2
-    LAST_MASK t5d, t0-(%2&1)*2, t4d
+    LAST_MASK t5d, t0-(%2&1)*SIZEOF_DCTCOEF, t4d
      not    t5d
      shl    t5d, 32-((%2+1)&~1)
      mov    t4d, %2-1
@@ -1147,9 +1261,15 @@ cglobal coeff_level_run%2_%1,0,7
      mov   [t1], t4d
  .loop:
      LZCOUNT t3d, t5d, 0x1f
+%ifdef X264_HIGH_BIT_DEPTH
+    mov    t2d, [t0+t4*4]
+    mov   [t1+t6  +4+16*4], t3b
+    mov   [t1+t6*4+ 4], t2d
+%else
      mov    t2w, [t0+t4*2]
-    mov   [t1+t6  +36], t3b
+    mov   [t1+t6  +4+16*2], t3b
      mov   [t1+t6*2+ 4], t2w
+%endif
      inc    t3d
      shl    t5d, t3b
      inc    t6d
diff --git a/common/x86/quant.h b/common/x86/quant.h

index ce22afbee63abbdacc9de2f1465a4f9b0bae6249..3a7e59b9763bdcc0075e782f3bdd81ad1e2e35c4 100644 (file)
--- a/common/x86/quant.h
+++ b/common/x86/quant.h
@@ -54,42 +54,42 @@ void x264_dequant_4x4_flat16_mmx( int16_t dct[16], int dequant_mf[6][16], int i_
  void x264_dequant_8x8_flat16_mmx( int16_t dct[64], int dequant_mf[6][64], int i_qp );
  void x264_dequant_4x4_flat16_sse2( int16_t dct[16], int dequant_mf[6][16], int i_qp );
  void x264_dequant_8x8_flat16_sse2( int16_t dct[64], int dequant_mf[6][64], int i_qp );
-void x264_denoise_dct_mmx( int16_t *dct, uint32_t *sum, uint16_t *offset, int size );
-void x264_denoise_dct_sse2( int16_t *dct, uint32_t *sum, uint16_t *offset, int size );
-void x264_denoise_dct_ssse3( int16_t *dct, uint32_t *sum, uint16_t *offset, int size );
-int x264_decimate_score15_mmxext( int16_t *dct );
-int x264_decimate_score15_sse2  ( int16_t *dct );
-int x264_decimate_score15_ssse3 ( int16_t *dct );
-int x264_decimate_score16_mmxext( int16_t *dct );
-int x264_decimate_score16_sse2  ( int16_t *dct );
-int x264_decimate_score16_ssse3 ( int16_t *dct );
-int x264_decimate_score15_mmxext_slowctz( int16_t *dct );
-int x264_decimate_score15_sse2_slowctz  ( int16_t *dct );
-int x264_decimate_score15_ssse3_slowctz ( int16_t *dct );
-int x264_decimate_score16_mmxext_slowctz( int16_t *dct );
-int x264_decimate_score16_sse2_slowctz  ( int16_t *dct );
-int x264_decimate_score16_ssse3_slowctz ( int16_t *dct );
-int x264_decimate_score64_mmxext( int16_t *dct );
-int x264_decimate_score64_sse2  ( int16_t *dct );
-int x264_decimate_score64_ssse3 ( int16_t *dct );
-int x264_coeff_last4_mmxext( int16_t *dct );
-int x264_coeff_last15_mmxext( int16_t *dct );
-int x264_coeff_last16_mmxext( int16_t *dct );
-int x264_coeff_last64_mmxext( int16_t *dct );
-int x264_coeff_last15_sse2( int16_t *dct );
-int x264_coeff_last16_sse2( int16_t *dct );
-int x264_coeff_last64_sse2( int16_t *dct );
-int x264_coeff_last4_mmxext_lzcnt( int16_t *dct );
-int x264_coeff_last15_sse2_lzcnt( int16_t *dct );
-int x264_coeff_last16_sse2_lzcnt( int16_t *dct );
-int x264_coeff_last64_sse2_lzcnt( int16_t *dct );
-int x264_coeff_level_run16_mmxext( int16_t *dct, x264_run_level_t *runlevel );
-int x264_coeff_level_run16_sse2( int16_t *dct, x264_run_level_t *runlevel );
-int x264_coeff_level_run16_sse2_lzcnt( int16_t *dct, x264_run_level_t *runlevel );
-int x264_coeff_level_run15_mmxext( int16_t *dct, x264_run_level_t *runlevel );
-int x264_coeff_level_run15_sse2( int16_t *dct, x264_run_level_t *runlevel );
-int x264_coeff_level_run15_sse2_lzcnt( int16_t *dct, x264_run_level_t *runlevel );
-int x264_coeff_level_run4_mmxext( int16_t *dct, x264_run_level_t *runlevel );
-int x264_coeff_level_run4_mmxext_lzcnt( int16_t *dct, x264_run_level_t *runlevel );
+void x264_denoise_dct_mmx( dctcoef *dct, uint32_t *sum, udctcoef *offset, int size );
+void x264_denoise_dct_sse2( dctcoef *dct, uint32_t *sum, udctcoef *offset, int size );
+void x264_denoise_dct_ssse3( dctcoef *dct, uint32_t *sum, udctcoef *offset, int size );
+int x264_decimate_score15_mmxext( dctcoef *dct );
+int x264_decimate_score15_sse2  ( dctcoef *dct );
+int x264_decimate_score15_ssse3 ( dctcoef *dct );
+int x264_decimate_score16_mmxext( dctcoef *dct );
+int x264_decimate_score16_sse2  ( dctcoef *dct );
+int x264_decimate_score16_ssse3 ( dctcoef *dct );
+int x264_decimate_score15_mmxext_slowctz( dctcoef *dct );
+int x264_decimate_score15_sse2_slowctz  ( dctcoef *dct );
+int x264_decimate_score15_ssse3_slowctz ( dctcoef *dct );
+int x264_decimate_score16_mmxext_slowctz( dctcoef *dct );
+int x264_decimate_score16_sse2_slowctz  ( dctcoef *dct );
+int x264_decimate_score16_ssse3_slowctz ( dctcoef *dct );
+int x264_decimate_score64_mmxext( dctcoef *dct );
+int x264_decimate_score64_sse2  ( dctcoef *dct );
+int x264_decimate_score64_ssse3 ( dctcoef *dct );
+int x264_coeff_last4_mmxext( dctcoef *dct );
+int x264_coeff_last15_mmxext( dctcoef *dct );
+int x264_coeff_last16_mmxext( dctcoef *dct );
+int x264_coeff_last64_mmxext( dctcoef *dct );
+int x264_coeff_last15_sse2( dctcoef *dct );
+int x264_coeff_last16_sse2( dctcoef *dct );
+int x264_coeff_last64_sse2( dctcoef *dct );
+int x264_coeff_last4_mmxext_lzcnt( dctcoef *dct );
+int x264_coeff_last15_sse2_lzcnt( dctcoef *dct );
+int x264_coeff_last16_sse2_lzcnt( dctcoef *dct );
+int x264_coeff_last64_sse2_lzcnt( dctcoef *dct );
+int x264_coeff_level_run16_mmxext( dctcoef *dct, x264_run_level_t *runlevel );
+int x264_coeff_level_run16_sse2( dctcoef *dct, x264_run_level_t *runlevel );
+int x264_coeff_level_run16_sse2_lzcnt( dctcoef *dct, x264_run_level_t *runlevel );
+int x264_coeff_level_run15_mmxext( dctcoef *dct, x264_run_level_t *runlevel );
+int x264_coeff_level_run15_sse2( dctcoef *dct, x264_run_level_t *runlevel );
+int x264_coeff_level_run15_sse2_lzcnt( dctcoef *dct, x264_run_level_t *runlevel );
+int x264_coeff_level_run4_mmxext( dctcoef *dct, x264_run_level_t *runlevel );
+int x264_coeff_level_run4_mmxext_lzcnt( dctcoef *dct, x264_run_level_t *runlevel );
  
  #endif
diff --git a/common/x86/x86util.asm b/common/x86/x86util.asm

index 84f3ce8df72896c15389a36090a691aecc5bf3a6..cf827d236dfb926d0397ed16ff8f0f54b7ba691d 100644 (file)
--- a/common/x86/x86util.asm
+++ b/common/x86/x86util.asm
@@ -28,8 +28,10 @@
  %assign FDEC_STRIDE 32
  
  %assign SIZEOF_PIXEL 1
+%assign SIZEOF_DCTCOEF 2
  %ifdef X264_HIGH_BIT_DEPTH
      %assign SIZEOF_PIXEL 2
+    %assign SIZEOF_DCTCOEF 4
  %endif
  
  %assign PIXEL_MAX ((1 << BIT_DEPTH)-1)
@@ -161,6 +163,17 @@
      pminub  %2, %4
  %endmacro
  
+%macro ABSD2_MMX 4
+    pxor    %3, %3
+    pxor    %4, %4
+    pcmpgtd %3, %1
+    pcmpgtd %4, %2
+    pxor    %1, %3
+    pxor    %2, %4
+    psubd   %1, %3
+    psubd   %2, %4
+%endmacro
+
  %macro ABSB_SSSE3 2
      pabsb   %1, %1
  %endmacro
diff --git a/tools/checkasm.c b/tools/checkasm.c

index c8cb1b89e9adfa8f2bca1c7cde43985364973bac..b0a6b69db5d696d1b40323f9465874ab858c8fb4 100644 (file)
--- a/tools/checkasm.c
+++ b/tools/checkasm.c
@@ -1532,12 +1532,12 @@ static int check_quant( int cpu_ref, int cpu_new )
              memcpy( dct1, buf1, size*sizeof(dctcoef) );
              memcpy( dct2, buf1, size*sizeof(dctcoef) );
              memcpy( buf3+256, buf3, 256 );
-            call_c1( qf_c.denoise_dct, dct1, (uint32_t*)buf3, (uint16_t*)buf2, size );
-            call_a1( qf_a.denoise_dct, dct2, (uint32_t*)(buf3+256), (uint16_t*)buf2, size );
+            call_c1( qf_c.denoise_dct, dct1, (uint32_t*)buf3, (udctcoef*)buf2, size );
+            call_a1( qf_a.denoise_dct, dct2, (uint32_t*)(buf3+256), (udctcoef*)buf2, size );
              if( memcmp( dct1, dct2, size*sizeof(dctcoef) ) || memcmp( buf3+4, buf3+256+4, (size-1)*sizeof(uint32_t) ) )
                  ok = 0;
-            call_c2( qf_c.denoise_dct, dct1, (uint32_t*)buf3, (uint16_t*)buf2, size );
-            call_a2( qf_a.denoise_dct, dct2, (uint32_t*)(buf3+256), (uint16_t*)buf2, size );
+            call_c2( qf_c.denoise_dct, dct1, (uint32_t*)buf3, (udctcoef*)buf2, size );
+            call_a2( qf_a.denoise_dct, dct2, (uint32_t*)(buf3+256), (udctcoef*)buf2, size );
          }
      }
      report( "denoise dct :" );
@@ -1549,8 +1549,17 @@ static int check_quant( int cpu_ref, int cpu_new )
          used_asm = 1; \
          for( int i = 0; i < 100; i++ ) \
          { \
+            static const int distrib[16] = {1,1,1,1,1,1,1,1,1,1,1,1,2,3,4};\
+            static const int zerorate_lut[4] = {3,7,15,31};\
+            int zero_rate = zerorate_lut[i&3];\
              for( int idx = 0; idx < w*w; idx++ ) \
-                dct1[idx] = !(rand()&3) + (!(rand()&15))*(rand()&3); \
+            { \
+                int sign = (rand()&1) ? -1 : 1; \
+                int abs_level = distrib[rand()&15]; \
+                if( abs_level == 4 ) abs_level = rand()&0x3fff; \
+                int zero = !(rand()&zero_rate); \
+                dct1[idx] = zero * abs_level * sign; \
+            } \
              if( ac ) \
                  dct1[0] = 0; \
              int result_c = call_c( qf_c.decname, dct1 ); \
author	Fiona Glaser <fiona@x264.com>
	Sun, 31 Oct 2010 02:13:05 +0000 (19:13 -0700)
committer	Fiona Glaser <fiona@x264.com>
	Fri, 19 Nov 2010 17:47:33 +0000 (09:47 -0800)
common/common.h		patch \| blob \| history
common/quant.c		patch \| blob \| history
common/quant.h		patch \| blob \| history
common/x86/quant-a.asm		patch \| blob \| history
common/x86/quant.h		patch \| blob \| history
common/x86/x86util.asm		patch \| blob \| history
tools/checkasm.c		patch \| blob \| history