mmx dequant. up to 3% speedup w/ RD.

author Loren Merritt <pengvado@videolan.org>

Sun, 6 Nov 2005 07:07:30 +0000 (07:07 +0000)

committer Loren Merritt <pengvado@videolan.org>

Sun, 6 Nov 2005 07:07:30 +0000 (07:07 +0000)
author Loren Merritt <pengvado@videolan.org>
Sun, 6 Nov 2005 07:07:30 +0000 (07:07 +0000)
committer Loren Merritt <pengvado@videolan.org>
Sun, 6 Nov 2005 07:07:30 +0000 (07:07 +0000)
diff --git a/common/amd64/quant-a.asm b/common/amd64/quant-a.asm

index ce2aad0b087a14b97004d3c8ba4f440f9ccae537..afdeee9e1684f097f9232cc296e0facc06b3c08a 100644 (file)
--- a/common/amd64/quant-a.asm
+++ b/common/amd64/quant-a.asm
@@ -35,7 +35,9 @@ BITS 64
  
  %include "amd64inc.asm"
  
-ALIGN 16
+SECTION .rodata
+pw_1:  times 4 dw 1
+pd_1:  times 2 dd 1
  
  SECTION .text
  
@@ -54,6 +56,9 @@ cglobal x264_quant_4x4_dc_core32_mmxext
  cglobal x264_quant_4x4_core32_mmxext
  cglobal x264_quant_8x8_core32_mmxext
  
+cglobal x264_dequant_4x4_mmx
+cglobal x264_dequant_8x8_mmx
+
  %macro MMX_QUANT_AC_START 0
  ;   mov         rdi, rdi        ; &dct[0][0]
  ;   mov         rsi, rsi        ; &quant_mf[0][0]
@@ -374,3 +379,139 @@ x264_quant_8x8_core32_mmxext:
  
      ret
  
+
+;=============================================================================
+; dequant
+;=============================================================================
+
+%macro DEQUANT16_L_1x4 3
+;;; %1      dct[y][x]
+;;; %2,%3   dequant_mf[i_mf][y][x]
+;;; mm5     i_qbits
+
+    movq     mm1, %2
+    movq     mm2, %3
+    movq     mm0, %1
+    packssdw mm1, mm2
+    pmullw   mm0, mm1
+    psllw    mm0, mm5
+    movq     %1,  mm0
+%endmacro
+
+%macro DEQUANT16_R_1x4 3
+;;; %1      dct[y][x]
+;;; %2,%3   dequant_mf[i_mf][y][x]
+;;; mm5     -i_qbits
+;;; mm6     f as words
+
+    movq     mm1, %2
+    movq     mm2, %3
+    movq     mm0, %1
+    packssdw mm1, mm2
+    pmullw   mm0, mm1
+    paddw    mm0, mm6
+    psraw    mm0, mm5
+    movq     %1,  mm0
+%endmacro
+
+%macro DEQUANT32_R_1x4 3
+;;; %1      dct[y][x]
+;;; %2,%3   dequant_mf[i_mf][y][x]
+;;; mm5     -i_qbits
+;;; mm6     f as dwords
+;;; mm7     0
+
+    movq      mm0, %1
+    movq      mm1, mm0
+    punpcklwd mm0, mm0
+    punpckhwd mm1, mm1
+
+    movq      mm2, mm0
+    movq      mm3, mm1
+    pmulhw    mm0, %2
+    pmulhw    mm1, %3
+    pmullw    mm2, %2
+    pmullw    mm3, %3
+    pslld     mm0, 16
+    pslld     mm1, 16
+    paddd     mm0, mm2
+    paddd     mm1, mm3
+
+    paddd     mm0, mm6
+    paddd     mm1, mm6
+    psrad     mm0, mm5
+    psrad     mm1, mm5
+
+    packssdw  mm0, mm1
+    movq      %1,  mm0
+%endmacro
+
+%macro DEQUANT_WxH 3
+ALIGN 16
+;;; void x264_dequant_4x4_mmx( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp )
+%1:
+;   mov  rdi, rdi   ; dct
+;   mov  rsi, rsi   ; dequant_mf
+;   mov  edx, edx   ; i_qp
+
+    imul eax, edx, 0x2b
+    shr  eax, 8     ; i_qbits = i_qp / 6
+    lea  ecx, [eax+eax*2]
+    sub  edx, ecx
+    sub  edx, ecx   ; i_mf = i_qp % 6
+    shl  edx, %3+2
+    movsxd rdx, edx
+    add  rsi, rdx   ; dequant_mf[i_mf]
+
+    sub  eax, %3
+    cmp  eax, -2
+    jle  .rshift32  ; dct * dequant overflows 16bit
+    cmp  eax, -1
+    jle  .rshift16  ; negative qbits => rightshift
+
+.lshift:
+    movd mm5, eax
+
+%rep %2
+    DEQUANT16_L_1x4 [rdi], [rsi], [rsi+8]
+    add  rsi, byte 16
+    add  rdi, byte 8
+%endrep
+
+    ret
+
+.rshift16:
+    neg   eax
+    movd  mm5, eax
+    movq  mm6, [pw_1]
+    pxor  mm7, mm7
+    psllw mm6, mm5
+    psrlw mm6, 1
+
+%rep %2
+    DEQUANT16_R_1x4 [rdi], [rsi], [rsi+8]
+    add  rsi, byte 16
+    add  rdi, byte 8
+%endrep
+
+    ret
+
+.rshift32:
+    neg   eax
+    movd  mm5, eax
+    movq  mm6, [pd_1]
+    pxor  mm7, mm7
+    pslld mm6, mm5
+    psrld mm6, 1
+
+%rep %2
+    DEQUANT32_R_1x4 [rdi], [rsi], [rsi+8]
+    add  rsi, byte 16
+    add  rdi, byte 8
+%endrep
+
+    ret
+%endmacro
+
+DEQUANT_WxH x264_dequant_4x4_mmx, 4, 4
+DEQUANT_WxH x264_dequant_8x8_mmx, 16, 6
diff --git a/common/i386/quant-a.asm b/common/i386/quant-a.asm

index b8813b365d5a56c07962795dff3fed81feee3074..ec2680c541ea8bc719a7b559abdba660f19ec0fa 100644 (file)
--- a/common/i386/quant-a.asm
+++ b/common/i386/quant-a.asm
@@ -42,7 +42,9 @@ BITS 32
      %endif
  %endmacro
  
-ALIGN 16
+SECTION .rodata
+pw_1:  times 4 dw 1
+pd_1:  times 2 dd 1
  
  SECTION .text
  
@@ -61,6 +63,9 @@ cglobal x264_quant_4x4_dc_core32_mmxext
  cglobal x264_quant_4x4_core32_mmxext
  cglobal x264_quant_8x8_core32_mmxext
  
+cglobal x264_dequant_4x4_mmx
+cglobal x264_dequant_8x8_mmx
+
  %macro MMX_QUANT_AC_START 0
      mov         eax, [esp+ 4]   ; &dct[0][0]
      mov         ecx, [esp+ 8]   ; &quant_mf[0][0]
@@ -381,3 +386,145 @@ x264_quant_8x8_core32_mmxext:
  
      ret
  
+
+;=============================================================================
+; dequant
+;=============================================================================
+
+%macro DEQUANT16_L_1x4 3
+;;; %1      dct[y][x]
+;;; %2,%3   dequant_mf[i_mf][y][x]
+;;; mm5     i_qbits
+
+    movq     mm1, %2
+    movq     mm2, %3
+    movq     mm0, %1
+    packssdw mm1, mm2
+    pmullw   mm0, mm1
+    psllw    mm0, mm5
+    movq     %1,  mm0
+%endmacro
+
+%macro DEQUANT16_R_1x4 3
+;;; %1      dct[y][x]
+;;; %2,%3   dequant_mf[i_mf][y][x]
+;;; mm5     -i_qbits
+;;; mm6     f as words
+
+    movq     mm1, %2
+    movq     mm2, %3
+    movq     mm0, %1
+    packssdw mm1, mm2
+    pmullw   mm0, mm1
+    paddw    mm0, mm6
+    psraw    mm0, mm5
+    movq     %1,  mm0
+%endmacro
+
+%macro DEQUANT32_R_1x4 3
+;;; %1      dct[y][x]
+;;; %2,%3   dequant_mf[i_mf][y][x]
+;;; mm5     -i_qbits
+;;; mm6     f as dwords
+;;; mm7     0
+
+    movq      mm0, %1
+    movq      mm1, mm0
+    punpcklwd mm0, mm0
+    punpckhwd mm1, mm1
+
+    movq      mm2, mm0
+    movq      mm3, mm1
+    pmulhw    mm0, %2
+    pmulhw    mm1, %3
+    pmullw    mm2, %2
+    pmullw    mm3, %3
+    pslld     mm0, 16
+    pslld     mm1, 16
+    paddd     mm0, mm2
+    paddd     mm1, mm3
+
+    paddd     mm0, mm6
+    paddd     mm1, mm6
+    psrad     mm0, mm5
+    psrad     mm1, mm5
+
+    packssdw  mm0, mm1
+    movq      %1,  mm0
+%endmacro
+
+%macro DEQUANT_WxH 3
+ALIGN 16
+;;; void x264_dequant_4x4_mmx( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp )
+%1:
+    mov  edx, [esp+12] ; i_qp
+    imul eax, edx, 0x2b
+    shr  eax, 8       ; i_qbits = i_qp / 6
+    lea  ecx, [eax+eax*2]
+    sub  edx, ecx
+    sub  edx, ecx     ; i_mf = i_qp % 6
+    shl  edx, %3+2
+    add  edx, [esp+8] ; dequant_mf[i_mf]
+    mov  ecx, [esp+4] ; dct
+
+    sub  eax, %3
+    jge  .lshift
+    cmp  eax, byte -1
+    je   .rshift16    ; negative qbits => rightshift
+    jmp  .rshift32    ; dct * dequant overflows 16bit
+
+.lshift:
+    movd mm5, eax
+
+    mov  eax, 8*(%2-1)
+.loopl16
+%rep 2
+    DEQUANT16_L_1x4 [ecx+eax], [edx+eax*2], [edx+eax*2+8]
+    sub  eax, byte 8
+%endrep
+    jge  .loopl16
+
+    nop
+    ret
+
+.rshift16:
+    neg   eax
+    movq  mm6, [pw_1]
+    movd  mm5, eax
+    pxor  mm7, mm7
+    psllw mm6, mm5
+    psrlw mm6, 1
+
+    mov  eax, 8*(%2-1)
+.loopr16
+%rep 2
+    DEQUANT16_R_1x4 [ecx+eax], [edx+eax*2], [edx+eax*2+8]
+    sub  eax, byte 8
+%endrep
+    jge  .loopr16
+
+    nop
+    ret
+
+.rshift32:
+    neg   eax
+    movq  mm6, [pd_1]
+    movd  mm5, eax
+    pxor  mm7, mm7
+    pslld mm6, mm5
+    psrld mm6, 1
+
+    mov  eax, 8*(%2-1)
+.loopr32
+%rep 2
+    DEQUANT32_R_1x4 [ecx+eax], [edx+eax*2], [edx+eax*2+8]
+    sub  eax, byte 8
+%endrep
+    jge  .loopr32
+
+    nop
+    ret
+%endmacro
+
+DEQUANT_WxH x264_dequant_4x4_mmx, 4, 4
+DEQUANT_WxH x264_dequant_8x8_mmx, 16, 6
diff --git a/common/i386/quant.h b/common/i386/quant.h

index 87fabbd4a6affc9ec79e9bdde0deb17182bbcdf4..ec42f4e18a2a8779813b10ed76d0c1ad76936d84 100644 (file)
--- a/common/i386/quant.h
+++ b/common/i386/quant.h
@@ -50,4 +50,7 @@ void x264_quant_4x4_dc_core32_mmxext( int16_t dct[4][4],
  void x264_quant_2x2_dc_core32_mmxext( int16_t dct[2][2],
      int const i_qmf, int const i_qbits, int const f );
  
+void x264_dequant_4x4_mmx( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp );
+void x264_dequant_8x8_mmx( int16_t dct[8][8], int dequant_mf[6][8][8], int i_qp );
+
  #endif
diff --git a/common/macroblock.c b/common/macroblock.c

index 0d7859199cf7ceb52c2a08546bbc9e88b74c9437..450def84039ae33fdd0c8909352ee1c948d2e583 100644 (file)
--- a/common/macroblock.c
+++ b/common/macroblock.c
@@ -84,132 +84,6 @@ int x264_mb_transform_8x8_allowed( x264_t *h )
      return 1;
  }
  
-/****************************************************************************
- * Scan and Quant functions
- ****************************************************************************/
-void x264_mb_dequant_2x2_dc( int16_t dct[2][2], int dequant_mf[6][4][4], int i_qscale )
-{
-    const int i_qbits = i_qscale/6 - 5;
-
-    if( i_qbits >= 0 )
-    {
-        const int i_dmf = dequant_mf[i_qscale%6][0][0] << i_qbits;
-
-        dct[0][0] *= i_dmf;
-        dct[0][1] *= i_dmf;
-        dct[1][0] *= i_dmf;
-        dct[1][1] *= i_dmf;
-    }
-    else
-    {
-        const int i_dmf = dequant_mf[i_qscale%6][0][0];
-        // chroma DC is truncated, not rounded
-
-        dct[0][0] = ( dct[0][0] * i_dmf ) >> (-i_qbits);
-        dct[0][1] = ( dct[0][1] * i_dmf ) >> (-i_qbits);
-        dct[1][0] = ( dct[1][0] * i_dmf ) >> (-i_qbits);
-        dct[1][1] = ( dct[1][1] * i_dmf ) >> (-i_qbits);
-    }
-}
-
-void x264_mb_dequant_4x4_dc( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qscale )
-{
-    const int i_qbits = i_qscale/6 - 6;
-    int y;
-
-    if( i_qbits >= 0 )
-    {
-        const int i_dmf = dequant_mf[i_qscale%6][0][0] << i_qbits;
-
-        for( y = 0; y < 4; y++ )
-        {
-            dct[y][0] *= i_dmf;
-            dct[y][1] *= i_dmf;
-            dct[y][2] *= i_dmf;
-            dct[y][3] *= i_dmf;
-        }
-    }
-    else
-    {
-        const int i_dmf = dequant_mf[i_qscale%6][0][0];
-        const int f = 1 << (-i_qbits-1);
-
-        for( y = 0; y < 4; y++ )
-        {
-            dct[y][0] = ( dct[y][0] * i_dmf + f ) >> (-i_qbits);
-            dct[y][1] = ( dct[y][1] * i_dmf + f ) >> (-i_qbits);
-            dct[y][2] = ( dct[y][2] * i_dmf + f ) >> (-i_qbits);
-            dct[y][3] = ( dct[y][3] * i_dmf + f ) >> (-i_qbits);
-        }
-    }
-}
-
-void x264_mb_dequant_4x4( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qscale )
-{
-    const int i_mf = i_qscale%6;
-    const int i_qbits = i_qscale/6 - 4;
-    int y;
-
-    if( i_qbits >= 0 )
-    {
-        for( y = 0; y < 4; y++ )
-        {
-            dct[y][0] = ( dct[y][0] * dequant_mf[i_mf][y][0] ) << i_qbits;
-            dct[y][1] = ( dct[y][1] * dequant_mf[i_mf][y][1] ) << i_qbits;
-            dct[y][2] = ( dct[y][2] * dequant_mf[i_mf][y][2] ) << i_qbits;
-            dct[y][3] = ( dct[y][3] * dequant_mf[i_mf][y][3] ) << i_qbits;
-        }
-    }
-    else
-    {
-        const int f = 1 << (-i_qbits-1);
-        for( y = 0; y < 4; y++ )
-        {
-            dct[y][0] = ( dct[y][0] * dequant_mf[i_mf][y][0] + f ) >> (-i_qbits);
-            dct[y][1] = ( dct[y][1] * dequant_mf[i_mf][y][1] + f ) >> (-i_qbits);
-            dct[y][2] = ( dct[y][2] * dequant_mf[i_mf][y][2] + f ) >> (-i_qbits);
-            dct[y][3] = ( dct[y][3] * dequant_mf[i_mf][y][3] + f ) >> (-i_qbits);
-        }
-    }
-}
-
-void x264_mb_dequant_8x8( int16_t dct[8][8], int dequant_mf[6][8][8], int i_qscale )
-{
-    const int i_mf = i_qscale%6;
-    const int i_qbits = i_qscale/6 - 6;
-    int y;
-
-    if( i_qbits >= 0 )
-    {
-        for( y = 0; y < 8; y++ )
-        {
-            dct[y][0] = ( dct[y][0] * dequant_mf[i_mf][y][0] ) << i_qbits;
-            dct[y][1] = ( dct[y][1] * dequant_mf[i_mf][y][1] ) << i_qbits;
-            dct[y][2] = ( dct[y][2] * dequant_mf[i_mf][y][2] ) << i_qbits;
-            dct[y][3] = ( dct[y][3] * dequant_mf[i_mf][y][3] ) << i_qbits;
-            dct[y][4] = ( dct[y][4] * dequant_mf[i_mf][y][4] ) << i_qbits;
-            dct[y][5] = ( dct[y][5] * dequant_mf[i_mf][y][5] ) << i_qbits;
-            dct[y][6] = ( dct[y][6] * dequant_mf[i_mf][y][6] ) << i_qbits;
-            dct[y][7] = ( dct[y][7] * dequant_mf[i_mf][y][7] ) << i_qbits;
-        }
-    }
-    else
-    {
-        const int f = 1 << (-i_qbits-1);
-        for( y = 0; y < 8; y++ )
-        {
-            dct[y][0] = ( dct[y][0] * dequant_mf[i_mf][y][0] + f ) >> (-i_qbits);
-            dct[y][1] = ( dct[y][1] * dequant_mf[i_mf][y][1] + f ) >> (-i_qbits);
-            dct[y][2] = ( dct[y][2] * dequant_mf[i_mf][y][2] + f ) >> (-i_qbits);
-            dct[y][3] = ( dct[y][3] * dequant_mf[i_mf][y][3] + f ) >> (-i_qbits);
-            dct[y][4] = ( dct[y][4] * dequant_mf[i_mf][y][4] + f ) >> (-i_qbits);
-            dct[y][5] = ( dct[y][5] * dequant_mf[i_mf][y][5] + f ) >> (-i_qbits);
-            dct[y][6] = ( dct[y][6] * dequant_mf[i_mf][y][6] + f ) >> (-i_qbits);
-            dct[y][7] = ( dct[y][7] * dequant_mf[i_mf][y][7] + f ) >> (-i_qbits);
-        }
-    }
-}
-
  void x264_mb_predict_mv( x264_t *h, int i_list, int idx, int i_width, int mvp[2] )
  {
      const int i8 = x264_scan8[idx];
diff --git a/common/macroblock.h b/common/macroblock.h

index 255eeded8b9172c0fcf605224c8d39812b7857ad..d41c09a63a8a4ad0dba6dfeaec7fb33c5cb70055 100644 (file)
--- a/common/macroblock.h
+++ b/common/macroblock.h
@@ -218,11 +218,6 @@ void x264_macroblock_cache_end( x264_t *h );
  
  void x264_macroblock_bipred_init( x264_t *h );
  
-void x264_mb_dequant_4x4_dc( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qscale );
-void x264_mb_dequant_2x2_dc( int16_t dct[2][2], int dequant_mf[6][4][4], int i_qscale );
-void x264_mb_dequant_4x4( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qscale );
-void x264_mb_dequant_8x8( int16_t dct[8][8], int dequant_mf[6][8][8], int i_qscale );
-
  /* x264_mb_predict_mv_16x16:
   *      set mvp with predicted mv for D_16x16 block
   *      h->mb. need only valid values from other blocks */
diff --git a/common/quant.c b/common/quant.c

index fc32cfd2fa33f02dc1981f7cb5f32c84ad143070..72c18cd9401685c9d458e5f0f990bb254765207e 100644 (file)
--- a/common/quant.c
+++ b/common/quant.c
@@ -63,6 +63,132 @@ static void quant_2x2_dc_core( int16_t dct[2][2], int i_quant_mf, int i_qbits, i
      QUANT_ONE( dct[0][3], i_quant_mf );
  }
  
+#define DEQUANT_SHL( x ) \
+    dct[y][x] = ( dct[y][x] * dequant_mf[i_mf][y][x] ) << i_qbits
+
+#define DEQUANT_SHR( x ) \
+    dct[y][x] = ( dct[y][x] * dequant_mf[i_mf][y][x] + f ) >> (-i_qbits)
+
+static void dequant_4x4( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp )
+{
+    const int i_mf = i_qp%6;
+    const int i_qbits = i_qp/6 - 4;
+    int y;
+
+    if( i_qbits >= 0 )
+    {
+        for( y = 0; y < 4; y++ )
+        {
+            DEQUANT_SHL( 0 );
+            DEQUANT_SHL( 1 );
+            DEQUANT_SHL( 2 );
+            DEQUANT_SHL( 3 );
+        }
+    }
+    else
+    {
+        const int f = 1 << (-i_qbits-1);
+        for( y = 0; y < 4; y++ )
+        {
+            DEQUANT_SHR( 0 );
+            DEQUANT_SHR( 1 );
+            DEQUANT_SHR( 2 );
+            DEQUANT_SHR( 3 );
+        }
+    }
+}
+
+static void dequant_8x8( int16_t dct[8][8], int dequant_mf[6][8][8], int i_qp )
+{
+    const int i_mf = i_qp%6;
+    const int i_qbits = i_qp/6 - 6;
+    int y;
+
+    if( i_qbits >= 0 )
+    {
+        for( y = 0; y < 8; y++ )
+        {
+            DEQUANT_SHL( 0 );
+            DEQUANT_SHL( 1 );
+            DEQUANT_SHL( 2 );
+            DEQUANT_SHL( 3 );
+            DEQUANT_SHL( 4 );
+            DEQUANT_SHL( 5 );
+            DEQUANT_SHL( 6 );
+            DEQUANT_SHL( 7 );
+        }
+    }
+    else
+    {
+        const int f = 1 << (-i_qbits-1);
+        for( y = 0; y < 8; y++ )
+        {
+            DEQUANT_SHR( 0 );
+            DEQUANT_SHR( 1 );
+            DEQUANT_SHR( 2 );
+            DEQUANT_SHR( 3 );
+            DEQUANT_SHR( 4 );
+            DEQUANT_SHR( 5 );
+            DEQUANT_SHR( 6 );
+            DEQUANT_SHR( 7 );
+        }
+    }
+}
+
+void x264_mb_dequant_2x2_dc( int16_t dct[2][2], int dequant_mf[6][4][4], int i_qp )
+{
+    const int i_qbits = i_qp/6 - 5;
+
+    if( i_qbits >= 0 )
+    {
+        const int i_dmf = dequant_mf[i_qp%6][0][0] << i_qbits;
+        dct[0][0] *= i_dmf;
+        dct[0][1] *= i_dmf;
+        dct[1][0] *= i_dmf;
+        dct[1][1] *= i_dmf;
+    }
+    else
+    {
+        const int i_dmf = dequant_mf[i_qp%6][0][0];
+        // chroma DC is truncated, not rounded
+        dct[0][0] = ( dct[0][0] * i_dmf ) >> (-i_qbits);
+        dct[0][1] = ( dct[0][1] * i_dmf ) >> (-i_qbits);
+        dct[1][0] = ( dct[1][0] * i_dmf ) >> (-i_qbits);
+        dct[1][1] = ( dct[1][1] * i_dmf ) >> (-i_qbits);
+    }
+}
+
+void x264_mb_dequant_4x4_dc( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp )
+{
+    const int i_qbits = i_qp/6 - 6;
+    int y;
+
+    if( i_qbits >= 0 )
+    {
+        const int i_dmf = dequant_mf[i_qp%6][0][0] << i_qbits;
+
+        for( y = 0; y < 4; y++ )
+        {
+            dct[y][0] *= i_dmf;
+            dct[y][1] *= i_dmf;
+            dct[y][2] *= i_dmf;
+            dct[y][3] *= i_dmf;
+        }
+    }
+    else
+    {
+        const int i_dmf = dequant_mf[i_qp%6][0][0];
+        const int f = 1 << (-i_qbits-1);
+
+        for( y = 0; y < 4; y++ )
+        {
+            dct[y][0] = ( dct[y][0] * i_dmf + f ) >> (-i_qbits);
+            dct[y][1] = ( dct[y][1] * i_dmf + f ) >> (-i_qbits);
+            dct[y][2] = ( dct[y][2] * i_dmf + f ) >> (-i_qbits);
+            dct[y][3] = ( dct[y][3] * i_dmf + f ) >> (-i_qbits);
+        }
+    }
+}
  
  void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
  {
@@ -73,6 +199,9 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
      pf->quant_4x4_dc_core = quant_4x4_dc_core;
      pf->quant_2x2_dc_core = quant_2x2_dc_core;
  
+    pf->dequant_4x4 = dequant_4x4;
+    pf->dequant_8x8 = dequant_8x8;
+
  #ifdef HAVE_MMXEXT
  
      /* determine the biggest coeffient in all quant8_mf tables */
@@ -133,5 +262,13 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
          pf->quant_2x2_dc_core = x264_quant_2x2_dc_core32_mmxext;
      }
  
+    if( cpu&X264_CPU_MMXEXT )
+    {
+        /* dequant is not subject to the above CQM-dependent overflow issues,
+         * as long as the inputs are in the range generable by dct+quant.
+         * that is not guaranteed by the standard, but is true within x264 */
+        pf->dequant_4x4 = x264_dequant_4x4_mmx;
+        pf->dequant_8x8 = x264_dequant_8x8_mmx;
+    }
  #endif  /* HAVE_MMXEXT */
  }
diff --git a/common/quant.h b/common/quant.h

index ca08b342163262151ac3418ba418afe2f5ecd961..3294df592f3d013c465715a481de3a10d4a0233b 100644 (file)
--- a/common/quant.h
+++ b/common/quant.h
@@ -29,8 +29,14 @@ typedef struct
      void (*quant_4x4_core)( int16_t dct[4][4], int quant_mf[4][4], int i_qbits, int f );
      void (*quant_4x4_dc_core)( int16_t dct[4][4], int i_quant_mf, int i_qbits, int f );
      void (*quant_2x2_dc_core)( int16_t dct[2][2], int i_quant_mf, int i_qbits, int f );
+
+    void (*dequant_4x4)( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp );
+    void (*dequant_8x8)( int16_t dct[8][8], int dequant_mf[6][8][8], int i_qp );
  } x264_quant_function_t;
  
  void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf );
  
+void x264_mb_dequant_4x4_dc( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qscale );
+void x264_mb_dequant_2x2_dc( int16_t dct[2][2], int dequant_mf[6][4][4], int i_qscale );
+
  #endif
diff --git a/encoder/macroblock.c b/encoder/macroblock.c

index 55d3093cb7aa5b88b0de90e8eb7ec997c18fa7fd..50b9d778f24998d4cbff6d4e095b97bc062b85c2 100644 (file)
--- a/encoder/macroblock.c
+++ b/encoder/macroblock.c
@@ -213,7 +213,7 @@ void x264_mb_encode_i4x4( x264_t *h, int idx, int i_qscale )
          quant_4x4( h, dct4x4, h->quant4_mf[CQM_4IY], i_qscale, 1 );
  
      scan_zigzag_4x4full( h->dct.block[idx].luma4x4, dct4x4 );
-    x264_mb_dequant_4x4( dct4x4, h->dequant4_mf[CQM_4IY], i_qscale );
+    h->quantf.dequant_4x4( dct4x4, h->dequant4_mf[CQM_4IY], i_qscale );
  
      /* output samples to fdec */
      h->dctf.add4x4_idct( p_dst, i_stride, dct4x4 );
@@ -235,7 +235,7 @@ void x264_mb_encode_i8x8( x264_t *h, int idx, int i_qscale )
          quant_8x8( h, dct8x8, h->quant8_mf[CQM_8IY], i_qscale, 1 );
  
      scan_zigzag_8x8full( h->dct.luma8x8[idx], dct8x8 );
-    x264_mb_dequant_8x8( dct8x8, h->dequant8_mf[CQM_8IY], i_qscale );
+    h->quantf.dequant_8x8( dct8x8, h->dequant8_mf[CQM_8IY], i_qscale );
      h->dctf.add8x8_idct8( p_dst, i_stride, dct8x8 );
  }
  
@@ -275,7 +275,7 @@ static void x264_mb_encode_i16x16( x264_t *h, int i_qscale )
              quant_4x4( h, dct4x4[1+i], h->quant4_mf[CQM_4IY], i_qscale, 1 );
  
          scan_zigzag_4x4( h->dct.block[i].residual_ac, dct4x4[1+i] );
-        x264_mb_dequant_4x4( dct4x4[1+i], h->dequant4_mf[CQM_4IY], i_qscale );
+        h->quantf.dequant_4x4( dct4x4[1+i], h->dequant4_mf[CQM_4IY], i_qscale );
      }
  
      h->dctf.dct4x4dc( dct4x4[0] );
@@ -332,7 +332,7 @@ static void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qscale )
              /* no trellis; it doesn't seem to help chroma noticeably */
              quant_4x4( h, dct4x4[i], h->quant4_mf[CQM_4IC + b_inter], i_qscale, !b_inter );
              scan_zigzag_4x4( h->dct.block[16+i+ch*4].residual_ac, dct4x4[i] );
-            x264_mb_dequant_4x4( dct4x4[i], h->dequant4_mf[CQM_4IC + b_inter], i_qscale );
+            h->quantf.dequant_4x4( dct4x4[i], h->dequant4_mf[CQM_4IC + b_inter], i_qscale );
  
              if( b_inter )
              {
@@ -502,7 +502,7 @@ void x264_macroblock_encode( x264_t *h )
                      quant_8x8( h, dct8x8[idx], h->quant8_mf[CQM_8PY], i_qp, 0 );
  
                  scan_zigzag_8x8full( h->dct.luma8x8[idx], dct8x8[idx] );
-                x264_mb_dequant_8x8( dct8x8[idx], h->dequant8_mf[CQM_8PY], i_qp );
+                h->quantf.dequant_8x8( dct8x8[idx], h->dequant8_mf[CQM_8PY], i_qp );
  
                  if( !h->mb.b_trellis )
                  {
@@ -544,7 +544,7 @@ void x264_macroblock_encode( x264_t *h )
                          quant_4x4( h, dct4x4[idx], h->quant4_mf[CQM_4PY], i_qp, 0 );
  
                      scan_zigzag_4x4full( h->dct.block[idx].luma4x4, dct4x4[idx] );
-                    x264_mb_dequant_4x4( dct4x4[idx], h->dequant4_mf[CQM_4PY], i_qp );
+                    h->quantf.dequant_4x4( dct4x4[idx], h->dequant4_mf[CQM_4PY], i_qp );
  
                      i_decimate_8x8 += x264_mb_decimate_score( h->dct.block[idx].luma4x4, 16 );
                  }
diff --git a/tools/checkasm.c b/tools/checkasm.c

index a91086a002016b0c930c0d6f9ad9a0a5ab2efe12..42f2eea80caf0ba69c25dd3f21a0693f91d5f28c 100644 (file)
--- a/tools/checkasm.c
+++ b/tools/checkasm.c
@@ -365,7 +365,8 @@ static int check_quant( int cpu_ref, int cpu_new )
      x264_quant_function_t qf_a;
      int16_t dct1[64], dct2[64];
      uint8_t cqm_buf[64];
-    int ret = 0, ok = 1, used_asm = 0;
+    int ret = 0, ok, used_asm;
+    int oks[2] = {1,1}, used_asms[2] = {0,0};
      int i, i_cqm;
      x264_t h_buf;
      x264_t *h = &h_buf;
@@ -400,14 +401,14 @@ static int check_quant( int cpu_ref, int cpu_new )
  #define TEST_QUANT( name, cqm ) \
          if( qf_a.name != qf_ref.name ) \
          { \
-            used_asm = 1; \
+            used_asms[0] = 1; \
              for( i = 0; i < 64; i++ ) \
-                dct1[i] = dct2[i] = rand() & 0xfff; \
+                dct1[i] = dct2[i] = (rand() & 0x1fff) - 0xfff; \
              qf_c.name( (void*)dct1, cqm, 20, (1<<20)/6 ); \
              qf_a.name( (void*)dct2, cqm, 20, (1<<20)/6 ); \
              if( memcmp( dct1, dct2, 64*2 ) )       \
              { \
-                ok = 0; \
+                oks[0] = 0; \
                  fprintf( stderr, #name "(cqm=%d): [FAILED]\n", i_cqm ); \
              } \
          }
@@ -418,10 +419,39 @@ static int check_quant( int cpu_ref, int cpu_new )
          TEST_QUANT( quant_4x4_core, *h->quant4_mf[CQM_4PY] );
          TEST_QUANT( quant_4x4_dc_core, ***h->quant4_mf[CQM_4IY] );
          TEST_QUANT( quant_2x2_dc_core, ***h->quant4_mf[CQM_4IC] );
+
+#define TEST_DEQUANT( name, quant, dqm, cqm, shift ) \
+        if( qf_a.name != qf_ref.name ) \
+        { \
+            int qp; \
+            used_asms[1] = 1; \
+            for( qp = 51; qp > 0; qp-- ) \
+            { \
+                for( i = 0; i < 64; i++ ) \
+                    dct1[i] = dct2[i] = (rand() & 0x1fff) - 0xfff; \
+                qf_c.quant( (void*)dct1, cqm[qp%6], shift+qp/6, 0 ); \
+                memcpy( dct2, dct1, sizeof(dct2) ); \
+                qf_c.name( (void*)dct1, dqm, qp ); \
+                qf_a.name( (void*)dct2, dqm, qp ); \
+                if( memcmp( dct1, dct2, 64*2 ) ) \
+                { \
+                    oks[1] = 0; \
+                    fprintf( stderr, #name "(qp=%d, cqm=%d): [FAILED]\n", qp, i_cqm ); \
+                    break; \
+                } \
+            } \
+        }
+
+        TEST_DEQUANT( dequant_8x8, quant_8x8_core, h->dequant8_mf[CQM_8PY], h->quant8_mf[CQM_8PY], 16 );
+        TEST_DEQUANT( dequant_4x4, quant_4x4_core, h->dequant4_mf[CQM_4PY], h->quant4_mf[CQM_4PY], 15 );
      }
  
+    ok = oks[0]; used_asm = used_asms[0];
      report( "quant :" );
  
+    ok = oks[1]; used_asm = used_asms[1];
+    report( "dequant :" );
+
      return ret;
  }
author	Loren Merritt <pengvado@videolan.org>
	Sun, 6 Nov 2005 07:07:30 +0000 (07:07 +0000)
committer	Loren Merritt <pengvado@videolan.org>
	Sun, 6 Nov 2005 07:07:30 +0000 (07:07 +0000)
common/amd64/quant-a.asm		patch \| blob \| history
common/i386/quant-a.asm		patch \| blob \| history
common/i386/quant.h		patch \| blob \| history
common/macroblock.c		patch \| blob \| history
common/macroblock.h		patch \| blob \| history
common/quant.c		patch \| blob \| history
common/quant.h		patch \| blob \| history
encoder/macroblock.c		patch \| blob \| history
tools/checkasm.c		patch \| blob \| history