From: Loren Merritt <pengvado@videolan.org>
Date: Sat, 24 Sep 2005 18:22:02 +0000 (+0000)
Subject: faster mmx quant 15bit, and add 16bit version. total speedup: ~0.3%
X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=cfebeac1a475f4a2ee57e5dd3cd1ff0c560f38db;p=libx264

faster mmx quant 15bit, and add 16bit version. total speedup: ~0.3%
patch by Christian Heine.


git-svn-id: svn://svn.videolan.org/x264/trunk@298 df754926-b1dd-0310-bc7b-ec298dee348c
---

diff --git a/common/amd64/quant-a.asm b/common/amd64/quant-a.asm
index e3699708..91d9347b 100644
--- a/common/amd64/quant-a.asm
+++ b/common/amd64/quant-a.asm
@@ -21,6 +21,16 @@
 ;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
 ;*****************************************************************************
 
+;*****************************************************************************
+;*                                                                           *
+;*  Revision history:                                                        *
+;*                                                                           *
+;*  2005.07.26  quant 4x4 & 8x8 MMX functions (AI)                           *
+;*  2005.09.04  quant MMXEXT (added precision) and DC (CH)                   *
+;*  2005.09.21  faster MMX and added MMXEXT16 (CH)                           *
+;*                                                                           *
+;*****************************************************************************
+
 BITS 64
 
 %macro cglobal 1
@@ -36,184 +46,338 @@ ALIGN 16
 
 SECTION .text
 
-cglobal x264_quant_8x8_core16_mmx
-cglobal x264_quant_4x4_core16_mmx
-cglobal x264_quant_8x8_core32_mmx
-cglobal x264_quant_4x4_core32_mmx
-cglobal x264_quant_4x4_dc_core32_mmx
-cglobal x264_quant_2x2_dc_core32_mmx
-
-%macro QUANT_AC_START 0
-;   mov       rdi, rdi  ; dct
-;   mov       rsi, rsi  ; quant_mf
-    movd      mm6, edx  ; i_qbits
-    movd      mm7, ecx  ; f
-    punpckldq mm7, mm7
-%endmacro
-
-%macro QUANT_DC_START 0
-;   mov       rdi, rdi  ; dct
-    movd      mm5, rsi  ; i_quant_mf
-    movd      mm6, edx  ; i_qbits
-    movd      mm7, ecx  ; f
-    punpckldq mm5, mm5
-    punpckldq mm7, mm7
+cglobal x264_quant_2x2_dc_core15_mmx
+cglobal x264_quant_4x4_dc_core15_mmx
+cglobal x264_quant_4x4_core15_mmx
+cglobal x264_quant_8x8_core15_mmx
+
+cglobal x264_quant_2x2_dc_core16_mmxext
+cglobal x264_quant_4x4_dc_core16_mmxext
+cglobal x264_quant_4x4_core16_mmxext
+cglobal x264_quant_8x8_core16_mmxext
+
+cglobal x264_quant_2x2_dc_core32_mmxext
+cglobal x264_quant_4x4_dc_core32_mmxext
+cglobal x264_quant_4x4_core32_mmxext
+cglobal x264_quant_8x8_core32_mmxext
+
+%macro MMX_QUANT_AC_START 0
+;   mov         rdi, rdi        ; &dct[0][0]
+;   mov         rsi, rsi        ; &quant_mf[0][0]
+    movd        mm6, edx        ; i_qbits
+    movd        mm7, ecx        ; f
+    punpckldq   mm7, mm7        ; f in each dword
 %endmacro
 
-%macro QUANT16_1x4 5
-;;; %1      dct[y][x]
-;;; %2,%3   quant_mf[i_mf][y][x], entries must fit in int16
-;;; %4      i_qbits
-;;; %5      f as doublewords
-;;; trashes mm0-mm5
-    movq     mm0, %1
-    movq     mm1, %2
-    movq     mm2, %3
-    packssdw mm1, mm2
-
-    movq     mm4, mm0
-    pxor     mm5, mm5
-    pcmpgtw  mm4, mm5
-
-    movq     mm2, mm0
-    pmullw   mm0, mm1
-    pmulhw   mm2, mm1
-
-    movq      mm1, mm0
-    punpcklwd mm0, mm2
-    punpckhwd mm1, mm2
-
-    movq     mm2, %5
-    movq     mm3, %5
-    psubd    mm2, mm0
-    psubd    mm3, mm1
-    paddd    mm0, %5
-    paddd    mm1, %5
-
-    psrad    mm0, %4
-    psrad    mm1, %4
-    psrad    mm2, %4
-    psrad    mm3, %4
-
-    packssdw mm0, mm1
-    packssdw mm2, mm3
-    pxor     mm5, mm5
-    psubw    mm5, mm2
-
-    pand     mm0, mm4
-    pandn    mm4, mm5
-
-    por      mm0, mm4
-    movq     %1,  mm0
+%macro MMX_QUANT15_DC_START 0
+;   mov         rdi, rdi        ; &dct[0][0]
+    movd        mm5, rsi        ; i_qmf
+    movd        mm6, edx        ; i_qbits
+    movd        mm7, ecx        ; f
+    punpcklwd   mm5, mm5
+    punpcklwd   mm5, mm5        ; i_qmf in each word
+    punpckldq   mm7, mm7        ; f in each dword
 %endmacro
 
-%macro QUANT32_1x4 5
-;;; %1      dct[y][x]
-;;; %2,%3   quant_mf[i_mf][y][x]
-;;; %4      i_qbits
-;;; %5      f as doublewords
-;;; trashes mm0-mm4
-    movq        mm0, %1
+%macro MMX_QUANT15_1x4 4
+;;; %1      (m64)       dct[y][x]
+;;; %2      (m64/mmx)   quant_mf[y][x] or quant_mf[0][0] (as int16_t)
+;;; %3      (mmx)       i_qbits in the low doubleword
+;;; %4      (mmx)       f as doublewords
+;;; trashes mm0-mm2,mm4
+    movq        mm0, %1     ; load dct coeffs
     pxor        mm4, mm4
-    pcmpgtw     mm4, mm0        ; mm4 = sign(mm0)
+    pcmpgtw     mm4, mm0    ; sign(coeff)
     pxor        mm0, mm4
-    psubw       mm0, mm4        ; mm0 = abs(mm0)
+    psubw       mm0, mm4    ; abs(coeff)
+
+    movq        mm2, mm0
+    pmullw      mm0, %2
+    pmulhw      mm2, %2
+
     movq        mm1, mm0
-    punpcklwd   mm0, mm0        ; duplicate the words for the upcomming
-    punpckhwd   mm1, mm1        ; 32 bit multiplication
+    punpcklwd   mm0, mm2
+    punpckhwd   mm1, mm2
+
+    paddd       mm0, %4     ; round with f
+    paddd       mm1, %4
+    psrad       mm0, %3
+    psrad       mm1, %3
+
+    packssdw    mm0, mm1    ; pack
+    pxor        mm0, mm4    ; restore sign
+    psubw       mm0, mm4
+    movq        %1, mm0     ; store
+%endmacro
 
-    movq        mm2, mm0        ; like in school ...
-    movq        mm3, mm1
-    pmulhuw     mm0, %2         ; ... multiply the parts ...
-    pmulhuw     mm1, %3
-    pmullw      mm2, %2
-    pmullw      mm3, %3
-    pslld       mm0, 16         ; ... shift ...
-    pslld       mm1, 16
-    paddd       mm0, mm2        ; ... and add them
-    paddd       mm1, mm3
+ALIGN 16
+;-----------------------------------------------------------------------------
+;   void __cdecl x264_quant_2x2_dc_core15_mmx( int16_t dct[2][2],
+;       int const i_qmf, int const i_qbits, int const f );
+;-----------------------------------------------------------------------------
+x264_quant_2x2_dc_core15_mmx:
+    MMX_QUANT15_DC_START
+    MMX_QUANT15_1x4 [rdi], mm5, mm6, mm7
+    ret
 
-    paddd       mm0, %5         ; round with f
-    paddd       mm1, %5
-    psrad       mm0, %4
-    psrad       mm1, %4
-    packssdw    mm0, mm1        ; pack & store
+ALIGN 16
+;-----------------------------------------------------------------------------
+;   void __cdecl x264_quant_4x4_dc_core15_mmx( int16_t dct[4][4],
+;       int const i_qmf, int const i_qbits, int const f );
+;-----------------------------------------------------------------------------
+x264_quant_4x4_dc_core15_mmx:
+    MMX_QUANT15_DC_START
+
+%rep 4
+    MMX_QUANT15_1x4 [rdi], mm5, mm6, mm7
+    add         rdi, byte 8
+%endrep
+
+    ret
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+;   void __cdecl x264_quant_4x4_core15_mmx( int16_t dct[4][4],
+;       int const quant_mf[4][4], int const i_qbits, int const f );
+;-----------------------------------------------------------------------------
+x264_quant_4x4_core15_mmx:
+    MMX_QUANT_AC_START
+
+%rep 4
+    movq        mm5, [rsi]
+    packssdw    mm5, [rsi+8]
+    MMX_QUANT15_1x4 [rdi], mm5, mm6, mm7
+    add         rsi, byte 16
+    add         rdi, byte 8
+%endrep
+
+    ret
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+;   void __cdecl x264_quant_8x8_core15_mmx( int16_t dct[8][8],
+;       int const quant_mf[8][8], int const i_qbits, int const f );
+;-----------------------------------------------------------------------------
+x264_quant_8x8_core15_mmx:
+    MMX_QUANT_AC_START
+
+%rep 16
+    movq        mm5, [rsi]
+    packssdw    mm5, [rsi+8]
+    MMX_QUANT15_1x4 [rdi], mm5, mm6, mm7
+    add         rsi, byte 16
+    add         rdi, byte 8
+%endrep
+
+    ret
+
+; ============================================================================
+
+%macro MMXEXT_QUANT16_DC_START 0
+;   mov         rdi, rdi        ; &dct[0][0]
+    movd        mm5, rsi        ; i_qmf
+    movd        mm6, edx        ; i_qbits
+    movd        mm7, ecx        ; f
+    pshufw      mm5, mm5, 0     ; i_qmf in each word
+    punpckldq   mm7, mm7        ; f in each dword
+%endmacro
+
+%macro MMXEXT_QUANT16_1x4 4
+;;; %1      (m64)       dct[y][x]
+;;; %2      (m64/mmx)   quant_mf[y][x] or quant_mf[0][0] (as uint16_t)
+;;; %3      (mmx)       i_qbits in the low doubleword
+;;; %4      (mmx)       f as doublewords
+;;; trashes mm0-mm2,mm4
+    movq        mm0, %1     ; load dct coeffs
+    pxor        mm4, mm4
+    pcmpgtw     mm4, mm0    ; sign(coeff)
     pxor        mm0, mm4
-    psubw       mm0, mm4        ; restore sign
-    movq        %1, mm0
+    psubw       mm0, mm4    ; abs(coeff)
+
+    movq        mm2, mm0
+    pmullw      mm0, %2
+    pmulhuw     mm2, %2
+
+    movq        mm1, mm0
+    punpcklwd   mm0, mm2
+    punpckhwd   mm1, mm2
+
+    paddd       mm0, %4     ; round with f
+    paddd       mm1, %4
+    psrad       mm0, %3
+    psrad       mm1, %3
+
+    packssdw    mm0, mm1    ; pack
+    pxor        mm0, mm4    ; restore sign
+    psubw       mm0, mm4
+    movq        %1, mm0     ; store
 %endmacro
 
+ALIGN 16
+;-----------------------------------------------------------------------------
+;   void __cdecl x264_quant_2x2_dc_core16_mmxext( int16_t dct[2][2],
+;       int const i_qmf, int const i_qbits, int const f );
+;-----------------------------------------------------------------------------
+x264_quant_2x2_dc_core16_mmxext:
+    MMXEXT_QUANT16_DC_START
+    MMXEXT_QUANT16_1x4 [rdi], mm5, mm6, mm7
+    ret
 
 ALIGN 16
-;;; void x264_quant_8x8_core16_mmx( int16_t dct[8][8], int quant_mf[8][8], int i_qbits, int f )
-x264_quant_8x8_core16_mmx:
-    QUANT_AC_START
+;-----------------------------------------------------------------------------
+;   void __cdecl x264_quant_4x4_dc_core16_mmxext( int16_t dct[4][4],
+;       int const i_qmf, int const i_qbits, int const f );
+;-----------------------------------------------------------------------------
+x264_quant_4x4_dc_core16_mmxext:
+    MMXEXT_QUANT16_DC_START
 
-%rep 16
-    QUANT16_1x4 [rdi], [rsi], [rsi+8], mm6, mm7
-    add  rdi, 8
-    add  rsi, 16
+%rep 4
+    MMXEXT_QUANT16_1x4 [rdi], mm5, mm6, mm7
+    add         rdi, byte 8
 %endrep
 
     ret
 
 ALIGN 16
-;;; void x264_quant_4x4_core16_mmx( int16_t dct[4][4], int quant_mf[4][4], int i_qbits, int f )
-x264_quant_4x4_core16_mmx:
-    QUANT_AC_START
+;-----------------------------------------------------------------------------
+;   void __cdecl x264_quant_4x4_core16_mmxext( int16_t dct[4][4],
+;       int const quant_mf[4][4], int const i_qbits, int const f );
+;-----------------------------------------------------------------------------
+x264_quant_4x4_core16_mmxext:
+    MMX_QUANT_AC_START
 
 %rep 4
-    QUANT16_1x4 [rdi], [rsi], [rsi+8], mm6, mm7
-    add  rdi, 8
-    add  rsi, 16
+    pshufw      mm5, [rsi], 10110001b
+    paddw       mm5, [rsi+8]
+    pshufw      mm5, mm5, 10001101b
+    MMXEXT_QUANT16_1x4 [rdi], mm5, mm6, mm7
+    add         rsi, byte 16
+    add         rdi, byte 8
 %endrep
 
     ret
 
 ALIGN 16
-;;; void x264_quant_8x8_core32_mmx( int16_t dct[8][8], int quant_mf[8][8], int i_qbits, int f )
-x264_quant_8x8_core32_mmx:
-    QUANT_AC_START
+;-----------------------------------------------------------------------------
+;   void __cdecl x264_quant_8x8_core16_mmxext( int16_t dct[8][8],
+;       int const quant_mf[8][8], int const i_qbits, int const f );
+;-----------------------------------------------------------------------------
+x264_quant_8x8_core16_mmxext:
+    MMX_QUANT_AC_START
 
 %rep 16
-    QUANT32_1x4 [rdi], [rsi], [rsi+8], mm6, mm7
-    add  rdi, 8
-    add  rsi, 16
+    pshufw      mm5, [rsi], 10110001b
+    paddw       mm5, [rsi+8]
+    pshufw      mm5, mm5, 10001101b
+    MMXEXT_QUANT16_1x4 [rdi], mm5, mm6, mm7
+    add         rsi, byte 16
+    add         rdi, byte 8
 %endrep
 
     ret
 
+
+
+%macro MMX_QUANT32_DC_START 0
+;   mov         rdi, rdi        ; &dct[0][0]
+    movd        mm5, rsi        ; i_qmf
+    movd        mm6, edx        ; i_qbits
+    movd        mm7, ecx        ; f
+    punpckldq   mm5, mm5        ; i_qmf in each dword
+    punpckldq   mm7, mm7        ; f in each dword
+%endmacro
+
+%macro MMXEXT_QUANT32_1x4 5
+;;; %1      (m64)       dct[y][x]
+;;; %2,%3   (m64/mmx)   quant_mf[y][x] or quant_mf[0][0] (as int16_t)
+;;; %4      (mmx)       i_qbits in the low quadword
+;;; %5      (mmx)       f as doublewords
+;;; trashes mm0-mm4
+    movq        mm0, %1     ; load dct coeffs
+    pxor        mm4, mm4
+    pcmpgtw     mm4, mm0    ; sign(mm0)
+    pxor        mm0, mm4
+    psubw       mm0, mm4    ; abs(mm0)
+    movq        mm1, mm0
+    punpcklwd   mm0, mm0    ; duplicate the words for the upcomming
+    punpckhwd   mm1, mm1    ; 32 bit multiplication
+
+    movq        mm2, mm0    ; like in school ...
+    movq        mm3, mm1
+    pmulhuw     mm0, %2     ; ... multiply the parts ...
+    pmulhuw     mm1, %3
+    pmullw      mm2, %2
+    pmullw      mm3, %3
+    pslld       mm0, 16     ; ... shift ...
+    pslld       mm1, 16
+    paddd       mm0, mm2    ; ... and add them
+    paddd       mm1, mm3
+
+    paddd       mm0, %5     ; round with f
+    paddd       mm1, %5
+    psrad       mm0, %4
+    psrad       mm1, %4
+
+    packssdw    mm0, mm1    ; pack to int16_t
+    pxor        mm0, mm4    ; restore sign
+    psubw       mm0, mm4
+    movq        %1, mm0     ; store
+%endmacro
+
 ALIGN 16
-;;; void x264_quant_4x4_core32_mmx( int16_t dct[4][4], int quant_mf[4][4], int i_qbits, int f )
-x264_quant_4x4_core32_mmx:
-    QUANT_AC_START
+;-----------------------------------------------------------------------------
+;   void __cdecl x264_quant_2x2_dc_core32_mmxext( int16_t dct[2][2],
+;       int const i_qmf, int const i_qbits, int const f );
+;-----------------------------------------------------------------------------
+x264_quant_2x2_dc_core32_mmxext:
+    MMX_QUANT32_DC_START
+    MMXEXT_QUANT32_1x4 [rdi], mm5, mm5, mm6, mm7
+    ret
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+;   void __cdecl x264_quant_4x4_dc_core32_mmxext( int16_t dct[4][4],
+;       int const i_qmf, int const i_qbits, int const f );
+;-----------------------------------------------------------------------------
+x264_quant_4x4_dc_core32_mmxext:
+    MMX_QUANT32_DC_START
 
 %rep 4
-    QUANT32_1x4 [rdi], [rsi], [rsi+8], mm6, mm7
-    add  rdi, 8
-    add  rsi, 16
+    MMXEXT_QUANT32_1x4 [rdi], mm5, mm5, mm6, mm7
+    add         rdi, byte 8
 %endrep
 
     ret
 
 ALIGN 16
-;;; void x264_quant_4x4_dc_core32_mmx( int16_t dct[4][4], int i_quant_mf, int i_qbits, int f )
-x264_quant_4x4_dc_core32_mmx:
-    QUANT_DC_START
+;-----------------------------------------------------------------------------
+;   void __cdecl x264_quant_4x4_core32_mmxext( int16_t dct[4][4],
+;       int const quant_mf[4][4], int const i_qbits, int const f );
+;-----------------------------------------------------------------------------
+x264_quant_4x4_core32_mmxext:
+    MMX_QUANT_AC_START
 
 %rep 4
-    QUANT32_1x4 [rdi], mm5, mm5, mm6, mm7
-    add  rdi, 8
+    MMXEXT_QUANT32_1x4 [rdi], [rsi], [rsi+8], mm6, mm7
+    add         rdi, byte 8
+    add         rsi, byte 16
 %endrep
 
     ret
 
 ALIGN 16
-;;; void x264_quant_2x2_dc_core32_mmx( int16_t dct[2][2], int i_quant_mf, int i_qbits, int f )
-x264_quant_2x2_dc_core32_mmx:
-    QUANT_DC_START
+;-----------------------------------------------------------------------------
+;   void __cdecl x264_quant_8x8_core32_mmxext( int16_t dct[8][8],
+;       int const quant_mf[8][8], int const i_qbits, int const f );
+;-----------------------------------------------------------------------------
+x264_quant_8x8_core32_mmxext:
+    MMX_QUANT_AC_START
 
-    QUANT32_1x4 [rdi], mm5, mm5, mm6, mm7
+%rep 16
+    MMXEXT_QUANT32_1x4 [rdi], [rsi], [rsi+8], mm6, mm7
+    add         rdi, byte 8
+    add         rsi, byte 16
+%endrep
 
     ret
 
diff --git a/common/i386/quant-a.asm b/common/i386/quant-a.asm
index 7806736f..b8813b36 100644
--- a/common/i386/quant-a.asm
+++ b/common/i386/quant-a.asm
@@ -21,6 +21,16 @@
 ;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
 ;*****************************************************************************
 
+;*****************************************************************************
+;*                                                                           *
+;*  Revision history:                                                        *
+;*                                                                           *
+;*  2005.07.26  quant 4x4 & 8x8 MMX functions (AI)                           *
+;*  2005.09.04  quant MMXEXT (added precision) and DC (CH)                   *
+;*  2005.09.21  faster MMX and added MMXEXT16 (CH)                           *
+;*                                                                           *
+;*****************************************************************************
+
 BITS 32
 
 %macro cglobal 1
@@ -36,184 +46,338 @@ ALIGN 16
 
 SECTION .text
 
-cglobal x264_quant_8x8_core16_mmx
-cglobal x264_quant_4x4_core16_mmx
-cglobal x264_quant_8x8_core32_mmx
-cglobal x264_quant_4x4_core32_mmx
-cglobal x264_quant_4x4_dc_core32_mmx
-cglobal x264_quant_2x2_dc_core32_mmx
-
-%macro QUANT_AC_START 0
-    mov       eax, [esp+ 4]   ; dct
-    mov       ecx, [esp+ 8]   ; quant_mf
-    movd      mm6, [esp+12]   ; i_qbits
-    movd      mm7, [esp+16]   ; f
-    punpckldq mm7, mm7
-%endmacro
-
-%macro QUANT_DC_START 0
-    mov       eax, [esp+ 4]   ; dct
-    movd      mm5, [esp+ 8]   ; i_quant_mf
-    movd      mm6, [esp+12]   ; i_qbits
-    movd      mm7, [esp+16]   ; f
-    punpckldq mm5, mm5
-    punpckldq mm7, mm7
+cglobal x264_quant_2x2_dc_core15_mmx
+cglobal x264_quant_4x4_dc_core15_mmx
+cglobal x264_quant_4x4_core15_mmx
+cglobal x264_quant_8x8_core15_mmx
+
+cglobal x264_quant_2x2_dc_core16_mmxext
+cglobal x264_quant_4x4_dc_core16_mmxext
+cglobal x264_quant_4x4_core16_mmxext
+cglobal x264_quant_8x8_core16_mmxext
+
+cglobal x264_quant_2x2_dc_core32_mmxext
+cglobal x264_quant_4x4_dc_core32_mmxext
+cglobal x264_quant_4x4_core32_mmxext
+cglobal x264_quant_8x8_core32_mmxext
+
+%macro MMX_QUANT_AC_START 0
+    mov         eax, [esp+ 4]   ; &dct[0][0]
+    mov         ecx, [esp+ 8]   ; &quant_mf[0][0]
+    movd        mm6, [esp+12]   ; i_qbits
+    movd        mm7, [esp+16]   ; f
+    punpckldq   mm7, mm7        ; f in each dword
 %endmacro
 
-%macro QUANT16_1x4 5
-;;; %1      dct[y][x]
-;;; %2,%3   quant_mf[i_mf][y][x], entries must fit in int16
-;;; %4      i_qbits
-;;; %5      f as doublewords
-;;; trashes mm0-mm5
-    movq     mm0, %1
-    movq     mm1, %2
-    movq     mm2, %3
-    packssdw mm1, mm2
-
-    movq     mm4, mm0
-    pxor     mm5, mm5
-    pcmpgtw  mm4, mm5
-
-    movq     mm2, mm0
-    pmullw   mm0, mm1
-    pmulhw   mm2, mm1
-
-    movq      mm1, mm0
-    punpcklwd mm0, mm2
-    punpckhwd mm1, mm2
-
-    movq     mm2, %5
-    movq     mm3, %5
-    psubd    mm2, mm0
-    psubd    mm3, mm1
-    paddd    mm0, %5
-    paddd    mm1, %5
-
-    psrad    mm0, %4
-    psrad    mm1, %4
-    psrad    mm2, %4
-    psrad    mm3, %4
-
-    packssdw mm0, mm1
-    packssdw mm2, mm3
-    pxor     mm5, mm5
-    psubw    mm5, mm2
-
-    pand     mm0, mm4
-    pandn    mm4, mm5
-
-    por      mm0, mm4
-    movq     %1,  mm0
+%macro MMX_QUANT15_DC_START 0
+    mov         eax, [esp+ 4]   ; &dct[0][0]
+    movd        mm5, [esp+ 8]   ; i_qmf
+    movd        mm6, [esp+12]   ; i_qbits
+    movd        mm7, [esp+16]   ; f
+    punpcklwd   mm5, mm5
+    punpcklwd   mm5, mm5        ; i_qmf in each word
+    punpckldq   mm7, mm7        ; f in each dword
 %endmacro
 
-%macro QUANT32_1x4 5
-;;; %1      dct[y][x]
-;;; %2,%3   quant_mf[i_mf][y][x]
-;;; %4      i_qbits
-;;; %5      f as doublewords
-;;; trashes mm0-mm4
-    movq        mm0, %1
+%macro MMX_QUANT15_1x4 4
+;;; %1      (m64)       dct[y][x]
+;;; %2      (m64/mmx)   quant_mf[y][x] or quant_mf[0][0] (as int16_t)
+;;; %3      (mmx)       i_qbits in the low doubleword
+;;; %4      (mmx)       f as doublewords
+;;; trashes mm0-mm2,mm4
+    movq        mm0, %1     ; load dct coeffs
     pxor        mm4, mm4
-    pcmpgtw     mm4, mm0        ; mm4 = sign(mm0)
+    pcmpgtw     mm4, mm0    ; sign(coeff)
     pxor        mm0, mm4
-    psubw       mm0, mm4        ; mm0 = abs(mm0)
+    psubw       mm0, mm4    ; abs(coeff)
+
+    movq        mm2, mm0
+    pmullw      mm0, %2
+    pmulhw      mm2, %2
+
     movq        mm1, mm0
-    punpcklwd   mm0, mm0        ; duplicate the words for the upcomming
-    punpckhwd   mm1, mm1        ; 32 bit multiplication
+    punpcklwd   mm0, mm2
+    punpckhwd   mm1, mm2
+
+    paddd       mm0, %4     ; round with f
+    paddd       mm1, %4
+    psrad       mm0, %3
+    psrad       mm1, %3
+
+    packssdw    mm0, mm1    ; pack
+    pxor        mm0, mm4    ; restore sign
+    psubw       mm0, mm4
+    movq        %1, mm0     ; store
+%endmacro
 
-    movq        mm2, mm0        ; like in school ...
-    movq        mm3, mm1
-    pmulhuw     mm0, %2         ; ... multiply the parts ...
-    pmulhuw     mm1, %3
-    pmullw      mm2, %2
-    pmullw      mm3, %3
-    pslld       mm0, 16         ; ... shift ...
-    pslld       mm1, 16
-    paddd       mm0, mm2        ; ... and add them
-    paddd       mm1, mm3
+ALIGN 16
+;-----------------------------------------------------------------------------
+;   void __cdecl x264_quant_2x2_dc_core15_mmx( int16_t dct[2][2],
+;       int const i_qmf, int const i_qbits, int const f );
+;-----------------------------------------------------------------------------
+x264_quant_2x2_dc_core15_mmx:
+    MMX_QUANT15_DC_START
+    MMX_QUANT15_1x4 [eax], mm5, mm6, mm7
+    ret
 
-    paddd       mm0, %5         ; round with f
-    paddd       mm1, %5
-    psrad       mm0, %4
-    psrad       mm1, %4
-    packssdw    mm0, mm1        ; pack & store
+ALIGN 16
+;-----------------------------------------------------------------------------
+;   void __cdecl x264_quant_4x4_dc_core15_mmx( int16_t dct[4][4],
+;       int const i_qmf, int const i_qbits, int const f );
+;-----------------------------------------------------------------------------
+x264_quant_4x4_dc_core15_mmx:
+    MMX_QUANT15_DC_START
+
+%rep 4
+    MMX_QUANT15_1x4 [eax], mm5, mm6, mm7
+    add         eax, byte 8
+%endrep
+
+    ret
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+;   void __cdecl x264_quant_4x4_core15_mmx( int16_t dct[4][4],
+;       int const quant_mf[4][4], int const i_qbits, int const f );
+;-----------------------------------------------------------------------------
+x264_quant_4x4_core15_mmx:
+    MMX_QUANT_AC_START
+
+%rep 4
+    movq        mm5, [ecx]
+    packssdw    mm5, [ecx+8]
+    MMX_QUANT15_1x4 [eax], mm5, mm6, mm7
+    add         ecx, byte 16
+    add         eax, byte 8
+%endrep
+
+    ret
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+;   void __cdecl x264_quant_8x8_core15_mmx( int16_t dct[8][8],
+;       int const quant_mf[8][8], int const i_qbits, int const f );
+;-----------------------------------------------------------------------------
+x264_quant_8x8_core15_mmx:
+    MMX_QUANT_AC_START
+
+%rep 16
+    movq        mm5, [ecx]
+    packssdw    mm5, [ecx+8]
+    MMX_QUANT15_1x4 [eax], mm5, mm6, mm7
+    add         ecx, byte 16
+    add         eax, byte 8
+%endrep
+
+    ret
+
+; ============================================================================
+
+%macro MMXEXT_QUANT16_DC_START 0
+    mov         eax, [esp+ 4]   ; &dct[0][0]
+    movd        mm5, [esp+ 8]   ; i_qmf
+    movd        mm6, [esp+12]   ; i_qbits
+    movd        mm7, [esp+16]   ; f
+    pshufw      mm5, mm5, 0     ; i_qmf in each word
+    punpckldq   mm7, mm7        ; f in each dword
+%endmacro
+
+%macro MMXEXT_QUANT16_1x4 4
+;;; %1      (m64)       dct[y][x]
+;;; %2      (m64/mmx)   quant_mf[y][x] or quant_mf[0][0] (as uint16_t)
+;;; %3      (mmx)       i_qbits in the low doubleword
+;;; %4      (mmx)       f as doublewords
+;;; trashes mm0-mm2,mm4
+    movq        mm0, %1     ; load dct coeffs
+    pxor        mm4, mm4
+    pcmpgtw     mm4, mm0    ; sign(coeff)
     pxor        mm0, mm4
-    psubw       mm0, mm4        ; restore sign
-    movq        %1, mm0
+    psubw       mm0, mm4    ; abs(coeff)
+
+    movq        mm2, mm0
+    pmullw      mm0, %2
+    pmulhuw     mm2, %2
+
+    movq        mm1, mm0
+    punpcklwd   mm0, mm2
+    punpckhwd   mm1, mm2
+
+    paddd       mm0, %4     ; round with f
+    paddd       mm1, %4
+    psrad       mm0, %3
+    psrad       mm1, %3
+
+    packssdw    mm0, mm1    ; pack
+    pxor        mm0, mm4    ; restore sign
+    psubw       mm0, mm4
+    movq        %1, mm0     ; store
 %endmacro
 
+ALIGN 16
+;-----------------------------------------------------------------------------
+;   void __cdecl x264_quant_2x2_dc_core16_mmxext( int16_t dct[2][2],
+;       int const i_qmf, int const i_qbits, int const f );
+;-----------------------------------------------------------------------------
+x264_quant_2x2_dc_core16_mmxext:
+    MMXEXT_QUANT16_DC_START
+    MMXEXT_QUANT16_1x4 [eax], mm5, mm6, mm7
+    ret
 
 ALIGN 16
-;;; void x264_quant_8x8_core16_mmx( int16_t dct[8][8], int quant_mf[8][8], int i_qbits, int f )
-x264_quant_8x8_core16_mmx:
-    QUANT_AC_START
+;-----------------------------------------------------------------------------
+;   void __cdecl x264_quant_4x4_dc_core16_mmxext( int16_t dct[4][4],
+;       int const i_qmf, int const i_qbits, int const f );
+;-----------------------------------------------------------------------------
+x264_quant_4x4_dc_core16_mmxext:
+    MMXEXT_QUANT16_DC_START
 
-%rep 16
-    QUANT16_1x4 [eax], [ecx], [ecx+8], mm6, mm7
-    add  eax, 8
-    add  ecx, 16
+%rep 4
+    MMXEXT_QUANT16_1x4 [eax], mm5, mm6, mm7
+    add         eax, byte 8
 %endrep
 
     ret
 
 ALIGN 16
-;;; void x264_quant_4x4_core16_mmx( int16_t dct[4][4], int quant_mf[4][4], int i_qbits, int f )
-x264_quant_4x4_core16_mmx:
-    QUANT_AC_START
+;-----------------------------------------------------------------------------
+;   void __cdecl x264_quant_4x4_core16_mmxext( int16_t dct[4][4],
+;       int const quant_mf[4][4], int const i_qbits, int const f );
+;-----------------------------------------------------------------------------
+x264_quant_4x4_core16_mmxext:
+    MMX_QUANT_AC_START
 
 %rep 4
-    QUANT16_1x4 [eax], [ecx], [ecx+8], mm6, mm7
-    add  eax, 8
-    add  ecx, 16
+    pshufw      mm5, [ecx], 10110001b
+    paddw       mm5, [ecx+8]
+    pshufw      mm5, mm5, 10001101b
+    MMXEXT_QUANT16_1x4 [eax], mm5, mm6, mm7
+    add         ecx, byte 16
+    add         eax, byte 8
 %endrep
 
     ret
 
 ALIGN 16
-;;; void x264_quant_8x8_core32_mmx( int16_t dct[8][8], int quant_mf[8][8], int i_qbits, int f )
-x264_quant_8x8_core32_mmx:
-    QUANT_AC_START
+;-----------------------------------------------------------------------------
+;   void __cdecl x264_quant_8x8_core16_mmxext( int16_t dct[8][8],
+;       int const quant_mf[8][8], int const i_qbits, int const f );
+;-----------------------------------------------------------------------------
+x264_quant_8x8_core16_mmxext:
+    MMX_QUANT_AC_START
 
 %rep 16
-    QUANT32_1x4 [eax], [ecx], [ecx+8], mm6, mm7
-    add  eax, 8
-    add  ecx, 16
+    pshufw      mm5, [ecx], 10110001b
+    paddw       mm5, [ecx+8]
+    pshufw      mm5, mm5, 10001101b
+    MMXEXT_QUANT16_1x4 [eax], mm5, mm6, mm7
+    add         ecx, byte 16
+    add         eax, byte 8
 %endrep
 
     ret
 
+
+
+%macro MMX_QUANT32_DC_START 0
+    mov         eax, [esp+ 4]   ; &dct[0][0]
+    movd        mm5, [esp+ 8]   ; i_qmf
+    movd        mm6, [esp+12]   ; i_qbits
+    movd        mm7, [esp+16]   ; f
+    punpckldq   mm5, mm5        ; i_qmf in each dword
+    punpckldq   mm7, mm7        ; f in each dword
+%endmacro
+
+%macro MMXEXT_QUANT32_1x4 5
+;;; %1      (m64)       dct[y][x]
+;;; %2,%3   (m64/mmx)   quant_mf[y][x] or quant_mf[0][0] (as int16_t)
+;;; %4      (mmx)       i_qbits in the low quadword
+;;; %5      (mmx)       f as doublewords
+;;; trashes mm0-mm4
+    movq        mm0, %1     ; load dct coeffs
+    pxor        mm4, mm4
+    pcmpgtw     mm4, mm0    ; sign(mm0)
+    pxor        mm0, mm4
+    psubw       mm0, mm4    ; abs(mm0)
+    movq        mm1, mm0
+    punpcklwd   mm0, mm0    ; duplicate the words for the upcomming
+    punpckhwd   mm1, mm1    ; 32 bit multiplication
+
+    movq        mm2, mm0    ; like in school ...
+    movq        mm3, mm1
+    pmulhuw     mm0, %2     ; ... multiply the parts ...
+    pmulhuw     mm1, %3
+    pmullw      mm2, %2
+    pmullw      mm3, %3
+    pslld       mm0, 16     ; ... shift ...
+    pslld       mm1, 16
+    paddd       mm0, mm2    ; ... and add them
+    paddd       mm1, mm3
+
+    paddd       mm0, %5     ; round with f
+    paddd       mm1, %5
+    psrad       mm0, %4
+    psrad       mm1, %4
+
+    packssdw    mm0, mm1    ; pack to int16_t
+    pxor        mm0, mm4    ; restore sign
+    psubw       mm0, mm4
+    movq        %1, mm0     ; store
+%endmacro
+
 ALIGN 16
-;;; void x264_quant_4x4_core32_mmx( int16_t dct[4][4], int quant_mf[4][4], int i_qbits, int f )
-x264_quant_4x4_core32_mmx:
-    QUANT_AC_START
+;-----------------------------------------------------------------------------
+;   void __cdecl x264_quant_2x2_dc_core32_mmxext( int16_t dct[2][2],
+;       int const i_qmf, int const i_qbits, int const f );
+;-----------------------------------------------------------------------------
+x264_quant_2x2_dc_core32_mmxext:
+    MMX_QUANT32_DC_START
+    MMXEXT_QUANT32_1x4 [eax], mm5, mm5, mm6, mm7
+    ret
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+;   void __cdecl x264_quant_4x4_dc_core32_mmxext( int16_t dct[4][4],
+;       int const i_qmf, int const i_qbits, int const f );
+;-----------------------------------------------------------------------------
+x264_quant_4x4_dc_core32_mmxext:
+    MMX_QUANT32_DC_START
 
 %rep 4
-    QUANT32_1x4 [eax], [ecx], [ecx+8], mm6, mm7
-    add  eax, 8
-    add  ecx, 16
+    MMXEXT_QUANT32_1x4 [eax], mm5, mm5, mm6, mm7
+    add         eax, byte 8
 %endrep
 
     ret
 
 ALIGN 16
-;;; void x264_quant_4x4_dc_core32_mmx( int16_t dct[4][4], int i_quant_mf, int i_qbits, int f )
-x264_quant_4x4_dc_core32_mmx:
-    QUANT_DC_START
+;-----------------------------------------------------------------------------
+;   void __cdecl x264_quant_4x4_core32_mmxext( int16_t dct[4][4],
+;       int const quant_mf[4][4], int const i_qbits, int const f );
+;-----------------------------------------------------------------------------
+x264_quant_4x4_core32_mmxext:
+    MMX_QUANT_AC_START
 
 %rep 4
-    QUANT32_1x4 [eax], mm5, mm5, mm6, mm7
-    add  eax, 8
+    MMXEXT_QUANT32_1x4 [eax], [ecx], [ecx+8], mm6, mm7
+    add         eax, byte 8
+    add         ecx, byte 16
 %endrep
 
     ret
 
 ALIGN 16
-;;; void x264_quant_2x2_dc_core32_mmx( int16_t dct[2][2], int i_quant_mf, int i_qbits, int f )
-x264_quant_2x2_dc_core32_mmx:
-    QUANT_DC_START
+;-----------------------------------------------------------------------------
+;   void __cdecl x264_quant_8x8_core32_mmxext( int16_t dct[8][8],
+;       int const quant_mf[8][8], int const i_qbits, int const f );
+;-----------------------------------------------------------------------------
+x264_quant_8x8_core32_mmxext:
+    MMX_QUANT_AC_START
 
-    QUANT32_1x4 [eax], mm5, mm5, mm6, mm7
+%rep 16
+    MMXEXT_QUANT32_1x4 [eax], [ecx], [ecx+8], mm6, mm7
+    add         eax, byte 8
+    add         ecx, byte 16
+%endrep
 
     ret
 
diff --git a/common/i386/quant.h b/common/i386/quant.h
new file mode 100644
index 00000000..87fabbd4
--- /dev/null
+++ b/common/i386/quant.h
@@ -0,0 +1,53 @@
+/*****************************************************************************
+ * quant.h: h264 encoder library
+ *****************************************************************************
+ * Copyright (C) 2005 x264 project
+ *
+ * Authors: Christian Heine <sennindemokrit@gmx.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#ifndef _I386_QUANT_H
+#define _I386_QUANT_H 1
+
+void x264_quant_8x8_core15_mmx( int16_t dct[8][8],
+    int quant_mf[8][8], int const i_qbits, int const f );
+void x264_quant_4x4_core15_mmx( int16_t dct[4][4],
+    int quant_mf[4][4], int const i_qbits, int const f );
+void x264_quant_4x4_dc_core15_mmx( int16_t dct[4][4],
+    int const i_qmf, int const i_qbits, int const f );
+void x264_quant_2x2_dc_core15_mmx( int16_t dct[2][2],
+    int const i_qmf, int const i_qbits, int const f );
+
+void x264_quant_8x8_core16_mmxext( int16_t dct[8][8],
+    int quant_mf[8][8], int const i_qbits, int const f );
+void x264_quant_4x4_core16_mmxext( int16_t dct[4][4],
+    int quant_mf[4][4], int const i_qbits, int const f );
+void x264_quant_4x4_dc_core16_mmxext( int16_t dct[4][4],
+    int const i_qmf, int const i_qbits, int const f );
+void x264_quant_2x2_dc_core16_mmxext( int16_t dct[2][2],
+    int const i_qmf, int const i_qbits, int const f );
+
+void x264_quant_8x8_core32_mmxext( int16_t dct[8][8],
+    int quant_mf[8][8], int const i_qbits, int const f );
+void x264_quant_4x4_core32_mmxext( int16_t dct[4][4],
+    int quant_mf[4][4], int const i_qbits, int const f );
+void x264_quant_4x4_dc_core32_mmxext( int16_t dct[4][4],
+    int const i_qmf, int const i_qbits, int const f );
+void x264_quant_2x2_dc_core32_mmxext( int16_t dct[2][2],
+    int const i_qmf, int const i_qbits, int const f );
+
+#endif
diff --git a/common/quant.c b/common/quant.c
index 437a135d..fc32cfd2 100644
--- a/common/quant.c
+++ b/common/quant.c
@@ -22,13 +22,9 @@
 
 #include "common.h"
 
-void x264_quant_8x8_core16_mmx( int16_t dct[8][8], int quant_mf[8][8], int i_qbits, int f );
-void x264_quant_4x4_core16_mmx( int16_t dct[4][4], int quant_mf[4][4], int i_qbits, int f );
-void x264_quant_8x8_core32_mmx( int16_t dct[8][8], int quant_mf[8][8], int i_qbits, int f );
-void x264_quant_4x4_core32_mmx( int16_t dct[4][4], int quant_mf[4][4], int i_qbits, int f );
-void x264_quant_4x4_dc_core32_mmx( int16_t dct[4][4], int i_quant_mf, int i_qbits, int f );
-void x264_quant_2x2_dc_core32_mmx( int16_t dct[2][2], int i_quant_mf, int i_qbits, int f );
-
+#ifdef HAVE_MMXEXT
+#include "i386/quant.h"
+#endif
 
 #define QUANT_ONE( coef, mf ) \
 { \
@@ -70,7 +66,7 @@ static void quant_2x2_dc_core( int16_t dct[2][2], int i_quant_mf, int i_qbits, i
 
 void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
 {
-    const char *name[4] = { "C", "C", "C", "C" };
+    int i, maxQ8=0, maxQ4=0, maxQdc=0;
 
     pf->quant_8x8_core = quant_8x8_core;
     pf->quant_4x4_core = quant_4x4_core;
@@ -78,34 +74,64 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
     pf->quant_2x2_dc_core = quant_2x2_dc_core;
 
 #ifdef HAVE_MMXEXT
-    if( cpu&X264_CPU_MMX )
+
+    /* determine the biggest coeffient in all quant8_mf tables */
+    for( i = 0; i < 2*6*8*8; i++ )
     {
-        int i;
-
-        pf->quant_8x8_core = x264_quant_8x8_core16_mmx;
-        pf->quant_4x4_core = x264_quant_4x4_core16_mmx;
-        pf->quant_4x4_dc_core = x264_quant_4x4_dc_core32_mmx;
-        pf->quant_2x2_dc_core = x264_quant_2x2_dc_core32_mmx;
-
-        name[0] = name[1] = "16MMX";
-        name[2] = name[3] = "32MMX";
-
-        for( i = 0; i < 2*6*8*8; i++ )
-            if( (***h->quant8_mf)[i] >= 0x8000 )
-            {
-                pf->quant_8x8_core = x264_quant_8x8_core32_mmx;
-                name[0] = "32MMX";
-            }
-
-        for( i = 0; i < 4*6*4*4; i++ )
-            if( (***h->quant4_mf)[i] >= 0x8000 )
-            {
-                pf->quant_4x4_core = x264_quant_4x4_core32_mmx;
-                name[1] = "32MMX";
-            }
+        int q = h->quant8_mf[0][0][0][i];
+        if( maxQ8 < q )
+            maxQ8 = q;
+    }
+
+    /* determine the biggest coeffient in all quant4_mf tables ( maxQ4 )
+       and the biggest DC coefficient if all quant4_mf tables ( maxQdc ) */
+    for( i = 0; i < 4*6*4*4; i++ )
+    {
+        int q = h->quant4_mf[0][0][0][i];
+        if( maxQ4 < q )
+            maxQ4 = q;
+        if( maxQdc < q && i%16 == 0 )
+            maxQdc = q;
+    }
+
+    /* select quant_8x8 based on CPU and maxQ8 */
+    if( maxQ8 < (1<<15) && cpu&X264_CPU_MMX )
+        pf->quant_8x8_core = x264_quant_8x8_core15_mmx;
+    else
+    if( maxQ8 < (1<<16) && cpu&X264_CPU_MMXEXT )
+        pf->quant_8x8_core = x264_quant_8x8_core16_mmxext;
+    else
+    if( cpu&X264_CPU_MMXEXT )
+        pf->quant_8x8_core = x264_quant_8x8_core32_mmxext;
+
+    /* select quant_4x4 based on CPU and maxQ4 */
+    if( maxQ4 < (1<<15) && cpu&X264_CPU_MMX )
+        pf->quant_4x4_core = x264_quant_4x4_core15_mmx;
+    else
+    if( maxQ4 < (1<<16) && cpu&X264_CPU_MMXEXT )
+        pf->quant_4x4_core = x264_quant_4x4_core16_mmxext;
+    else
+    if( cpu&X264_CPU_MMXEXT )
+        pf->quant_4x4_core = x264_quant_4x4_core32_mmxext;
+
+    /* select quant_XxX_dc based on CPU and maxQdc */
+    if( maxQdc < (1<<16) && cpu&X264_CPU_MMXEXT )
+    {
+        pf->quant_4x4_dc_core = x264_quant_4x4_dc_core16_mmxext;
+        pf->quant_2x2_dc_core = x264_quant_2x2_dc_core16_mmxext;
+    }
+    else
+    if( maxQdc < (1<<15) && cpu&X264_CPU_MMX )
+    {
+        pf->quant_4x4_dc_core = x264_quant_4x4_dc_core15_mmx;
+        pf->quant_2x2_dc_core = x264_quant_2x2_dc_core15_mmx;
+    }
+    else
+    if( cpu&X264_CPU_MMXEXT )
+    {
+        pf->quant_4x4_dc_core = x264_quant_4x4_dc_core32_mmxext;
+        pf->quant_2x2_dc_core = x264_quant_2x2_dc_core32_mmxext;
     }
-#endif
 
-    x264_log( h, X264_LOG_DEBUG, "using quant functions 8x8=%s 4x4=%s dc4x4=%s dc2x2=%s\n",
-              name[0], name[1], name[2], name[3] );
+#endif  /* HAVE_MMXEXT */
 }