From: Loren Merritt <pengvado@videolan.org>
Date: Wed, 4 Apr 2007 18:45:25 +0000 (+0000)
Subject: 2x faster quant. 2% overall.
X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=10265a0c2a0b29e6252ad3be6fad1569e7a04339;p=libx264

2x faster quant. 2% overall.
side effects:
not bit-identical to the previous algorithm.
while the new algorithm covers a wider range of cqms than the previous one did,
I couldn't find a good way to fallback to a general version for the extreme
cqms. so now it refuses to encode extreme cqms instead of just being slower.
lays a framework for custom deadzone matrices, though I didn't add an api.


git-svn-id: svn://svn.videolan.org/x264/trunk@642 df754926-b1dd-0310-bc7b-ec298dee348c
---

diff --git a/common/amd64/quant-a.asm b/common/amd64/quant-a.asm
index 32ff0cda..6fdc3198 100644
--- a/common/amd64/quant-a.asm
+++ b/common/amd64/quant-a.asm
@@ -3,8 +3,7 @@
 ;*****************************************************************************
 ;* Copyright (C) 2005 x264 project
 ;*
-;* Authors: Alex Izvorski <aizvorksi@gmail.com>
-;*          Christian Heine <sennindemokrit@gmx.net>
+;* Authors: Loren Merritt <lorenm@u.washington.edu>
 ;*
 ;* This program is free software; you can redistribute it and/or modify
 ;* it under the terms of the GNU General Public License as published by
@@ -21,16 +20,6 @@
 ;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
 ;*****************************************************************************
 
-;*****************************************************************************
-;*                                                                           *
-;*  Revision history:                                                        *
-;*                                                                           *
-;*  2005.07.26  quant 4x4 & 8x8 MMX functions (AI)                           *
-;*  2005.09.04  quant MMXEXT (added precision) and DC (CH)                   *
-;*  2005.09.21  faster MMX and added MMXEXT16 (CH)                           *
-;*                                                                           *
-;*****************************************************************************
-
 BITS 64
 
 %include "amd64inc.asm"
@@ -40,394 +29,106 @@ pd_1:  times 2 dd 1
 
 SECTION .text
 
-%macro MMX_QUANT_AC_START 0
-;   mov         rdi, rdi        ; &dct[0][0]
-;   mov         rsi, rsi        ; &quant_mf[0][0]
-    movd        mm6, parm3d     ; i_qbits
-    movd        mm7, parm4d     ; f
-    punpckldq   mm7, mm7        ; f in each dword
+%macro MMX_QUANT_DC_START 0
+    movd       mm6, parm2d     ; mf
+    movd       mm7, parm3d     ; bias
+    pshufw     mm6, mm6, 0
+    pshufw     mm7, mm7, 0
 %endmacro
 
-%macro MMX_QUANT15_DC_START 0
-;   mov         rdi, rdi        ; &dct[0][0]
-    movd        mm5, parm2d     ; i_qmf
-    movd        mm6, parm3d     ; i_qbits
-    movd        mm7, parm4d     ; f
-    punpcklwd   mm5, mm5
-    punpcklwd   mm5, mm5        ; i_qmf in each word
-    punpckldq   mm7, mm7        ; f in each dword
+%macro SSE2_QUANT_DC_START 0
+    movd       xmm6, parm2d     ; mf
+    movd       xmm7, parm3d     ; bias
+    pshuflw    xmm6, xmm6, 0
+    pshuflw    xmm7, xmm7, 0
+    punpcklqdq xmm6, xmm6
+    punpcklqdq xmm7, xmm7
 %endmacro
 
-%macro SSE2_QUANT_AC_START 0
-    movd       xmm6, parm3d     ; i_qbits
-    movd       xmm7, parm4d     ; f
-    pshufd     xmm7, xmm7, 0    ; f in each dword
+%macro QUANT_ONE 5
+;;; %1      (m64)       dct[y][x]
+;;; %2      (m64/mmx)   mf[y][x] or mf[0][0] (as uint16_t)
+;;; %3      (m64/mmx)   bias[y][x] or bias[0][0] (as uint16_t)
+
+    mov%1      %2m0, %3     ; load dct coeffs
+    pxor       %2m1, %2m1
+    pcmpgtw    %2m1, %2m0   ; sign(coeff)
+    pxor       %2m0, %2m1
+    psubw      %2m0, %2m1   ; abs(coeff)
+    paddusw    %2m0, %5     ; round
+    pmulhuw    %2m0, %4     ; divide
+    pxor       %2m0, %2m1   ; restore sign
+    psubw      %2m0, %2m1
+    mov%1        %3, %2m0   ; store
 %endmacro
-
-%macro SSE2_QUANT15_DC_START 0
-    movd       xmm5, parm2d     ; i_qmf
-    movd       xmm6, parm3d     ; i_qbits
-    movd       xmm7, parm4d     ; f
-    pshuflw    xmm5, xmm5, 0
-    punpcklqdq xmm5, xmm5       ; i_qmf in each word
-    pshufd     xmm7, xmm7, 0    ; f in each dword
+%macro MMX_QUANT_1x4 3
+    QUANT_ONE q, m, %1, %2, %3
 %endmacro
-
-%macro MMX_QUANT15_1x4 4
-;;; %1      (m64)       dct[y][x]
-;;; %2      (m64/mmx)   quant_mf[y][x] or quant_mf[0][0] (as int16_t)
-;;; %3      (mmx)       i_qbits in the low doubleword
-;;; %4      (mmx)       f as doublewords
-;;; trashes mm0-mm2,mm4
-    movq        mm0, %1     ; load dct coeffs
-    pxor        mm4, mm4
-    pcmpgtw     mm4, mm0    ; sign(coeff)
-    pxor        mm0, mm4
-    psubw       mm0, mm4    ; abs(coeff)
-
-    movq        mm2, mm0
-    pmullw      mm0, %2
-    pmulhw      mm2, %2
-
-    movq        mm1, mm0
-    punpcklwd   mm0, mm2
-    punpckhwd   mm1, mm2
-
-    paddd       mm0, %4     ; round with f
-    paddd       mm1, %4
-    psrad       mm0, %3
-    psrad       mm1, %3
-
-    packssdw    mm0, mm1    ; pack
-    pxor        mm0, mm4    ; restore sign
-    psubw       mm0, mm4
-    movq         %1, mm0    ; store
+%macro SSE2_QUANT_1x8 3
+    QUANT_ONE dqa, xm, %1, %2, %3
 %endmacro
 
-%macro SSSE3_QUANT15_1x8 4
+%macro SSSE3_QUANT_1x8 3
     movdqa     xmm0, %1     ; load dct coeffs
-    movdqa     xmm4, xmm0   ; save sign
+    movdqa     xmm1, xmm0   ; save sign
     pabsw      xmm0, xmm0
-
-    movdqa     xmm2, xmm0
-    pmullw     xmm0, %2
-    pmulhw     xmm2, %2
-
-    movdqa     xmm1, xmm0
-    punpcklwd  xmm0, xmm2
-    punpckhwd  xmm1, xmm2
-
-    paddd      xmm0, %4     ; round with f
-    paddd      xmm1, %4
-    psrad      xmm0, %3
-    psrad      xmm1, %3
-
-    packssdw   xmm0, xmm1   ; pack
-    psignw     xmm0, xmm4   ; restore sign
+    paddusw    xmm0, %3     ; round
+    pmulhuw    xmm0, %2     ; divide
+    psignw     xmm0, xmm1   ; restore sign
     movdqa       %1, xmm0   ; store
 %endmacro
 
 ;-----------------------------------------------------------------------------
-;   void x264_quant_2x2_dc_core15_mmx( int16_t dct[2][2],
-;       int const i_qmf, int const i_qbits, int const f );
+; void x264_quant_2x2_dc_mmxext( int16_t dct[4], int mf, int bias )
 ;-----------------------------------------------------------------------------
-cglobal x264_quant_2x2_dc_core15_mmx
-    MMX_QUANT15_DC_START
-    MMX_QUANT15_1x4 [parm1q], mm5, mm6, mm7
-    ret
-
-;-----------------------------------------------------------------------------
-;   void x264_quant_4x4_dc_core15_mmx( int16_t dct[4][4],
-;       int const i_qmf, int const i_qbits, int const f );
-;-----------------------------------------------------------------------------
-cglobal x264_quant_4x4_dc_core15_mmx
-    MMX_QUANT15_DC_START
-
-%rep 4
-    MMX_QUANT15_1x4 [parm1q], mm5, mm6, mm7
-    add         parm1q, byte 8
-%endrep
-
-    ret
-
-;-----------------------------------------------------------------------------
-;   void x264_quant_4x4_core15_mmx( int16_t dct[4][4],
-;       int const quant_mf[4][4], int const i_qbits, int const f );
-;-----------------------------------------------------------------------------
-cglobal x264_quant_4x4_core15_mmx
-    MMX_QUANT_AC_START
-
-%rep 4
-    movq        mm5, [parm2q]
-    packssdw    mm5, [parm2q+8]
-    MMX_QUANT15_1x4 [parm1q], mm5, mm6, mm7
-    add         parm2q, byte 16
-    add         parm1q, byte 8
-%endrep
-
+cglobal x264_quant_2x2_dc_mmxext
+    MMX_QUANT_DC_START
+    MMX_QUANT_1x4 [parm1q], mm6, mm7
     ret
 
+%macro QUANT_SSE 1
 ;-----------------------------------------------------------------------------
-;   void x264_quant_8x8_core15_mmx( int16_t dct[8][8],
-;       int const quant_mf[8][8], int const i_qbits, int const f );
+; void x264_quant_4x4_dc_sse2( int16_t dct[16], int mf, int bias )
 ;-----------------------------------------------------------------------------
-cglobal x264_quant_8x8_core15_mmx
-    MMX_QUANT_AC_START
-
-%rep 16
-    movq        mm5, [parm2q]
-    packssdw    mm5, [parm2q+8]
-    MMX_QUANT15_1x4 [parm1q], mm5, mm6, mm7
-    add         parm2q, byte 16
-    add         parm1q, byte 8
+cglobal x264_quant_4x4_dc_%1
+    SSE2_QUANT_DC_START
+%assign x 0
+%rep 2
+    QUANT_1x8 [parm1q+x], xmm6, xmm7
+%assign x (x+16)
 %endrep
-
-    ret
-
-%ifdef HAVE_SSE3
-;-----------------------------------------------------------------------------
-;   void x264_quant_4x4_dc_core15_ssse3( int16_t dct[4][4],
-;       int const i_qmf, int const i_qbits, int const f );
-;-----------------------------------------------------------------------------
-cglobal x264_quant_4x4_dc_core15_ssse3
-    SSE2_QUANT15_DC_START
-    SSSE3_QUANT15_1x8 [parm1q], xmm5, xmm6, xmm7
-    SSSE3_QUANT15_1x8 [parm1q+16], xmm5, xmm6, xmm7
     ret
 
 ;-----------------------------------------------------------------------------
-;   void x264_quant_4x4_core15_ssse3( int16_t dct[4][4],
-;       int const quant_mf[4][4], int const i_qbits, int const f );
+; void x264_quant_4x4_sse2( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] )
 ;-----------------------------------------------------------------------------
-cglobal x264_quant_4x4_core15_ssse3
-    SSE2_QUANT_AC_START
+cglobal x264_quant_4x4_%1
 %assign x 0
 %rep 2
-    movdqa      xmm5, [parm2q+32*x]
-    packssdw    xmm5, [parm2q+32*x+16]
-    SSSE3_QUANT15_1x8 [parm1q+16*x], xmm5, xmm6, xmm7
-    %assign x x+1
+    QUANT_1x8 [parm1q+x], [parm2q+x], [parm3q+x]
+%assign x (x+16)
 %endrep
     ret
 
 ;-----------------------------------------------------------------------------
-;   void x264_quant_8x8_core15_ssse3( int16_t dct[8][8],
-;       int const quant_mf[8][8], int const i_qbits, int const f );
+; void x264_quant_8x8_sse2( int16_t dct[64], uint16_t mf[64], uint16_t bias[64] )
 ;-----------------------------------------------------------------------------
-cglobal x264_quant_8x8_core15_ssse3
-    SSE2_QUANT_AC_START
+cglobal x264_quant_8x8_%1
 %assign x 0
 %rep 8
-    movdqa      xmm5, [parm2q+32*x]
-    packssdw    xmm5, [parm2q+32*x+16]
-    SSSE3_QUANT15_1x8 [parm1q+16*x], xmm5, xmm6, xmm7
-    %assign x x+1
-%endrep
-    ret
-%endif ; HAVE_SSE3
-
-
-; ============================================================================
-
-%macro MMXEXT_QUANT16_DC_START 0
-;   mov         rdi, rdi        ; &dct[0][0]
-    movd        mm5, parm2d     ; i_qmf
-    movd        mm6, parm3d     ; i_qbits
-    movd        mm7, parm4d     ; f
-    pshufw      mm5, mm5, 0     ; i_qmf in each word
-    punpckldq   mm7, mm7        ; f in each dword
-%endmacro
-
-%macro MMXEXT_QUANT16_1x4 4
-;;; %1      (m64)       dct[y][x]
-;;; %2      (m64/mmx)   quant_mf[y][x] or quant_mf[0][0] (as uint16_t)
-;;; %3      (mmx)       i_qbits in the low doubleword
-;;; %4      (mmx)       f as doublewords
-;;; trashes mm0-mm2,mm4
-    movq        mm0, %1     ; load dct coeffs
-    pxor        mm4, mm4
-    pcmpgtw     mm4, mm0    ; sign(coeff)
-    pxor        mm0, mm4
-    psubw       mm0, mm4    ; abs(coeff)
-
-    movq        mm2, mm0
-    pmullw      mm0, %2
-    pmulhuw     mm2, %2
-
-    movq        mm1, mm0
-    punpcklwd   mm0, mm2
-    punpckhwd   mm1, mm2
-
-    paddd       mm0, %4     ; round with f
-    paddd       mm1, %4
-    psrad       mm0, %3
-    psrad       mm1, %3
-
-    packssdw    mm0, mm1    ; pack
-    pxor        mm0, mm4    ; restore sign
-    psubw       mm0, mm4
-    movq        %1, mm0     ; store
-%endmacro
-
-;-----------------------------------------------------------------------------
-;   void x264_quant_2x2_dc_core16_mmxext( int16_t dct[2][2],
-;       int const i_qmf, int const i_qbits, int const f );
-;-----------------------------------------------------------------------------
-cglobal x264_quant_2x2_dc_core16_mmxext
-    MMXEXT_QUANT16_DC_START
-    MMXEXT_QUANT16_1x4 [parm1q], mm5, mm6, mm7
-    ret
-
-;-----------------------------------------------------------------------------
-;   void x264_quant_4x4_dc_core16_mmxext( int16_t dct[4][4],
-;       int const i_qmf, int const i_qbits, int const f );
-;-----------------------------------------------------------------------------
-cglobal x264_quant_4x4_dc_core16_mmxext
-    MMXEXT_QUANT16_DC_START
-
-%rep 4
-    MMXEXT_QUANT16_1x4 [parm1q], mm5, mm6, mm7
-    add         parm1q, byte 8
-%endrep
-
-    ret
-
-;-----------------------------------------------------------------------------
-;   void x264_quant_4x4_core16_mmxext( int16_t dct[4][4],
-;       int const quant_mf[4][4], int const i_qbits, int const f );
-;-----------------------------------------------------------------------------
-cglobal x264_quant_4x4_core16_mmxext
-    MMX_QUANT_AC_START
-
-%rep 4
-    pshufw      mm5, [parm2q], 10110001b
-    paddw       mm5, [parm2q+8]
-    pshufw      mm5, mm5, 10001101b
-    MMXEXT_QUANT16_1x4 [parm1q], mm5, mm6, mm7
-    add         parm2q, byte 16
-    add         parm1q, byte 8
+    QUANT_1x8 [parm1q+x], [parm2q+x], [parm3q+x]
+%assign x (x+16)
 %endrep
-
     ret
-
-;-----------------------------------------------------------------------------
-;   void x264_quant_8x8_core16_mmxext( int16_t dct[8][8],
-;       int const quant_mf[8][8], int const i_qbits, int const f );
-;-----------------------------------------------------------------------------
-cglobal x264_quant_8x8_core16_mmxext
-    MMX_QUANT_AC_START
-
-%rep 16
-    pshufw      mm5, [parm2q], 10110001b
-    paddw       mm5, [parm2q+8]
-    pshufw      mm5, mm5, 10001101b
-    MMXEXT_QUANT16_1x4 [parm1q], mm5, mm6, mm7
-    add         parm2q, byte 16
-    add         parm1q, byte 8
-%endrep
-
-    ret
-
-
-
-%macro MMX_QUANT32_DC_START 0
-;   mov         rdi, rdi        ; &dct[0][0]
-    movd        mm5, parm2d     ; i_qmf
-    movd        mm6, parm3d     ; i_qbits
-    movd        mm7, parm4d     ; f
-    punpckldq   mm5, mm5        ; i_qmf in each dword
-    punpckldq   mm7, mm7        ; f in each dword
 %endmacro
 
-%macro MMXEXT_QUANT32_1x4 5
-;;; %1      (m64)       dct[y][x]
-;;; %2,%3   (m64/mmx)   quant_mf[y][x] or quant_mf[0][0] (as int16_t)
-;;; %4      (mmx)       i_qbits in the low quadword
-;;; %5      (mmx)       f as doublewords
-;;; trashes mm0-mm4
-    movq        mm0, %1     ; load dct coeffs
-    pxor        mm4, mm4
-    pcmpgtw     mm4, mm0    ; sign(mm0)
-    pxor        mm0, mm4
-    psubw       mm0, mm4    ; abs(mm0)
-    movq        mm1, mm0
-    punpcklwd   mm0, mm0    ; duplicate the words for the upcomming
-    punpckhwd   mm1, mm1    ; 32 bit multiplication
-
-    movq        mm2, mm0    ; like in school ...
-    movq        mm3, mm1
-    pmulhuw     mm0, %2     ; ... multiply the parts ...
-    pmulhuw     mm1, %3
-    pmullw      mm2, %2
-    pmullw      mm3, %3
-    pslld       mm0, 16     ; ... shift ...
-    pslld       mm1, 16
-    paddd       mm0, mm2    ; ... and add them
-    paddd       mm1, mm3
-
-    paddd       mm0, %5     ; round with f
-    paddd       mm1, %5
-    psrad       mm0, %4
-    psrad       mm1, %4
-
-    packssdw    mm0, mm1    ; pack to int16_t
-    pxor        mm0, mm4    ; restore sign
-    psubw       mm0, mm4
-    movq        %1, mm0     ; store
-%endmacro
-
-;-----------------------------------------------------------------------------
-;   void x264_quant_2x2_dc_core32_mmxext( int16_t dct[2][2],
-;       int const i_qmf, int const i_qbits, int const f );
-;-----------------------------------------------------------------------------
-cglobal x264_quant_2x2_dc_core32_mmxext
-    MMX_QUANT32_DC_START
-    MMXEXT_QUANT32_1x4 [parm1q], mm5, mm5, mm6, mm7
-    ret
-
-;-----------------------------------------------------------------------------
-;   void x264_quant_4x4_dc_core32_mmxext( int16_t dct[4][4],
-;       int const i_qmf, int const i_qbits, int const f );
-;-----------------------------------------------------------------------------
-cglobal x264_quant_4x4_dc_core32_mmxext
-    MMX_QUANT32_DC_START
-
-%rep 4
-    MMXEXT_QUANT32_1x4 [parm1q], mm5, mm5, mm6, mm7
-    add         parm1q, byte 8
-%endrep
-
-    ret
-
-;-----------------------------------------------------------------------------
-;   void x264_quant_4x4_core32_mmxext( int16_t dct[4][4],
-;       int const quant_mf[4][4], int const i_qbits, int const f );
-;-----------------------------------------------------------------------------
-cglobal x264_quant_4x4_core32_mmxext
-    MMX_QUANT_AC_START
-
-%rep 4
-    MMXEXT_QUANT32_1x4 [parm1q], [parm2q], [parm2q+8], mm6, mm7
-    add         parm1q, byte 8
-    add         parm2q, byte 16
-%endrep
-
-    ret
-
-;-----------------------------------------------------------------------------
-;   void x264_quant_8x8_core32_mmxext( int16_t dct[8][8],
-;       int const quant_mf[8][8], int const i_qbits, int const f );
-;-----------------------------------------------------------------------------
-cglobal x264_quant_8x8_core32_mmxext
-    MMX_QUANT_AC_START
-
-%rep 16
-    MMXEXT_QUANT32_1x4 [parm1q], [parm2q], [parm2q+8], mm6, mm7
-    add         parm1q, byte 8
-    add         parm2q, byte 16
-%endrep
+%define QUANT_1x8 SSE2_QUANT_1x8
+QUANT_SSE sse2
+%ifdef HAVE_SSE3
+%define QUANT_1x8 SSSE3_QUANT_1x8
+QUANT_SSE ssse3
+%endif
 
-    ret
 
 
 ;=============================================================================
diff --git a/common/common.h b/common/common.h
index dd055f5e..74f8c108 100644
--- a/common/common.h
+++ b/common/common.h
@@ -341,10 +341,12 @@ struct x264_t
 
     int             (*dequant4_mf[4])[4][4]; /* [4][6][4][4] */
     int             (*dequant8_mf[2])[8][8]; /* [2][6][8][8] */
-    int             (*quant4_mf[4])[4][4];   /* [4][6][4][4] */
-    int             (*quant8_mf[2])[8][8];   /* [2][6][8][8] */
     int             (*unquant4_mf[4])[16];   /* [4][52][16] */
     int             (*unquant8_mf[2])[64];   /* [2][52][64] */
+    uint16_t        (*quant4_mf[4])[16];     /* [4][52][16] */
+    uint16_t        (*quant8_mf[2])[64];     /* [2][52][64] */
+    uint16_t        (*quant4_bias[4])[16];   /* [4][52][16] */
+    uint16_t        (*quant8_bias[2])[64];   /* [2][52][64] */
 
     uint32_t        nr_residual_sum[2][64];
     uint32_t        nr_offset[2][64];
@@ -436,9 +438,6 @@ struct x264_t
 
         int     b_interlaced;
 
-        /* Inverted luma quantization deadzone */
-        int     i_luma_deadzone[2]; // {inter, intra}
-
         /* Allowed qpel MV range to stay within the picture + emulated edge pixels */
         int     mv_min[2];
         int     mv_max[2];
diff --git a/common/i386/quant-a.asm b/common/i386/quant-a.asm
index b8860557..13e794de 100644
--- a/common/i386/quant-a.asm
+++ b/common/i386/quant-a.asm
@@ -3,8 +3,7 @@
 ;*****************************************************************************
 ;* Copyright (C) 2005 x264 project
 ;*
-;* Authors: Alex Izvorski <aizvorksi@gmail.com>
-;*          Christian Heine <sennindemokrit@gmx.net>
+;* Authors: Loren Merritt <lorenm@u.washington.edu>
 ;*
 ;* This program is free software; you can redistribute it and/or modify
 ;* it under the terms of the GNU General Public License as published by
@@ -21,16 +20,6 @@
 ;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
 ;*****************************************************************************
 
-;*****************************************************************************
-;*                                                                           *
-;*  Revision history:                                                        *
-;*                                                                           *
-;*  2005.07.26  quant 4x4 & 8x8 MMX functions (AI)                           *
-;*  2005.09.04  quant MMXEXT (added precision) and DC (CH)                   *
-;*  2005.09.21  faster MMX and added MMXEXT16 (CH)                           *
-;*                                                                           *
-;*****************************************************************************
-
 BITS 32
 
 %include "i386inc.asm"
@@ -40,313 +29,151 @@ pd_1:  times 2 dd 1
 
 SECTION .text
 
-%macro MMX_QUANT_AC_START 0
-    mov         eax, [esp+ 4]   ; &dct[0][0]
-    mov         ecx, [esp+ 8]   ; &quant_mf[0][0]
-    movd        mm6, [esp+12]   ; i_qbits
-    movd        mm7, [esp+16]   ; f
-    punpckldq   mm7, mm7        ; f in each dword
+%macro QUANT_AC_START 0
+    mov         eax, [esp+ 4]   ; dct
+    mov         ecx, [esp+ 8]   ; mf
+    mov         edx, [esp+12]   ; bias
 %endmacro
 
-%macro MMX_QUANT15_DC_START 0
-    mov         eax, [esp+ 4]   ; &dct[0][0]
-    movd        mm5, [esp+ 8]   ; i_qmf
-    movd        mm6, [esp+12]   ; i_qbits
-    movd        mm7, [esp+16]   ; f
-    punpcklwd   mm5, mm5
-    punpcklwd   mm5, mm5        ; i_qmf in each word
-    punpckldq   mm7, mm7        ; f in each dword
+%macro MMX_QUANT_DC_START 0
+    mov         eax, [esp+ 4]   ; dct
+    movd        mm6, [esp+ 8]   ; mf
+    movd        mm7, [esp+12]   ; bias
+    pshufw      mm6, mm6, 0
+    pshufw      mm7, mm7, 0
 %endmacro
 
-%macro MMX_QUANT15_1x4 4
-;;; %1      (m64)       dct[y][x]
-;;; %2      (m64/mmx)   quant_mf[y][x] or quant_mf[0][0] (as int16_t)
-;;; %3      (mmx)       i_qbits in the low doubleword
-;;; %4      (mmx)       f as doublewords
-;;; trashes mm0-mm2,mm4
-    movq        mm0, %1     ; load dct coeffs
-    pxor        mm4, mm4
-    pcmpgtw     mm4, mm0    ; sign(coeff)
-    pxor        mm0, mm4
-    psubw       mm0, mm4    ; abs(coeff)
-
-    movq        mm2, mm0
-    pmullw      mm0, %2
-    pmulhw      mm2, %2
-
-    movq        mm1, mm0
-    punpcklwd   mm0, mm2
-    punpckhwd   mm1, mm2
-
-    paddd       mm0, %4     ; round with f
-    paddd       mm1, %4
-    psrad       mm0, %3
-    psrad       mm1, %3
-
-    packssdw    mm0, mm1    ; pack
-    pxor        mm0, mm4    ; restore sign
-    psubw       mm0, mm4
-    movq        %1, mm0     ; store
+%macro SSE2_QUANT_DC_START 0
+    mov         eax, [esp+ 4]   ; dct
+    movd       xmm6, [esp+ 8]   ; mf
+    movd       xmm7, [esp+12]   ; bias
+    pshuflw    xmm6, xmm6, 0
+    pshuflw    xmm7, xmm7, 0
+    punpcklqdq xmm6, xmm6
+    punpcklqdq xmm7, xmm7
 %endmacro
 
-;-----------------------------------------------------------------------------
-;   void __cdecl x264_quant_2x2_dc_core15_mmx( int16_t dct[2][2],
-;       int const i_qmf, int const i_qbits, int const f );
-;-----------------------------------------------------------------------------
-cglobal x264_quant_2x2_dc_core15_mmx
-    MMX_QUANT15_DC_START
-    MMX_QUANT15_1x4 [eax], mm5, mm6, mm7
-    ret
-
-;-----------------------------------------------------------------------------
-;   void __cdecl x264_quant_4x4_dc_core15_mmx( int16_t dct[4][4],
-;       int const i_qmf, int const i_qbits, int const f );
-;-----------------------------------------------------------------------------
-cglobal x264_quant_4x4_dc_core15_mmx
-    MMX_QUANT15_DC_START
-
-%rep 4
-    MMX_QUANT15_1x4 [eax], mm5, mm6, mm7
-    add         eax, byte 8
-%endrep
-
-    ret
-
-;-----------------------------------------------------------------------------
-;   void __cdecl x264_quant_4x4_core15_mmx( int16_t dct[4][4],
-;       int const quant_mf[4][4], int const i_qbits, int const f );
-;-----------------------------------------------------------------------------
-cglobal x264_quant_4x4_core15_mmx
-    MMX_QUANT_AC_START
-
-%rep 4
-    movq        mm5, [ecx]
-    packssdw    mm5, [ecx+8]
-    MMX_QUANT15_1x4 [eax], mm5, mm6, mm7
-    add         ecx, byte 16
-    add         eax, byte 8
-%endrep
-
-    ret
-
-;-----------------------------------------------------------------------------
-;   void __cdecl x264_quant_8x8_core15_mmx( int16_t dct[8][8],
-;       int const quant_mf[8][8], int const i_qbits, int const f );
-;-----------------------------------------------------------------------------
-cglobal x264_quant_8x8_core15_mmx
-    MMX_QUANT_AC_START
-
-%rep 16
-    movq        mm5, [ecx]
-    packssdw    mm5, [ecx+8]
-    MMX_QUANT15_1x4 [eax], mm5, mm6, mm7
-    add         ecx, byte 16
-    add         eax, byte 8
-%endrep
-
-    ret
-
-; ============================================================================
-
-%macro MMXEXT_QUANT16_DC_START 0
-    mov         eax, [esp+ 4]   ; &dct[0][0]
-    movd        mm5, [esp+ 8]   ; i_qmf
-    movd        mm6, [esp+12]   ; i_qbits
-    movd        mm7, [esp+16]   ; f
-    pshufw      mm5, mm5, 0     ; i_qmf in each word
-    punpckldq   mm7, mm7        ; f in each dword
+%macro QUANT_ONE 5
+;;; %1      (m64)       dct[y][x]
+;;; %2      (m64/mmx)   mf[y][x] or mf[0][0] (as uint16_t)
+;;; %3      (m64/mmx)   bias[y][x] or bias[0][0] (as uint16_t)
+
+    mov%1      %2m0, %3     ; load dct coeffs
+    pxor       %2m1, %2m1
+    pcmpgtw    %2m1, %2m0   ; sign(coeff)
+    pxor       %2m0, %2m1
+    psubw      %2m0, %2m1   ; abs(coeff)
+    paddusw    %2m0, %5     ; round
+    pmulhuw    %2m0, %4     ; divide
+    pxor       %2m0, %2m1   ; restore sign
+    psubw      %2m0, %2m1
+    mov%1        %3, %2m0   ; store
+%endmacro
+%macro MMX_QUANT_1x4 3
+    QUANT_ONE q, m, %1, %2, %3
+%endmacro
+%macro SSE2_QUANT_1x8 3
+    QUANT_ONE dqa, xm, %1, %2, %3
 %endmacro
 
-%macro MMXEXT_QUANT16_1x4 4
-;;; %1      (m64)       dct[y][x]
-;;; %2      (m64/mmx)   quant_mf[y][x] or quant_mf[0][0] (as uint16_t)
-;;; %3      (mmx)       i_qbits in the low doubleword
-;;; %4      (mmx)       f as doublewords
-;;; trashes mm0-mm2,mm4
-    movq        mm0, %1     ; load dct coeffs
-    pxor        mm4, mm4
-    pcmpgtw     mm4, mm0    ; sign(coeff)
-    pxor        mm0, mm4
-    psubw       mm0, mm4    ; abs(coeff)
-
-    movq        mm2, mm0
-    pmullw      mm0, %2
-    pmulhuw     mm2, %2
-
-    movq        mm1, mm0
-    punpcklwd   mm0, mm2
-    punpckhwd   mm1, mm2
-
-    paddd       mm0, %4     ; round with f
-    paddd       mm1, %4
-    psrad       mm0, %3
-    psrad       mm1, %3
-
-    packssdw    mm0, mm1    ; pack
-    pxor        mm0, mm4    ; restore sign
-    psubw       mm0, mm4
-    movq        %1, mm0     ; store
+%macro SSSE3_QUANT_1x8 3
+    movdqa     xmm0, %1     ; load dct coeffs
+    movdqa     xmm1, xmm0   ; save sign
+    pabsw      xmm0, xmm0
+    paddusw    xmm0, %3     ; round
+    pmulhuw    xmm0, %2     ; divide
+    psignw     xmm0, xmm1   ; restore sign
+    movdqa       %1, xmm0   ; store
 %endmacro
 
 ;-----------------------------------------------------------------------------
-;   void __cdecl x264_quant_2x2_dc_core16_mmxext( int16_t dct[2][2],
-;       int const i_qmf, int const i_qbits, int const f );
+; void x264_quant_2x2_dc_mmxext( int16_t dct[4], int mf, int bias )
 ;-----------------------------------------------------------------------------
-cglobal x264_quant_2x2_dc_core16_mmxext
-    MMXEXT_QUANT16_DC_START
-    MMXEXT_QUANT16_1x4 [eax], mm5, mm6, mm7
+cglobal x264_quant_2x2_dc_mmxext
+    MMX_QUANT_DC_START
+    MMX_QUANT_1x4 [eax], mm6, mm7
     ret
 
 ;-----------------------------------------------------------------------------
-;   void __cdecl x264_quant_4x4_dc_core16_mmxext( int16_t dct[4][4],
-;       int const i_qmf, int const i_qbits, int const f );
+; void x264_quant_4x4_dc_mmxext( int16_t dct[16], int mf, int bias )
 ;-----------------------------------------------------------------------------
-cglobal x264_quant_4x4_dc_core16_mmxext
-    MMXEXT_QUANT16_DC_START
-
+cglobal x264_quant_4x4_dc_mmxext
+    MMX_QUANT_DC_START
+%assign x 0
 %rep 4
-    MMXEXT_QUANT16_1x4 [eax], mm5, mm6, mm7
-    add         eax, byte 8
+    MMX_QUANT_1x4 [eax+x], mm6, mm7
+%assign x (x+8)
 %endrep
-
     ret
 
 ;-----------------------------------------------------------------------------
-;   void __cdecl x264_quant_4x4_core16_mmxext( int16_t dct[4][4],
-;       int const quant_mf[4][4], int const i_qbits, int const f );
+; void x264_quant_4x4_mmx( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] )
 ;-----------------------------------------------------------------------------
-cglobal x264_quant_4x4_core16_mmxext
-    MMX_QUANT_AC_START
-
+cglobal x264_quant_4x4_mmx
+    QUANT_AC_START
+%assign x 0
 %rep 4
-    pshufw      mm5, [ecx], 10110001b
-    paddw       mm5, [ecx+8]
-    pshufw      mm5, mm5, 10001101b
-    MMXEXT_QUANT16_1x4 [eax], mm5, mm6, mm7
-    add         ecx, byte 16
-    add         eax, byte 8
+    MMX_QUANT_1x4 [eax+x], [ecx+x], [edx+x]
+%assign x (x+8)
 %endrep
-
     ret
 
 ;-----------------------------------------------------------------------------
-;   void __cdecl x264_quant_8x8_core16_mmxext( int16_t dct[8][8],
-;       int const quant_mf[8][8], int const i_qbits, int const f );
+; void x264_quant_8x8_mmx( int16_t dct[64], uint16_t mf[64], uint16_t bias[64] )
 ;-----------------------------------------------------------------------------
-cglobal x264_quant_8x8_core16_mmxext
-    MMX_QUANT_AC_START
-
+cglobal x264_quant_8x8_mmx
+    QUANT_AC_START
+%assign x 0
 %rep 16
-    pshufw      mm5, [ecx], 10110001b
-    paddw       mm5, [ecx+8]
-    pshufw      mm5, mm5, 10001101b
-    MMXEXT_QUANT16_1x4 [eax], mm5, mm6, mm7
-    add         ecx, byte 16
-    add         eax, byte 8
+    MMX_QUANT_1x4 [eax+x], [ecx+x], [edx+x]
+%assign x (x+8)
 %endrep
-
     ret
 
-
-
-%macro MMX_QUANT32_DC_START 0
-    mov         eax, [esp+ 4]   ; &dct[0][0]
-    movd        mm5, [esp+ 8]   ; i_qmf
-    movd        mm6, [esp+12]   ; i_qbits
-    movd        mm7, [esp+16]   ; f
-    punpckldq   mm5, mm5        ; i_qmf in each dword
-    punpckldq   mm7, mm7        ; f in each dword
-%endmacro
-
-%macro MMXEXT_QUANT32_1x4 5
-;;; %1      (m64)       dct[y][x]
-;;; %2,%3   (m64/mmx)   quant_mf[y][x] or quant_mf[0][0] (as int16_t)
-;;; %4      (mmx)       i_qbits in the low quadword
-;;; %5      (mmx)       f as doublewords
-;;; trashes mm0-mm4
-    movq        mm0, %1     ; load dct coeffs
-    pxor        mm4, mm4
-    pcmpgtw     mm4, mm0    ; sign(mm0)
-    pxor        mm0, mm4
-    psubw       mm0, mm4    ; abs(mm0)
-    movq        mm1, mm0
-    punpcklwd   mm0, mm0    ; duplicate the words for the upcomming
-    punpckhwd   mm1, mm1    ; 32 bit multiplication
-
-    movq        mm2, mm0    ; like in school ...
-    movq        mm3, mm1
-    pmulhuw     mm0, %2     ; ... multiply the parts ...
-    pmulhuw     mm1, %3
-    pmullw      mm2, %2
-    pmullw      mm3, %3
-    pslld       mm0, 16     ; ... shift ...
-    pslld       mm1, 16
-    paddd       mm0, mm2    ; ... and add them
-    paddd       mm1, mm3
-
-    paddd       mm0, %5     ; round with f
-    paddd       mm1, %5
-    psrad       mm0, %4
-    psrad       mm1, %4
-
-    packssdw    mm0, mm1    ; pack to int16_t
-    pxor        mm0, mm4    ; restore sign
-    psubw       mm0, mm4
-    movq        %1, mm0     ; store
-%endmacro
-
+%macro QUANT_SSE 1
 ;-----------------------------------------------------------------------------
-;   void __cdecl x264_quant_2x2_dc_core32_mmxext( int16_t dct[2][2],
-;       int const i_qmf, int const i_qbits, int const f );
+; void x264_quant_4x4_dc_sse2( int16_t dct[16], int mf, int bias )
 ;-----------------------------------------------------------------------------
-cglobal x264_quant_2x2_dc_core32_mmxext
-    MMX_QUANT32_DC_START
-    MMXEXT_QUANT32_1x4 [eax], mm5, mm5, mm6, mm7
-    ret
-
-;-----------------------------------------------------------------------------
-;   void __cdecl x264_quant_4x4_dc_core32_mmxext( int16_t dct[4][4],
-;       int const i_qmf, int const i_qbits, int const f );
-;-----------------------------------------------------------------------------
-cglobal x264_quant_4x4_dc_core32_mmxext
-    MMX_QUANT32_DC_START
-
-%rep 4
-    MMXEXT_QUANT32_1x4 [eax], mm5, mm5, mm6, mm7
-    add         eax, byte 8
+cglobal x264_quant_4x4_dc_%1
+    SSE2_QUANT_DC_START
+%assign x 0
+%rep 2
+    QUANT_1x8 [eax+x], xmm6, xmm7
+%assign x (x+16)
 %endrep
-
     ret
 
 ;-----------------------------------------------------------------------------
-;   void __cdecl x264_quant_4x4_core32_mmxext( int16_t dct[4][4],
-;       int const quant_mf[4][4], int const i_qbits, int const f );
+; void x264_quant_4x4_sse2( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] )
 ;-----------------------------------------------------------------------------
-cglobal x264_quant_4x4_core32_mmxext
-    MMX_QUANT_AC_START
-
-%rep 4
-    MMXEXT_QUANT32_1x4 [eax], [ecx], [ecx+8], mm6, mm7
-    add         eax, byte 8
-    add         ecx, byte 16
+cglobal x264_quant_4x4_%1
+    QUANT_AC_START
+%assign x 0
+%rep 2
+    QUANT_1x8 [eax+x], [ecx+x], [edx+x]
+%assign x (x+16)
 %endrep
-
     ret
 
 ;-----------------------------------------------------------------------------
-;   void __cdecl x264_quant_8x8_core32_mmxext( int16_t dct[8][8],
-;       int const quant_mf[8][8], int const i_qbits, int const f );
+; void x264_quant_8x8_sse2( int16_t dct[64], uint16_t mf[64], uint16_t bias[64] )
 ;-----------------------------------------------------------------------------
-cglobal x264_quant_8x8_core32_mmxext
-    MMX_QUANT_AC_START
-
-%rep 16
-    MMXEXT_QUANT32_1x4 [eax], [ecx], [ecx+8], mm6, mm7
-    add         eax, byte 8
-    add         ecx, byte 16
+cglobal x264_quant_8x8_%1
+    QUANT_AC_START
+%assign x 0
+%rep 8
+    QUANT_1x8 [eax+x], [ecx+x], [edx+x]
+%assign x (x+16)
 %endrep
-
     ret
+%endmacro
+
+%define QUANT_1x8 SSE2_QUANT_1x8
+QUANT_SSE sse2
+%ifdef HAVE_SSE3
+%define QUANT_1x8 SSSE3_QUANT_1x8
+QUANT_SSE ssse3
+%endif
 
 
 ;=============================================================================
diff --git a/common/i386/quant.h b/common/i386/quant.h
index 1d4b51d9..8532fde9 100644
--- a/common/i386/quant.h
+++ b/common/i386/quant.h
@@ -23,40 +23,16 @@
 #ifndef _I386_QUANT_H
 #define _I386_QUANT_H 1
 
-void x264_quant_8x8_core15_mmx( int16_t dct[8][8],
-    int quant_mf[8][8], int const i_qbits, int const f );
-void x264_quant_4x4_core15_mmx( int16_t dct[4][4],
-    int quant_mf[4][4], int const i_qbits, int const f );
-void x264_quant_4x4_dc_core15_mmx( int16_t dct[4][4],
-    int const i_qmf, int const i_qbits, int const f );
-void x264_quant_2x2_dc_core15_mmx( int16_t dct[2][2],
-    int const i_qmf, int const i_qbits, int const f );
-
-void x264_quant_8x8_core15_ssse3( int16_t dct[8][8],
-    int quant_mf[8][8], int const i_qbits, int const f );
-void x264_quant_4x4_core15_ssse3( int16_t dct[4][4],
-    int quant_mf[4][4], int const i_qbits, int const f );
-void x264_quant_4x4_dc_core15_ssse3( int16_t dct[4][4],
-    int const i_qmf, int const i_qbits, int const f );
-
-void x264_quant_8x8_core16_mmxext( int16_t dct[8][8],
-    int quant_mf[8][8], int const i_qbits, int const f );
-void x264_quant_4x4_core16_mmxext( int16_t dct[4][4],
-    int quant_mf[4][4], int const i_qbits, int const f );
-void x264_quant_4x4_dc_core16_mmxext( int16_t dct[4][4],
-    int const i_qmf, int const i_qbits, int const f );
-void x264_quant_2x2_dc_core16_mmxext( int16_t dct[2][2],
-    int const i_qmf, int const i_qbits, int const f );
-
-void x264_quant_8x8_core32_mmxext( int16_t dct[8][8],
-    int quant_mf[8][8], int const i_qbits, int const f );
-void x264_quant_4x4_core32_mmxext( int16_t dct[4][4],
-    int quant_mf[4][4], int const i_qbits, int const f );
-void x264_quant_4x4_dc_core32_mmxext( int16_t dct[4][4],
-    int const i_qmf, int const i_qbits, int const f );
-void x264_quant_2x2_dc_core32_mmxext( int16_t dct[2][2],
-    int const i_qmf, int const i_qbits, int const f );
-
+void x264_quant_2x2_dc_mmxext( int16_t dct[2][2], int mf, int bias );
+void x264_quant_4x4_dc_mmxext( int16_t dct[4][4], int mf, int bias );
+void x264_quant_4x4_mmx( int16_t dct[4][4], uint16_t mf[16], uint16_t bias[16] );
+void x264_quant_8x8_mmx( int16_t dct[8][8], uint16_t mf[64], uint16_t bias[64] );
+void x264_quant_4x4_dc_sse2( int16_t dct[4][4], int mf, int bias );
+void x264_quant_4x4_sse2( int16_t dct[4][4], uint16_t mf[16], uint16_t bias[16] );
+void x264_quant_8x8_sse2( int16_t dct[8][8], uint16_t mf[64], uint16_t bias[64] );
+void x264_quant_4x4_dc_ssse3( int16_t dct[4][4], int mf, int bias );
+void x264_quant_4x4_ssse3( int16_t dct[4][4], uint16_t mf[16], uint16_t bias[16] );
+void x264_quant_8x8_ssse3( int16_t dct[8][8], uint16_t mf[64], uint16_t bias[64] );
 void x264_dequant_4x4_mmx( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp );
 void x264_dequant_8x8_mmx( int16_t dct[8][8], int dequant_mf[6][8][8], int i_qp );
 
diff --git a/common/quant.c b/common/quant.c
index 9d0ffee4..1e990cb5 100644
--- a/common/quant.c
+++ b/common/quant.c
@@ -29,41 +29,41 @@
 #   include "ppc/quant.h"
 #endif
 
-#define QUANT_ONE( coef, mf ) \
+#define QUANT_ONE( coef, mf, f ) \
 { \
     if( (coef) > 0 ) \
-        (coef) = ( f + (coef) * (mf) ) >> i_qbits; \
+        (coef) = (f + (coef)) * (mf) >> 16; \
     else \
-        (coef) = - ( ( f - (coef) * (mf) ) >> i_qbits ); \
+        (coef) = - ((f - (coef)) * (mf) >> 16); \
 }
 
-static void quant_8x8_core( int16_t dct[8][8], int quant_mf[8][8], int i_qbits, int f )
+static void quant_8x8( int16_t dct[8][8], uint16_t mf[64], uint16_t bias[64] )
 {
     int i;
     for( i = 0; i < 64; i++ )
-        QUANT_ONE( dct[0][i], quant_mf[0][i] );
+        QUANT_ONE( dct[0][i], mf[i], bias[i] );
 }
 
-static void quant_4x4_core( int16_t dct[4][4], int quant_mf[4][4], int i_qbits, int f )
+static void quant_4x4( int16_t dct[4][4], uint16_t mf[16], uint16_t bias[16] )
 {
     int i;
     for( i = 0; i < 16; i++ )
-        QUANT_ONE( dct[0][i], quant_mf[0][i] );
+        QUANT_ONE( dct[0][i], mf[i], bias[i] );
 }
 
-static void quant_4x4_dc_core( int16_t dct[4][4], int i_quant_mf, int i_qbits, int f )
+static void quant_4x4_dc( int16_t dct[4][4], int mf, int bias )
 {
     int i;
     for( i = 0; i < 16; i++ )
-        QUANT_ONE( dct[0][i], i_quant_mf );
+        QUANT_ONE( dct[0][i], mf, bias );
 }
 
-static void quant_2x2_dc_core( int16_t dct[2][2], int i_quant_mf, int i_qbits, int f )
+static void quant_2x2_dc( int16_t dct[2][2], int mf, int bias )
 {
-    QUANT_ONE( dct[0][0], i_quant_mf );
-    QUANT_ONE( dct[0][1], i_quant_mf );
-    QUANT_ONE( dct[0][2], i_quant_mf );
-    QUANT_ONE( dct[0][3], i_quant_mf );
+    QUANT_ONE( dct[0][0], mf, bias );
+    QUANT_ONE( dct[0][1], mf, bias );
+    QUANT_ONE( dct[0][2], mf, bias );
+    QUANT_ONE( dct[0][3], mf, bias );
 }
 
 #define DEQUANT_SHL( x ) \
@@ -195,117 +195,47 @@ void x264_mb_dequant_4x4_dc( int16_t dct[4][4], int dequant_mf[6][4][4], int i_q
 
 void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
 {
-    int i, j, maxQ8=0, maxQ4=0, maxQdc=0;
-
-    pf->quant_8x8_core = quant_8x8_core;
-    pf->quant_4x4_core = quant_4x4_core;
-    pf->quant_4x4_dc_core = quant_4x4_dc_core;
-    pf->quant_2x2_dc_core = quant_2x2_dc_core;
+    pf->quant_8x8 = quant_8x8;
+    pf->quant_4x4 = quant_4x4;
+    pf->quant_4x4_dc = quant_4x4_dc;
+    pf->quant_2x2_dc = quant_2x2_dc;
 
     pf->dequant_4x4 = dequant_4x4;
     pf->dequant_8x8 = dequant_8x8;
 
-    /* determine the biggest coefficient in all quant8_mf tables */
-    for( j = 0; j < 2; j++ )
-        for( i = 0; i < 6*8*8; i++ )
-        {
-            int q = h->quant8_mf[j][0][0][i];
-            if( maxQ8 < q )
-                maxQ8 = q;
-        }
-
-    /* determine the biggest coefficient in all quant4_mf tables ( maxQ4 )
-       and the biggest DC coefficient if all quant4_mf tables ( maxQdc ) */
-    for( j = 0; j < 4; j++ )
-        for( i = 0; i < 6*4*4; i++ )
-        {
-            int q = h->quant4_mf[j][0][0][i];
-            if( maxQ4 < q )
-                maxQ4 = q;
-            if( maxQdc < q && i%16 == 0 )
-                maxQdc = q;
-        }
-
 #ifdef HAVE_MMX
-
-    /* select quant_8x8 based on CPU and maxQ8 */
-#if defined(ARCH_X86_64) && defined(HAVE_SSE3)
-    if( maxQ8 < (1<<15) && cpu&X264_CPU_SSSE3 )
-        pf->quant_8x8_core = x264_quant_8x8_core15_ssse3;
-    else
+    if( cpu&X264_CPU_MMX )
+    {
+#ifdef ARCH_X86
+        pf->quant_4x4 = x264_quant_4x4_mmx;
+        pf->quant_8x8 = x264_quant_8x8_mmx;
 #endif
-    if( maxQ8 < (1<<15) && cpu&X264_CPU_MMX )
-        pf->quant_8x8_core = x264_quant_8x8_core15_mmx;
-    else
-    if( maxQ8 < (1<<16) && cpu&X264_CPU_MMXEXT )
-        pf->quant_8x8_core = x264_quant_8x8_core16_mmxext;
-    else
-    if( cpu&X264_CPU_MMXEXT )
-        pf->quant_8x8_core = x264_quant_8x8_core32_mmxext;
+        pf->dequant_4x4 = x264_dequant_4x4_mmx;
+        pf->dequant_8x8 = x264_dequant_8x8_mmx;
+    }
 
-    /* select quant_4x4 based on CPU and maxQ4 */
-#if defined(ARCH_X86_64) && defined(HAVE_SSE3)
-    if( maxQ4 < (1<<15) && cpu&X264_CPU_SSSE3 )
-        pf->quant_4x4_core = x264_quant_4x4_core15_ssse3;
-    else
-#endif
-    if( maxQ4 < (1<<15) && cpu&X264_CPU_MMX )
-        pf->quant_4x4_core = x264_quant_4x4_core15_mmx;
-    else
-    if( maxQ4 < (1<<16) && cpu&X264_CPU_MMXEXT )
-        pf->quant_4x4_core = x264_quant_4x4_core16_mmxext;
-    else
     if( cpu&X264_CPU_MMXEXT )
-        pf->quant_4x4_core = x264_quant_4x4_core32_mmxext;
-
-    /* select quant_XxX_dc based on CPU and maxQdc */
-    if( maxQdc < (1<<16) && cpu&X264_CPU_MMXEXT )
-    {
-        pf->quant_4x4_dc_core = x264_quant_4x4_dc_core16_mmxext;
-        pf->quant_2x2_dc_core = x264_quant_2x2_dc_core16_mmxext;
-    }
-    else
-    if( maxQdc < (1<<15) && cpu&X264_CPU_MMX )
     {
-        pf->quant_4x4_dc_core = x264_quant_4x4_dc_core15_mmx;
-        pf->quant_2x2_dc_core = x264_quant_2x2_dc_core15_mmx;
+        pf->quant_2x2_dc = x264_quant_2x2_dc_mmxext;
+#ifdef ARCH_X86
+        pf->quant_4x4_dc = x264_quant_4x4_dc_mmxext;
+#endif
     }
-    else
-    if( cpu&X264_CPU_MMXEXT )
+
+    if( cpu&X264_CPU_SSE2 )
     {
-        pf->quant_4x4_dc_core = x264_quant_4x4_dc_core32_mmxext;
-        pf->quant_2x2_dc_core = x264_quant_2x2_dc_core32_mmxext;
+        pf->quant_4x4_dc = x264_quant_4x4_dc_sse2;
+        pf->quant_4x4 = x264_quant_4x4_sse2;
+        pf->quant_8x8 = x264_quant_8x8_sse2;
     }
-
-#if defined(ARCH_X86_64) && defined(HAVE_SSE3)
-    if( maxQdc < (1<<15) && cpu&X264_CPU_SSSE3 )
-        pf->quant_4x4_dc_core = x264_quant_4x4_dc_core15_ssse3;
 #endif
 
-    if( cpu&X264_CPU_MMX )
+#ifdef HAVE_SSE3
+    if( cpu&X264_CPU_SSSE3 )
     {
-        /* dequant is not subject to the above CQM-dependent overflow issues,
-         * as long as the inputs are in the range generable by dct+quant.
-         * that is not guaranteed by the standard, but is true within x264 */
-        pf->dequant_4x4 = x264_dequant_4x4_mmx;
-        pf->dequant_8x8 = x264_dequant_8x8_mmx;
-    }
-#endif  /* HAVE_MMX */
-    
-#ifdef ARCH_PPC
-    if( cpu&X264_CPU_ALTIVEC ) {
-        if( maxQ8 < (1<<16) )
-        {
-            pf->quant_8x8_core = x264_quant_8x8_altivec;
-        }
-        if( maxQ4 < (1<<16) )
-        {
-            pf->quant_4x4_core = x264_quant_4x4_altivec;
-        }
-        if( maxQdc < (1<<16) )
-        {
-           pf->quant_4x4_dc_core = x264_quant_4x4_dc_altivec;
-        }
+        pf->quant_4x4_dc = x264_quant_4x4_dc_ssse3;
+        pf->quant_4x4 = x264_quant_4x4_ssse3;
+        pf->quant_8x8 = x264_quant_8x8_ssse3;
     }
-#endif /* ARCH_PPC */
+#endif
 }
diff --git a/common/quant.h b/common/quant.h
index 3294df59..0fe7d0c9 100644
--- a/common/quant.h
+++ b/common/quant.h
@@ -25,10 +25,10 @@
 
 typedef struct
 {
-    void (*quant_8x8_core)( int16_t dct[8][8], int quant_mf[8][8], int i_qbits, int f );
-    void (*quant_4x4_core)( int16_t dct[4][4], int quant_mf[4][4], int i_qbits, int f );
-    void (*quant_4x4_dc_core)( int16_t dct[4][4], int i_quant_mf, int i_qbits, int f );
-    void (*quant_2x2_dc_core)( int16_t dct[2][2], int i_quant_mf, int i_qbits, int f );
+    void (*quant_8x8)( int16_t dct[8][8], uint16_t mf[64], uint16_t bias[64] );
+    void (*quant_4x4)( int16_t dct[4][4], uint16_t mf[16], uint16_t bias[16] );
+    void (*quant_4x4_dc)( int16_t dct[4][4], int mf, int bias );
+    void (*quant_2x2_dc)( int16_t dct[2][2], int mf, int bias );
 
     void (*dequant_4x4)( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp );
     void (*dequant_8x8)( int16_t dct[8][8], int dequant_mf[6][8][8], int i_qp );
diff --git a/common/set.c b/common/set.c
index f6dc7c3d..9ae8aa5d 100644
--- a/common/set.c
+++ b/common/set.c
@@ -24,6 +24,9 @@
 #include <stdio.h>
 #include <string.h>
 
+#define SHIFT(x,s) ((s)<0 ? (x)<<-(s) : (s)==0 ? (x) : ((x)+(1<<((s)-1)))>>(s))
+#define DIV(n,d) (((n) + ((d)>>1)) / (d))
+
 static const int dequant4_scale[6][3] =
 {
     { 10, 13, 16 },
@@ -66,13 +69,19 @@ static const int quant8_scale[6][6] =
     {  7282,  6428, 11570,  6830,  9118,  8640 }
 };
 
-void x264_cqm_init( x264_t *h )
+int x264_cqm_init( x264_t *h )
 {
     int def_quant4[6][16];
     int def_quant8[6][64];
     int def_dequant4[6][16];
     int def_dequant8[6][64];
+    int quant4_mf[4][6][4][4];
+    int quant8_mf[2][6][8][8];
     int q, i, j, i_list;
+    int deadzone[4] = { 32 - h->param.analyse.i_luma_deadzone[1],
+                        32 - h->param.analyse.i_luma_deadzone[0], 
+                        32 - 11, 32 - 21 };
+    int max_qp_err = -1;
 
     for( i = 0; i < 6; i++ )
     {
@@ -88,10 +97,19 @@ void x264_cqm_init( x264_t *h )
         }
         else
         {
-            h->  quant4_mf[i] = x264_malloc( 6*size*sizeof(int) );
+            h->  quant4_mf[i] = x264_malloc(52*size*sizeof(uint16_t) );
             h->dequant4_mf[i] = x264_malloc( 6*size*sizeof(int) );
             h->unquant4_mf[i] = x264_malloc(52*size*sizeof(int) );
         }
+
+        for( j = (i<4 ? 0 : 4); j < i; j++ )
+            if( deadzone[j&3] == deadzone[i&3] &&
+                !memcmp( h->pps->scaling_list[i], h->pps->scaling_list[j], size*sizeof(uint8_t) ) )
+                break;
+        if( j < i )
+            h->quant4_bias[i] = h->quant4_bias[j];
+        else
+            h->quant4_bias[i] = x264_malloc(52*size*sizeof(uint16_t) );
     }
 
     for( q = 0; q < 6; q++ )
@@ -116,24 +134,47 @@ void x264_cqm_init( x264_t *h )
             for( i = 0; i < 16; i++ )
             {
                 h->dequant4_mf[i_list][q][0][i] = def_dequant4[q][i] * h->pps->scaling_list[i_list][i];
-                h->  quant4_mf[i_list][q][0][i] = def_quant4[q][i] * 16 / h->pps->scaling_list[i_list][i];
+                     quant4_mf[i_list][q][0][i] = DIV(def_quant4[q][i] * 16, h->pps->scaling_list[i_list][i]);
             }
         for( i_list = 0; i_list < 2; i_list++ )
             for( i = 0; i < 64; i++ )
             {
                 h->dequant8_mf[i_list][q][0][i] = def_dequant8[q][i] * h->pps->scaling_list[4+i_list][i];
-                h->  quant8_mf[i_list][q][0][i] = def_quant8[q][i] * 16 / h->pps->scaling_list[4+i_list][i];
+                     quant8_mf[i_list][q][0][i] = DIV(def_quant8[q][i] * 16, h->pps->scaling_list[4+i_list][i]);
             }
     }
     for( q = 0; q < 52; q++ )
     {
         for( i_list = 0; i_list < 4; i_list++ )
             for( i = 0; i < 16; i++ )
-                h->unquant4_mf[i_list][q][i] = (1 << (q/6 + 15 + 8)) / h->quant4_mf[i_list][q%6][0][i];
+            {
+                h->unquant4_mf[i_list][q][i] = (1 << (q/6 + 15 + 8)) / quant4_mf[i_list][q%6][0][i];
+                h->  quant4_mf[i_list][q][i] = j = SHIFT(quant4_mf[i_list][q%6][0][i], q/6 - 1);
+                // round to nearest, unless that would cause the deadzone to be negative
+                h->quant4_bias[i_list][q][i] = X264_MIN( DIV(deadzone[i_list]<<10, j), (1<<15)/j );
+                if( j > 0xffff && q > max_qp_err )
+                    max_qp_err = q;
+            }
+        if( h->param.analyse.b_transform_8x8 )
         for( i_list = 0; i_list < 2; i_list++ )
             for( i = 0; i < 64; i++ )
-                h->unquant8_mf[i_list][q][i] = (1 << (q/6 + 16 + 8)) / h->quant8_mf[i_list][q%6][0][i];
+            {
+                h->unquant8_mf[i_list][q][i] = (1 << (q/6 + 16 + 8)) / quant8_mf[i_list][q%6][0][i];
+                h->  quant8_mf[i_list][q][i] = j = SHIFT(quant8_mf[i_list][q%6][0][i], q/6);
+                h->quant8_bias[i_list][q][i] = X264_MIN( DIV(deadzone[i_list]<<10, j), (1<<15)/j );
+                if( j > 0xffff && q > max_qp_err )
+                    max_qp_err = q;
+            }
     }
+
+    if( !h->mb.b_lossless && max_qp_err >= h->param.rc.i_qp_min )
+    {
+        x264_log( h, X264_LOG_ERROR, "Quantization overflow.\n" );
+        x264_log( h, X264_LOG_ERROR, "Your CQM is incompatible with QP < %d, but min QP is set to %d\n",
+                  max_qp_err+1, h->param.rc.i_qp_min );
+        return -1;
+    }
+    return 0;
 }
 
 void x264_cqm_delete( x264_t *h )
diff --git a/common/set.h b/common/set.h
index 86de444a..a018e7a0 100644
--- a/common/set.h
+++ b/common/set.h
@@ -218,7 +218,7 @@ static const uint8_t * const x264_cqm_jvt[6] =
     x264_cqm_jvt8i, x264_cqm_jvt8p
 };
 
-void x264_cqm_init( x264_t *h );
+int  x264_cqm_init( x264_t *h );
 void x264_cqm_delete( x264_t *h );
 int  x264_cqm_parse_file( x264_t *h, const char *filename );
 
diff --git a/encoder/encoder.c b/encoder/encoder.c
index e82445f9..d76994cd 100644
--- a/encoder/encoder.c
+++ b/encoder/encoder.c
@@ -410,6 +410,14 @@ static int x264_validate_parameters( x264_t *h )
         h->param.analyse.i_noise_reduction = 0;
         h->param.analyse.i_subpel_refine = x264_clip3( h->param.analyse.i_subpel_refine, 1, 6 );
     }
+    if( h->param.rc.i_rc_method == X264_RC_CQP )
+    {
+        float qp_p = h->param.rc.i_qp_constant;
+        float qp_i = qp_p - 6*log(h->param.rc.f_ip_factor)/log(2);
+        float qp_b = qp_p + 6*log(h->param.rc.f_pb_factor)/log(2);
+        h->param.rc.i_qp_min = x264_clip3( (int)(X264_MIN3( qp_p, qp_i, qp_b )), 0, 51 );
+        h->param.rc.i_qp_max = x264_clip3( (int)(X264_MAX3( qp_p, qp_i, qp_b ) + .999), 0, 51 );
+    }
 
     if( ( h->param.i_width % 16 || h->param.i_height % 16 ) && !h->mb.b_lossless )
     {
@@ -438,8 +446,6 @@ static int x264_validate_parameters( x264_t *h )
     h->param.i_deblocking_filter_beta    = x264_clip3( h->param.i_deblocking_filter_beta, -6, 6 );
     h->param.analyse.i_luma_deadzone[0] = x264_clip3( h->param.analyse.i_luma_deadzone[0], 0, 32 );
     h->param.analyse.i_luma_deadzone[1] = x264_clip3( h->param.analyse.i_luma_deadzone[1], 0, 32 );
-    h->mb.i_luma_deadzone[0] = 32 - h->param.analyse.i_luma_deadzone[0];
-    h->mb.i_luma_deadzone[1] = 32 - h->param.analyse.i_luma_deadzone[1];
 
     h->param.i_cabac_init_idc = x264_clip3( h->param.i_cabac_init_idc, 0, 2 );
 
@@ -625,7 +631,11 @@ x264_t *x264_encoder_open   ( x264_param_t *param )
 
     x264_validate_levels( h );
 
-    x264_cqm_init( h );
+    if( x264_cqm_init( h ) < 0 )
+    {
+        x264_free( h );
+        return NULL;
+    }
     
     h->mb.i_mb_count = h->sps->i_mb_width * h->sps->i_mb_height;
 
diff --git a/encoder/macroblock.c b/encoder/macroblock.c
index be0ee357..ec7e9b80 100644
--- a/encoder/macroblock.c
+++ b/encoder/macroblock.c
@@ -38,42 +38,6 @@ static inline void zigzag_scan_2x2_dc( int level[4], int16_t dct[2][2] )
 }
 #undef ZIG
 
-static void quant_8x8( x264_t *h, int16_t dct[8][8], int quant_mf[6][8][8], int i_qscale, int b_intra )
-{
-    const int i_qbits = 16 + i_qscale / 6;
-    const int i_mf = i_qscale % 6;
-    const int f = h->mb.i_luma_deadzone[b_intra] << (i_qbits-6);
-    h->quantf.quant_8x8_core( dct, quant_mf[i_mf], i_qbits, f );
-}
-static void quant_4x4( x264_t *h, int16_t dct[4][4], int quant_mf[6][4][4], int i_qscale, int b_intra )
-{
-    const int i_qbits = 15 + i_qscale / 6;
-    const int i_mf = i_qscale % 6;
-    const int f = h->mb.i_luma_deadzone[b_intra] << (i_qbits-6);
-    h->quantf.quant_4x4_core( dct, quant_mf[i_mf], i_qbits, f );
-}
-static void quant_4x4_chroma( x264_t *h, int16_t dct[4][4], int quant_mf[6][4][4], int i_qscale, int b_intra )
-{
-    const int i_qbits = 15 + i_qscale / 6;
-    const int i_mf = i_qscale % 6;
-    const int f = ( 1 << (i_qbits + b_intra) ) / 6;
-    h->quantf.quant_4x4_core( dct, quant_mf[i_mf], i_qbits, f );
-}
-static void quant_4x4_dc( x264_t *h, int16_t dct[4][4], int quant_mf[6][4][4], int i_qscale )
-{
-    const int i_qbits = 16 + i_qscale / 6;
-    const int i_mf = i_qscale % 6;
-    const int f = h->mb.i_luma_deadzone[1] << (i_qbits-6);
-    h->quantf.quant_4x4_dc_core( dct, quant_mf[i_mf][0][0], i_qbits, f );
-}
-static void quant_2x2_dc( x264_t *h, int16_t dct[2][2], int quant_mf[6][4][4], int i_qscale, int b_intra )
-{
-    const int i_qbits = 16 + i_qscale / 6;
-    const int i_mf = i_qscale % 6;
-    const int f = ( 1 << (i_qbits + b_intra) ) / 6;
-    h->quantf.quant_2x2_dc_core( dct, quant_mf[i_mf][0][0], i_qbits, f );
-}
-
 /* (ref: JVT-B118)
  * x264_mb_decimate_score: given dct coeffs it returns a score to see if we could empty this dct coeffs
  * to 0 (low score means set it to null)
@@ -137,7 +101,7 @@ void x264_mb_encode_i4x4( x264_t *h, int idx, int i_qscale )
     if( h->mb.b_trellis )
         x264_quant_4x4_trellis( h, dct4x4, CQM_4IY, i_qscale, DCT_LUMA_4x4, 1 );
     else
-        quant_4x4( h, dct4x4, h->quant4_mf[CQM_4IY], i_qscale, 1 );
+        h->quantf.quant_4x4( dct4x4, h->quant4_mf[CQM_4IY][i_qscale], h->quant4_bias[CQM_4IY][i_qscale] );
 
     h->zigzagf.scan_4x4( h->dct.block[idx].luma4x4, dct4x4 );
     h->quantf.dequant_4x4( dct4x4, h->dequant4_mf[CQM_4IY], i_qscale );
@@ -159,7 +123,7 @@ void x264_mb_encode_i8x8( x264_t *h, int idx, int i_qscale )
     if( h->mb.b_trellis )
         x264_quant_8x8_trellis( h, dct8x8, CQM_8IY, i_qscale, 1 );
     else 
-        quant_8x8( h, dct8x8, h->quant8_mf[CQM_8IY], i_qscale, 1 );
+        h->quantf.quant_8x8( dct8x8, h->quant8_mf[CQM_8IY][i_qscale], h->quant8_bias[CQM_8IY][i_qscale] );
 
     h->zigzagf.scan_8x8( h->dct.luma8x8[idx], dct8x8 );
     h->quantf.dequant_8x8( dct8x8, h->dequant8_mf[CQM_8IY], i_qscale );
@@ -199,14 +163,14 @@ static void x264_mb_encode_i16x16( x264_t *h, int i_qscale )
         if( h->mb.b_trellis )
             x264_quant_4x4_trellis( h, dct4x4[1+i], CQM_4IY, i_qscale, DCT_LUMA_AC, 1 );
         else
-            quant_4x4( h, dct4x4[1+i], h->quant4_mf[CQM_4IY], i_qscale, 1 );
+            h->quantf.quant_4x4( dct4x4[1+i], h->quant4_mf[CQM_4IY][i_qscale], h->quant4_bias[CQM_4IY][i_qscale] );
 
         h->zigzagf.scan_4x4ac( h->dct.block[i].residual_ac, dct4x4[1+i] );
         h->quantf.dequant_4x4( dct4x4[1+i], h->dequant4_mf[CQM_4IY], i_qscale );
     }
 
     h->dctf.dct4x4dc( dct4x4[0] );
-    quant_4x4_dc( h, dct4x4[0], h->quant4_mf[CQM_4IY], i_qscale );
+    h->quantf.quant_4x4_dc( dct4x4[0], h->quant4_mf[CQM_4IY][i_qscale][0]>>1, h->quant4_bias[CQM_4IY][i_qscale][0]<<1 );
     h->zigzagf.scan_4x4( h->dct.luma16x16_dc, dct4x4[0] );
 
     /* output samples to fdec */
@@ -258,7 +222,7 @@ void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qscale )
             dct2x2[block_idx_y[i]][block_idx_x[i]] = dct4x4[i][0][0];
 
             /* no trellis; it doesn't seem to help chroma noticeably */
-            quant_4x4_chroma( h, dct4x4[i], h->quant4_mf[CQM_4IC + b_inter], i_qscale, !b_inter );
+            h->quantf.quant_4x4( dct4x4[i], h->quant4_mf[CQM_4IC+b_inter][i_qscale], h->quant4_bias[CQM_4IC+b_inter][i_qscale] );
             h->zigzagf.scan_4x4ac( h->dct.block[16+i+ch*4].residual_ac, dct4x4[i] );
 
             if( b_decimate )
@@ -268,7 +232,7 @@ void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qscale )
         }
 
         h->dctf.dct2x2dc( dct2x2 );
-        quant_2x2_dc( h, dct2x2, h->quant4_mf[CQM_4IC + b_inter], i_qscale, !b_inter );
+        h->quantf.quant_2x2_dc( dct2x2, h->quant4_mf[CQM_4IC+b_inter][i_qscale][0]>>1, h->quant4_bias[CQM_4IC+b_inter][i_qscale][0]<<1 );
         zigzag_scan_2x2_dc( h->dct.chroma_dc[ch], dct2x2 );
 
         /* output samples to fdec */
@@ -466,7 +430,7 @@ void x264_macroblock_encode( x264_t *h )
                 if( h->mb.b_trellis )
                     x264_quant_8x8_trellis( h, dct8x8[idx], CQM_8PY, i_qp, 0 );
                 else
-                    quant_8x8( h, dct8x8[idx], h->quant8_mf[CQM_8PY], i_qp, 0 );
+                    h->quantf.quant_8x8( dct8x8[idx], h->quant8_mf[CQM_8PY][i_qp], h->quant8_bias[CQM_8PY][i_qp] );
 
                 h->zigzagf.scan_8x8( h->dct.luma8x8[idx], dct8x8[idx] );
 
@@ -518,7 +482,7 @@ void x264_macroblock_encode( x264_t *h )
                     if( h->mb.b_trellis )
                         x264_quant_4x4_trellis( h, dct4x4[idx], CQM_4PY, i_qp, DCT_LUMA_4x4, 0 );
                     else
-                        quant_4x4( h, dct4x4[idx], h->quant4_mf[CQM_4PY], i_qp, 0 );
+                        h->quantf.quant_4x4( dct4x4[idx], h->quant4_mf[CQM_4PY][i_qp], h->quant4_bias[CQM_4PY][i_qp] );
 
                     h->zigzagf.scan_4x4( h->dct.block[idx].luma4x4, dct4x4[idx] );
                     
@@ -673,7 +637,7 @@ int x264_macroblock_probe_skip( x264_t *h, const int b_bidir )
         {
             const int idx = i8x8 * 4 + i4x4;
 
-            quant_4x4( h, dct4x4[idx], h->quant4_mf[CQM_4PY], i_qp, 0 );
+            h->quantf.quant_4x4( dct4x4[idx], h->quant4_mf[CQM_4PY][i_qp], h->quant4_bias[CQM_4PY][i_qp] );
             h->zigzagf.scan_4x4( dctscan, dct4x4[idx] );
 
             i_decimate_mb += x264_mb_decimate_score( dctscan, 16 );
@@ -709,7 +673,7 @@ int x264_macroblock_probe_skip( x264_t *h, const int b_bidir )
         dct2x2[1][0] = dct4x4[2][0][0];
         dct2x2[1][1] = dct4x4[3][0][0];
         h->dctf.dct2x2dc( dct2x2 );
-        quant_2x2_dc( h, dct2x2, h->quant4_mf[CQM_4PC], i_qp, 0 );
+        h->quantf.quant_2x2_dc( dct2x2, h->quant4_mf[CQM_4PC][i_qp][0]>>1, h->quant4_bias[CQM_4PC][i_qp][0]<<1 );
         if( dct2x2[0][0] || dct2x2[0][1] || dct2x2[1][0] || dct2x2[1][1]  )
         {
             /* can't be */
@@ -719,7 +683,7 @@ int x264_macroblock_probe_skip( x264_t *h, const int b_bidir )
         /* calculate dct coeffs */
         for( i4x4 = 0, i_decimate_mb = 0; i4x4 < 4; i4x4++ )
         {
-            quant_4x4_chroma( h, dct4x4[i4x4], h->quant4_mf[CQM_4PC], i_qp, 0 );
+            h->quantf.quant_4x4( dct4x4[i4x4], h->quant4_mf[CQM_4PC][i_qp], h->quant4_bias[CQM_4PC][i_qp] );
             h->zigzagf.scan_4x4ac( dctscan, dct4x4[i4x4] );
 
             i_decimate_mb += x264_mb_decimate_score( dctscan, 15 );
@@ -811,7 +775,7 @@ void x264_macroblock_encode_p8x8( x264_t *h, int i8 )
     {
         DECLARE_ALIGNED( int16_t, dct8x8[8][8], 16 );
         h->dctf.sub8x8_dct8( dct8x8, p_fenc, p_fdec );
-        quant_8x8( h, dct8x8, h->quant8_mf[CQM_8PY], i_qp, 0 );
+        h->quantf.quant_8x8( dct8x8, h->quant8_mf[CQM_8PY][i_qp], h->quant8_bias[CQM_8PY][i_qp] );
         h->zigzagf.scan_8x8( h->dct.luma8x8[i8], dct8x8 );
 
         if( b_decimate )
@@ -830,10 +794,10 @@ void x264_macroblock_encode_p8x8( x264_t *h, int i8 )
         int i4;
         DECLARE_ALIGNED( int16_t, dct4x4[4][4][4], 16 );
         h->dctf.sub8x8_dct( dct4x4, p_fenc, p_fdec );
-        quant_4x4( h, dct4x4[0], h->quant4_mf[CQM_4PY], i_qp, 0 );
-        quant_4x4( h, dct4x4[1], h->quant4_mf[CQM_4PY], i_qp, 0 );
-        quant_4x4( h, dct4x4[2], h->quant4_mf[CQM_4PY], i_qp, 0 );
-        quant_4x4( h, dct4x4[3], h->quant4_mf[CQM_4PY], i_qp, 0 );
+        h->quantf.quant_4x4( dct4x4[0], h->quant4_mf[CQM_4PY][i_qp], h->quant4_bias[CQM_4PY][i_qp] );
+        h->quantf.quant_4x4( dct4x4[1], h->quant4_mf[CQM_4PY][i_qp], h->quant4_bias[CQM_4PY][i_qp] );
+        h->quantf.quant_4x4( dct4x4[2], h->quant4_mf[CQM_4PY][i_qp], h->quant4_bias[CQM_4PY][i_qp] );
+        h->quantf.quant_4x4( dct4x4[3], h->quant4_mf[CQM_4PY][i_qp], h->quant4_bias[CQM_4PY][i_qp] );
         for( i4 = 0; i4 < 4; i4++ )
             h->zigzagf.scan_4x4( h->dct.block[i8*4+i4].luma4x4, dct4x4[i4] );
 
@@ -864,7 +828,7 @@ void x264_macroblock_encode_p8x8( x264_t *h, int i8 )
         p_fdec = h->mb.pic.p_fdec[1+ch] + (i8&1)*4 + (i8>>1)*4*FDEC_STRIDE;
 
         h->dctf.sub4x4_dct( dct4x4, p_fenc, p_fdec );
-        quant_4x4_chroma( h, dct4x4, h->quant4_mf[CQM_4PC], i_qp, 0 );
+        h->quantf.quant_4x4( dct4x4, h->quant4_mf[CQM_4PC][i_qp], h->quant4_bias[CQM_4PC][i_qp] );
         h->zigzagf.scan_4x4ac( h->dct.block[16+i8+ch*4].residual_ac, dct4x4 );
         if( array_non_zero( dct4x4 ) )
         {
diff --git a/encoder/rdo.c b/encoder/rdo.c
index 3c827cb7..d2fa78b1 100644
--- a/encoder/rdo.c
+++ b/encoder/rdo.c
@@ -282,7 +282,7 @@ typedef struct {
 // and uses the dct scaling factors, not the idct ones.
 
 static void quant_trellis_cabac( x264_t *h, int16_t *dct,
-                                 const int *quant_mf, const int *unquant_mf,
+                                 const uint16_t *quant_mf, const int *unquant_mf,
                                  const int *coef_weight, const int *zigzag,
                                  int i_ctxBlockCat, int i_qbits, int i_lambda2, int b_ac, int i_coefs )
 {
@@ -294,7 +294,7 @@ static void quant_trellis_cabac( x264_t *h, int16_t *dct,
     uint8_t cabac_state_sig[64];
     uint8_t cabac_state_last[64];
     const int b_interlaced = h->mb.b_interlaced;
-    const int f = 1 << (i_qbits-1); // no deadzone
+    const int f = 1 << 15; // no deadzone
     int i_last_nnz = -1;
     int i, j;
 
@@ -359,7 +359,7 @@ static void quant_trellis_cabac( x264_t *h, int16_t *dct,
     for( i = i_last_nnz; i >= b_ac; i-- )
     {
         int i_coef = abs_coefs[i];
-        int q = ( f + i_coef * quant_mf[zigzag[i]] ) >> i_qbits;
+        int q = ( f + i_coef * quant_mf[zigzag[i]] ) >> 16;
         int abs_level;
         int cost_sig[2], cost_last[2];
         trellis_node_t n;
@@ -488,7 +488,7 @@ void x264_quant_4x4_trellis( x264_t *h, int16_t dct[4][4], int i_quant_cat,
                           << (2*i_qbits)) >> LAMBDA_BITS;
 
     quant_trellis_cabac( h, (int16_t*)dct,
-        (int*)h->quant4_mf[i_quant_cat][i_mf], h->unquant4_mf[i_quant_cat][i_qp],
+        h->quant4_mf[i_quant_cat][i_qp], h->unquant4_mf[i_quant_cat][i_qp],
         x264_dct4_weight2_zigzag[h->mb.b_interlaced],
         x264_zigzag_scan4[h->mb.b_interlaced],
         i_ctxBlockCat, 15+i_qbits, i_lambda2, b_ac, 16 );
@@ -505,7 +505,7 @@ void x264_quant_8x8_trellis( x264_t *h, int16_t dct[8][8], int i_quant_cat,
                           << (2*i_qbits)) >> LAMBDA_BITS;
 
     quant_trellis_cabac( h, (int16_t*)dct,
-        (int*)h->quant8_mf[i_quant_cat][i_mf], h->unquant8_mf[i_quant_cat][i_qp],
+        h->quant8_mf[i_quant_cat][i_qp], h->unquant8_mf[i_quant_cat][i_qp],
         x264_dct8_weight2_zigzag[h->mb.b_interlaced],
         x264_zigzag_scan8[h->mb.b_interlaced],
         DCT_LUMA_8x8, 16+i_qbits, i_lambda2, 0, 64 );
diff --git a/tools/checkasm.c b/tools/checkasm.c
index 3152a5af..8e77fbb7 100644
--- a/tools/checkasm.c
+++ b/tools/checkasm.c
@@ -478,11 +478,12 @@ static int check_quant( int cpu_ref, int cpu_new )
     uint8_t cqm_buf[64] __attribute__((__aligned__(16)));
     int ret = 0, ok, used_asm;
     int oks[2] = {1,1}, used_asms[2] = {0,0};
-    int i, i_cqm;
+    int i, i_cqm, qp;
     x264_t h_buf;
     x264_t *h = &h_buf;
     h->pps = h->pps_array;
     x264_param_default( &h->param );
+    h->param.rc.i_qp_min = 26;
 
     for( i_cqm = 0; i_cqm < 4; i_cqm++ )
     {
@@ -533,112 +534,74 @@ static int check_quant( int cpu_ref, int cpu_new )
                 } \
         }
 
-#define TEST_QUANT( name, cqm ) \
+#define TEST_QUANT_DC( name, cqm ) \
         if( qf_a.name != qf_ref.name ) \
         { \
-            used_asms[0] = 1; \
-            for( i = 0; i < 64; i++ ) \
-                dct1[i] = dct2[i] = (rand() & 0x1fff) - 0xfff; \
-            qf_c.name( (void*)dct1, cqm, 20, (1<<20)/6 ); \
-            qf_a.name( (void*)dct2, cqm, 20, (1<<20)/6 ); \
-            if( memcmp( dct1, dct2, 64*2 ) )       \
-            { \
-                oks[0] = 0; \
-                fprintf( stderr, #name "(cqm=%d): [FAILED]\n", i_cqm ); \
-            } \
-        }
-
-#define TEST_QUANT8( qname, cqm, shift, divider ) \
-        if( qf_a.qname != qf_ref.qname ) \
-        { \
-            int qp; \
             used_asms[0] = 1; \
             for( qp = 51; qp > 0; qp-- ) \
             { \
-                INIT_QUANT8() \
-                qf_c.qname( (void*)dct1, cqm[qp%6], shift+qp/6, (1<<(shift+qp/6))/divider ); \
-                qf_a.qname( (void*)dct2, cqm[qp%6], shift+qp/6, (1<<(shift+qp/6))/divider ); \
-                if( memcmp( dct1, dct2, 64*2 ) ) \
+                for( i = 0; i < 16; i++ ) \
+                    dct1[i] = dct2[i] = (rand() & 0x1fff) - 0xfff; \
+                qf_c.name( (void*)dct1, h->quant4_mf[CQM_4IY][qp][0], h->quant4_bias[CQM_4IY][qp][0] ); \
+                qf_a.name( (void*)dct2, h->quant4_mf[CQM_4IY][qp][0], h->quant4_bias[CQM_4IY][qp][0] ); \
+                if( memcmp( dct1, dct2, 16*2 ) )       \
                 { \
                     oks[0] = 0; \
-                    fprintf( stderr, #qname "(qp=%d, cqm=%d, intra=%d): [FAILED]\n", qp, i_cqm, divider==3 ); \
+                    fprintf( stderr, #name "(cqm=%d): [FAILED]\n", i_cqm ); \
                     break; \
                 } \
             } \
         }
 
-#define TEST_QUANT4( qname, cqm, shift, divider ) \
+#define TEST_QUANT( qname, block, w ) \
         if( qf_a.qname != qf_ref.qname ) \
         { \
-            int qp; \
             used_asms[0] = 1; \
             for( qp = 51; qp > 0; qp-- ) \
             { \
-                INIT_QUANT4() \
-                qf_c.qname( (void*)dct1, cqm[qp%6], shift+qp/6, (1<<(shift+qp/6))/divider ); \
-                qf_a.qname( (void*)dct2, cqm[qp%6], shift+qp/6, (1<<(shift+qp/6))/divider ); \
-                if( memcmp( dct1, dct2, 16*2 ) ) \
+                INIT_QUANT##w() \
+                qf_c.qname( (void*)dct1, h->quant##w##_mf[block][qp], h->quant##w##_bias[block][qp] ); \
+                qf_a.qname( (void*)dct2, h->quant##w##_mf[block][qp], h->quant##w##_bias[block][qp] ); \
+                if( memcmp( dct1, dct2, w*w*2 ) ) \
                 { \
                     oks[0] = 0; \
-                    fprintf( stderr, #qname "(qp=%d, cqm=%d, intra=%d): [FAILED]\n", qp, i_cqm, divider==3 ); \
+                    fprintf( stderr, #qname "(qp=%d, cqm=%d, block="#block"): [FAILED]\n", qp, i_cqm ); \
                     break; \
                 } \
             } \
         }
 
-        TEST_QUANT8( quant_8x8_core, h->quant8_mf[CQM_8IY], 16, 3 );
-        TEST_QUANT8( quant_8x8_core, h->quant8_mf[CQM_8PY], 16, 6 );
-        TEST_QUANT4( quant_4x4_core, h->quant4_mf[CQM_4IY], 15, 3 );
-        TEST_QUANT4( quant_4x4_core, h->quant4_mf[CQM_4PY], 15, 6 );
-        TEST_QUANT( quant_4x4_dc_core, ***h->quant4_mf[CQM_4IY] );
-        TEST_QUANT( quant_2x2_dc_core, ***h->quant4_mf[CQM_4IC] );
-
-#define TEST_DEQUANT8( qname, dqname, cqm, dqm, shift, divider ) \
-        if( qf_a.dqname != qf_ref.dqname ) \
-        { \
-            int qp; \
-            used_asms[1] = 1; \
-            for( qp = 51; qp > 0; qp-- ) \
-            { \
-                INIT_QUANT8() \
-                qf_c.qname( (void*)dct1, cqm[qp%6], shift+qp/6, (1<<(shift+qp/6))/divider ); \
-                memcpy( dct2, dct1, 64*2 ); \
-                qf_c.dqname( (void*)dct1, dqm, qp ); \
-                qf_a.dqname( (void*)dct2, dqm, qp ); \
-                if( memcmp( dct1, dct2, 64*2 ) ) \
-                { \
-                    oks[1] = 0; \
-                    fprintf( stderr, #dqname "(qp=%d, cqm=%d, intra=%d): [FAILED]\n", qp, i_cqm, divider==3 ); \
-                    break; \
-                } \
-            } \
-        }
+        TEST_QUANT( quant_8x8, CQM_8IY, 8 );
+        TEST_QUANT( quant_8x8, CQM_8PY, 8 );
+        TEST_QUANT( quant_4x4, CQM_4IY, 4 );
+        TEST_QUANT( quant_4x4, CQM_4PY, 4 );
+        TEST_QUANT_DC( quant_4x4_dc, **h->quant4_mf[CQM_4IY] );
+        TEST_QUANT_DC( quant_2x2_dc, **h->quant4_mf[CQM_4IC] );
 
-#define TEST_DEQUANT4( qname, dqname, cqm, dqm, shift, divider ) \
+#define TEST_DEQUANT( qname, dqname, block, w ) \
         if( qf_a.dqname != qf_ref.dqname ) \
         { \
-            int qp; \
             used_asms[1] = 1; \
             for( qp = 51; qp > 0; qp-- ) \
             { \
-                INIT_QUANT4() \
-                qf_c.qname( (void*)dct1, cqm[qp%6], shift+qp/6, (1<<(shift+qp/6))/divider ); \
-                memcpy( dct2, dct1, 16*2 ); \
-                qf_c.dqname( (void*)dct1, dqm, qp ); \
-                qf_a.dqname( (void*)dct2, dqm, qp ); \
-                if( memcmp( dct1, dct2, 16*2 ) ) \
+                INIT_QUANT##w() \
+                qf_c.qname( (void*)dct1, h->quant##w##_mf[block][qp], h->quant##w##_bias[block][qp] ); \
+                memcpy( dct2, dct1, w*w*2 ); \
+                qf_c.dqname( (void*)dct1, h->dequant##w##_mf[block], qp ); \
+                qf_a.dqname( (void*)dct2, h->dequant##w##_mf[block], qp ); \
+                if( memcmp( dct1, dct2, w*w*2 ) ) \
                 { \
                     oks[1] = 0; \
-                    fprintf( stderr, #dqname "(qp=%d, cqm=%d, intra=%d): [FAILED]\n", qp, i_cqm, divider==3 ); \
+                    fprintf( stderr, #dqname "(qp=%d, cqm=%d, block="#block"): [FAILED]\n", qp, i_cqm ); \
                     break; \
                 } \
             } \
         }
 
-        TEST_DEQUANT8( quant_8x8_core, dequant_8x8, h->quant8_mf[CQM_8IY], h->dequant8_mf[CQM_8IY], 16, 3 );
-        TEST_DEQUANT8( quant_8x8_core, dequant_8x8, h->quant8_mf[CQM_8PY], h->dequant8_mf[CQM_8PY], 16, 6 );
-        TEST_DEQUANT4( quant_4x4_core, dequant_4x4, h->quant4_mf[CQM_4IY], h->dequant4_mf[CQM_4IY], 15, 3 );
-        TEST_DEQUANT4( quant_4x4_core, dequant_4x4, h->quant4_mf[CQM_4PY], h->dequant4_mf[CQM_4PY], 15, 6 );
+        TEST_DEQUANT( quant_8x8, dequant_8x8, CQM_8IY, 8 );
+        TEST_DEQUANT( quant_8x8, dequant_8x8, CQM_8PY, 8 );
+        TEST_DEQUANT( quant_4x4, dequant_4x4, CQM_4IY, 4 );
+        TEST_DEQUANT( quant_4x4, dequant_4x4, CQM_4PY, 4 );
     }
 
     ok = oks[0]; used_asm = used_asms[0];