From 8aef0e941d986f10427cc2d3a848162065bdef3a Mon Sep 17 00:00:00 2001
From: Loren Merritt <pengvado@videolan.org>
Date: Wed, 14 Mar 2007 21:11:11 +0000
Subject: [PATCH] ssse3 detection. x86_64 ssse3 satd and quant. requires yasm
 >= 0.6.0

git-svn-id: svn://svn.videolan.org/x264/trunk@631 df754926-b1dd-0310-bc7b-ec298dee348c
---
 common/amd64/pixel-sse2.asm | 117 +++++++++++++++++++++---------------
 common/amd64/quant-a.asm    |  90 ++++++++++++++++++++++++++-
 common/cpu.c                |  10 +++
 common/i386/pixel.h         |   9 +++
 common/i386/quant.h         |   7 +++
 common/pixel.c              |  13 ++++
 common/quant.c              |  15 +++++
 configure                   |  20 ++++++
 encoder/encoder.c           |   1 +
 tools/checkasm.c            |  17 +++++-
 x264.h                      |   2 +
 11 files changed, 250 insertions(+), 51 deletions(-)

diff --git a/common/amd64/pixel-sse2.asm b/common/amd64/pixel-sse2.asm
index 9fc3755c..57a1a910 100644
--- a/common/amd64/pixel-sse2.asm
+++ b/common/amd64/pixel-sse2.asm
@@ -50,8 +50,15 @@ cglobal x264_pixel_satd_8x8_sse2
 cglobal x264_pixel_satd_16x8_sse2
 cglobal x264_pixel_satd_8x16_sse2
 cglobal x264_pixel_satd_16x16_sse2
+cglobal x264_pixel_satd_8x4_ssse3
+cglobal x264_pixel_satd_8x8_ssse3
+cglobal x264_pixel_satd_16x8_ssse3
+cglobal x264_pixel_satd_8x16_ssse3
+cglobal x264_pixel_satd_16x16_ssse3
 cglobal x264_pixel_sa8d_8x8_sse2
 cglobal x264_pixel_sa8d_16x16_sse2
+cglobal x264_pixel_sa8d_8x8_ssse3
+cglobal x264_pixel_sa8d_16x16_ssse3
 cglobal x264_intra_sa8d_x3_8x8_core_sse2
 cglobal x264_pixel_ssim_4x4x2_core_sse2
 cglobal x264_pixel_ssim_end4_sse2
@@ -267,6 +274,20 @@ x264_pixel_ssd_16x8_sse2:
     SUMSUB_BADC %5, %6, %7, %8
 %endmacro
 
+;;; row transform not used, because phaddw is much slower than paddw on a Conroe
+;%macro PHSUMSUB 3
+;    movdqa  %3, %1
+;    phaddw  %1, %2
+;    phsubw  %3, %2
+;%endmacro
+
+;%macro HADAMARD4x1_SSSE3 5  ; ABCD-T -> ADTC
+;    PHSUMSUB    %1, %2, %5
+;    PHSUMSUB    %3, %4, %2
+;    PHSUMSUB    %1, %3, %4
+;    PHSUMSUB    %5, %2, %3
+;%endmacro
+
 %macro SBUTTERFLY 5
     mov%1       %5, %3
     punpckl%2   %3, %4
@@ -318,6 +339,13 @@ x264_pixel_ssd_16x8_sse2:
     psubw       %1, %2
 %endmacro
 
+%macro LOAD_DIFF_4x8P 6 ; 4x dest, 2x temp
+    LOAD_DIFF_8P %1, %5, [parm1q],          [parm3q]
+    LOAD_DIFF_8P %2, %6, [parm1q+parm2q],   [parm3q+parm4q]
+    LOAD_DIFF_8P %3, %5, [parm1q+2*parm2q], [parm3q+2*parm4q]
+    LOAD_DIFF_8P %4, %6, [parm1q+r10],      [parm3q+r11]
+%endmacro
+
 %macro SUM1x8_SSE2 3    ; 01 junk sum
     pxor    %2, %2
     psubw   %2, %1
@@ -338,8 +366,7 @@ x264_pixel_ssd_16x8_sse2:
     paddusw %4, %2
 %endmacro
 
-;;; two SUM4x4_SSE2 running side-by-side
-%macro SUM4x4_TWO_SSE2 7    ; a02 a13 junk1 b02 b13 junk2 (1=4 2=5 3=6) sum
+%macro SUM8x4_SSE2 7    ; a02 a13 junk1 b02 b13 junk2 (1=4 2=5 3=6) sum
     pxor    %3, %3
     pxor    %6, %6
     psubw   %3, %1
@@ -358,18 +385,25 @@ x264_pixel_ssd_16x8_sse2:
     paddusw %7, %4
 %endmacro
 
-%macro SATD_TWO_SSE2 0
-    LOAD_DIFF_8P xmm0, xmm4, [parm1q],          [parm3q]
-    LOAD_DIFF_8P xmm1, xmm5, [parm1q+parm2q],   [parm3q+parm4q]
-    LOAD_DIFF_8P xmm2, xmm4, [parm1q+2*parm2q], [parm3q+2*parm4q]
-    LOAD_DIFF_8P xmm3, xmm5, [parm1q+r10],      [parm3q+r11]
-    lea          parm1q, [parm1q+4*parm2q]
-    lea          parm3q, [parm3q+4*parm4q]
+%macro SUM8x4_SSSE3 7    ; a02 a13 . b02 b13 . sum
+    pabsw   %1, %1
+    pabsw   %2, %2
+    pabsw   %4, %4
+    pabsw   %5, %5
+    paddusw %1, %2
+    paddusw %4, %5
+    paddusw %7, %1
+    paddusw %7, %4
+%endmacro
 
+%macro SATD_TWO_SSE2 0
+    LOAD_DIFF_4x8P    xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
+    lea     parm1q, [parm1q+4*parm2q]
+    lea     parm3q, [parm3q+4*parm4q]
     HADAMARD1x4       xmm0, xmm1, xmm2, xmm3
     TRANSPOSE2x4x4W   xmm0, xmm1, xmm2, xmm3, xmm4
     HADAMARD1x4       xmm0, xmm1, xmm2, xmm3
-    SUM4x4_TWO_SSE2   xmm0, xmm1, xmm4, xmm2, xmm3, xmm5, xmm6
+    SUM8x4            xmm0, xmm1, xmm4, xmm2, xmm3, xmm5, xmm6
 %endmacro
 
 %macro SATD_START 0
@@ -385,85 +419,72 @@ x264_pixel_ssd_16x8_sse2:
     ret
 %endmacro
 
+%macro SATDS 1
 ALIGN 16
 ;-----------------------------------------------------------------------------
 ;   int x264_pixel_satd_16x16_sse2 (uint8_t *, int, uint8_t *, int )
 ;-----------------------------------------------------------------------------
-x264_pixel_satd_16x16_sse2:
+x264_pixel_satd_16x16_%1:
     SATD_START
     mov     r8,  rdi
     mov     r9,  rdx
-
     SATD_TWO_SSE2
     SATD_TWO_SSE2
     SATD_TWO_SSE2
     SATD_TWO_SSE2
-
     lea     rdi, [r8+8]
     lea     rdx, [r9+8]
-
     SATD_TWO_SSE2
     SATD_TWO_SSE2
     SATD_TWO_SSE2
     SATD_TWO_SSE2
-
     SATD_END
 
 ALIGN 16
 ;-----------------------------------------------------------------------------
 ;   int x264_pixel_satd_8x16_sse2 (uint8_t *, int, uint8_t *, int )
 ;-----------------------------------------------------------------------------
-x264_pixel_satd_8x16_sse2:
+x264_pixel_satd_8x16_%1:
     SATD_START
-
     SATD_TWO_SSE2
     SATD_TWO_SSE2
     SATD_TWO_SSE2
     SATD_TWO_SSE2
-
     SATD_END
 
 ALIGN 16
 ;-----------------------------------------------------------------------------
 ;   int x264_pixel_satd_16x8_sse2 (uint8_t *, int, uint8_t *, int )
 ;-----------------------------------------------------------------------------
-x264_pixel_satd_16x8_sse2:
+x264_pixel_satd_16x8_%1:
     SATD_START
     mov     r8,  rdi
     mov     r9,  rdx
-
     SATD_TWO_SSE2
     SATD_TWO_SSE2
-
     lea     rdi, [r8+8]
     lea     rdx, [r9+8]
-
     SATD_TWO_SSE2
     SATD_TWO_SSE2
-
     SATD_END
 
 ALIGN 16
 ;-----------------------------------------------------------------------------
 ;   int x264_pixel_satd_8x8_sse2 (uint8_t *, int, uint8_t *, int )
 ;-----------------------------------------------------------------------------
-x264_pixel_satd_8x8_sse2:
+x264_pixel_satd_8x8_%1:
     SATD_START
-
     SATD_TWO_SSE2
     SATD_TWO_SSE2
-
     SATD_END
 
 ALIGN 16
 ;-----------------------------------------------------------------------------
 ;   int x264_pixel_satd_8x4_sse2 (uint8_t *, int, uint8_t *, int )
 ;-----------------------------------------------------------------------------
-x264_pixel_satd_8x4_sse2:
+x264_pixel_satd_8x4_%1:
     SATD_START
-
     SATD_TWO_SSE2
-
     SATD_END
 
 
@@ -471,27 +492,21 @@ ALIGN 16
 ;-----------------------------------------------------------------------------
 ;   int x264_pixel_sa8d_8x8_sse2( uint8_t *, int, uint8_t *, int )
 ;-----------------------------------------------------------------------------
-x264_pixel_sa8d_8x8_sse2:
+x264_pixel_sa8d_8x8_%1:
     lea  r10, [3*parm2q]
     lea  r11, [3*parm4q]
-    LOAD_DIFF_8P xmm0, xmm8, [parm1q],          [parm3q]
-    LOAD_DIFF_8P xmm1, xmm9, [parm1q+parm2q],   [parm3q+parm4q]
-    LOAD_DIFF_8P xmm2, xmm8, [parm1q+2*parm2q], [parm3q+2*parm4q]
-    LOAD_DIFF_8P xmm3, xmm9, [parm1q+r10],      [parm3q+r11]
+    LOAD_DIFF_4x8P xmm0, xmm1, xmm2, xmm3, xmm8, xmm8
     lea  parm1q, [parm1q+4*parm2q]
     lea  parm3q, [parm3q+4*parm4q]
-    LOAD_DIFF_8P xmm4, xmm8, [parm1q],          [parm3q]
-    LOAD_DIFF_8P xmm5, xmm9, [parm1q+parm2q],   [parm3q+parm4q]
-    LOAD_DIFF_8P xmm6, xmm8, [parm1q+2*parm2q], [parm3q+2*parm4q]
-    LOAD_DIFF_8P xmm7, xmm9, [parm1q+r10],      [parm3q+r11]
-    
+    LOAD_DIFF_4x8P xmm4, xmm5, xmm6, xmm7, xmm8, xmm8
+
     HADAMARD1x8  xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
     TRANSPOSE8x8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8
     HADAMARD1x8  xmm0, xmm5, xmm7, xmm3, xmm8, xmm4, xmm2, xmm1
 
     pxor            xmm10, xmm10
-    SUM4x4_TWO_SSE2 xmm0, xmm1, xmm6, xmm2, xmm3, xmm9, xmm10
-    SUM4x4_TWO_SSE2 xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10
+    SUM8x4          xmm0, xmm1, xmm6, xmm2, xmm3, xmm9, xmm10
+    SUM8x4          xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10
     psrlw           xmm10, 1
     HADDW           xmm10, xmm0
     movd eax, xmm10
@@ -505,26 +520,34 @@ ALIGN 16
 ;   int x264_pixel_sa8d_16x16_sse2( uint8_t *, int, uint8_t *, int )
 ;-----------------------------------------------------------------------------
 ;; violates calling convention
-x264_pixel_sa8d_16x16_sse2:
+x264_pixel_sa8d_16x16_%1:
     xor  r8d, r8d
-    call x264_pixel_sa8d_8x8_sse2 ; pix[0]
+    call x264_pixel_sa8d_8x8_%1 ; pix[0]
     lea  parm1q, [parm1q+4*parm2q]
     lea  parm3q, [parm3q+4*parm4q]
-    call x264_pixel_sa8d_8x8_sse2 ; pix[8*stride]
+    call x264_pixel_sa8d_8x8_%1 ; pix[8*stride]
     lea  r10, [3*parm2q-2]
     lea  r11, [3*parm4q-2]
     shl  r10, 2
     shl  r11, 2
     sub  parm1q, r10
     sub  parm3q, r11
-    call x264_pixel_sa8d_8x8_sse2 ; pix[8]
+    call x264_pixel_sa8d_8x8_%1 ; pix[8]
     lea  parm1q, [parm1q+4*parm2q]
     lea  parm3q, [parm3q+4*parm4q]
-    call x264_pixel_sa8d_8x8_sse2 ; pix[8*stride+8]
+    call x264_pixel_sa8d_8x8_%1 ; pix[8*stride+8]
     mov  eax, r8d
     add  eax, 1
     shr  eax, 1
     ret
+%endmacro ; SATDS
+
+%define SUM8x4 SUM8x4_SSE2
+SATDS sse2
+%ifdef HAVE_SSE3
+%define SUM8x4 SUM8x4_SSSE3
+SATDS ssse3
+%endif
 
 
 
@@ -567,7 +590,7 @@ x264_intra_sa8d_x3_8x8_core_sse2:
     movdqa      xmm9, xmm3
     movdqa      xmm10, xmm4
     movdqa      xmm11, xmm5
-    SUM4x4_TWO_SSE2 xmm8, xmm9, xmm12, xmm10, xmm11, xmm13, xmm15
+    SUM8x4_SSE2 xmm8, xmm9, xmm12, xmm10, xmm11, xmm13, xmm15
     movdqa      xmm8, xmm6
     movdqa      xmm9, xmm7
     SUM4x4_SSE2 xmm8, xmm9, xmm10, xmm15
diff --git a/common/amd64/quant-a.asm b/common/amd64/quant-a.asm
index fcb5db1a..ba7058de 100644
--- a/common/amd64/quant-a.asm
+++ b/common/amd64/quant-a.asm
@@ -45,6 +45,10 @@ cglobal x264_quant_4x4_dc_core15_mmx
 cglobal x264_quant_4x4_core15_mmx
 cglobal x264_quant_8x8_core15_mmx
 
+cglobal x264_quant_4x4_dc_core15_ssse3
+cglobal x264_quant_4x4_core15_ssse3
+cglobal x264_quant_8x8_core15_ssse3
+
 cglobal x264_quant_2x2_dc_core16_mmxext
 cglobal x264_quant_4x4_dc_core16_mmxext
 cglobal x264_quant_4x4_core16_mmxext
@@ -76,6 +80,21 @@ cglobal x264_dequant_8x8_mmx
     punpckldq   mm7, mm7        ; f in each dword
 %endmacro
 
+%macro SSE2_QUANT_AC_START 0
+    movd       xmm6, parm3d     ; i_qbits
+    movd       xmm7, parm4d     ; f
+    pshufd     xmm7, xmm7, 0    ; f in each dword
+%endmacro
+
+%macro SSE2_QUANT15_DC_START 0
+    movd       xmm5, parm2d     ; i_qmf
+    movd       xmm6, parm3d     ; i_qbits
+    movd       xmm7, parm4d     ; f
+    pshuflw    xmm5, xmm5, 0
+    punpcklqdq xmm5, xmm5       ; i_qmf in each word
+    pshufd     xmm7, xmm7, 0    ; f in each dword
+%endmacro
+
 %macro MMX_QUANT15_1x4 4
 ;;; %1      (m64)       dct[y][x]
 ;;; %2      (m64/mmx)   quant_mf[y][x] or quant_mf[0][0] (as int16_t)
@@ -104,7 +123,30 @@ cglobal x264_dequant_8x8_mmx
     packssdw    mm0, mm1    ; pack
     pxor        mm0, mm4    ; restore sign
     psubw       mm0, mm4
-    movq        %1, mm0     ; store
+    movq         %1, mm0    ; store
+%endmacro
+
+%macro SSSE3_QUANT15_1x8 4
+    movdqa     xmm0, %1     ; load dct coeffs
+    movdqa     xmm4, xmm0   ; save sign
+    pabsw      xmm0, xmm0
+
+    movdqa     xmm2, xmm0
+    pmullw     xmm0, %2
+    pmulhw     xmm2, %2
+
+    movdqa     xmm1, xmm0
+    punpcklwd  xmm0, xmm2
+    punpckhwd  xmm1, xmm2
+
+    paddd      xmm0, %4     ; round with f
+    paddd      xmm1, %4
+    psrad      xmm0, %3
+    psrad      xmm1, %3
+
+    packssdw   xmm0, xmm1   ; pack
+    psignw     xmm0, xmm4   ; restore sign
+    movdqa       %1, xmm0   ; store
 %endmacro
 
 ALIGN 16
@@ -168,6 +210,52 @@ x264_quant_8x8_core15_mmx:
 
     ret
 
+%ifdef HAVE_SSE3
+ALIGN 16
+;-----------------------------------------------------------------------------
+;   void x264_quant_4x4_dc_core15_ssse3( int16_t dct[4][4],
+;       int const i_qmf, int const i_qbits, int const f );
+;-----------------------------------------------------------------------------
+x264_quant_4x4_dc_core15_ssse3:
+    SSE2_QUANT15_DC_START
+    SSSE3_QUANT15_1x8 [parm1q], xmm5, xmm6, xmm7
+    SSSE3_QUANT15_1x8 [parm1q+16], xmm5, xmm6, xmm7
+    ret
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+;   void x264_quant_4x4_core15_ssse3( int16_t dct[4][4],
+;       int const quant_mf[4][4], int const i_qbits, int const f );
+;-----------------------------------------------------------------------------
+x264_quant_4x4_core15_ssse3:
+    SSE2_QUANT_AC_START
+%assign x 0
+%rep 2
+    movdqa      xmm5, [parm2q+32*x]
+    packssdw    xmm5, [parm2q+32*x+16]
+    SSSE3_QUANT15_1x8 [parm1q+16*x], xmm5, xmm6, xmm7
+    %assign x x+1
+%endrep
+    ret
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+;   void x264_quant_8x8_core15_ssse3( int16_t dct[8][8],
+;       int const quant_mf[8][8], int const i_qbits, int const f );
+;-----------------------------------------------------------------------------
+x264_quant_8x8_core15_ssse3:
+    SSE2_QUANT_AC_START
+%assign x 0
+%rep 8
+    movdqa      xmm5, [parm2q+32*x]
+    packssdw    xmm5, [parm2q+32*x+16]
+    SSSE3_QUANT15_1x8 [parm1q+16*x], xmm5, xmm6, xmm7
+    %assign x x+1
+%endrep
+    ret
+%endif ; HAVE_SSE3
+
+
 ; ============================================================================
 
 %macro MMXEXT_QUANT16_DC_START 0
diff --git a/common/cpu.c b/common/cpu.c
index 09e20ea8..32c7cd3b 100644
--- a/common/cpu.c
+++ b/common/cpu.c
@@ -80,6 +80,16 @@ uint32_t x264_cpu_detect( void )
         /* Is it OK ? */
         cpu |= X264_CPU_SSE2;
     }
+#ifdef HAVE_SSE3
+    if( (ecx&0x00000001) )
+    {
+        cpu |= X264_CPU_SSE3;
+    }
+    if( (ecx&0x00000200) )
+    {
+        cpu |= X264_CPU_SSSE3;
+    }
+#endif
 
     x264_cpu_cpuid( 0x80000000, &eax, &ebx, &ecx, &edx );
     if( eax < 0x80000001 )
diff --git a/common/i386/pixel.h b/common/i386/pixel.h
index fb06cccf..c15459d9 100644
--- a/common/i386/pixel.h
+++ b/common/i386/pixel.h
@@ -87,9 +87,18 @@ int x264_pixel_satd_8x16_sse2( uint8_t *, int, uint8_t *, int );
 int x264_pixel_satd_8x8_sse2( uint8_t *, int, uint8_t *, int );
 int x264_pixel_satd_8x4_sse2( uint8_t *, int, uint8_t *, int );
 
+int x264_pixel_satd_16x16_ssse3( uint8_t *, int, uint8_t *, int );
+int x264_pixel_satd_16x8_ssse3( uint8_t *, int, uint8_t *, int );
+int x264_pixel_satd_8x16_ssse3( uint8_t *, int, uint8_t *, int );
+int x264_pixel_satd_8x8_ssse3( uint8_t *, int, uint8_t *, int );
+int x264_pixel_satd_8x4_ssse3( uint8_t *, int, uint8_t *, int );
+
 int x264_pixel_sa8d_16x16_sse2( uint8_t *, int, uint8_t *, int );
 int x264_pixel_sa8d_8x8_sse2( uint8_t *, int, uint8_t *, int );
 
+int x264_pixel_sa8d_16x16_ssse3( uint8_t *, int, uint8_t *, int );
+int x264_pixel_sa8d_8x8_ssse3( uint8_t *, int, uint8_t *, int );
+
 void x264_intra_satd_x3_4x4_mmxext( uint8_t *, uint8_t *, int * );
 void x264_intra_satd_x3_8x8c_mmxext( uint8_t *, uint8_t *, int * );
 void x264_intra_satd_x3_16x16_mmxext( uint8_t *, uint8_t *, int * );
diff --git a/common/i386/quant.h b/common/i386/quant.h
index ec42f4e1..1d4b51d9 100644
--- a/common/i386/quant.h
+++ b/common/i386/quant.h
@@ -32,6 +32,13 @@ void x264_quant_4x4_dc_core15_mmx( int16_t dct[4][4],
 void x264_quant_2x2_dc_core15_mmx( int16_t dct[2][2],
     int const i_qmf, int const i_qbits, int const f );
 
+void x264_quant_8x8_core15_ssse3( int16_t dct[8][8],
+    int quant_mf[8][8], int const i_qbits, int const f );
+void x264_quant_4x4_core15_ssse3( int16_t dct[4][4],
+    int quant_mf[4][4], int const i_qbits, int const f );
+void x264_quant_4x4_dc_core15_ssse3( int16_t dct[4][4],
+    int const i_qmf, int const i_qbits, int const f );
+
 void x264_quant_8x8_core16_mmxext( int16_t dct[8][8],
     int quant_mf[8][8], int const i_qbits, int const f );
 void x264_quant_4x4_core16_mmxext( int16_t dct[4][4],
diff --git a/common/pixel.c b/common/pixel.c
index 365266a0..51c52c05 100644
--- a/common/pixel.c
+++ b/common/pixel.c
@@ -536,6 +536,19 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
         pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_sse2;
         pixf->sa8d[PIXEL_8x8]   = x264_pixel_sa8d_8x8_sse2;
         pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_sse2;
+#endif
+    }
+
+    if( cpu&X264_CPU_SSSE3 )
+    {
+#if defined(ARCH_X86_64) && defined(HAVE_SSE3)
+        pixf->satd[PIXEL_16x16]= x264_pixel_satd_16x16_ssse3;
+        pixf->satd[PIXEL_16x8] = x264_pixel_satd_16x8_ssse3;
+        pixf->satd[PIXEL_8x16] = x264_pixel_satd_8x16_ssse3;
+        pixf->satd[PIXEL_8x8]  = x264_pixel_satd_8x8_ssse3;
+        pixf->satd[PIXEL_8x4]  = x264_pixel_satd_8x4_ssse3;
+        pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_ssse3;
+        pixf->sa8d[PIXEL_8x8]  = x264_pixel_sa8d_8x8_ssse3;
 #endif
     }
 #endif
diff --git a/common/quant.c b/common/quant.c
index e7bd48cc..ad25824c 100644
--- a/common/quant.c
+++ b/common/quant.c
@@ -229,6 +229,11 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
 #ifdef HAVE_MMXEXT
 
     /* select quant_8x8 based on CPU and maxQ8 */
+#if defined(ARCH_X86_64) && defined(HAVE_SSE3)
+    if( maxQ8 < (1<<15) && cpu&X264_CPU_SSSE3 )
+        pf->quant_8x8_core = x264_quant_8x8_core15_ssse3;
+    else
+#endif
     if( maxQ8 < (1<<15) && cpu&X264_CPU_MMX )
         pf->quant_8x8_core = x264_quant_8x8_core15_mmx;
     else
@@ -239,6 +244,11 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
         pf->quant_8x8_core = x264_quant_8x8_core32_mmxext;
 
     /* select quant_4x4 based on CPU and maxQ4 */
+#if defined(ARCH_X86_64) && defined(HAVE_SSE3)
+    if( maxQ4 < (1<<15) && cpu&X264_CPU_SSSE3 )
+        pf->quant_4x4_core = x264_quant_4x4_core15_ssse3;
+    else
+#endif
     if( maxQ4 < (1<<15) && cpu&X264_CPU_MMX )
         pf->quant_4x4_core = x264_quant_4x4_core15_mmx;
     else
@@ -267,6 +277,11 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
         pf->quant_2x2_dc_core = x264_quant_2x2_dc_core32_mmxext;
     }
 
+#if defined(ARCH_X86_64) && defined(HAVE_SSE3)
+    if( maxQdc < (1<<15) && cpu&X264_CPU_SSSE3 )
+        pf->quant_4x4_dc_core = x264_quant_4x4_dc_core15_ssse3;
+#endif
+
     if( cpu&X264_CPU_MMX )
     {
         /* dequant is not subject to the above CQM-dependent overflow issues,
diff --git a/configure b/configure
index 4b013b09..b903fdd3 100755
--- a/configure
+++ b/configure
@@ -35,6 +35,15 @@ EOF
     return $TMP
 }
 
+as_check() {
+    rm -f conftest*
+    echo "$1" > conftest.asm
+    $AS conftest.asm $ASFLAGS $2 -o conftest.o 2>$DEVNULL
+    TMP="$?"
+    rm -f conftest*
+    return $TMP
+}
+
 rm -f config.h config.mak x264.pc
 
 prefix='/usr/local'
@@ -210,6 +219,17 @@ then
   fi
 fi
 
+if [ $ARCH = X86_64 ] ; then
+    if ! as_check ; then
+        echo "No assembler. Please install yasm."
+        exit 1
+    fi
+    if as_check "pabsw xmm0, xmm0" ; then
+        ASFLAGS="$ASFLAGS -DHAVE_SSE3"
+        CFLAGS="$CFLAGS -DHAVE_SSE3"
+    fi
+fi
+
 CFLAGS="$CFLAGS -DARCH_$ARCH -DSYS_$SYS"
 
 # parse options
diff --git a/encoder/encoder.c b/encoder/encoder.c
index e247e1e3..a5eacff2 100644
--- a/encoder/encoder.c
+++ b/encoder/encoder.c
@@ -671,6 +671,7 @@ x264_t *x264_encoder_open   ( x264_param_t *param )
              param->cpu&X264_CPU_MMXEXT ? "MMXEXT " : "",
              param->cpu&X264_CPU_SSE ? "SSE " : "",
              param->cpu&X264_CPU_SSE2 ? "SSE2 " : "",
+             param->cpu&X264_CPU_SSSE3 ? "SSSE3 " : "",
              param->cpu&X264_CPU_3DNOW ? "3DNow! " : "",
              param->cpu&X264_CPU_ALTIVEC ? "Altivec " : "" );
 
diff --git a/tools/checkasm.c b/tools/checkasm.c
index d64fe502..988ff502 100644
--- a/tools/checkasm.c
+++ b/tools/checkasm.c
@@ -738,6 +738,7 @@ int check_all( int cpu_ref, int cpu_new )
 int main(int argc, char *argv[])
 {
     int ret = 0;
+    int cpu0 = 0, cpu1 = 0;
     int i;
 
     buf1 = x264_malloc( 1024 ); /* 32 x 32 */
@@ -759,13 +760,23 @@ int main(int argc, char *argv[])
 
 #ifdef HAVE_MMXEXT
     fprintf( stderr, "x264: MMXEXT against C\n" );
-    ret = check_all( 0, X264_CPU_MMX | X264_CPU_MMXEXT );
+    cpu1 = X264_CPU_MMX | X264_CPU_MMXEXT;
+    ret = check_all( 0, cpu1 );
 #ifdef HAVE_SSE2
     if( x264_cpu_detect() & X264_CPU_SSE2 )
     {
         fprintf( stderr, "\nx264: SSE2 against C\n" );
-        ret |= check_all( X264_CPU_MMX | X264_CPU_MMXEXT,
-                          X264_CPU_MMX | X264_CPU_MMXEXT | X264_CPU_SSE | X264_CPU_SSE2 );
+        cpu0 = cpu1;
+        cpu1 |= X264_CPU_SSE | X264_CPU_SSE2;
+        ret |= check_all( cpu0, cpu1 );
+
+        if( x264_cpu_detect() & X264_CPU_SSSE3 )
+        {
+            fprintf( stderr, "\nx264: SSSE3 against C\n" );
+            cpu0 = cpu1;
+            cpu1 |= X264_CPU_SSE3 | X264_CPU_SSSE3;
+            ret |= check_all( cpu0, cpu1 );
+        }
     }
 #endif
 #elif ARCH_PPC
diff --git a/x264.h b/x264.h
index d2897d84..cbddbee7 100644
--- a/x264.h
+++ b/x264.h
@@ -53,6 +53,8 @@ typedef struct x264_t x264_t;
 #define X264_CPU_3DNOW      0x000010    /* 3dnow! */
 #define X264_CPU_3DNOWEXT   0x000020    /* 3dnow! ext */
 #define X264_CPU_ALTIVEC    0x000040    /* altivec */
+#define X264_CPU_SSE3       0x000080    /* sse 3 */
+#define X264_CPU_SSSE3      0x000100    /* ssse 3 */
 
 /* Analyse flags
  */
-- 
2.40.0