]> granicus.if.org Git - libx264/commitdiff
ssse3 detection. x86_64 ssse3 satd and quant.
authorLoren Merritt <pengvado@videolan.org>
Wed, 14 Mar 2007 21:11:11 +0000 (21:11 +0000)
committerLoren Merritt <pengvado@videolan.org>
Wed, 14 Mar 2007 21:11:11 +0000 (21:11 +0000)
requires yasm >= 0.6.0

git-svn-id: svn://svn.videolan.org/x264/trunk@631 df754926-b1dd-0310-bc7b-ec298dee348c

common/amd64/pixel-sse2.asm
common/amd64/quant-a.asm
common/cpu.c
common/i386/pixel.h
common/i386/quant.h
common/pixel.c
common/quant.c
configure
encoder/encoder.c
tools/checkasm.c
x264.h

index 9fc3755cfa066d3181d4b3a24813666933d244f0..57a1a910285e58def9ca692e3ed99ee2073eb1ee 100644 (file)
@@ -50,8 +50,15 @@ cglobal x264_pixel_satd_8x8_sse2
 cglobal x264_pixel_satd_16x8_sse2
 cglobal x264_pixel_satd_8x16_sse2
 cglobal x264_pixel_satd_16x16_sse2
+cglobal x264_pixel_satd_8x4_ssse3
+cglobal x264_pixel_satd_8x8_ssse3
+cglobal x264_pixel_satd_16x8_ssse3
+cglobal x264_pixel_satd_8x16_ssse3
+cglobal x264_pixel_satd_16x16_ssse3
 cglobal x264_pixel_sa8d_8x8_sse2
 cglobal x264_pixel_sa8d_16x16_sse2
+cglobal x264_pixel_sa8d_8x8_ssse3
+cglobal x264_pixel_sa8d_16x16_ssse3
 cglobal x264_intra_sa8d_x3_8x8_core_sse2
 cglobal x264_pixel_ssim_4x4x2_core_sse2
 cglobal x264_pixel_ssim_end4_sse2
@@ -267,6 +274,20 @@ x264_pixel_ssd_16x8_sse2:
     SUMSUB_BADC %5, %6, %7, %8
 %endmacro
 
+;;; row transform not used, because phaddw is much slower than paddw on a Conroe
+;%macro PHSUMSUB 3
+;    movdqa  %3, %1
+;    phaddw  %1, %2
+;    phsubw  %3, %2
+;%endmacro
+
+;%macro HADAMARD4x1_SSSE3 5  ; ABCD-T -> ADTC
+;    PHSUMSUB    %1, %2, %5
+;    PHSUMSUB    %3, %4, %2
+;    PHSUMSUB    %1, %3, %4
+;    PHSUMSUB    %5, %2, %3
+;%endmacro
+
 %macro SBUTTERFLY 5
     mov%1       %5, %3
     punpckl%2   %3, %4
@@ -318,6 +339,13 @@ x264_pixel_ssd_16x8_sse2:
     psubw       %1, %2
 %endmacro
 
+%macro LOAD_DIFF_4x8P 6 ; 4x dest, 2x temp
+    LOAD_DIFF_8P %1, %5, [parm1q],          [parm3q]
+    LOAD_DIFF_8P %2, %6, [parm1q+parm2q],   [parm3q+parm4q]
+    LOAD_DIFF_8P %3, %5, [parm1q+2*parm2q], [parm3q+2*parm4q]
+    LOAD_DIFF_8P %4, %6, [parm1q+r10],      [parm3q+r11]
+%endmacro
+
 %macro SUM1x8_SSE2 3    ; 01 junk sum
     pxor    %2, %2
     psubw   %2, %1
@@ -338,8 +366,7 @@ x264_pixel_ssd_16x8_sse2:
     paddusw %4, %2
 %endmacro
 
-;;; two SUM4x4_SSE2 running side-by-side
-%macro SUM4x4_TWO_SSE2 7    ; a02 a13 junk1 b02 b13 junk2 (1=4 2=5 3=6) sum
+%macro SUM8x4_SSE2 7    ; a02 a13 junk1 b02 b13 junk2 (1=4 2=5 3=6) sum
     pxor    %3, %3
     pxor    %6, %6
     psubw   %3, %1
@@ -358,18 +385,25 @@ x264_pixel_ssd_16x8_sse2:
     paddusw %7, %4
 %endmacro
 
-%macro SATD_TWO_SSE2 0
-    LOAD_DIFF_8P xmm0, xmm4, [parm1q],          [parm3q]
-    LOAD_DIFF_8P xmm1, xmm5, [parm1q+parm2q],   [parm3q+parm4q]
-    LOAD_DIFF_8P xmm2, xmm4, [parm1q+2*parm2q], [parm3q+2*parm4q]
-    LOAD_DIFF_8P xmm3, xmm5, [parm1q+r10],      [parm3q+r11]
-    lea          parm1q, [parm1q+4*parm2q]
-    lea          parm3q, [parm3q+4*parm4q]
+%macro SUM8x4_SSSE3 7    ; a02 a13 . b02 b13 . sum
+    pabsw   %1, %1
+    pabsw   %2, %2
+    pabsw   %4, %4
+    pabsw   %5, %5
+    paddusw %1, %2
+    paddusw %4, %5
+    paddusw %7, %1
+    paddusw %7, %4
+%endmacro
 
+%macro SATD_TWO_SSE2 0
+    LOAD_DIFF_4x8P    xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
+    lea     parm1q, [parm1q+4*parm2q]
+    lea     parm3q, [parm3q+4*parm4q]
     HADAMARD1x4       xmm0, xmm1, xmm2, xmm3
     TRANSPOSE2x4x4W   xmm0, xmm1, xmm2, xmm3, xmm4
     HADAMARD1x4       xmm0, xmm1, xmm2, xmm3
-    SUM4x4_TWO_SSE2   xmm0, xmm1, xmm4, xmm2, xmm3, xmm5, xmm6
+    SUM8x4            xmm0, xmm1, xmm4, xmm2, xmm3, xmm5, xmm6
 %endmacro
 
 %macro SATD_START 0
@@ -385,85 +419,72 @@ x264_pixel_ssd_16x8_sse2:
     ret
 %endmacro
 
+%macro SATDS 1
 ALIGN 16
 ;-----------------------------------------------------------------------------
 ;   int x264_pixel_satd_16x16_sse2 (uint8_t *, int, uint8_t *, int )
 ;-----------------------------------------------------------------------------
-x264_pixel_satd_16x16_sse2:
+x264_pixel_satd_16x16_%1:
     SATD_START
     mov     r8,  rdi
     mov     r9,  rdx
-
     SATD_TWO_SSE2
     SATD_TWO_SSE2
     SATD_TWO_SSE2
     SATD_TWO_SSE2
-
     lea     rdi, [r8+8]
     lea     rdx, [r9+8]
-
     SATD_TWO_SSE2
     SATD_TWO_SSE2
     SATD_TWO_SSE2
     SATD_TWO_SSE2
-
     SATD_END
 
 ALIGN 16
 ;-----------------------------------------------------------------------------
 ;   int x264_pixel_satd_8x16_sse2 (uint8_t *, int, uint8_t *, int )
 ;-----------------------------------------------------------------------------
-x264_pixel_satd_8x16_sse2:
+x264_pixel_satd_8x16_%1:
     SATD_START
-
     SATD_TWO_SSE2
     SATD_TWO_SSE2
     SATD_TWO_SSE2
     SATD_TWO_SSE2
-
     SATD_END
 
 ALIGN 16
 ;-----------------------------------------------------------------------------
 ;   int x264_pixel_satd_16x8_sse2 (uint8_t *, int, uint8_t *, int )
 ;-----------------------------------------------------------------------------
-x264_pixel_satd_16x8_sse2:
+x264_pixel_satd_16x8_%1:
     SATD_START
     mov     r8,  rdi
     mov     r9,  rdx
-
     SATD_TWO_SSE2
     SATD_TWO_SSE2
-
     lea     rdi, [r8+8]
     lea     rdx, [r9+8]
-
     SATD_TWO_SSE2
     SATD_TWO_SSE2
-
     SATD_END
 
 ALIGN 16
 ;-----------------------------------------------------------------------------
 ;   int x264_pixel_satd_8x8_sse2 (uint8_t *, int, uint8_t *, int )
 ;-----------------------------------------------------------------------------
-x264_pixel_satd_8x8_sse2:
+x264_pixel_satd_8x8_%1:
     SATD_START
-
     SATD_TWO_SSE2
     SATD_TWO_SSE2
-
     SATD_END
 
 ALIGN 16
 ;-----------------------------------------------------------------------------
 ;   int x264_pixel_satd_8x4_sse2 (uint8_t *, int, uint8_t *, int )
 ;-----------------------------------------------------------------------------
-x264_pixel_satd_8x4_sse2:
+x264_pixel_satd_8x4_%1:
     SATD_START
-
     SATD_TWO_SSE2
-
     SATD_END
 
 
@@ -471,27 +492,21 @@ ALIGN 16
 ;-----------------------------------------------------------------------------
 ;   int x264_pixel_sa8d_8x8_sse2( uint8_t *, int, uint8_t *, int )
 ;-----------------------------------------------------------------------------
-x264_pixel_sa8d_8x8_sse2:
+x264_pixel_sa8d_8x8_%1:
     lea  r10, [3*parm2q]
     lea  r11, [3*parm4q]
-    LOAD_DIFF_8P xmm0, xmm8, [parm1q],          [parm3q]
-    LOAD_DIFF_8P xmm1, xmm9, [parm1q+parm2q],   [parm3q+parm4q]
-    LOAD_DIFF_8P xmm2, xmm8, [parm1q+2*parm2q], [parm3q+2*parm4q]
-    LOAD_DIFF_8P xmm3, xmm9, [parm1q+r10],      [parm3q+r11]
+    LOAD_DIFF_4x8P xmm0, xmm1, xmm2, xmm3, xmm8, xmm8
     lea  parm1q, [parm1q+4*parm2q]
     lea  parm3q, [parm3q+4*parm4q]
-    LOAD_DIFF_8P xmm4, xmm8, [parm1q],          [parm3q]
-    LOAD_DIFF_8P xmm5, xmm9, [parm1q+parm2q],   [parm3q+parm4q]
-    LOAD_DIFF_8P xmm6, xmm8, [parm1q+2*parm2q], [parm3q+2*parm4q]
-    LOAD_DIFF_8P xmm7, xmm9, [parm1q+r10],      [parm3q+r11]
-    
+    LOAD_DIFF_4x8P xmm4, xmm5, xmm6, xmm7, xmm8, xmm8
+
     HADAMARD1x8  xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
     TRANSPOSE8x8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8
     HADAMARD1x8  xmm0, xmm5, xmm7, xmm3, xmm8, xmm4, xmm2, xmm1
 
     pxor            xmm10, xmm10
-    SUM4x4_TWO_SSE2 xmm0, xmm1, xmm6, xmm2, xmm3, xmm9, xmm10
-    SUM4x4_TWO_SSE2 xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10
+    SUM8x4          xmm0, xmm1, xmm6, xmm2, xmm3, xmm9, xmm10
+    SUM8x4          xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10
     psrlw           xmm10, 1
     HADDW           xmm10, xmm0
     movd eax, xmm10
@@ -505,26 +520,34 @@ ALIGN 16
 ;   int x264_pixel_sa8d_16x16_sse2( uint8_t *, int, uint8_t *, int )
 ;-----------------------------------------------------------------------------
 ;; violates calling convention
-x264_pixel_sa8d_16x16_sse2:
+x264_pixel_sa8d_16x16_%1:
     xor  r8d, r8d
-    call x264_pixel_sa8d_8x8_sse2 ; pix[0]
+    call x264_pixel_sa8d_8x8_%1 ; pix[0]
     lea  parm1q, [parm1q+4*parm2q]
     lea  parm3q, [parm3q+4*parm4q]
-    call x264_pixel_sa8d_8x8_sse2 ; pix[8*stride]
+    call x264_pixel_sa8d_8x8_%1 ; pix[8*stride]
     lea  r10, [3*parm2q-2]
     lea  r11, [3*parm4q-2]
     shl  r10, 2
     shl  r11, 2
     sub  parm1q, r10
     sub  parm3q, r11
-    call x264_pixel_sa8d_8x8_sse2 ; pix[8]
+    call x264_pixel_sa8d_8x8_%1 ; pix[8]
     lea  parm1q, [parm1q+4*parm2q]
     lea  parm3q, [parm3q+4*parm4q]
-    call x264_pixel_sa8d_8x8_sse2 ; pix[8*stride+8]
+    call x264_pixel_sa8d_8x8_%1 ; pix[8*stride+8]
     mov  eax, r8d
     add  eax, 1
     shr  eax, 1
     ret
+%endmacro ; SATDS
+
+%define SUM8x4 SUM8x4_SSE2
+SATDS sse2
+%ifdef HAVE_SSE3
+%define SUM8x4 SUM8x4_SSSE3
+SATDS ssse3
+%endif
 
 
 
@@ -567,7 +590,7 @@ x264_intra_sa8d_x3_8x8_core_sse2:
     movdqa      xmm9, xmm3
     movdqa      xmm10, xmm4
     movdqa      xmm11, xmm5
-    SUM4x4_TWO_SSE2 xmm8, xmm9, xmm12, xmm10, xmm11, xmm13, xmm15
+    SUM8x4_SSE2 xmm8, xmm9, xmm12, xmm10, xmm11, xmm13, xmm15
     movdqa      xmm8, xmm6
     movdqa      xmm9, xmm7
     SUM4x4_SSE2 xmm8, xmm9, xmm10, xmm15
index fcb5db1ac0ce6e7605a0a88ccefeebb246e36741..ba7058deb4359cf7820b3daafda445c73fbc1b0c 100644 (file)
@@ -45,6 +45,10 @@ cglobal x264_quant_4x4_dc_core15_mmx
 cglobal x264_quant_4x4_core15_mmx
 cglobal x264_quant_8x8_core15_mmx
 
+cglobal x264_quant_4x4_dc_core15_ssse3
+cglobal x264_quant_4x4_core15_ssse3
+cglobal x264_quant_8x8_core15_ssse3
+
 cglobal x264_quant_2x2_dc_core16_mmxext
 cglobal x264_quant_4x4_dc_core16_mmxext
 cglobal x264_quant_4x4_core16_mmxext
@@ -76,6 +80,21 @@ cglobal x264_dequant_8x8_mmx
     punpckldq   mm7, mm7        ; f in each dword
 %endmacro
 
+%macro SSE2_QUANT_AC_START 0
+    movd       xmm6, parm3d     ; i_qbits
+    movd       xmm7, parm4d     ; f
+    pshufd     xmm7, xmm7, 0    ; f in each dword
+%endmacro
+
+%macro SSE2_QUANT15_DC_START 0
+    movd       xmm5, parm2d     ; i_qmf
+    movd       xmm6, parm3d     ; i_qbits
+    movd       xmm7, parm4d     ; f
+    pshuflw    xmm5, xmm5, 0
+    punpcklqdq xmm5, xmm5       ; i_qmf in each word
+    pshufd     xmm7, xmm7, 0    ; f in each dword
+%endmacro
+
 %macro MMX_QUANT15_1x4 4
 ;;; %1      (m64)       dct[y][x]
 ;;; %2      (m64/mmx)   quant_mf[y][x] or quant_mf[0][0] (as int16_t)
@@ -104,7 +123,30 @@ cglobal x264_dequant_8x8_mmx
     packssdw    mm0, mm1    ; pack
     pxor        mm0, mm4    ; restore sign
     psubw       mm0, mm4
-    movq        %1, mm0     ; store
+    movq         %1, mm0    ; store
+%endmacro
+
+%macro SSSE3_QUANT15_1x8 4
+    movdqa     xmm0, %1     ; load dct coeffs
+    movdqa     xmm4, xmm0   ; save sign
+    pabsw      xmm0, xmm0
+
+    movdqa     xmm2, xmm0
+    pmullw     xmm0, %2
+    pmulhw     xmm2, %2
+
+    movdqa     xmm1, xmm0
+    punpcklwd  xmm0, xmm2
+    punpckhwd  xmm1, xmm2
+
+    paddd      xmm0, %4     ; round with f
+    paddd      xmm1, %4
+    psrad      xmm0, %3
+    psrad      xmm1, %3
+
+    packssdw   xmm0, xmm1   ; pack
+    psignw     xmm0, xmm4   ; restore sign
+    movdqa       %1, xmm0   ; store
 %endmacro
 
 ALIGN 16
@@ -168,6 +210,52 @@ x264_quant_8x8_core15_mmx:
 
     ret
 
+%ifdef HAVE_SSE3
+ALIGN 16
+;-----------------------------------------------------------------------------
+;   void x264_quant_4x4_dc_core15_ssse3( int16_t dct[4][4],
+;       int const i_qmf, int const i_qbits, int const f );
+;-----------------------------------------------------------------------------
+x264_quant_4x4_dc_core15_ssse3:
+    SSE2_QUANT15_DC_START
+    SSSE3_QUANT15_1x8 [parm1q], xmm5, xmm6, xmm7
+    SSSE3_QUANT15_1x8 [parm1q+16], xmm5, xmm6, xmm7
+    ret
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+;   void x264_quant_4x4_core15_ssse3( int16_t dct[4][4],
+;       int const quant_mf[4][4], int const i_qbits, int const f );
+;-----------------------------------------------------------------------------
+x264_quant_4x4_core15_ssse3:
+    SSE2_QUANT_AC_START
+%assign x 0
+%rep 2
+    movdqa      xmm5, [parm2q+32*x]
+    packssdw    xmm5, [parm2q+32*x+16]
+    SSSE3_QUANT15_1x8 [parm1q+16*x], xmm5, xmm6, xmm7
+    %assign x x+1
+%endrep
+    ret
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+;   void x264_quant_8x8_core15_ssse3( int16_t dct[8][8],
+;       int const quant_mf[8][8], int const i_qbits, int const f );
+;-----------------------------------------------------------------------------
+x264_quant_8x8_core15_ssse3:
+    SSE2_QUANT_AC_START
+%assign x 0
+%rep 8
+    movdqa      xmm5, [parm2q+32*x]
+    packssdw    xmm5, [parm2q+32*x+16]
+    SSSE3_QUANT15_1x8 [parm1q+16*x], xmm5, xmm6, xmm7
+    %assign x x+1
+%endrep
+    ret
+%endif ; HAVE_SSE3
+
+
 ; ============================================================================
 
 %macro MMXEXT_QUANT16_DC_START 0
index 09e20ea86b1c920654186923f0e2f2c23541a3a9..32c7cd3b7f2ce5a40aff5573f0f5370ee7e9bf3b 100644 (file)
@@ -80,6 +80,16 @@ uint32_t x264_cpu_detect( void )
         /* Is it OK ? */
         cpu |= X264_CPU_SSE2;
     }
+#ifdef HAVE_SSE3
+    if( (ecx&0x00000001) )
+    {
+        cpu |= X264_CPU_SSE3;
+    }
+    if( (ecx&0x00000200) )
+    {
+        cpu |= X264_CPU_SSSE3;
+    }
+#endif
 
     x264_cpu_cpuid( 0x80000000, &eax, &ebx, &ecx, &edx );
     if( eax < 0x80000001 )
index fb06cccf1b4dabd5276149e20bafb826925a8604..c15459d9a6b29bdd57aaf3687327f15bf83cb6e0 100644 (file)
@@ -87,9 +87,18 @@ int x264_pixel_satd_8x16_sse2( uint8_t *, int, uint8_t *, int );
 int x264_pixel_satd_8x8_sse2( uint8_t *, int, uint8_t *, int );
 int x264_pixel_satd_8x4_sse2( uint8_t *, int, uint8_t *, int );
 
+int x264_pixel_satd_16x16_ssse3( uint8_t *, int, uint8_t *, int );
+int x264_pixel_satd_16x8_ssse3( uint8_t *, int, uint8_t *, int );
+int x264_pixel_satd_8x16_ssse3( uint8_t *, int, uint8_t *, int );
+int x264_pixel_satd_8x8_ssse3( uint8_t *, int, uint8_t *, int );
+int x264_pixel_satd_8x4_ssse3( uint8_t *, int, uint8_t *, int );
+
 int x264_pixel_sa8d_16x16_sse2( uint8_t *, int, uint8_t *, int );
 int x264_pixel_sa8d_8x8_sse2( uint8_t *, int, uint8_t *, int );
 
+int x264_pixel_sa8d_16x16_ssse3( uint8_t *, int, uint8_t *, int );
+int x264_pixel_sa8d_8x8_ssse3( uint8_t *, int, uint8_t *, int );
+
 void x264_intra_satd_x3_4x4_mmxext( uint8_t *, uint8_t *, int * );
 void x264_intra_satd_x3_8x8c_mmxext( uint8_t *, uint8_t *, int * );
 void x264_intra_satd_x3_16x16_mmxext( uint8_t *, uint8_t *, int * );
index ec42f4e18a2a8779813b10ed76d0c1ad76936d84..1d4b51d9fde55226e398eae57bca2be2f5f0b1c7 100644 (file)
@@ -32,6 +32,13 @@ void x264_quant_4x4_dc_core15_mmx( int16_t dct[4][4],
 void x264_quant_2x2_dc_core15_mmx( int16_t dct[2][2],
     int const i_qmf, int const i_qbits, int const f );
 
+void x264_quant_8x8_core15_ssse3( int16_t dct[8][8],
+    int quant_mf[8][8], int const i_qbits, int const f );
+void x264_quant_4x4_core15_ssse3( int16_t dct[4][4],
+    int quant_mf[4][4], int const i_qbits, int const f );
+void x264_quant_4x4_dc_core15_ssse3( int16_t dct[4][4],
+    int const i_qmf, int const i_qbits, int const f );
+
 void x264_quant_8x8_core16_mmxext( int16_t dct[8][8],
     int quant_mf[8][8], int const i_qbits, int const f );
 void x264_quant_4x4_core16_mmxext( int16_t dct[4][4],
index 365266a08497c453c15d781be91c6a648c588894..51c52c059d2e8988f65c50c4fc230b4f8c7a5bc5 100644 (file)
@@ -536,6 +536,19 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
         pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_sse2;
         pixf->sa8d[PIXEL_8x8]   = x264_pixel_sa8d_8x8_sse2;
         pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_sse2;
+#endif
+    }
+
+    if( cpu&X264_CPU_SSSE3 )
+    {
+#if defined(ARCH_X86_64) && defined(HAVE_SSE3)
+        pixf->satd[PIXEL_16x16]= x264_pixel_satd_16x16_ssse3;
+        pixf->satd[PIXEL_16x8] = x264_pixel_satd_16x8_ssse3;
+        pixf->satd[PIXEL_8x16] = x264_pixel_satd_8x16_ssse3;
+        pixf->satd[PIXEL_8x8]  = x264_pixel_satd_8x8_ssse3;
+        pixf->satd[PIXEL_8x4]  = x264_pixel_satd_8x4_ssse3;
+        pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_ssse3;
+        pixf->sa8d[PIXEL_8x8]  = x264_pixel_sa8d_8x8_ssse3;
 #endif
     }
 #endif
index e7bd48ccdb24844201f4ef19bd5b8239ca00c8c6..ad25824cad57b547dd966660dc6eb858645bdd7a 100644 (file)
@@ -229,6 +229,11 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
 #ifdef HAVE_MMXEXT
 
     /* select quant_8x8 based on CPU and maxQ8 */
+#if defined(ARCH_X86_64) && defined(HAVE_SSE3)
+    if( maxQ8 < (1<<15) && cpu&X264_CPU_SSSE3 )
+        pf->quant_8x8_core = x264_quant_8x8_core15_ssse3;
+    else
+#endif
     if( maxQ8 < (1<<15) && cpu&X264_CPU_MMX )
         pf->quant_8x8_core = x264_quant_8x8_core15_mmx;
     else
@@ -239,6 +244,11 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
         pf->quant_8x8_core = x264_quant_8x8_core32_mmxext;
 
     /* select quant_4x4 based on CPU and maxQ4 */
+#if defined(ARCH_X86_64) && defined(HAVE_SSE3)
+    if( maxQ4 < (1<<15) && cpu&X264_CPU_SSSE3 )
+        pf->quant_4x4_core = x264_quant_4x4_core15_ssse3;
+    else
+#endif
     if( maxQ4 < (1<<15) && cpu&X264_CPU_MMX )
         pf->quant_4x4_core = x264_quant_4x4_core15_mmx;
     else
@@ -267,6 +277,11 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
         pf->quant_2x2_dc_core = x264_quant_2x2_dc_core32_mmxext;
     }
 
+#if defined(ARCH_X86_64) && defined(HAVE_SSE3)
+    if( maxQdc < (1<<15) && cpu&X264_CPU_SSSE3 )
+        pf->quant_4x4_dc_core = x264_quant_4x4_dc_core15_ssse3;
+#endif
+
     if( cpu&X264_CPU_MMX )
     {
         /* dequant is not subject to the above CQM-dependent overflow issues,
index 4b013b09667cb1f626b7c6bad390bb0a8a999bd1..b903fdd3d5e6fe2997a8d24ed1669ac8f7a882d6 100755 (executable)
--- a/configure
+++ b/configure
@@ -35,6 +35,15 @@ EOF
     return $TMP
 }
 
+as_check() {
+    rm -f conftest*
+    echo "$1" > conftest.asm
+    $AS conftest.asm $ASFLAGS $2 -o conftest.o 2>$DEVNULL
+    TMP="$?"
+    rm -f conftest*
+    return $TMP
+}
+
 rm -f config.h config.mak x264.pc
 
 prefix='/usr/local'
@@ -210,6 +219,17 @@ then
   fi
 fi
 
+if [ $ARCH = X86_64 ] ; then
+    if ! as_check ; then
+        echo "No assembler. Please install yasm."
+        exit 1
+    fi
+    if as_check "pabsw xmm0, xmm0" ; then
+        ASFLAGS="$ASFLAGS -DHAVE_SSE3"
+        CFLAGS="$CFLAGS -DHAVE_SSE3"
+    fi
+fi
+
 CFLAGS="$CFLAGS -DARCH_$ARCH -DSYS_$SYS"
 
 # parse options
index e247e1e36558f9e7b7161b57d08c92f24f6f79aa..a5eacff20c695c3c15087ebecfb7517e7ab81c03 100644 (file)
@@ -671,6 +671,7 @@ x264_t *x264_encoder_open   ( x264_param_t *param )
              param->cpu&X264_CPU_MMXEXT ? "MMXEXT " : "",
              param->cpu&X264_CPU_SSE ? "SSE " : "",
              param->cpu&X264_CPU_SSE2 ? "SSE2 " : "",
+             param->cpu&X264_CPU_SSSE3 ? "SSSE3 " : "",
              param->cpu&X264_CPU_3DNOW ? "3DNow! " : "",
              param->cpu&X264_CPU_ALTIVEC ? "Altivec " : "" );
 
index d64fe50255297ec248f1d3df1a9fb0258eb87640..988ff5022eea836c2b3b53a7e48aeee5d317084f 100644 (file)
@@ -738,6 +738,7 @@ int check_all( int cpu_ref, int cpu_new )
 int main(int argc, char *argv[])
 {
     int ret = 0;
+    int cpu0 = 0, cpu1 = 0;
     int i;
 
     buf1 = x264_malloc( 1024 ); /* 32 x 32 */
@@ -759,13 +760,23 @@ int main(int argc, char *argv[])
 
 #ifdef HAVE_MMXEXT
     fprintf( stderr, "x264: MMXEXT against C\n" );
-    ret = check_all( 0, X264_CPU_MMX | X264_CPU_MMXEXT );
+    cpu1 = X264_CPU_MMX | X264_CPU_MMXEXT;
+    ret = check_all( 0, cpu1 );
 #ifdef HAVE_SSE2
     if( x264_cpu_detect() & X264_CPU_SSE2 )
     {
         fprintf( stderr, "\nx264: SSE2 against C\n" );
-        ret |= check_all( X264_CPU_MMX | X264_CPU_MMXEXT,
-                          X264_CPU_MMX | X264_CPU_MMXEXT | X264_CPU_SSE | X264_CPU_SSE2 );
+        cpu0 = cpu1;
+        cpu1 |= X264_CPU_SSE | X264_CPU_SSE2;
+        ret |= check_all( cpu0, cpu1 );
+
+        if( x264_cpu_detect() & X264_CPU_SSSE3 )
+        {
+            fprintf( stderr, "\nx264: SSSE3 against C\n" );
+            cpu0 = cpu1;
+            cpu1 |= X264_CPU_SSE3 | X264_CPU_SSSE3;
+            ret |= check_all( cpu0, cpu1 );
+        }
     }
 #endif
 #elif ARCH_PPC
diff --git a/x264.h b/x264.h
index d2897d8479b9b13c630d4e048d7ebb7fab07f697..cbddbee7227776a03bb702895d7207fc6502e5c9 100644 (file)
--- a/x264.h
+++ b/x264.h
@@ -53,6 +53,8 @@ typedef struct x264_t x264_t;
 #define X264_CPU_3DNOW      0x000010    /* 3dnow! */
 #define X264_CPU_3DNOWEXT   0x000020    /* 3dnow! ext */
 #define X264_CPU_ALTIVEC    0x000040    /* altivec */
+#define X264_CPU_SSE3       0x000080    /* sse 3 */
+#define X264_CPU_SSSE3      0x000100    /* ssse 3 */
 
 /* Analyse flags
  */