From 8aef0e941d986f10427cc2d3a848162065bdef3a Mon Sep 17 00:00:00 2001 From: Loren Merritt Date: Wed, 14 Mar 2007 21:11:11 +0000 Subject: [PATCH] ssse3 detection. x86_64 ssse3 satd and quant. requires yasm >= 0.6.0 git-svn-id: svn://svn.videolan.org/x264/trunk@631 df754926-b1dd-0310-bc7b-ec298dee348c --- common/amd64/pixel-sse2.asm | 117 +++++++++++++++++++++--------------- common/amd64/quant-a.asm | 90 ++++++++++++++++++++++++++- common/cpu.c | 10 +++ common/i386/pixel.h | 9 +++ common/i386/quant.h | 7 +++ common/pixel.c | 13 ++++ common/quant.c | 15 +++++ configure | 20 ++++++ encoder/encoder.c | 1 + tools/checkasm.c | 17 +++++- x264.h | 2 + 11 files changed, 250 insertions(+), 51 deletions(-) diff --git a/common/amd64/pixel-sse2.asm b/common/amd64/pixel-sse2.asm index 9fc3755c..57a1a910 100644 --- a/common/amd64/pixel-sse2.asm +++ b/common/amd64/pixel-sse2.asm @@ -50,8 +50,15 @@ cglobal x264_pixel_satd_8x8_sse2 cglobal x264_pixel_satd_16x8_sse2 cglobal x264_pixel_satd_8x16_sse2 cglobal x264_pixel_satd_16x16_sse2 +cglobal x264_pixel_satd_8x4_ssse3 +cglobal x264_pixel_satd_8x8_ssse3 +cglobal x264_pixel_satd_16x8_ssse3 +cglobal x264_pixel_satd_8x16_ssse3 +cglobal x264_pixel_satd_16x16_ssse3 cglobal x264_pixel_sa8d_8x8_sse2 cglobal x264_pixel_sa8d_16x16_sse2 +cglobal x264_pixel_sa8d_8x8_ssse3 +cglobal x264_pixel_sa8d_16x16_ssse3 cglobal x264_intra_sa8d_x3_8x8_core_sse2 cglobal x264_pixel_ssim_4x4x2_core_sse2 cglobal x264_pixel_ssim_end4_sse2 @@ -267,6 +274,20 @@ x264_pixel_ssd_16x8_sse2: SUMSUB_BADC %5, %6, %7, %8 %endmacro +;;; row transform not used, because phaddw is much slower than paddw on a Conroe +;%macro PHSUMSUB 3 +; movdqa %3, %1 +; phaddw %1, %2 +; phsubw %3, %2 +;%endmacro + +;%macro HADAMARD4x1_SSSE3 5 ; ABCD-T -> ADTC +; PHSUMSUB %1, %2, %5 +; PHSUMSUB %3, %4, %2 +; PHSUMSUB %1, %3, %4 +; PHSUMSUB %5, %2, %3 +;%endmacro + %macro SBUTTERFLY 5 mov%1 %5, %3 punpckl%2 %3, %4 @@ -318,6 +339,13 @@ x264_pixel_ssd_16x8_sse2: psubw %1, %2 %endmacro +%macro LOAD_DIFF_4x8P 6 ; 4x dest, 2x temp + LOAD_DIFF_8P %1, %5, [parm1q], [parm3q] + LOAD_DIFF_8P %2, %6, [parm1q+parm2q], [parm3q+parm4q] + LOAD_DIFF_8P %3, %5, [parm1q+2*parm2q], [parm3q+2*parm4q] + LOAD_DIFF_8P %4, %6, [parm1q+r10], [parm3q+r11] +%endmacro + %macro SUM1x8_SSE2 3 ; 01 junk sum pxor %2, %2 psubw %2, %1 @@ -338,8 +366,7 @@ x264_pixel_ssd_16x8_sse2: paddusw %4, %2 %endmacro -;;; two SUM4x4_SSE2 running side-by-side -%macro SUM4x4_TWO_SSE2 7 ; a02 a13 junk1 b02 b13 junk2 (1=4 2=5 3=6) sum +%macro SUM8x4_SSE2 7 ; a02 a13 junk1 b02 b13 junk2 (1=4 2=5 3=6) sum pxor %3, %3 pxor %6, %6 psubw %3, %1 @@ -358,18 +385,25 @@ x264_pixel_ssd_16x8_sse2: paddusw %7, %4 %endmacro -%macro SATD_TWO_SSE2 0 - LOAD_DIFF_8P xmm0, xmm4, [parm1q], [parm3q] - LOAD_DIFF_8P xmm1, xmm5, [parm1q+parm2q], [parm3q+parm4q] - LOAD_DIFF_8P xmm2, xmm4, [parm1q+2*parm2q], [parm3q+2*parm4q] - LOAD_DIFF_8P xmm3, xmm5, [parm1q+r10], [parm3q+r11] - lea parm1q, [parm1q+4*parm2q] - lea parm3q, [parm3q+4*parm4q] +%macro SUM8x4_SSSE3 7 ; a02 a13 . b02 b13 . sum + pabsw %1, %1 + pabsw %2, %2 + pabsw %4, %4 + pabsw %5, %5 + paddusw %1, %2 + paddusw %4, %5 + paddusw %7, %1 + paddusw %7, %4 +%endmacro +%macro SATD_TWO_SSE2 0 + LOAD_DIFF_4x8P xmm0, xmm1, xmm2, xmm3, xmm4, xmm5 + lea parm1q, [parm1q+4*parm2q] + lea parm3q, [parm3q+4*parm4q] HADAMARD1x4 xmm0, xmm1, xmm2, xmm3 TRANSPOSE2x4x4W xmm0, xmm1, xmm2, xmm3, xmm4 HADAMARD1x4 xmm0, xmm1, xmm2, xmm3 - SUM4x4_TWO_SSE2 xmm0, xmm1, xmm4, xmm2, xmm3, xmm5, xmm6 + SUM8x4 xmm0, xmm1, xmm4, xmm2, xmm3, xmm5, xmm6 %endmacro %macro SATD_START 0 @@ -385,85 +419,72 @@ x264_pixel_ssd_16x8_sse2: ret %endmacro +%macro SATDS 1 ALIGN 16 ;----------------------------------------------------------------------------- ; int x264_pixel_satd_16x16_sse2 (uint8_t *, int, uint8_t *, int ) ;----------------------------------------------------------------------------- -x264_pixel_satd_16x16_sse2: +x264_pixel_satd_16x16_%1: SATD_START mov r8, rdi mov r9, rdx - SATD_TWO_SSE2 SATD_TWO_SSE2 SATD_TWO_SSE2 SATD_TWO_SSE2 - lea rdi, [r8+8] lea rdx, [r9+8] - SATD_TWO_SSE2 SATD_TWO_SSE2 SATD_TWO_SSE2 SATD_TWO_SSE2 - SATD_END ALIGN 16 ;----------------------------------------------------------------------------- ; int x264_pixel_satd_8x16_sse2 (uint8_t *, int, uint8_t *, int ) ;----------------------------------------------------------------------------- -x264_pixel_satd_8x16_sse2: +x264_pixel_satd_8x16_%1: SATD_START - SATD_TWO_SSE2 SATD_TWO_SSE2 SATD_TWO_SSE2 SATD_TWO_SSE2 - SATD_END ALIGN 16 ;----------------------------------------------------------------------------- ; int x264_pixel_satd_16x8_sse2 (uint8_t *, int, uint8_t *, int ) ;----------------------------------------------------------------------------- -x264_pixel_satd_16x8_sse2: +x264_pixel_satd_16x8_%1: SATD_START mov r8, rdi mov r9, rdx - SATD_TWO_SSE2 SATD_TWO_SSE2 - lea rdi, [r8+8] lea rdx, [r9+8] - SATD_TWO_SSE2 SATD_TWO_SSE2 - SATD_END ALIGN 16 ;----------------------------------------------------------------------------- ; int x264_pixel_satd_8x8_sse2 (uint8_t *, int, uint8_t *, int ) ;----------------------------------------------------------------------------- -x264_pixel_satd_8x8_sse2: +x264_pixel_satd_8x8_%1: SATD_START - SATD_TWO_SSE2 SATD_TWO_SSE2 - SATD_END ALIGN 16 ;----------------------------------------------------------------------------- ; int x264_pixel_satd_8x4_sse2 (uint8_t *, int, uint8_t *, int ) ;----------------------------------------------------------------------------- -x264_pixel_satd_8x4_sse2: +x264_pixel_satd_8x4_%1: SATD_START - SATD_TWO_SSE2 - SATD_END @@ -471,27 +492,21 @@ ALIGN 16 ;----------------------------------------------------------------------------- ; int x264_pixel_sa8d_8x8_sse2( uint8_t *, int, uint8_t *, int ) ;----------------------------------------------------------------------------- -x264_pixel_sa8d_8x8_sse2: +x264_pixel_sa8d_8x8_%1: lea r10, [3*parm2q] lea r11, [3*parm4q] - LOAD_DIFF_8P xmm0, xmm8, [parm1q], [parm3q] - LOAD_DIFF_8P xmm1, xmm9, [parm1q+parm2q], [parm3q+parm4q] - LOAD_DIFF_8P xmm2, xmm8, [parm1q+2*parm2q], [parm3q+2*parm4q] - LOAD_DIFF_8P xmm3, xmm9, [parm1q+r10], [parm3q+r11] + LOAD_DIFF_4x8P xmm0, xmm1, xmm2, xmm3, xmm8, xmm8 lea parm1q, [parm1q+4*parm2q] lea parm3q, [parm3q+4*parm4q] - LOAD_DIFF_8P xmm4, xmm8, [parm1q], [parm3q] - LOAD_DIFF_8P xmm5, xmm9, [parm1q+parm2q], [parm3q+parm4q] - LOAD_DIFF_8P xmm6, xmm8, [parm1q+2*parm2q], [parm3q+2*parm4q] - LOAD_DIFF_8P xmm7, xmm9, [parm1q+r10], [parm3q+r11] - + LOAD_DIFF_4x8P xmm4, xmm5, xmm6, xmm7, xmm8, xmm8 + HADAMARD1x8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7 TRANSPOSE8x8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8 HADAMARD1x8 xmm0, xmm5, xmm7, xmm3, xmm8, xmm4, xmm2, xmm1 pxor xmm10, xmm10 - SUM4x4_TWO_SSE2 xmm0, xmm1, xmm6, xmm2, xmm3, xmm9, xmm10 - SUM4x4_TWO_SSE2 xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10 + SUM8x4 xmm0, xmm1, xmm6, xmm2, xmm3, xmm9, xmm10 + SUM8x4 xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10 psrlw xmm10, 1 HADDW xmm10, xmm0 movd eax, xmm10 @@ -505,26 +520,34 @@ ALIGN 16 ; int x264_pixel_sa8d_16x16_sse2( uint8_t *, int, uint8_t *, int ) ;----------------------------------------------------------------------------- ;; violates calling convention -x264_pixel_sa8d_16x16_sse2: +x264_pixel_sa8d_16x16_%1: xor r8d, r8d - call x264_pixel_sa8d_8x8_sse2 ; pix[0] + call x264_pixel_sa8d_8x8_%1 ; pix[0] lea parm1q, [parm1q+4*parm2q] lea parm3q, [parm3q+4*parm4q] - call x264_pixel_sa8d_8x8_sse2 ; pix[8*stride] + call x264_pixel_sa8d_8x8_%1 ; pix[8*stride] lea r10, [3*parm2q-2] lea r11, [3*parm4q-2] shl r10, 2 shl r11, 2 sub parm1q, r10 sub parm3q, r11 - call x264_pixel_sa8d_8x8_sse2 ; pix[8] + call x264_pixel_sa8d_8x8_%1 ; pix[8] lea parm1q, [parm1q+4*parm2q] lea parm3q, [parm3q+4*parm4q] - call x264_pixel_sa8d_8x8_sse2 ; pix[8*stride+8] + call x264_pixel_sa8d_8x8_%1 ; pix[8*stride+8] mov eax, r8d add eax, 1 shr eax, 1 ret +%endmacro ; SATDS + +%define SUM8x4 SUM8x4_SSE2 +SATDS sse2 +%ifdef HAVE_SSE3 +%define SUM8x4 SUM8x4_SSSE3 +SATDS ssse3 +%endif @@ -567,7 +590,7 @@ x264_intra_sa8d_x3_8x8_core_sse2: movdqa xmm9, xmm3 movdqa xmm10, xmm4 movdqa xmm11, xmm5 - SUM4x4_TWO_SSE2 xmm8, xmm9, xmm12, xmm10, xmm11, xmm13, xmm15 + SUM8x4_SSE2 xmm8, xmm9, xmm12, xmm10, xmm11, xmm13, xmm15 movdqa xmm8, xmm6 movdqa xmm9, xmm7 SUM4x4_SSE2 xmm8, xmm9, xmm10, xmm15 diff --git a/common/amd64/quant-a.asm b/common/amd64/quant-a.asm index fcb5db1a..ba7058de 100644 --- a/common/amd64/quant-a.asm +++ b/common/amd64/quant-a.asm @@ -45,6 +45,10 @@ cglobal x264_quant_4x4_dc_core15_mmx cglobal x264_quant_4x4_core15_mmx cglobal x264_quant_8x8_core15_mmx +cglobal x264_quant_4x4_dc_core15_ssse3 +cglobal x264_quant_4x4_core15_ssse3 +cglobal x264_quant_8x8_core15_ssse3 + cglobal x264_quant_2x2_dc_core16_mmxext cglobal x264_quant_4x4_dc_core16_mmxext cglobal x264_quant_4x4_core16_mmxext @@ -76,6 +80,21 @@ cglobal x264_dequant_8x8_mmx punpckldq mm7, mm7 ; f in each dword %endmacro +%macro SSE2_QUANT_AC_START 0 + movd xmm6, parm3d ; i_qbits + movd xmm7, parm4d ; f + pshufd xmm7, xmm7, 0 ; f in each dword +%endmacro + +%macro SSE2_QUANT15_DC_START 0 + movd xmm5, parm2d ; i_qmf + movd xmm6, parm3d ; i_qbits + movd xmm7, parm4d ; f + pshuflw xmm5, xmm5, 0 + punpcklqdq xmm5, xmm5 ; i_qmf in each word + pshufd xmm7, xmm7, 0 ; f in each dword +%endmacro + %macro MMX_QUANT15_1x4 4 ;;; %1 (m64) dct[y][x] ;;; %2 (m64/mmx) quant_mf[y][x] or quant_mf[0][0] (as int16_t) @@ -104,7 +123,30 @@ cglobal x264_dequant_8x8_mmx packssdw mm0, mm1 ; pack pxor mm0, mm4 ; restore sign psubw mm0, mm4 - movq %1, mm0 ; store + movq %1, mm0 ; store +%endmacro + +%macro SSSE3_QUANT15_1x8 4 + movdqa xmm0, %1 ; load dct coeffs + movdqa xmm4, xmm0 ; save sign + pabsw xmm0, xmm0 + + movdqa xmm2, xmm0 + pmullw xmm0, %2 + pmulhw xmm2, %2 + + movdqa xmm1, xmm0 + punpcklwd xmm0, xmm2 + punpckhwd xmm1, xmm2 + + paddd xmm0, %4 ; round with f + paddd xmm1, %4 + psrad xmm0, %3 + psrad xmm1, %3 + + packssdw xmm0, xmm1 ; pack + psignw xmm0, xmm4 ; restore sign + movdqa %1, xmm0 ; store %endmacro ALIGN 16 @@ -168,6 +210,52 @@ x264_quant_8x8_core15_mmx: ret +%ifdef HAVE_SSE3 +ALIGN 16 +;----------------------------------------------------------------------------- +; void x264_quant_4x4_dc_core15_ssse3( int16_t dct[4][4], +; int const i_qmf, int const i_qbits, int const f ); +;----------------------------------------------------------------------------- +x264_quant_4x4_dc_core15_ssse3: + SSE2_QUANT15_DC_START + SSSE3_QUANT15_1x8 [parm1q], xmm5, xmm6, xmm7 + SSSE3_QUANT15_1x8 [parm1q+16], xmm5, xmm6, xmm7 + ret + +ALIGN 16 +;----------------------------------------------------------------------------- +; void x264_quant_4x4_core15_ssse3( int16_t dct[4][4], +; int const quant_mf[4][4], int const i_qbits, int const f ); +;----------------------------------------------------------------------------- +x264_quant_4x4_core15_ssse3: + SSE2_QUANT_AC_START +%assign x 0 +%rep 2 + movdqa xmm5, [parm2q+32*x] + packssdw xmm5, [parm2q+32*x+16] + SSSE3_QUANT15_1x8 [parm1q+16*x], xmm5, xmm6, xmm7 + %assign x x+1 +%endrep + ret + +ALIGN 16 +;----------------------------------------------------------------------------- +; void x264_quant_8x8_core15_ssse3( int16_t dct[8][8], +; int const quant_mf[8][8], int const i_qbits, int const f ); +;----------------------------------------------------------------------------- +x264_quant_8x8_core15_ssse3: + SSE2_QUANT_AC_START +%assign x 0 +%rep 8 + movdqa xmm5, [parm2q+32*x] + packssdw xmm5, [parm2q+32*x+16] + SSSE3_QUANT15_1x8 [parm1q+16*x], xmm5, xmm6, xmm7 + %assign x x+1 +%endrep + ret +%endif ; HAVE_SSE3 + + ; ============================================================================ %macro MMXEXT_QUANT16_DC_START 0 diff --git a/common/cpu.c b/common/cpu.c index 09e20ea8..32c7cd3b 100644 --- a/common/cpu.c +++ b/common/cpu.c @@ -80,6 +80,16 @@ uint32_t x264_cpu_detect( void ) /* Is it OK ? */ cpu |= X264_CPU_SSE2; } +#ifdef HAVE_SSE3 + if( (ecx&0x00000001) ) + { + cpu |= X264_CPU_SSE3; + } + if( (ecx&0x00000200) ) + { + cpu |= X264_CPU_SSSE3; + } +#endif x264_cpu_cpuid( 0x80000000, &eax, &ebx, &ecx, &edx ); if( eax < 0x80000001 ) diff --git a/common/i386/pixel.h b/common/i386/pixel.h index fb06cccf..c15459d9 100644 --- a/common/i386/pixel.h +++ b/common/i386/pixel.h @@ -87,9 +87,18 @@ int x264_pixel_satd_8x16_sse2( uint8_t *, int, uint8_t *, int ); int x264_pixel_satd_8x8_sse2( uint8_t *, int, uint8_t *, int ); int x264_pixel_satd_8x4_sse2( uint8_t *, int, uint8_t *, int ); +int x264_pixel_satd_16x16_ssse3( uint8_t *, int, uint8_t *, int ); +int x264_pixel_satd_16x8_ssse3( uint8_t *, int, uint8_t *, int ); +int x264_pixel_satd_8x16_ssse3( uint8_t *, int, uint8_t *, int ); +int x264_pixel_satd_8x8_ssse3( uint8_t *, int, uint8_t *, int ); +int x264_pixel_satd_8x4_ssse3( uint8_t *, int, uint8_t *, int ); + int x264_pixel_sa8d_16x16_sse2( uint8_t *, int, uint8_t *, int ); int x264_pixel_sa8d_8x8_sse2( uint8_t *, int, uint8_t *, int ); +int x264_pixel_sa8d_16x16_ssse3( uint8_t *, int, uint8_t *, int ); +int x264_pixel_sa8d_8x8_ssse3( uint8_t *, int, uint8_t *, int ); + void x264_intra_satd_x3_4x4_mmxext( uint8_t *, uint8_t *, int * ); void x264_intra_satd_x3_8x8c_mmxext( uint8_t *, uint8_t *, int * ); void x264_intra_satd_x3_16x16_mmxext( uint8_t *, uint8_t *, int * ); diff --git a/common/i386/quant.h b/common/i386/quant.h index ec42f4e1..1d4b51d9 100644 --- a/common/i386/quant.h +++ b/common/i386/quant.h @@ -32,6 +32,13 @@ void x264_quant_4x4_dc_core15_mmx( int16_t dct[4][4], void x264_quant_2x2_dc_core15_mmx( int16_t dct[2][2], int const i_qmf, int const i_qbits, int const f ); +void x264_quant_8x8_core15_ssse3( int16_t dct[8][8], + int quant_mf[8][8], int const i_qbits, int const f ); +void x264_quant_4x4_core15_ssse3( int16_t dct[4][4], + int quant_mf[4][4], int const i_qbits, int const f ); +void x264_quant_4x4_dc_core15_ssse3( int16_t dct[4][4], + int const i_qmf, int const i_qbits, int const f ); + void x264_quant_8x8_core16_mmxext( int16_t dct[8][8], int quant_mf[8][8], int const i_qbits, int const f ); void x264_quant_4x4_core16_mmxext( int16_t dct[4][4], diff --git a/common/pixel.c b/common/pixel.c index 365266a0..51c52c05 100644 --- a/common/pixel.c +++ b/common/pixel.c @@ -536,6 +536,19 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ) pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_sse2; pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_sse2; pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_sse2; +#endif + } + + if( cpu&X264_CPU_SSSE3 ) + { +#if defined(ARCH_X86_64) && defined(HAVE_SSE3) + pixf->satd[PIXEL_16x16]= x264_pixel_satd_16x16_ssse3; + pixf->satd[PIXEL_16x8] = x264_pixel_satd_16x8_ssse3; + pixf->satd[PIXEL_8x16] = x264_pixel_satd_8x16_ssse3; + pixf->satd[PIXEL_8x8] = x264_pixel_satd_8x8_ssse3; + pixf->satd[PIXEL_8x4] = x264_pixel_satd_8x4_ssse3; + pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_ssse3; + pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_ssse3; #endif } #endif diff --git a/common/quant.c b/common/quant.c index e7bd48cc..ad25824c 100644 --- a/common/quant.c +++ b/common/quant.c @@ -229,6 +229,11 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf ) #ifdef HAVE_MMXEXT /* select quant_8x8 based on CPU and maxQ8 */ +#if defined(ARCH_X86_64) && defined(HAVE_SSE3) + if( maxQ8 < (1<<15) && cpu&X264_CPU_SSSE3 ) + pf->quant_8x8_core = x264_quant_8x8_core15_ssse3; + else +#endif if( maxQ8 < (1<<15) && cpu&X264_CPU_MMX ) pf->quant_8x8_core = x264_quant_8x8_core15_mmx; else @@ -239,6 +244,11 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf ) pf->quant_8x8_core = x264_quant_8x8_core32_mmxext; /* select quant_4x4 based on CPU and maxQ4 */ +#if defined(ARCH_X86_64) && defined(HAVE_SSE3) + if( maxQ4 < (1<<15) && cpu&X264_CPU_SSSE3 ) + pf->quant_4x4_core = x264_quant_4x4_core15_ssse3; + else +#endif if( maxQ4 < (1<<15) && cpu&X264_CPU_MMX ) pf->quant_4x4_core = x264_quant_4x4_core15_mmx; else @@ -267,6 +277,11 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf ) pf->quant_2x2_dc_core = x264_quant_2x2_dc_core32_mmxext; } +#if defined(ARCH_X86_64) && defined(HAVE_SSE3) + if( maxQdc < (1<<15) && cpu&X264_CPU_SSSE3 ) + pf->quant_4x4_dc_core = x264_quant_4x4_dc_core15_ssse3; +#endif + if( cpu&X264_CPU_MMX ) { /* dequant is not subject to the above CQM-dependent overflow issues, diff --git a/configure b/configure index 4b013b09..b903fdd3 100755 --- a/configure +++ b/configure @@ -35,6 +35,15 @@ EOF return $TMP } +as_check() { + rm -f conftest* + echo "$1" > conftest.asm + $AS conftest.asm $ASFLAGS $2 -o conftest.o 2>$DEVNULL + TMP="$?" + rm -f conftest* + return $TMP +} + rm -f config.h config.mak x264.pc prefix='/usr/local' @@ -210,6 +219,17 @@ then fi fi +if [ $ARCH = X86_64 ] ; then + if ! as_check ; then + echo "No assembler. Please install yasm." + exit 1 + fi + if as_check "pabsw xmm0, xmm0" ; then + ASFLAGS="$ASFLAGS -DHAVE_SSE3" + CFLAGS="$CFLAGS -DHAVE_SSE3" + fi +fi + CFLAGS="$CFLAGS -DARCH_$ARCH -DSYS_$SYS" # parse options diff --git a/encoder/encoder.c b/encoder/encoder.c index e247e1e3..a5eacff2 100644 --- a/encoder/encoder.c +++ b/encoder/encoder.c @@ -671,6 +671,7 @@ x264_t *x264_encoder_open ( x264_param_t *param ) param->cpu&X264_CPU_MMXEXT ? "MMXEXT " : "", param->cpu&X264_CPU_SSE ? "SSE " : "", param->cpu&X264_CPU_SSE2 ? "SSE2 " : "", + param->cpu&X264_CPU_SSSE3 ? "SSSE3 " : "", param->cpu&X264_CPU_3DNOW ? "3DNow! " : "", param->cpu&X264_CPU_ALTIVEC ? "Altivec " : "" ); diff --git a/tools/checkasm.c b/tools/checkasm.c index d64fe502..988ff502 100644 --- a/tools/checkasm.c +++ b/tools/checkasm.c @@ -738,6 +738,7 @@ int check_all( int cpu_ref, int cpu_new ) int main(int argc, char *argv[]) { int ret = 0; + int cpu0 = 0, cpu1 = 0; int i; buf1 = x264_malloc( 1024 ); /* 32 x 32 */ @@ -759,13 +760,23 @@ int main(int argc, char *argv[]) #ifdef HAVE_MMXEXT fprintf( stderr, "x264: MMXEXT against C\n" ); - ret = check_all( 0, X264_CPU_MMX | X264_CPU_MMXEXT ); + cpu1 = X264_CPU_MMX | X264_CPU_MMXEXT; + ret = check_all( 0, cpu1 ); #ifdef HAVE_SSE2 if( x264_cpu_detect() & X264_CPU_SSE2 ) { fprintf( stderr, "\nx264: SSE2 against C\n" ); - ret |= check_all( X264_CPU_MMX | X264_CPU_MMXEXT, - X264_CPU_MMX | X264_CPU_MMXEXT | X264_CPU_SSE | X264_CPU_SSE2 ); + cpu0 = cpu1; + cpu1 |= X264_CPU_SSE | X264_CPU_SSE2; + ret |= check_all( cpu0, cpu1 ); + + if( x264_cpu_detect() & X264_CPU_SSSE3 ) + { + fprintf( stderr, "\nx264: SSSE3 against C\n" ); + cpu0 = cpu1; + cpu1 |= X264_CPU_SSE3 | X264_CPU_SSSE3; + ret |= check_all( cpu0, cpu1 ); + } } #endif #elif ARCH_PPC diff --git a/x264.h b/x264.h index d2897d84..cbddbee7 100644 --- a/x264.h +++ b/x264.h @@ -53,6 +53,8 @@ typedef struct x264_t x264_t; #define X264_CPU_3DNOW 0x000010 /* 3dnow! */ #define X264_CPU_3DNOWEXT 0x000020 /* 3dnow! ext */ #define X264_CPU_ALTIVEC 0x000040 /* altivec */ +#define X264_CPU_SSE3 0x000080 /* sse 3 */ +#define X264_CPU_SSSE3 0x000100 /* ssse 3 */ /* Analyse flags */ -- 2.40.0