cglobal x264_pixel_satd_16x8_sse2
cglobal x264_pixel_satd_8x16_sse2
cglobal x264_pixel_satd_16x16_sse2
+cglobal x264_pixel_satd_8x4_ssse3
+cglobal x264_pixel_satd_8x8_ssse3
+cglobal x264_pixel_satd_16x8_ssse3
+cglobal x264_pixel_satd_8x16_ssse3
+cglobal x264_pixel_satd_16x16_ssse3
cglobal x264_pixel_sa8d_8x8_sse2
cglobal x264_pixel_sa8d_16x16_sse2
+cglobal x264_pixel_sa8d_8x8_ssse3
+cglobal x264_pixel_sa8d_16x16_ssse3
cglobal x264_intra_sa8d_x3_8x8_core_sse2
cglobal x264_pixel_ssim_4x4x2_core_sse2
cglobal x264_pixel_ssim_end4_sse2
SUMSUB_BADC %5, %6, %7, %8
%endmacro
+;;; row transform not used, because phaddw is much slower than paddw on a Conroe
+;%macro PHSUMSUB 3
+; movdqa %3, %1
+; phaddw %1, %2
+; phsubw %3, %2
+;%endmacro
+
+;%macro HADAMARD4x1_SSSE3 5 ; ABCD-T -> ADTC
+; PHSUMSUB %1, %2, %5
+; PHSUMSUB %3, %4, %2
+; PHSUMSUB %1, %3, %4
+; PHSUMSUB %5, %2, %3
+;%endmacro
+
%macro SBUTTERFLY 5
mov%1 %5, %3
punpckl%2 %3, %4
psubw %1, %2
%endmacro
+%macro LOAD_DIFF_4x8P 6 ; 4x dest, 2x temp
+ LOAD_DIFF_8P %1, %5, [parm1q], [parm3q]
+ LOAD_DIFF_8P %2, %6, [parm1q+parm2q], [parm3q+parm4q]
+ LOAD_DIFF_8P %3, %5, [parm1q+2*parm2q], [parm3q+2*parm4q]
+ LOAD_DIFF_8P %4, %6, [parm1q+r10], [parm3q+r11]
+%endmacro
+
%macro SUM1x8_SSE2 3 ; 01 junk sum
pxor %2, %2
psubw %2, %1
paddusw %4, %2
%endmacro
-;;; two SUM4x4_SSE2 running side-by-side
-%macro SUM4x4_TWO_SSE2 7 ; a02 a13 junk1 b02 b13 junk2 (1=4 2=5 3=6) sum
+%macro SUM8x4_SSE2 7 ; a02 a13 junk1 b02 b13 junk2 (1=4 2=5 3=6) sum
pxor %3, %3
pxor %6, %6
psubw %3, %1
paddusw %7, %4
%endmacro
-%macro SATD_TWO_SSE2 0
- LOAD_DIFF_8P xmm0, xmm4, [parm1q], [parm3q]
- LOAD_DIFF_8P xmm1, xmm5, [parm1q+parm2q], [parm3q+parm4q]
- LOAD_DIFF_8P xmm2, xmm4, [parm1q+2*parm2q], [parm3q+2*parm4q]
- LOAD_DIFF_8P xmm3, xmm5, [parm1q+r10], [parm3q+r11]
- lea parm1q, [parm1q+4*parm2q]
- lea parm3q, [parm3q+4*parm4q]
+%macro SUM8x4_SSSE3 7 ; a02 a13 . b02 b13 . sum
+ pabsw %1, %1
+ pabsw %2, %2
+ pabsw %4, %4
+ pabsw %5, %5
+ paddusw %1, %2
+ paddusw %4, %5
+ paddusw %7, %1
+ paddusw %7, %4
+%endmacro
+%macro SATD_TWO_SSE2 0
+ LOAD_DIFF_4x8P xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
+ lea parm1q, [parm1q+4*parm2q]
+ lea parm3q, [parm3q+4*parm4q]
HADAMARD1x4 xmm0, xmm1, xmm2, xmm3
TRANSPOSE2x4x4W xmm0, xmm1, xmm2, xmm3, xmm4
HADAMARD1x4 xmm0, xmm1, xmm2, xmm3
- SUM4x4_TWO_SSE2 xmm0, xmm1, xmm4, xmm2, xmm3, xmm5, xmm6
+ SUM8x4 xmm0, xmm1, xmm4, xmm2, xmm3, xmm5, xmm6
%endmacro
%macro SATD_START 0
ret
%endmacro
+%macro SATDS 1
ALIGN 16
;-----------------------------------------------------------------------------
; int x264_pixel_satd_16x16_sse2 (uint8_t *, int, uint8_t *, int )
;-----------------------------------------------------------------------------
-x264_pixel_satd_16x16_sse2:
+x264_pixel_satd_16x16_%1:
SATD_START
mov r8, rdi
mov r9, rdx
-
SATD_TWO_SSE2
SATD_TWO_SSE2
SATD_TWO_SSE2
SATD_TWO_SSE2
-
lea rdi, [r8+8]
lea rdx, [r9+8]
-
SATD_TWO_SSE2
SATD_TWO_SSE2
SATD_TWO_SSE2
SATD_TWO_SSE2
-
SATD_END
ALIGN 16
;-----------------------------------------------------------------------------
; int x264_pixel_satd_8x16_sse2 (uint8_t *, int, uint8_t *, int )
;-----------------------------------------------------------------------------
-x264_pixel_satd_8x16_sse2:
+x264_pixel_satd_8x16_%1:
SATD_START
-
SATD_TWO_SSE2
SATD_TWO_SSE2
SATD_TWO_SSE2
SATD_TWO_SSE2
-
SATD_END
ALIGN 16
;-----------------------------------------------------------------------------
; int x264_pixel_satd_16x8_sse2 (uint8_t *, int, uint8_t *, int )
;-----------------------------------------------------------------------------
-x264_pixel_satd_16x8_sse2:
+x264_pixel_satd_16x8_%1:
SATD_START
mov r8, rdi
mov r9, rdx
-
SATD_TWO_SSE2
SATD_TWO_SSE2
-
lea rdi, [r8+8]
lea rdx, [r9+8]
-
SATD_TWO_SSE2
SATD_TWO_SSE2
-
SATD_END
ALIGN 16
;-----------------------------------------------------------------------------
; int x264_pixel_satd_8x8_sse2 (uint8_t *, int, uint8_t *, int )
;-----------------------------------------------------------------------------
-x264_pixel_satd_8x8_sse2:
+x264_pixel_satd_8x8_%1:
SATD_START
-
SATD_TWO_SSE2
SATD_TWO_SSE2
-
SATD_END
ALIGN 16
;-----------------------------------------------------------------------------
; int x264_pixel_satd_8x4_sse2 (uint8_t *, int, uint8_t *, int )
;-----------------------------------------------------------------------------
-x264_pixel_satd_8x4_sse2:
+x264_pixel_satd_8x4_%1:
SATD_START
-
SATD_TWO_SSE2
-
SATD_END
;-----------------------------------------------------------------------------
; int x264_pixel_sa8d_8x8_sse2( uint8_t *, int, uint8_t *, int )
;-----------------------------------------------------------------------------
-x264_pixel_sa8d_8x8_sse2:
+x264_pixel_sa8d_8x8_%1:
lea r10, [3*parm2q]
lea r11, [3*parm4q]
- LOAD_DIFF_8P xmm0, xmm8, [parm1q], [parm3q]
- LOAD_DIFF_8P xmm1, xmm9, [parm1q+parm2q], [parm3q+parm4q]
- LOAD_DIFF_8P xmm2, xmm8, [parm1q+2*parm2q], [parm3q+2*parm4q]
- LOAD_DIFF_8P xmm3, xmm9, [parm1q+r10], [parm3q+r11]
+ LOAD_DIFF_4x8P xmm0, xmm1, xmm2, xmm3, xmm8, xmm8
lea parm1q, [parm1q+4*parm2q]
lea parm3q, [parm3q+4*parm4q]
- LOAD_DIFF_8P xmm4, xmm8, [parm1q], [parm3q]
- LOAD_DIFF_8P xmm5, xmm9, [parm1q+parm2q], [parm3q+parm4q]
- LOAD_DIFF_8P xmm6, xmm8, [parm1q+2*parm2q], [parm3q+2*parm4q]
- LOAD_DIFF_8P xmm7, xmm9, [parm1q+r10], [parm3q+r11]
-
+ LOAD_DIFF_4x8P xmm4, xmm5, xmm6, xmm7, xmm8, xmm8
+
HADAMARD1x8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
TRANSPOSE8x8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8
HADAMARD1x8 xmm0, xmm5, xmm7, xmm3, xmm8, xmm4, xmm2, xmm1
pxor xmm10, xmm10
- SUM4x4_TWO_SSE2 xmm0, xmm1, xmm6, xmm2, xmm3, xmm9, xmm10
- SUM4x4_TWO_SSE2 xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10
+ SUM8x4 xmm0, xmm1, xmm6, xmm2, xmm3, xmm9, xmm10
+ SUM8x4 xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10
psrlw xmm10, 1
HADDW xmm10, xmm0
movd eax, xmm10
; int x264_pixel_sa8d_16x16_sse2( uint8_t *, int, uint8_t *, int )
;-----------------------------------------------------------------------------
;; violates calling convention
-x264_pixel_sa8d_16x16_sse2:
+x264_pixel_sa8d_16x16_%1:
xor r8d, r8d
- call x264_pixel_sa8d_8x8_sse2 ; pix[0]
+ call x264_pixel_sa8d_8x8_%1 ; pix[0]
lea parm1q, [parm1q+4*parm2q]
lea parm3q, [parm3q+4*parm4q]
- call x264_pixel_sa8d_8x8_sse2 ; pix[8*stride]
+ call x264_pixel_sa8d_8x8_%1 ; pix[8*stride]
lea r10, [3*parm2q-2]
lea r11, [3*parm4q-2]
shl r10, 2
shl r11, 2
sub parm1q, r10
sub parm3q, r11
- call x264_pixel_sa8d_8x8_sse2 ; pix[8]
+ call x264_pixel_sa8d_8x8_%1 ; pix[8]
lea parm1q, [parm1q+4*parm2q]
lea parm3q, [parm3q+4*parm4q]
- call x264_pixel_sa8d_8x8_sse2 ; pix[8*stride+8]
+ call x264_pixel_sa8d_8x8_%1 ; pix[8*stride+8]
mov eax, r8d
add eax, 1
shr eax, 1
ret
+%endmacro ; SATDS
+
+%define SUM8x4 SUM8x4_SSE2
+SATDS sse2
+%ifdef HAVE_SSE3
+%define SUM8x4 SUM8x4_SSSE3
+SATDS ssse3
+%endif
movdqa xmm9, xmm3
movdqa xmm10, xmm4
movdqa xmm11, xmm5
- SUM4x4_TWO_SSE2 xmm8, xmm9, xmm12, xmm10, xmm11, xmm13, xmm15
+ SUM8x4_SSE2 xmm8, xmm9, xmm12, xmm10, xmm11, xmm13, xmm15
movdqa xmm8, xmm6
movdqa xmm9, xmm7
SUM4x4_SSE2 xmm8, xmm9, xmm10, xmm15
cglobal x264_quant_4x4_core15_mmx
cglobal x264_quant_8x8_core15_mmx
+cglobal x264_quant_4x4_dc_core15_ssse3
+cglobal x264_quant_4x4_core15_ssse3
+cglobal x264_quant_8x8_core15_ssse3
+
cglobal x264_quant_2x2_dc_core16_mmxext
cglobal x264_quant_4x4_dc_core16_mmxext
cglobal x264_quant_4x4_core16_mmxext
punpckldq mm7, mm7 ; f in each dword
%endmacro
+%macro SSE2_QUANT_AC_START 0
+ movd xmm6, parm3d ; i_qbits
+ movd xmm7, parm4d ; f
+ pshufd xmm7, xmm7, 0 ; f in each dword
+%endmacro
+
+%macro SSE2_QUANT15_DC_START 0
+ movd xmm5, parm2d ; i_qmf
+ movd xmm6, parm3d ; i_qbits
+ movd xmm7, parm4d ; f
+ pshuflw xmm5, xmm5, 0
+ punpcklqdq xmm5, xmm5 ; i_qmf in each word
+ pshufd xmm7, xmm7, 0 ; f in each dword
+%endmacro
+
%macro MMX_QUANT15_1x4 4
;;; %1 (m64) dct[y][x]
;;; %2 (m64/mmx) quant_mf[y][x] or quant_mf[0][0] (as int16_t)
packssdw mm0, mm1 ; pack
pxor mm0, mm4 ; restore sign
psubw mm0, mm4
- movq %1, mm0 ; store
+ movq %1, mm0 ; store
+%endmacro
+
+%macro SSSE3_QUANT15_1x8 4
+ movdqa xmm0, %1 ; load dct coeffs
+ movdqa xmm4, xmm0 ; save sign
+ pabsw xmm0, xmm0
+
+ movdqa xmm2, xmm0
+ pmullw xmm0, %2
+ pmulhw xmm2, %2
+
+ movdqa xmm1, xmm0
+ punpcklwd xmm0, xmm2
+ punpckhwd xmm1, xmm2
+
+ paddd xmm0, %4 ; round with f
+ paddd xmm1, %4
+ psrad xmm0, %3
+ psrad xmm1, %3
+
+ packssdw xmm0, xmm1 ; pack
+ psignw xmm0, xmm4 ; restore sign
+ movdqa %1, xmm0 ; store
%endmacro
ALIGN 16
ret
+%ifdef HAVE_SSE3
+ALIGN 16
+;-----------------------------------------------------------------------------
+; void x264_quant_4x4_dc_core15_ssse3( int16_t dct[4][4],
+; int const i_qmf, int const i_qbits, int const f );
+;-----------------------------------------------------------------------------
+x264_quant_4x4_dc_core15_ssse3:
+ SSE2_QUANT15_DC_START
+ SSSE3_QUANT15_1x8 [parm1q], xmm5, xmm6, xmm7
+ SSSE3_QUANT15_1x8 [parm1q+16], xmm5, xmm6, xmm7
+ ret
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+; void x264_quant_4x4_core15_ssse3( int16_t dct[4][4],
+; int const quant_mf[4][4], int const i_qbits, int const f );
+;-----------------------------------------------------------------------------
+x264_quant_4x4_core15_ssse3:
+ SSE2_QUANT_AC_START
+%assign x 0
+%rep 2
+ movdqa xmm5, [parm2q+32*x]
+ packssdw xmm5, [parm2q+32*x+16]
+ SSSE3_QUANT15_1x8 [parm1q+16*x], xmm5, xmm6, xmm7
+ %assign x x+1
+%endrep
+ ret
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+; void x264_quant_8x8_core15_ssse3( int16_t dct[8][8],
+; int const quant_mf[8][8], int const i_qbits, int const f );
+;-----------------------------------------------------------------------------
+x264_quant_8x8_core15_ssse3:
+ SSE2_QUANT_AC_START
+%assign x 0
+%rep 8
+ movdqa xmm5, [parm2q+32*x]
+ packssdw xmm5, [parm2q+32*x+16]
+ SSSE3_QUANT15_1x8 [parm1q+16*x], xmm5, xmm6, xmm7
+ %assign x x+1
+%endrep
+ ret
+%endif ; HAVE_SSE3
+
+
; ============================================================================
%macro MMXEXT_QUANT16_DC_START 0
/* Is it OK ? */
cpu |= X264_CPU_SSE2;
}
+#ifdef HAVE_SSE3
+ if( (ecx&0x00000001) )
+ {
+ cpu |= X264_CPU_SSE3;
+ }
+ if( (ecx&0x00000200) )
+ {
+ cpu |= X264_CPU_SSSE3;
+ }
+#endif
x264_cpu_cpuid( 0x80000000, &eax, &ebx, &ecx, &edx );
if( eax < 0x80000001 )
int x264_pixel_satd_8x8_sse2( uint8_t *, int, uint8_t *, int );
int x264_pixel_satd_8x4_sse2( uint8_t *, int, uint8_t *, int );
+int x264_pixel_satd_16x16_ssse3( uint8_t *, int, uint8_t *, int );
+int x264_pixel_satd_16x8_ssse3( uint8_t *, int, uint8_t *, int );
+int x264_pixel_satd_8x16_ssse3( uint8_t *, int, uint8_t *, int );
+int x264_pixel_satd_8x8_ssse3( uint8_t *, int, uint8_t *, int );
+int x264_pixel_satd_8x4_ssse3( uint8_t *, int, uint8_t *, int );
+
int x264_pixel_sa8d_16x16_sse2( uint8_t *, int, uint8_t *, int );
int x264_pixel_sa8d_8x8_sse2( uint8_t *, int, uint8_t *, int );
+int x264_pixel_sa8d_16x16_ssse3( uint8_t *, int, uint8_t *, int );
+int x264_pixel_sa8d_8x8_ssse3( uint8_t *, int, uint8_t *, int );
+
void x264_intra_satd_x3_4x4_mmxext( uint8_t *, uint8_t *, int * );
void x264_intra_satd_x3_8x8c_mmxext( uint8_t *, uint8_t *, int * );
void x264_intra_satd_x3_16x16_mmxext( uint8_t *, uint8_t *, int * );
void x264_quant_2x2_dc_core15_mmx( int16_t dct[2][2],
int const i_qmf, int const i_qbits, int const f );
+void x264_quant_8x8_core15_ssse3( int16_t dct[8][8],
+ int quant_mf[8][8], int const i_qbits, int const f );
+void x264_quant_4x4_core15_ssse3( int16_t dct[4][4],
+ int quant_mf[4][4], int const i_qbits, int const f );
+void x264_quant_4x4_dc_core15_ssse3( int16_t dct[4][4],
+ int const i_qmf, int const i_qbits, int const f );
+
void x264_quant_8x8_core16_mmxext( int16_t dct[8][8],
int quant_mf[8][8], int const i_qbits, int const f );
void x264_quant_4x4_core16_mmxext( int16_t dct[4][4],
pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_sse2;
pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_sse2;
pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_sse2;
+#endif
+ }
+
+ if( cpu&X264_CPU_SSSE3 )
+ {
+#if defined(ARCH_X86_64) && defined(HAVE_SSE3)
+ pixf->satd[PIXEL_16x16]= x264_pixel_satd_16x16_ssse3;
+ pixf->satd[PIXEL_16x8] = x264_pixel_satd_16x8_ssse3;
+ pixf->satd[PIXEL_8x16] = x264_pixel_satd_8x16_ssse3;
+ pixf->satd[PIXEL_8x8] = x264_pixel_satd_8x8_ssse3;
+ pixf->satd[PIXEL_8x4] = x264_pixel_satd_8x4_ssse3;
+ pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_ssse3;
+ pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_ssse3;
#endif
}
#endif
#ifdef HAVE_MMXEXT
/* select quant_8x8 based on CPU and maxQ8 */
+#if defined(ARCH_X86_64) && defined(HAVE_SSE3)
+ if( maxQ8 < (1<<15) && cpu&X264_CPU_SSSE3 )
+ pf->quant_8x8_core = x264_quant_8x8_core15_ssse3;
+ else
+#endif
if( maxQ8 < (1<<15) && cpu&X264_CPU_MMX )
pf->quant_8x8_core = x264_quant_8x8_core15_mmx;
else
pf->quant_8x8_core = x264_quant_8x8_core32_mmxext;
/* select quant_4x4 based on CPU and maxQ4 */
+#if defined(ARCH_X86_64) && defined(HAVE_SSE3)
+ if( maxQ4 < (1<<15) && cpu&X264_CPU_SSSE3 )
+ pf->quant_4x4_core = x264_quant_4x4_core15_ssse3;
+ else
+#endif
if( maxQ4 < (1<<15) && cpu&X264_CPU_MMX )
pf->quant_4x4_core = x264_quant_4x4_core15_mmx;
else
pf->quant_2x2_dc_core = x264_quant_2x2_dc_core32_mmxext;
}
+#if defined(ARCH_X86_64) && defined(HAVE_SSE3)
+ if( maxQdc < (1<<15) && cpu&X264_CPU_SSSE3 )
+ pf->quant_4x4_dc_core = x264_quant_4x4_dc_core15_ssse3;
+#endif
+
if( cpu&X264_CPU_MMX )
{
/* dequant is not subject to the above CQM-dependent overflow issues,
return $TMP
}
+as_check() {
+ rm -f conftest*
+ echo "$1" > conftest.asm
+ $AS conftest.asm $ASFLAGS $2 -o conftest.o 2>$DEVNULL
+ TMP="$?"
+ rm -f conftest*
+ return $TMP
+}
+
rm -f config.h config.mak x264.pc
prefix='/usr/local'
fi
fi
+if [ $ARCH = X86_64 ] ; then
+ if ! as_check ; then
+ echo "No assembler. Please install yasm."
+ exit 1
+ fi
+ if as_check "pabsw xmm0, xmm0" ; then
+ ASFLAGS="$ASFLAGS -DHAVE_SSE3"
+ CFLAGS="$CFLAGS -DHAVE_SSE3"
+ fi
+fi
+
CFLAGS="$CFLAGS -DARCH_$ARCH -DSYS_$SYS"
# parse options
param->cpu&X264_CPU_MMXEXT ? "MMXEXT " : "",
param->cpu&X264_CPU_SSE ? "SSE " : "",
param->cpu&X264_CPU_SSE2 ? "SSE2 " : "",
+ param->cpu&X264_CPU_SSSE3 ? "SSSE3 " : "",
param->cpu&X264_CPU_3DNOW ? "3DNow! " : "",
param->cpu&X264_CPU_ALTIVEC ? "Altivec " : "" );
int main(int argc, char *argv[])
{
int ret = 0;
+ int cpu0 = 0, cpu1 = 0;
int i;
buf1 = x264_malloc( 1024 ); /* 32 x 32 */
#ifdef HAVE_MMXEXT
fprintf( stderr, "x264: MMXEXT against C\n" );
- ret = check_all( 0, X264_CPU_MMX | X264_CPU_MMXEXT );
+ cpu1 = X264_CPU_MMX | X264_CPU_MMXEXT;
+ ret = check_all( 0, cpu1 );
#ifdef HAVE_SSE2
if( x264_cpu_detect() & X264_CPU_SSE2 )
{
fprintf( stderr, "\nx264: SSE2 against C\n" );
- ret |= check_all( X264_CPU_MMX | X264_CPU_MMXEXT,
- X264_CPU_MMX | X264_CPU_MMXEXT | X264_CPU_SSE | X264_CPU_SSE2 );
+ cpu0 = cpu1;
+ cpu1 |= X264_CPU_SSE | X264_CPU_SSE2;
+ ret |= check_all( cpu0, cpu1 );
+
+ if( x264_cpu_detect() & X264_CPU_SSSE3 )
+ {
+ fprintf( stderr, "\nx264: SSSE3 against C\n" );
+ cpu0 = cpu1;
+ cpu1 |= X264_CPU_SSE3 | X264_CPU_SSSE3;
+ ret |= check_all( cpu0, cpu1 );
+ }
}
#endif
#elif ARCH_PPC
#define X264_CPU_3DNOW 0x000010 /* 3dnow! */
#define X264_CPU_3DNOWEXT 0x000020 /* 3dnow! ext */
#define X264_CPU_ALTIVEC 0x000040 /* altivec */
+#define X264_CPU_SSE3 0x000080 /* sse 3 */
+#define X264_CPU_SSSE3 0x000100 /* ssse 3 */
/* Analyse flags
*/