cpu |= X264_CPU_MMXEXT|X264_CPU_SSE;
if( edx&0x04000000 )
cpu |= X264_CPU_SSE2;
-#ifdef HAVE_SSE3
if( ecx&0x00000001 )
cpu |= X264_CPU_SSE3;
if( ecx&0x00000200 )
cpu |= X264_CPU_SSSE3;
-#endif
x264_cpu_cpuid( 0x80000000, &eax, &ebx, &ecx, &edx );
max_extended_cap = eax;
pf->scan_8x8 = zigzag_scan_8x8_frame;
pf->scan_4x4 = zigzag_scan_4x4_frame;
pf->sub_4x4 = zigzag_sub_4x4_frame;
-#ifdef HAVE_SSE3
+#ifdef HAVE_MMX
if( cpu&X264_CPU_SSSE3 )
pf->sub_4x4 = x264_zigzag_sub_4x4_frame_ssse3;
#endif
#ifdef HAVE_MMX
SATD_X_DECL7( _mmxext )
SATD_X_DECL5( _sse2 )
-#ifdef HAVE_SSE3
SATD_X_DECL7( _ssse3 )
#endif
-#endif
/****************************************************************************
* structural similarity metric
#endif
}
-#ifdef HAVE_SSE3
if( (cpu&X264_CPU_SSE3) && (cpu&X264_CPU_CACHELINE_SPLIT) )
{
INIT2( sad, _sse3 );
INIT2( sad_x4, _cache64_ssse3 );
}
}
-#endif //HAVE_SSE3
#endif //HAVE_MMX
#ifdef ARCH_PPC
pf->dequant_8x8 = x264_dequant_8x8_flat16_sse2;
}
}
-#endif
-#ifdef HAVE_SSE3
if( cpu&X264_CPU_SSSE3 )
{
pf->quant_4x4_dc = x264_quant_4x4_dc_ssse3;
pf->quant_4x4 = x264_quant_4x4_ssse3;
pf->quant_8x8 = x264_quant_8x8_ssse3;
}
-#endif
+#endif // HAVE_MMX
#ifdef ARCH_PPC
if( cpu&X264_CPU_ALTIVEC ) {
mov [r0+12], r2d
RET
-%ifdef HAVE_SSE3
;-----------------------------------------------------------------------------
; void x264_zigzag_sub_4x4_frame_ssse3( int16_t level[16], const uint8_t *src, uint8_t *dst )
;-----------------------------------------------------------------------------
movdqa [r0], xmm0
movdqa [r0+16], xmm1
RET
-%endif
%endmacro
PIXEL_AVG_SSE sse2
-%ifdef HAVE_SSE3
%define movdqu lddqu
PIXEL_AVG_SSE sse3
%undef movdqu
-%endif
; Cacheline split code for processors with high latencies for loads
; split over cache lines. See sad-a.asm for a more detailed explanation.
COPY_W16_SSE2 x264_mc_copy_w16_sse2, movdqu
; cacheline split with mmx has too much overhead; the speed benefit is near-zero.
; but with SSE3 the overhead is zero, so there's no reason not to include it.
-%ifdef HAVE_SSE3
COPY_W16_SSE2 x264_mc_copy_w16_sse3, lddqu
-%endif
COPY_W16_SSE2 x264_mc_copy_w16_aligned_sse2, movdqa
%define PALIGNR PALIGNR_SSE2
HPEL_V sse2
HPEL_C sse2
-%ifdef HAVE_SSE3
%define PALIGNR PALIGNR_SSSE3
HPEL_C ssse3
-%endif
cglobal x264_sfence
sfence
PIXEL_AVG_WTAB(cache64_mmxext, mmxext, cache64_mmxext, cache64_mmxext, cache64_mmxext, cache64_mmxext)
PIXEL_AVG_WTAB(sse2, mmxext, mmxext, mmxext, sse2, sse2)
PIXEL_AVG_WTAB(cache64_sse2, mmxext, cache64_mmxext, cache64_sse2, cache64_sse2, cache64_sse2)
-#ifdef HAVE_SSE3
PIXEL_AVG_WTAB(cache64_sse3, mmxext, cache64_mmxext, sse3, sse3, sse3)
-#endif
#define MC_COPY_WTAB(instr, name1, name2, name3)\
static void (* const x264_mc_copy_wtab_##instr[5])( uint8_t *, int, uint8_t *, int, int ) =\
MC_COPY_WTAB(mmx,mmx,mmx,mmx)
MC_COPY_WTAB(sse2,mmx,mmx,sse2)
-#ifdef HAVE_SSE3
MC_COPY_WTAB(sse3,mmx,mmx,sse3)
-#endif
static const int hpel_ref0[16] = {0,1,1,1,0,1,1,1,2,3,3,3,0,1,1,1};
static const int hpel_ref1[16] = {0,0,0,0,2,2,3,2,2,2,3,2,2,2,3,2};
#endif
MC_LUMA(sse2,sse2,sse2)
MC_LUMA(cache64_sse2,cache64_sse2,sse2)
-#ifdef HAVE_SSE3
MC_LUMA(cache64_sse3,cache64_sse3,sse3)
-#endif
#define GET_REF(name)\
uint8_t *get_ref_##name( uint8_t *dst, int *i_dst_stride,\
#endif
GET_REF(sse2)
GET_REF(cache64_sse2)
-#ifdef HAVE_SSE3
GET_REF(cache64_sse3)
-#endif
#define HPEL(align, cpu, cpuv, cpuc, cpuh)\
void x264_hpel_filter_v_##cpuv( uint8_t *dst, uint8_t *src, int16_t *buf, int stride, int width);\
HPEL(8, mmxext, mmxext, mmxext, mmxext)
HPEL(16, sse2_amd, mmxext, mmxext, sse2)
HPEL(16, sse2, sse2, sse2, sse2)
-#ifdef HAVE_SSE3
HPEL(16, ssse3, sse2, ssse3, sse2)
-#endif
void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
{
{
pf->mc_luma = mc_luma_cache64_sse2;
pf->get_ref = get_ref_cache64_sse2;
-#ifdef HAVE_SSE3
/* lddqu doesn't work on Core2 */
if( (cpu&X264_CPU_SSE3) && !(cpu&X264_CPU_SSSE3) )
{
pf->mc_luma = mc_luma_cache64_sse3;
pf->get_ref = get_ref_cache64_sse3;
}
-#endif
}
if( !(cpu&X264_CPU_SSSE3) )
return;
-#ifdef HAVE_SSE3
pf->hpel_filter = x264_hpel_filter_ssse3;
-#endif
}
SA8D_16x16_32 sse2
INTRA_SA8D_SSE2 sse2
INTRA_SATDS_MMX mmxext
-%ifdef HAVE_SSE3
%define ABS1 ABS1_SSSE3
%define ABS2 ABS2_SSSE3
SATDS_SSE2 ssse3
INTRA_SA8D_SSE2 ssse3
INTRA_SATDS_MMX ssse3
SATD_W4 ssse3 ; mmx, but uses pabsw from ssse3.
-%endif
%endmacro
ADS_SSE2 sse2
-%ifdef HAVE_SSE3
%define ABS1 ABS1_SSSE3
ADS_SSE2 ssse3
-%endif
; int x264_pixel_ads_mvs( int16_t *mvs, uint8_t *masks, int width )
; {
#ifdef ARCH_X86_64
INTRA_SA8D_X3(sse2)
-#ifdef HAVE_SSE3
INTRA_SA8D_X3(ssse3)
-#endif
#else
INTRA_SA8D_X3(mmxext)
#endif
QUANT_AC x264_quant_4x4_sse2, QUANT_MMX, 2, 16
QUANT_AC x264_quant_8x8_sse2, QUANT_MMX, 8, 16
-%ifdef HAVE_SSE3
QUANT_DC x264_quant_4x4_dc_ssse3, QUANT_SSSE3, 2, 16
QUANT_AC x264_quant_4x4_ssse3, QUANT_SSSE3, 2, 16
QUANT_AC x264_quant_8x8_ssse3, QUANT_SSSE3, 8, 16
-%endif
%include "x86inc.asm"
SECTION_RODATA
-sw_64: dq 64
+sw_64: dd 64
SECTION .text
%endmacro
SAD_W16 sse2
-%ifdef HAVE_SSE3
%define movdqu lddqu
SAD_W16 sse3
%undef movdqu
-%endif
SAD_X_SSE2 4, 16, 16, sse2
SAD_X_SSE2 4, 16, 8, sse2
-%ifdef HAVE_SSE3
%define movdqu lddqu
SAD_X_SSE2 3, 16, 16, sse3
SAD_X_SSE2 3, 16, 8, sse3
SAD_X_SSE2 4, 16, 16, sse3
SAD_X_SSE2 4, 16, 8, sse3
%undef movdqu
-%endif
SADX34_CACHELINE_FUNC 16, 8, 64, sse2, sse2
%endif ; !ARCH_X86_64
-%ifdef HAVE_SSE3
SAD16_CACHELINE_FUNC ssse3, 8
SAD16_CACHELINE_FUNC ssse3, 16
%assign i 1
%endrep
SADX34_CACHELINE_FUNC 16, 16, 64, sse2, ssse3
SADX34_CACHELINE_FUNC 16, 8, 64, sse2, ssse3
-%endif ; HAVE_SSE3
echo "yasm prior to 0.6.2 miscompiles PIC. trying nasm instead..."
AS=nasm
fi
- if as_check ; then
+ if as_check "pabsw xmm0, xmm0" ; then
CFLAGS="$CFLAGS -DHAVE_MMX"
- if as_check "pabsw xmm0, xmm0" ; then
- ASFLAGS="$ASFLAGS -DHAVE_SSE3"
- CFLAGS="$CFLAGS -DHAVE_SSE3"
- fi
else
echo "No suitable assembler found. x264 will be several times slower."
echo "Please install 'yasm' to get MMX/SSE optimized code."