with Phenom, 3dnow is no longer equivalent to "sse2 is slow", so make a new flag for that.
some sse2 functions are useful only on Core2 and Phenom, so make a "sse2 is fast" flag for that.
some ssse3 instructions didn't become useful until Penryn, so yet another flag.
disable sse2 completely on Pentium M and Core1, because it's uniformly slower than mmx.
enable some sse2 functions on Athlon64 that always were faster and we just didn't notice.
remove mc_luma_sse3, because the only cpu that has lddqu (namely Pentium 4D) doesn't have "sse2 is fast".
don't print mmx1, sse1, nor 3dnow in the detected cpuflags, since we don't really have any such functions. likewise don't print sse3 unless it's used (Pentium 4D).
#endif
#include "common.h"
+#include "cpu.h"
-const struct {
- const char name[8];
- int flags;
-} x264_cpu_names[] = {
- {"MMX", X264_CPU_MMX},
+const x264_cpu_name_t x264_cpu_names[] = {
+ {"Altivec", X264_CPU_ALTIVEC},
+// {"MMX", X264_CPU_MMX}, // we don't support asm on mmx1 cpus anymore
{"MMX2", X264_CPU_MMX|X264_CPU_MMXEXT},
{"MMXEXT", X264_CPU_MMX|X264_CPU_MMXEXT},
- {"SSE", X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE},
- {"SSE1", X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE},
+// {"SSE", X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE}, // there are no sse1 functions in x264
+ {"SSE2Slow",X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SSE2_IS_SLOW},
{"SSE2", X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2},
+ {"SSE2Fast",X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SSE2_IS_FAST},
{"SSE3", X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SSE3},
{"SSSE3", X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SSE3|X264_CPU_SSSE3},
+ {"PHADD", X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_PHADD_IS_FAST},
{"SSE4", X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4},
- {"3DNow", X264_CPU_3DNOW},
- {"Altivec", X264_CPU_ALTIVEC},
- {"Cache32", X264_CPU_CACHELINE_SPLIT|X264_CPU_CACHELINE_32},
- {"Cache64", X264_CPU_CACHELINE_SPLIT|X264_CPU_CACHELINE_64},
+ {"Cache32", X264_CPU_CACHELINE_32},
+ {"Cache64", X264_CPU_CACHELINE_64},
{"", 0},
};
if( ecx&0x00080000 )
cpu |= X264_CPU_SSE4;
+ if( cpu & X264_CPU_SSSE3 )
+ cpu |= X264_CPU_SSE2_IS_FAST;
+ if( cpu & X264_CPU_SSE4 )
+ cpu |= X264_CPU_PHADD_IS_FAST;
+
x264_cpu_cpuid( 0x80000000, &eax, &ebx, &ecx, &edx );
max_extended_cap = eax;
if( !strcmp((char*)vendor, "AuthenticAMD") && max_extended_cap >= 0x80000001 )
{
x264_cpu_cpuid( 0x80000001, &eax, &ebx, &ecx, &edx );
- if( edx&0x80000000 )
- cpu |= X264_CPU_3DNOW;
if( edx&0x00400000 )
cpu |= X264_CPU_MMXEXT;
+ if( cpu & X264_CPU_SSE2 )
+ {
+ if( ecx&0x00000040 ) /* SSE4a */
+ cpu |= X264_CPU_SSE2_IS_FAST;
+ else
+ cpu |= X264_CPU_SSE2_IS_SLOW;
+ }
}
- if( !strcmp((char*)vendor, "GenuineIntel") || !strcmp((char*)vendor, "CyrixInstead") )
- cpu |= X264_CPU_CACHELINE_SPLIT;
- /* cacheline size is specified in 3 places, any of which may be missing */
- x264_cpu_cpuid( 1, &eax, &ebx, &ecx, &edx );
- cache = (ebx&0xff00)>>5; // cflush size
- if( !cache && max_extended_cap >= 0x80000006 )
+ if( !strcmp((char*)vendor, "GenuineIntel") )
{
- x264_cpu_cpuid( 0x80000006, &eax, &ebx, &ecx, &edx );
- cache = ecx&0xff; // cacheline size
+ int family, model, stepping;
+ x264_cpu_cpuid( 1, &eax, &ebx, &ecx, &edx );
+ family = ((eax>>8)&0xf) + ((eax>>20)&0xff);
+ model = ((eax>>4)&0xf) + ((eax>>12)&0xf0);
+ stepping = eax&0xf;
+ /* 6/9 (pentium-m "banias"), 6/13 (pentium-m "dothan"), and 6/14 (core1 "yonah")
+ * theoretically support sse2, but it's significantly slower than mmx for
+ * almost all of x264's functions, so let's just pretend they don't. */
+ if( family==6 && (model==9 || model==13 || model==14) )
+ {
+ cpu &= ~(X264_CPU_SSE2|X264_CPU_SSE3);
+ assert(!(cpu&(X264_CPU_SSSE3|X264_CPU_SSE4)));
+ }
}
- if( !cache )
+
+ if( !strcmp((char*)vendor, "GenuineIntel") || !strcmp((char*)vendor, "CyrixInstead") )
{
- // Cache and TLB Information
- static const char cache32_ids[] = { 0x0a, 0x0c, 0x41, 0x42, 0x43, 0x44, 0x45, 0x82, 0x83, 0x84, 0x85, 0 };
- static const char cache64_ids[] = { 0x22, 0x23, 0x25, 0x29, 0x2c, 0x46, 0x47, 0x49, 0x60, 0x66, 0x67, 0x68, 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7c, 0x7f, 0x86, 0x87, 0 };
- uint32_t buf[4];
- int max, i=0, j;
- do {
- x264_cpu_cpuid( 2, buf+0, buf+1, buf+2, buf+3 );
- max = buf[0]&0xff;
- buf[0] &= ~0xff;
- for(j=0; j<4; j++)
- if( !(buf[j]>>31) )
- while( buf[j] )
- {
- if( strchr( cache32_ids, buf[j]&0xff ) )
- cache = 32;
- if( strchr( cache64_ids, buf[j]&0xff ) )
- cache = 64;
- buf[j] >>= 8;
- }
- } while( ++i < max );
+ /* cacheline size is specified in 3 places, any of which may be missing */
+ x264_cpu_cpuid( 1, &eax, &ebx, &ecx, &edx );
+ cache = (ebx&0xff00)>>5; // cflush size
+ if( !cache && max_extended_cap >= 0x80000006 )
+ {
+ x264_cpu_cpuid( 0x80000006, &eax, &ebx, &ecx, &edx );
+ cache = ecx&0xff; // cacheline size
+ }
+ if( !cache )
+ {
+ // Cache and TLB Information
+ static const char cache32_ids[] = { 0x0a, 0x0c, 0x41, 0x42, 0x43, 0x44, 0x45, 0x82, 0x83, 0x84, 0x85, 0 };
+ static const char cache64_ids[] = { 0x22, 0x23, 0x25, 0x29, 0x2c, 0x46, 0x47, 0x49, 0x60, 0x66, 0x67, 0x68, 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7c, 0x7f, 0x86, 0x87, 0 };
+ uint32_t buf[4];
+ int max, i=0, j;
+ do {
+ x264_cpu_cpuid( 2, buf+0, buf+1, buf+2, buf+3 );
+ max = buf[0]&0xff;
+ buf[0] &= ~0xff;
+ for(j=0; j<4; j++)
+ if( !(buf[j]>>31) )
+ while( buf[j] )
+ {
+ if( strchr( cache32_ids, buf[j]&0xff ) )
+ cache = 32;
+ if( strchr( cache64_ids, buf[j]&0xff ) )
+ cache = 64;
+ buf[j] >>= 8;
+ }
+ } while( ++i < max );
+ }
+
+ if( cache == 32 )
+ cpu |= X264_CPU_CACHELINE_32;
+ else if( cache == 64 )
+ cpu |= X264_CPU_CACHELINE_64;
+ else
+ fprintf( stderr, "x264 [warning]: unable to determine cacheline size\n" );
}
- if( cache == 32 )
- cpu |= X264_CPU_CACHELINE_32;
- if( cache == 64 )
- cpu |= X264_CPU_CACHELINE_64;
-
return cpu;
}
#define x264_stack_align(func,arg) func(arg)
#endif
-extern const struct {
- const char name[8];
+typedef struct {
+ const char name[12];
int flags;
-} x264_cpu_names[];
+} x264_cpu_name_t;
+extern const x264_cpu_name_t x264_cpu_names[];
#endif
if( cpu&X264_CPU_MMX )
{
dctf->sub4x4_dct = x264_sub4x4_dct_mmx;
- dctf->sub8x8_dct = x264_sub8x8_dct_mmx;
- dctf->sub16x16_dct = x264_sub16x16_dct_mmx;
-
dctf->add4x4_idct = x264_add4x4_idct_mmx;
- dctf->add8x8_idct = x264_add8x8_idct_mmx;
- dctf->add16x16_idct = x264_add16x16_idct_mmx;
-
dctf->dct4x4dc = x264_dct4x4dc_mmx;
dctf->idct4x4dc = x264_idct4x4dc_mmx;
#ifndef ARCH_X86_64
+ dctf->sub8x8_dct = x264_sub8x8_dct_mmx;
+ dctf->sub16x16_dct = x264_sub16x16_dct_mmx;
+ dctf->add8x8_idct = x264_add8x8_idct_mmx;
+ dctf->add16x16_idct = x264_add16x16_idct_mmx;
+
dctf->sub8x8_dct8 = x264_sub8x8_dct8_mmx;
dctf->sub16x16_dct8 = x264_sub16x16_dct8_mmx;
-
dctf->add8x8_idct8 = x264_add8x8_idct8_mmx;
dctf->add16x16_idct8= x264_add16x16_idct8_mmx;
#endif
dctf->sub16x16_dct8 = x264_sub16x16_dct8_sse2;
dctf->add8x8_idct8 = x264_add8x8_idct8_sse2;
dctf->add16x16_idct8= x264_add16x16_idct8_sse2;
- }
- if( (cpu&X264_CPU_SSE2) && !(cpu&X264_CPU_3DNOW) )
- {
+
dctf->sub8x8_dct = x264_sub8x8_dct_sse2;
dctf->sub16x16_dct = x264_sub16x16_dct_sse2;
dctf->add8x8_idct = x264_add8x8_idct_sse2;
if( h->param.b_interlaced )
i_lines = ( i_lines + 31 ) & -32;
- if( h->param.cpu&X264_CPU_CACHELINE_SPLIT )
- {
- int align = h->param.cpu&X264_CPU_CACHELINE_32 ? 32 : 64;
- i_stride = (i_stride + align-1) & -align;
- }
+ if( h->param.cpu&X264_CPU_CACHELINE_64 )
+ i_stride = (i_stride + 63) & ~63;
+ else if( h->param.cpu&X264_CPU_CACHELINE_32 )
+ i_stride = (i_stride + 31) & ~31;
frame->i_plane = 3;
for( i = 0; i < 3; i++ )
pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_mmxext;
pixf->ssim_4x4x2_core = x264_pixel_ssim_4x4x2_core_mmxext;
- if( cpu&X264_CPU_CACHELINE_SPLIT )
+ if( cpu&X264_CPU_CACHELINE_32 )
{
- if( cpu&X264_CPU_CACHELINE_32 )
- {
- INIT5( sad, _cache32_mmxext );
- INIT4( sad_x3, _cache32_mmxext );
- INIT4( sad_x4, _cache32_mmxext );
- }
- else
- {
- INIT5( sad, _cache64_mmxext );
- INIT4( sad_x3, _cache64_mmxext );
- INIT4( sad_x4, _cache64_mmxext );
- }
+ INIT5( sad, _cache32_mmxext );
+ INIT4( sad_x3, _cache32_mmxext );
+ INIT4( sad_x4, _cache32_mmxext );
+ }
+ else if( cpu&X264_CPU_CACHELINE_64 )
+ {
+ INIT5( sad, _cache64_mmxext );
+ INIT4( sad_x3, _cache64_mmxext );
+ INIT4( sad_x4, _cache64_mmxext );
}
#else
- if( cpu&X264_CPU_CACHELINE_SPLIT )
+ if( cpu&X264_CPU_CACHELINE_64 )
{
pixf->sad[PIXEL_8x16] = x264_pixel_sad_8x16_cache64_mmxext;
pixf->sad[PIXEL_8x8] = x264_pixel_sad_8x8_cache64_mmxext;
pixf->intra_satd_x3_4x4 = x264_intra_satd_x3_4x4_mmxext;
}
- // disable on AMD processors since it is slower
- if( (cpu&X264_CPU_SSE2) && !(cpu&X264_CPU_3DNOW) )
+ if( (cpu&X264_CPU_SSE2) && !(cpu&X264_CPU_SSE2_IS_SLOW) )
{
INIT2( sad, _sse2 );
INIT2( sad_x3, _sse2 );
INIT2( sad_x4, _sse2 );
- INIT5( satd, _sse2 );
- INIT5( satd_x3, _sse2 );
- INIT5( satd_x4, _sse2 );
INIT_ADS( _sse2 );
#ifdef ARCH_X86
- if( cpu&X264_CPU_CACHELINE_SPLIT )
+ if( cpu&X264_CPU_CACHELINE_64 )
{
INIT2( sad, _cache64_sse2 );
INIT2( sad_x3, _cache64_sse2 );
}
#endif
}
- // these are faster on both Intel and AMD
if( cpu&X264_CPU_SSE2 )
{
INIT5( ssd, _sse2 );
+ INIT5( satd, _sse2 );
+ INIT5( satd_x3, _sse2 );
+ INIT5( satd_x4, _sse2 );
pixf->ssim_4x4x2_core = x264_pixel_ssim_4x4x2_core_sse2;
pixf->ssim_end4 = x264_pixel_ssim_end4_sse2;
pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_sse2;
#endif
}
- if( (cpu&X264_CPU_SSE3) && (cpu&X264_CPU_CACHELINE_SPLIT) )
+ if( (cpu&X264_CPU_SSE3) && (cpu&X264_CPU_CACHELINE_64) )
{
INIT2( sad, _sse3 );
INIT2( sad_x3, _sse3 );
#ifdef ARCH_X86_64
pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_ssse3;
#endif
- if( cpu&X264_CPU_CACHELINE_SPLIT )
+ if( cpu&X264_CPU_CACHELINE_64 )
{
INIT2( sad, _cache64_ssse3 );
INIT2( sad_x3, _cache64_ssse3 );
INIT2( sad_x4, _cache64_ssse3 );
}
- }
-
- if( cpu&X264_CPU_SSE4 )
- {
- // enabled on Penryn, but slower on Conroe
- INIT5( satd, _ssse3_phadd );
- INIT5( satd_x3, _ssse3_phadd );
- INIT5( satd_x4, _ssse3_phadd );
+ if( cpu&X264_CPU_PHADD_IS_FAST )
+ {
+ INIT5( satd, _ssse3_phadd );
+ INIT5( satd_x3, _ssse3_phadd );
+ INIT5( satd_x4, _ssse3_phadd );
+ }
}
#endif //HAVE_MMX
jmp %2
%endmacro
+%ifndef ARCH_X86_64
SUB_NxN_DCT x264_sub8x8_dct_mmx, x264_sub4x4_dct_mmx %+ .skip_prologue, 32, 4, 0, 0
ADD_NxN_IDCT x264_add8x8_idct_mmx, x264_add4x4_idct_mmx %+ .skip_prologue, 32, 4, 0, 0
-
SUB_NxN_DCT x264_sub16x16_dct_mmx, x264_sub8x8_dct_mmx %+ .skip_prologue, 32, 8, 4, 4
ADD_NxN_IDCT x264_add16x16_idct_mmx, x264_add8x8_idct_mmx %+ .skip_prologue, 32, 8, 4, 4
-SUB_NxN_DCT x264_sub16x16_dct_sse2, x264_sub8x8_dct_sse2 %+ .skip_prologue, 64, 8, 0, 4
-ADD_NxN_IDCT x264_add16x16_idct_sse2, x264_add8x8_idct_sse2 %+ .skip_prologue, 64, 8, 0, 4
-
-%ifndef ARCH_X86_64
cextern x264_sub8x8_dct8_mmx.skip_prologue
cextern x264_add8x8_idct8_mmx.skip_prologue
SUB_NxN_DCT x264_sub16x16_dct8_mmx, x264_sub8x8_dct8_mmx %+ .skip_prologue, 128, 8, 0, 0
%define x264_add8x8_idct8_sse2 x264_add8x8_idct8_sse2.skip_prologue
%endif
+SUB_NxN_DCT x264_sub16x16_dct_sse2, x264_sub8x8_dct_sse2 %+ .skip_prologue, 64, 8, 0, 4
+ADD_NxN_IDCT x264_add16x16_idct_sse2, x264_add8x8_idct_sse2 %+ .skip_prologue, 64, 8, 0, 4
+
cextern x264_sub8x8_dct8_sse2
cextern x264_add8x8_idct8_sse2
SUB_NxN_DCT x264_sub16x16_dct8_sse2, x264_sub8x8_dct8_sse2, 128, 8, 0, 0
jg .height_loop
REP_RET
-%macro PIXEL_AVG_SSE 1
-cglobal x264_pixel_avg2_w16_%1, 6,7
+cglobal x264_pixel_avg2_w16_sse2, 6,7
sub r4, r2
lea r6, [r4+r3]
.height_loop:
jg .height_loop
REP_RET
-cglobal x264_pixel_avg2_w20_%1, 6,7
+cglobal x264_pixel_avg2_w20_sse2, 6,7
sub r4, r2
lea r6, [r4+r3]
.height_loop:
sub r5d, 2
jg .height_loop
REP_RET
-%endmacro
-
-PIXEL_AVG_SSE sse2
-%define movdqu lddqu
-PIXEL_AVG_SSE sse3
-%undef movdqu
; Cacheline split code for processors with high latencies for loads
; split over cache lines. See sad-a.asm for a more detailed explanation.
PIXEL_AVG_WALL(cache64_mmxext)
PIXEL_AVG_WALL(cache64_sse2)
PIXEL_AVG_WALL(sse2)
-PIXEL_AVG_WALL(sse3)
#define AVG_WEIGHT(W,H) \
void x264_pixel_avg_weight_ ## W ## x ## H ## _mmxext( uint8_t *dst, int i_dst, uint8_t *src, int i_src, int i_weight_dst ) \
PIXEL_AVG_WTAB(cache64_mmxext, mmxext, cache64_mmxext, cache64_mmxext, cache64_mmxext, cache64_mmxext)
PIXEL_AVG_WTAB(sse2, mmxext, mmxext, mmxext, sse2, sse2)
PIXEL_AVG_WTAB(cache64_sse2, mmxext, cache64_mmxext, cache64_sse2, cache64_sse2, cache64_sse2)
-PIXEL_AVG_WTAB(cache64_sse3, mmxext, cache64_mmxext, sse3, sse3, sse3)
#define MC_COPY_WTAB(instr, name1, name2, name3)\
static void (* const x264_mc_copy_wtab_##instr[5])( uint8_t *, int, uint8_t *, int, int ) =\
MC_COPY_WTAB(mmx,mmx,mmx,mmx)
MC_COPY_WTAB(sse2,mmx,mmx,sse2)
-MC_COPY_WTAB(sse3,mmx,mmx,sse3)
static const int hpel_ref0[16] = {0,1,1,1,0,1,1,1,2,3,3,3,0,1,1,1};
static const int hpel_ref1[16] = {0,0,0,0,2,2,3,2,2,2,3,2,2,2,3,2};
#endif
MC_LUMA(sse2,sse2,sse2)
MC_LUMA(cache64_sse2,cache64_sse2,sse2)
-MC_LUMA(cache64_sse3,cache64_sse3,sse3)
#define GET_REF(name)\
uint8_t *get_ref_##name( uint8_t *dst, int *i_dst_stride,\
#endif
GET_REF(sse2)
GET_REF(cache64_sse2)
-GET_REF(cache64_sse3)
#define HPEL(align, cpu, cpuv, cpuc, cpuh)\
void x264_hpel_filter_v_##cpuv( uint8_t *dst, uint8_t *src, int16_t *buf, int stride, int width);\
pf->mc_luma = mc_luma_cache32_mmxext;
pf->get_ref = get_ref_cache32_mmxext;
}
- else if( cpu&X264_CPU_CACHELINE_SPLIT )
+ else if( cpu&X264_CPU_CACHELINE_64 )
{
pf->mc_luma = mc_luma_cache64_mmxext;
pf->get_ref = get_ref_cache64_mmxext;
pf->memzero_aligned = x264_memzero_aligned_sse2;
pf->hpel_filter = x264_hpel_filter_sse2_amd;
- // disable on AMD processors since it is slower
- if( cpu&X264_CPU_3DNOW )
+ if( cpu&X264_CPU_SSE2_IS_SLOW )
return;
- pf->mc_luma = mc_luma_sse2;
- pf->get_ref = get_ref_sse2;
pf->copy[PIXEL_16x16] = x264_mc_copy_w16_aligned_sse2;
pf->avg[PIXEL_16x16] = x264_pixel_avg_16x16_sse2;
pf->avg[PIXEL_16x8] = x264_pixel_avg_16x8_sse2;
pf->hpel_filter = x264_hpel_filter_sse2;
- if( cpu&X264_CPU_CACHELINE_SPLIT )
+ if( cpu&X264_CPU_SSE2_IS_FAST )
{
- pf->mc_luma = mc_luma_cache64_sse2;
- pf->get_ref = get_ref_cache64_sse2;
- /* lddqu doesn't work on Core2 */
- if( (cpu&X264_CPU_SSE3) && !(cpu&X264_CPU_SSSE3) )
+ pf->mc_luma = mc_luma_sse2;
+ pf->get_ref = get_ref_sse2;
+ if( cpu&X264_CPU_CACHELINE_64 )
{
- pf->mc_luma = mc_luma_cache64_sse3;
- pf->get_ref = get_ref_cache64_sse3;
+ pf->mc_luma = mc_luma_cache64_sse2;
+ pf->get_ref = get_ref_cache64_sse2;
}
}
pf[I_PRED_16x16_DC] = predict_16x16_dc_mmxext;
pf[I_PRED_16x16_DC_TOP] = predict_16x16_dc_top_mmxext;
pf[I_PRED_16x16_P] = predict_16x16_p_mmxext;
- if( !(cpu&X264_CPU_SSE2) || (cpu&X264_CPU_3DNOW) )
+ if( !(cpu&X264_CPU_SSE2) )
return;
pf[I_PRED_16x16_DC] = predict_16x16_dc_sse2;
- pf[I_PRED_16x16_DC_TOP] = predict_16x16_dc_top_sse2;
pf[I_PRED_16x16_V] = predict_16x16_v_sse2;
+ if( cpu&X264_CPU_SSE2_IS_SLOW )
+ return;
+ pf[I_PRED_16x16_DC_TOP] = predict_16x16_dc_top_sse2;
pf[I_PRED_16x16_P] = predict_16x16_p_sse2;
}
p = buf + sprintf( buf, "using cpu capabilities:" );
for( i=0; x264_cpu_names[i].flags; i++ )
+ {
+ if( !strcmp(x264_cpu_names[i].name, "SSE2")
+ && param->cpu & (X264_CPU_SSE2_IS_FAST|X264_CPU_SSE2_IS_SLOW) )
+ continue;
+ if( !strcmp(x264_cpu_names[i].name, "SSE3")
+ && (param->cpu & X264_CPU_SSSE3 || !(param->cpu & X264_CPU_CACHELINE_64)) )
+ continue;
if( (param->cpu & x264_cpu_names[i].flags) == x264_cpu_names[i].flags
&& (!i || x264_cpu_names[i].flags != x264_cpu_names[i-1].flags) )
p += sprintf( p, " %s", x264_cpu_names[i].name );
+ }
if( !param->cpu )
p += sprintf( p, " none!" );
x264_log( h, X264_LOG_INFO, "%s\n", buf );
for( k=0; k<j && benchs[i].vers[k].pointer != b->pointer; k++ );
if( k<j ) continue;
printf( "%s_%s%s: %"PRId64"\n", benchs[i].name,
- b->cpu&X264_CPU_SSE4 ? "sse4" :
+ b->cpu&X264_CPU_PHADD_IS_FAST ? "phadd" :
b->cpu&X264_CPU_SSSE3 ? "ssse3" :
b->cpu&X264_CPU_SSE3 ? "sse3" :
+ /* print sse2slow only if there's also a sse2fast version of the same func */
+ b->cpu&X264_CPU_SSE2_IS_SLOW && j<MAX_CPUS && b[1].cpu&X264_CPU_SSE2_IS_FAST && !(b[1].cpu&X264_CPU_SSE3) ? "sse2slow" :
b->cpu&X264_CPU_SSE2 ? "sse2" :
b->cpu&X264_CPU_MMX ? "mmx" : "c",
b->cpu&X264_CPU_CACHELINE_32 ? "_c32" :
{
*cpu_ref = *cpu_new;
*cpu_new |= flags;
+ if( *cpu_new & X264_CPU_SSE2_IS_FAST )
+ *cpu_new &= ~X264_CPU_SSE2_IS_SLOW;
if( !quiet )
fprintf( stderr, "x264: %s\n", name );
return check_all_funcs( *cpu_ref, *cpu_new );
#ifdef HAVE_MMX
if( x264_cpu_detect() & X264_CPU_MMXEXT )
{
- ret |= add_flags( &cpu0, &cpu1, X264_CPU_MMX | X264_CPU_MMXEXT, "MMXEXT" );
- ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_SPLIT|X264_CPU_CACHELINE_64, "MMXEXT Cache64" );
+ ret |= add_flags( &cpu0, &cpu1, X264_CPU_MMX | X264_CPU_MMXEXT, "MMX" );
+ ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_64, "MMX Cache64" );
cpu1 &= ~X264_CPU_CACHELINE_64;
- ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_SPLIT|X264_CPU_CACHELINE_32, "MMXEXT Cache32" );
+#ifdef ARCH_X86
+ ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_32, "MMX Cache32" );
+ cpu1 &= ~X264_CPU_CACHELINE_32;
+#endif
}
if( x264_cpu_detect() & X264_CPU_SSE2 )
{
- cpu1 &= ~(X264_CPU_CACHELINE_SPLIT|X264_CPU_CACHELINE_32);
- ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE | X264_CPU_SSE2, "SSE2" );
- ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_SPLIT|X264_CPU_CACHELINE_64, "SSE2 Cache64" );
+ ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE | X264_CPU_SSE2 | X264_CPU_SSE2_IS_SLOW, "SSE2Slow" );
+ ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE2_IS_FAST, "SSE2Fast" );
+ ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_64, "SSE2Fast Cache64" );
}
if( x264_cpu_detect() & X264_CPU_SSE3 )
- ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE3, "SSE3" );
+ ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE3 | X264_CPU_CACHELINE_64, "SSE3" );
if( x264_cpu_detect() & X264_CPU_SSSE3 )
{
- cpu1 &= ~(X264_CPU_CACHELINE_SPLIT|X264_CPU_CACHELINE_64);
+ cpu1 &= ~X264_CPU_CACHELINE_64;
ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSSE3, "SSSE3" );
- ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_SPLIT|X264_CPU_CACHELINE_64, "SSSE3 Cache64" );
- }
- if( x264_cpu_detect() & X264_CPU_SSSE3 )
- {
- cpu1 &= ~(X264_CPU_CACHELINE_SPLIT|X264_CPU_CACHELINE_64);
- ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE4, "SSE4" );
+ ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_64, "SSSE3 Cache64" );
+ ret |= add_flags( &cpu0, &cpu1, X264_CPU_PHADD_IS_FAST, "PHADD" );
}
#elif ARCH_PPC
if( x264_cpu_detect() & X264_CPU_ALTIVEC )
#include <stdarg.h>
-#define X264_BUILD 59
+#define X264_BUILD 60
/* x264_t:
* opaque handler for encoder */
****************************************************************************/
/* CPU flags
*/
-#define X264_CPU_MMX 0x000001 /* mmx */
-#define X264_CPU_MMXEXT 0x000002 /* mmx-ext*/
-#define X264_CPU_SSE 0x000004 /* sse */
-#define X264_CPU_SSE2 0x000008 /* sse 2 */
-#define X264_CPU_3DNOW 0x000010 /* 3dnow! */
-#define X264_CPU_3DNOWEXT 0x000020 /* 3dnow! ext */
-#define X264_CPU_ALTIVEC 0x000040 /* altivec */
-#define X264_CPU_SSE3 0x000080 /* sse 3 */
-#define X264_CPU_SSSE3 0x000100 /* ssse 3 */
-#define X264_CPU_CACHELINE_SPLIT 0x200 /* avoid memory loads that span the border between two cachelines */
-#define X264_CPU_CACHELINE_32 0x0400 /* size of a cacheline in bytes */
-#define X264_CPU_CACHELINE_64 0x0800
-#define X264_CPU_SSE4 0x001000 /* sse 4.1 */
+#define X264_CPU_CACHELINE_32 0x000001 /* avoid memory loads that span the border between two cachelines */
+#define X264_CPU_CACHELINE_64 0x000002 /* 32/64 is the size of a cacheline in bytes */
+#define X264_CPU_ALTIVEC 0x000004
+#define X264_CPU_MMX 0x000008
+#define X264_CPU_MMXEXT 0x000010 /* MMX2 aka MMXEXT aka ISSE */
+#define X264_CPU_SSE 0x000020
+#define X264_CPU_SSE2 0x000040
+#define X264_CPU_SSE2_IS_SLOW 0x000080 /* avoid most SSE2 functions on Athlon64 */
+#define X264_CPU_SSE2_IS_FAST 0x000100 /* a few functions are only faster on Core2 and Phenom */
+#define X264_CPU_SSE3 0x000200
+#define X264_CPU_SSSE3 0x000400
+#define X264_CPU_PHADD_IS_FAST 0x000800 /* pre-Penryn Core2 have a uselessly slow PHADD instruction */
+#define X264_CPU_SSE4 0x001000 /* SSE4.1 */
/* Analyse flags
*/