Replace PHADD with FastShuffle (more accurate naming).
This flag represents asm functions that rely on fast SSE2 shuffle units, and thus are only faster on Phenom, Nehalem, and Penryn CPUs.
{"SSE2Fast",X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SSE2_IS_FAST},
{"SSE3", X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SSE3},
{"SSSE3", X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SSE3|X264_CPU_SSSE3},
- {"PHADD", X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_PHADD_IS_FAST},
+ {"FastShuffle", X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SHUFFLE_IS_FAST},
{"SSE4.1", X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4},
{"SSE4.2", X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4|X264_CPU_SSE42},
{"Cache32", X264_CPU_CACHELINE_32},
if( cpu & X264_CPU_SSSE3 )
cpu |= X264_CPU_SSE2_IS_FAST;
if( cpu & X264_CPU_SSE4 )
- cpu |= X264_CPU_PHADD_IS_FAST;
+ cpu |= X264_CPU_SHUFFLE_IS_FAST;
x264_cpu_cpuid( 0x80000000, &eax, &ebx, &ecx, &edx );
max_extended_cap = eax;
cpu |= X264_CPU_SSE2_IS_FAST;
cpu |= X264_CPU_SSE_MISALIGN;
cpu |= X264_CPU_LZCNT;
+ cpu |= X264_CPU_SHUFFLE_IS_FAST;
x264_cpu_mask_misalign_sse();
}
else
{
pf->sub_4x4 = x264_zigzag_sub_4x4_frame_ssse3;
pf->scan_8x8 = x264_zigzag_scan_8x8_frame_ssse3;
+ if( cpu&X264_CPU_SHUFFLE_IS_FAST )
+ pf->scan_4x4 = x264_zigzag_scan_4x4_frame_ssse3;
}
- if( cpu&X264_CPU_PHADD_IS_FAST )
- pf->scan_4x4 = x264_zigzag_scan_4x4_frame_ssse3;
#endif
#ifdef ARCH_PPC
#ifdef HAVE_MMX
if( cpu&X264_CPU_MMX )
pf->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_mmx;
+ if( cpu&X264_CPU_SHUFFLE_IS_FAST )
+ pf->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_sse2;
#endif
}
INIT2( sad_x3, _cache64_ssse3 );
INIT2( sad_x4, _cache64_ssse3 );
}
- if( !(cpu&X264_CPU_PHADD_IS_FAST) )
+ if( !(cpu&X264_CPU_SHUFFLE_IS_FAST) )
{
INIT5( ssd, _sse2 ); /* on conroe, sse2 is faster for width8/16 */
}
pb_scan4frameb: db 0,1,8,9,2,3,4,5,10,11,12,13,6,7,14,15
pb_idctdc_unpack: db 0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3
pb_idctdc_unpack2: db 4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7
-pb_1: times 8 db 1
+pb_1: times 16 db 1
SECTION .text
shr r0d, 16
mov [r2+8], r0w
RET
+
+%macro INTERLEAVE_XMM 1
+ mova m0, [r1+%1*4+ 0]
+ mova m1, [r1+%1*4+16]
+ mova m4, [r1+%1*4+32]
+ mova m5, [r1+%1*4+48]
+ SBUTTERFLY wd, 0, 1, 6
+ SBUTTERFLY wd, 4, 5, 7
+ SBUTTERFLY wd, 0, 1, 6
+ SBUTTERFLY wd, 4, 5, 7
+ movq [r0+%1+ 0], m0
+ movhps [r0+%1+ 32], m0
+ movq [r0+%1+ 64], m1
+ movhps [r0+%1+ 96], m1
+ movq [r0+%1+ 8], m4
+ movhps [r0+%1+ 40], m4
+ movq [r0+%1+ 72], m5
+ movhps [r0+%1+104], m5
+%if %1
+ por m2, m0
+ por m3, m1
+ por m2, m4
+ por m3, m5
+%else
+ SWAP 0,2
+ SWAP 3,1
+ por m2, m4
+ por m3, m5
+%endif
+%endmacro
+
+INIT_XMM
+cglobal x264_zigzag_interleave_8x8_cavlc_sse2, 3,3,8
+ INTERLEAVE_XMM 0
+ INTERLEAVE_XMM 16
+ packsswb m2, m3
+ pxor m5, m5
+ packsswb m2, m2
+ packsswb m2, m2
+ pcmpeqb m5, m2
+ paddb m5, [pb_1 GLOBAL]
+ movd r0d, m5
+ mov [r2+0], r0w
+ shr r0d, 16
+ mov [r2+8], r0w
+ RET
+
void x264_zigzag_scan_4x4_field_mmxext( int16_t level[16], int16_t dct[4][4] );
void x264_zigzag_sub_4x4_frame_ssse3 ( int16_t level[16], const uint8_t *src, uint8_t *dst );
void x264_zigzag_interleave_8x8_cavlc_mmx( int16_t *dst, int16_t *src, uint8_t *nnz );
+void x264_zigzag_interleave_8x8_cavlc_sse2( int16_t *dst, int16_t *src, uint8_t *nnz );
#endif
if( k<j ) continue;
printf( "%s_%s%s: %"PRId64"\n", benchs[i].name,
b->cpu&X264_CPU_SSE4 ? "sse4" :
- b->cpu&X264_CPU_PHADD_IS_FAST ? "phadd" :
+ b->cpu&X264_CPU_SHUFFLE_IS_FAST ? "fastshuffle" :
b->cpu&X264_CPU_SSSE3 ? "ssse3" :
b->cpu&X264_CPU_SSE3 ? "sse3" :
/* print sse2slow only if there's also a sse2fast version of the same func */
for( i = 0; i < 12; i++ )
INTRA_TEST( predict_8x8, i, 8, edge );
- used_asm = 1;
set_func_name("intra_predict_8x8_filter");
if( ip_a.predict_8x8_filter != ip_ref.predict_8x8_filter )
{
+ used_asm = 1;
for( i = 0; i < 32; i++ )
{
memcpy( edge2, edge, 33 );
ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE | X264_CPU_SSE2 | X264_CPU_SSE2_IS_SLOW, "SSE2Slow" );
ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE2_IS_FAST, "SSE2Fast" );
ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_64, "SSE2Fast Cache64" );
+ ret |= add_flags( &cpu0, &cpu1, X264_CPU_SHUFFLE_IS_FAST, "SSE2 FastShuffle" );
+ cpu1 &= ~X264_CPU_SHUFFLE_IS_FAST;
}
if( x264_cpu_detect() & X264_CPU_SSE_MISALIGN )
{
cpu1 &= ~X264_CPU_CACHELINE_64;
ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSSE3, "SSSE3" );
ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_64, "SSSE3 Cache64" );
- ret |= add_flags( &cpu0, &cpu1, X264_CPU_PHADD_IS_FAST, "PHADD" );
+ ret |= add_flags( &cpu0, &cpu1, X264_CPU_SHUFFLE_IS_FAST, "SSSE3 FastShuffle" );
+ cpu1 &= ~X264_CPU_SHUFFLE_IS_FAST;
}
if( x264_cpu_detect() & X264_CPU_SSE4 )
{
#define X264_CPU_SSE2_IS_FAST 0x000100 /* a few functions are only faster on Core2 and Phenom */
#define X264_CPU_SSE3 0x000200
#define X264_CPU_SSSE3 0x000400
-#define X264_CPU_PHADD_IS_FAST 0x000800 /* pre-Penryn Core2 have a uselessly slow PHADD instruction */
+#define X264_CPU_SHUFFLE_IS_FAST 0x000800 /* Penryn, Nehalem, and Phenom have fast shuffle units */
#define X264_CPU_STACK_MOD4 0x001000 /* if stack is only mod4 and not mod16 */
#define X264_CPU_SSE4 0x002000 /* SSE4.1 */
#define X264_CPU_SSE42 0x004000 /* SSE4.2 */