Replaces the MMX2 version, one cycle faster.
Also change the checkasm test to use the correct alignment macro.
pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_mmx;
if( cpu&X264_CPU_MMX2 )
{
- pf_interlaced->scan_4x4 = x264_zigzag_scan_4x4_field_mmx2;
pf_interlaced->scan_8x8 = x264_zigzag_scan_8x8_field_mmx2;
pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_mmx2;
}
+ if( cpu&X264_CPU_SSE )
+ pf_interlaced->scan_4x4 = x264_zigzag_scan_4x4_field_sse;
if( cpu&X264_CPU_SSE2_IS_FAST )
pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_sse2;
if( cpu&X264_CPU_SSSE3 )
; void zigzag_scan_4x4_field( int32_t level[16], int32_t dct[4][4] )
;-----------------------------------------------------------------------------
INIT_XMM sse2
-cglobal zigzag_scan_4x4_field, 2,3
- movu m4, [r1+ 8]
- pshufd m0, m4, q3102
+cglobal zigzag_scan_4x4_field, 2,2
+ movu m0, [r1+ 8]
+ pshufd m0, m0, q3102
mova m1, [r1+32]
mova m2, [r1+48]
movu [r0+ 8], m0
;-----------------------------------------------------------------------------
; void zigzag_scan_4x4_field( int16_t level[16], int16_t dct[4][4] )
;-----------------------------------------------------------------------------
-; sse2 is only 1 cycle faster, and ssse3/pshufb is slower on core2
-INIT_MMX mmx2
-cglobal zigzag_scan_4x4_field, 2,3
- pshufw m0, [r1+4], q3102
- mova m1, [r1+16]
- mova m2, [r1+24]
- movu [r0+4], m0
- mova [r0+16], m1
- mova [r0+24], m2
- mov r2d, [r1]
- mov [r0], r2d
- mov r2d, [r1+12]
- mov [r0+12], r2d
+INIT_XMM sse
+cglobal zigzag_scan_4x4_field, 2,2
+ mova m0, [r1]
+ mova m1, [r1+16]
+ pshufw mm0, [r1+4], q3102
+ mova [r0], m0
+ mova [r0+16], m1
+ movq [r0+4], mm0
RET
%endif ; HIGH_BIT_DEPTH
void x264_zigzag_scan_4x4_frame_sse2 ( int32_t level[16], int32_t dct[16] );
void x264_zigzag_scan_4x4_frame_mmx ( int16_t level[16], int16_t dct[16] );
void x264_zigzag_scan_4x4_field_sse2 ( int32_t level[16], int32_t dct[16] );
-void x264_zigzag_scan_4x4_field_mmx2 ( int16_t level[16], int16_t dct[16] );
+void x264_zigzag_scan_4x4_field_sse ( int16_t level[16], int16_t dct[16] );
void x264_zigzag_scan_8x8_field_xop ( int16_t level[64], int16_t dct[64] );
void x264_zigzag_scan_8x8_field_avx ( int32_t level[64], int32_t dct[64] );
void x264_zigzag_scan_8x8_field_sse4 ( int32_t level[64], int32_t dct[64] );
x264_zigzag_function_t zigzag_ref[2];
x264_zigzag_function_t zigzag_asm[2];
- ALIGNED_16( dctcoef level1[64] );
- ALIGNED_16( dctcoef level2[64] );
+ ALIGNED_ARRAY_16( dctcoef, level1,[64] );
+ ALIGNED_ARRAY_16( dctcoef, level2,[64] );
#define TEST_ZIGZAG_SCAN( name, t1, t2, dct, size ) \
if( zigzag_asm[interlace].name != zigzag_ref[interlace].name ) \