Do satd 4x8 by transposing the two blocks' positions and running satd 8x4.
Use pinsrd (SSE4) for faster width4 SSD
Globally replace movlhps with punpcklqdq (it seems to be faster on Conroe)
Move mask_misalign declaration to cpu.h to avoid warning in encoder.c.
These optimizations help on Nehalem, Phenom, and Penryn CPUs.
#ifdef HAVE_MMX
extern int x264_cpu_cpuid_test( void );
-extern void x264_cpu_mask_misalign_sse( void );
extern uint32_t x264_cpu_cpuid( uint32_t op, uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx );
uint32_t x264_cpu_detect( void )
uint32_t x264_cpu_detect( void );
int x264_cpu_num_processors( void );
void x264_emms( void );
+void x264_cpu_mask_misalign_sse( void );
/* kluge:
* gcc can't give variables any greater alignment than the stack frame has.
scores[2] = x264_pixel_satd_##size##cpu( fenc, FENC_STRIDE, pix2, i_stride );\
scores[3] = x264_pixel_satd_##size##cpu( fenc, FENC_STRIDE, pix3, i_stride );\
}
-#define SATD_X_DECL5( cpu )\
+#define SATD_X_DECL6( cpu )\
SATD_X( 16x16, cpu )\
SATD_X( 16x8, cpu )\
SATD_X( 8x16, cpu )\
SATD_X( 8x8, cpu )\
-SATD_X( 8x4, cpu )
+SATD_X( 8x4, cpu )\
+SATD_X( 4x8, cpu )
#define SATD_X_DECL7( cpu )\
-SATD_X_DECL5( cpu )\
-SATD_X( 4x8, cpu )\
+SATD_X_DECL6( cpu )\
SATD_X( 4x4, cpu )
SATD_X_DECL7()
#ifdef HAVE_MMX
SATD_X_DECL7( _mmxext )
-SATD_X_DECL5( _sse2 )
+SATD_X_DECL6( _sse2 )
SATD_X_DECL7( _ssse3 )
-SATD_X_DECL5( _ssse3_phadd )
+SATD_X_DECL6( _ssse3_phadd )
#endif
/****************************************************************************
#define INIT5_NAME( name1, name2, cpu ) \
INIT4_NAME( name1, name2, cpu ) \
pixf->name1[PIXEL_8x4] = x264_pixel_##name2##_8x4##cpu;
-#define INIT7_NAME( name1, name2, cpu ) \
+#define INIT6_NAME( name1, name2, cpu ) \
INIT5_NAME( name1, name2, cpu ) \
- pixf->name1[PIXEL_4x8] = x264_pixel_##name2##_4x8##cpu;\
+ pixf->name1[PIXEL_4x8] = x264_pixel_##name2##_4x8##cpu;
+#define INIT7_NAME( name1, name2, cpu ) \
+ INIT6_NAME( name1, name2, cpu ) \
pixf->name1[PIXEL_4x4] = x264_pixel_##name2##_4x4##cpu;
#define INIT2( name, cpu ) INIT2_NAME( name, name, cpu )
#define INIT4( name, cpu ) INIT4_NAME( name, name, cpu )
#define INIT5( name, cpu ) INIT5_NAME( name, name, cpu )
+#define INIT6( name, cpu ) INIT6_NAME( name, name, cpu )
#define INIT7( name, cpu ) INIT7_NAME( name, name, cpu )
#define INIT_ADS( cpu ) \
if( cpu&X264_CPU_SSE2 )
{
INIT5( ssd, _sse2 );
- INIT5( satd, _sse2 );
- INIT5( satd_x3, _sse2 );
- INIT5( satd_x4, _sse2 );
+ if( cpu&X264_CPU_SSE2_IS_FAST )
+ {
+ INIT6( satd, _sse2 );
+ INIT6( satd_x3, _sse2 );
+ INIT6( satd_x4, _sse2 );
+ }
+ else
+ {
+ INIT5( satd, _sse2 );
+ INIT5( satd_x3, _sse2 );
+ INIT5( satd_x4, _sse2 );
+ }
INIT2_NAME( sad_aligned, sad, _sse2_aligned );
pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_sse2;
pixf->ssim_4x4x2_core = x264_pixel_ssim_4x4x2_core_sse2;
}
if( cpu&X264_CPU_PHADD_IS_FAST )
{
- INIT5( satd, _ssse3_phadd );
- INIT5( satd_x3, _ssse3_phadd );
- INIT5( satd_x4, _ssse3_phadd );
+ INIT6( satd, _ssse3_phadd );
+ INIT6( satd_x3, _ssse3_phadd );
+ INIT6( satd_x4, _ssse3_phadd );
}
}
+
+ if( cpu&X264_CPU_SSE4 )
+ {
+ pixf->ssd[PIXEL_4x8] = x264_pixel_ssd_4x8_sse4;
+ pixf->ssd[PIXEL_4x4] = x264_pixel_ssd_4x4_sse4;
+ }
#endif //HAVE_MMX
#ifdef ARCH_PPC
movdq2q mm7, xmm7
%else
movhlps xmm3, xmm7
- movlhps xmm7, xmm7
+ punpcklqdq xmm7, xmm7
movdq2q mm7, xmm3
%endif
punpckldq xmm2, xmm3
punpckldq xmm4, xmm5
punpckldq xmm6, xmm7
- movlhps xmm0, xmm2
- movlhps xmm4, xmm6
+ punpcklqdq xmm0, xmm2
+ punpcklqdq xmm4, xmm6
movdqa xmm7, [pb_sub4frame GLOBAL]
pshufb xmm0, xmm7
pshufb xmm4, xmm7
%macro SPLATW 1
%ifidn m0, xmm0
pshuflw %1, %1, 0
- movlhps %1, %1
+ punpcklqdq %1, %1
%else
pshufw %1, %1, 0
%endif
%macro SPLATW 2
%if mmsize==16
pshuflw %1, %2, 0
- movlhps %1, %1
+ punpcklqdq %1, %1
%else
pshufw %1, %2, 0
%endif
paddd m0, m3
%endmacro
+%macro SSD_QUARTER 6
+ movd m1, [r0+%1]
+ movd m2, [r2+%2]
+ movd m3, [r0+%3]
+ movd m4, [r2+%4]
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ pinsrd m1, [r0+%1], 1
+ pinsrd m2, [r2+%2], 1
+ pinsrd m3, [r0+%3], 1
+ pinsrd m4, [r2+%4], 1
+ punpcklbw m1, m7
+ punpcklbw m2, m7
+ punpcklbw m3, m7
+ punpcklbw m4, m7
+ psubw m1, m2
+ psubw m3, m4
+ pmaddwd m1, m1
+ pmaddwd m3, m3
+
+%if %6
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+%endif
+%if %5
+ paddd m0, m1
+%else
+ SWAP m0, m1
+%endif
+ paddd m0, m3
+%endmacro
+
;-----------------------------------------------------------------------------
; int x264_pixel_ssd_16x16_mmx( uint8_t *, int, uint8_t *, int )
;-----------------------------------------------------------------------------
%macro SSD 3
cglobal x264_pixel_ssd_%1x%2_%3, 4,4
+%if %1 >= mmsize
pxor m7, m7
+%endif
%assign i 0
%rep %2/2
%if %1 > mmsize
SSD 8, 8, sse2
SSD 8, 4, sse2
+cglobal x264_pixel_ssd_4x8_sse4, 4,4
+ SSD_QUARTER 0, 0, r1, r3, 0, 1
+ SSD_QUARTER 0, 0, r1, r3, 1, 0
+ HADDD m0, m1
+ movd eax, m0
+ RET
+
+cglobal x264_pixel_ssd_4x4_sse4, 4,4
+ SSD_QUARTER 0, 0, r1, r3, 0, 0
+ HADDD m0, m1
+ movd eax, m0
+ RET
+
;=============================================================================
; variance
HADAMARD4x4_SUM %1
%endmacro
-%macro SATD_8x4_SSE2 2
- LOAD_DIFF_8x4P m0, m1, m2, m3, m4, m5
-%if %1
- lea r0, [r0+4*r1]
- lea r2, [r2+4*r3]
-%endif
+%macro SATD_8x4_SSE2 1
HADAMARD4_1D m0, m1, m2, m3
-%ifidn %2, ssse3_phadd
+%ifidn %1, ssse3_phadd
HADAMARD4_ROW_PHADD 0, 1, 2, 3, 4
%else
TRANSPOSE2x4x4W 0, 1, 2, 3, 4
call x264_pixel_satd_8x4_internal_mmxext
SATD_END_MMX
-%macro SATD_W4 1
-INIT_MMX
-cglobal x264_pixel_satd_4x8_%1, 4,6
+cglobal x264_pixel_satd_4x8_mmxext, 4,6
SATD_START_MMX
SATD_4x4_MMX m0, 0, 1
SATD_4x4_MMX m1, 0, 0
paddw m0, m1
SATD_END_MMX
+%macro SATD_W4 1
+INIT_MMX
cglobal x264_pixel_satd_4x4_%1, 4,6
SATD_START_MMX
SATD_4x4_MMX m0, 0, 0
%macro SATDS_SSE2 1
INIT_XMM
cglobal x264_pixel_satd_8x8_internal_%1
- SATD_8x4_SSE2 1, %1
+ LOAD_DIFF_8x4P m0, m1, m2, m3, m4, m5
+ SATD_8x4_SSE2 %1
+ lea r0, [r0+4*r1]
+ lea r2, [r2+4*r3]
x264_pixel_satd_8x4_internal_%1:
- SATD_8x4_SSE2 0, %1
+ LOAD_DIFF_8x4P m0, m1, m2, m3, m4, m5
+x264_pixel_satd_4x8_internal_%1:
+ SAVE_MM_PERMUTATION satd_4x8_internal
+ SATD_8x4_SSE2 %1
ret
cglobal x264_pixel_satd_16x16_%1, 4,6
call x264_pixel_satd_8x4_internal_%1
SATD_END_SSE2
+cglobal x264_pixel_satd_4x8_%1, 4,6
+ INIT_XMM
+ LOAD_MM_PERMUTATION satd_4x8_internal
+ %define movh movd
+ SATD_START_SSE2
+ LOAD_DIFF m0, m7, m6, [r0], [r2]
+ LOAD_DIFF m1, m7, m6, [r0+r1], [r2+r3]
+ LOAD_DIFF m2, m7, m6, [r0+2*r1], [r2+2*r3]
+ LOAD_DIFF m3, m7, m6, [r0+r4], [r2+r5]
+ lea r0, [r0+4*r1]
+ lea r2, [r2+4*r3]
+ LOAD_DIFF m4, m7, m6, [r0], [r2]
+ LOAD_DIFF m5, m7, m6, [r0+r1], [r2+r3]
+ punpcklqdq m0, m4
+ punpcklqdq m1, m5
+ LOAD_DIFF m4, m7, m6, [r0+2*r1], [r2+2*r3]
+ LOAD_DIFF m5, m7, m6, [r0+r4], [r2+r5]
+ punpcklqdq m2, m4
+ punpcklqdq m3, m5
+ %define movh movq
+ call x264_pixel_satd_4x8_internal_%1
+ SATD_END_SSE2
+
%ifdef ARCH_X86_64
;-----------------------------------------------------------------------------
; int x264_pixel_sa8d_8x8_sse2( uint8_t *, int, uint8_t *, int )
DECL_X4( sad, sse3 )
DECL_X1( ssd, mmx )
DECL_X1( ssd, sse2 )
+DECL_X1( ssd, sse4 )
DECL_X1( satd, mmxext )
DECL_X1( satd, sse2 )
DECL_X1( satd, ssse3 )
punpcklbw %1, %1
%if mmsize==16
pshuflw %1, %1, 0xff
- movlhps %1, %1
+ punpcklqdq %1, %1
%else
pshufw %1, %1, 0xff
%endif
for( k=0; k<j && benchs[i].vers[k].pointer != b->pointer; k++ );
if( k<j ) continue;
printf( "%s_%s%s: %"PRId64"\n", benchs[i].name,
+ b->cpu&X264_CPU_SSE4 ? "sse4" :
b->cpu&X264_CPU_PHADD_IS_FAST ? "phadd" :
b->cpu&X264_CPU_SSSE3 ? "ssse3" :
b->cpu&X264_CPU_SSE3 ? "sse3" :
ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_64, "SSSE3 Cache64" );
ret |= add_flags( &cpu0, &cpu1, X264_CPU_PHADD_IS_FAST, "PHADD" );
}
+ if( x264_cpu_detect() & X264_CPU_SSE4 )
+ {
+ cpu1 &= ~X264_CPU_CACHELINE_64;
+ ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE4, "SSE4" );
+ }
#elif ARCH_PPC
if( x264_cpu_detect() & X264_CPU_ALTIVEC )
{