}
#endif
pixf->intra_satd_x3_16x16 = x264_intra_satd_x3_16x16_mmxext;
+ pixf->intra_sad_x3_16x16 = x264_intra_sad_x3_16x16_mmxext;
pixf->intra_satd_x3_8x8c = x264_intra_satd_x3_8x8c_mmxext;
pixf->intra_satd_x3_4x4 = x264_intra_satd_x3_4x4_mmxext;
}
INIT2( sad_x4, _sse2 );
INIT_ADS( _sse2 );
pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_sse2;
-
+ pixf->intra_sad_x3_16x16 = x264_intra_sad_x3_16x16_sse2;
#ifdef ARCH_X86
if( cpu&X264_CPU_CACHELINE_64 )
{
pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_ssse3;
pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_ssse3;
pixf->intra_satd_x3_16x16 = x264_intra_satd_x3_16x16_ssse3;
+ pixf->intra_sad_x3_16x16 = x264_intra_sad_x3_16x16_ssse3;
pixf->intra_satd_x3_8x8c = x264_intra_satd_x3_8x8c_ssse3;
pixf->intra_satd_x3_4x4 = x264_intra_satd_x3_4x4_ssse3;
#ifdef ARCH_X86_64
int (*ads[7])( int enc_dc[4], uint16_t *sums, int delta,
uint16_t *cost_mvx, int16_t *mvs, int width, int thresh );
- /* calculate satd of V, H, and DC modes.
+ /* calculate satd or sad of V, H, and DC modes.
* may be NULL, in which case just use pred+satd instead. */
- void (*intra_satd_x3_16x16)( uint8_t *fenc, uint8_t *fdec, int res[3] );
- void (*intra_satd_x3_8x8c)( uint8_t *fenc, uint8_t *fdec, int res[3] );
- void (*intra_satd_x3_4x4)( uint8_t *fenc, uint8_t *fdec, int res[3] );
- void (*intra_sa8d_x3_8x8)( uint8_t *fenc, uint8_t edge[33], int res[3] );
+ void (*intra_mbcmp_x3_16x16)( uint8_t *fenc, uint8_t *fdec , int res[3] );
+ void (*intra_satd_x3_16x16) ( uint8_t *fenc, uint8_t *fdec , int res[3] );
+ void (*intra_sad_x3_16x16) ( uint8_t *fenc, uint8_t *fdec , int res[3] );
+ void (*intra_satd_x3_8x8c) ( uint8_t *fenc, uint8_t *fdec , int res[3] );
+ void (*intra_satd_x3_4x4) ( uint8_t *fenc, uint8_t *fdec , int res[3] );
+ void (*intra_sa8d_x3_8x8) ( uint8_t *fenc, uint8_t edge[33], int res[3] );
} x264_pixel_function_t;
void x264_pixel_init( int cpu, x264_pixel_function_t *pixf );
int x264_pixel_var_8x8_mmxext ( uint8_t *pix, int i_stride, uint32_t *sad );
int x264_pixel_var_8x8_sse2 ( uint8_t *pix, int i_stride, uint32_t *sad );
-void x264_intra_satd_x3_4x4_mmxext( uint8_t *, uint8_t *, int * );
-void x264_intra_satd_x3_4x4_ssse3( uint8_t *, uint8_t *, int * );
-void x264_intra_satd_x3_8x8c_mmxext( uint8_t *, uint8_t *, int * );
-void x264_intra_satd_x3_8x8c_ssse3( uint8_t *, uint8_t *, int * );
+void x264_intra_satd_x3_4x4_mmxext ( uint8_t *, uint8_t *, int * );
+void x264_intra_satd_x3_4x4_ssse3 ( uint8_t *, uint8_t *, int * );
+void x264_intra_satd_x3_8x8c_mmxext ( uint8_t *, uint8_t *, int * );
+void x264_intra_satd_x3_8x8c_ssse3 ( uint8_t *, uint8_t *, int * );
void x264_intra_satd_x3_16x16_mmxext( uint8_t *, uint8_t *, int * );
-void x264_intra_satd_x3_16x16_ssse3( uint8_t *, uint8_t *, int * );
-void x264_intra_sa8d_x3_8x8_mmxext( uint8_t *, uint8_t *, int * );
-void x264_intra_sa8d_x3_8x8_sse2( uint8_t *, uint8_t *, int * );
-void x264_intra_sa8d_x3_8x8_ssse3( uint8_t *, uint8_t *, int * );
+void x264_intra_satd_x3_16x16_ssse3 ( uint8_t *, uint8_t *, int * );
+void x264_intra_sad_x3_16x16_mmxext ( uint8_t *, uint8_t *, int * );
+void x264_intra_sad_x3_16x16_sse2 ( uint8_t *, uint8_t *, int * );
+void x264_intra_sad_x3_16x16_ssse3 ( uint8_t *, uint8_t *, int * );
+void x264_intra_sa8d_x3_8x8_mmxext ( uint8_t *, uint8_t *, int * );
+void x264_intra_sa8d_x3_8x8_sse2 ( uint8_t *, uint8_t *, int * );
+void x264_intra_sa8d_x3_8x8_ssse3 ( uint8_t *, uint8_t *, int * );
void x264_intra_sa8d_x3_8x8_core_mmxext( uint8_t *, int16_t [2][8], int * );
-void x264_intra_sa8d_x3_8x8_core_sse2( uint8_t *, int16_t [2][8], int * );
-void x264_intra_sa8d_x3_8x8_core_ssse3( uint8_t *, int16_t [2][8], int * );
+void x264_intra_sa8d_x3_8x8_core_sse2 ( uint8_t *, int16_t [2][8], int * );
+void x264_intra_sa8d_x3_8x8_core_ssse3 ( uint8_t *, int16_t [2][8], int * );
void x264_pixel_ssim_4x4x2_core_mmxext( const uint8_t *pix1, int stride1,
const uint8_t *pix2, int stride2, int sums[2][4] );
;*****************************************************************************
%include "x86inc.asm"
+%include "x86util.asm"
SECTION_RODATA
+pb_3: times 16 db 3
sw_64: dd 64
SECTION .text
+;-----------------------------------------------------------------------------
+; void intra_sad_x3_16x16 ( uint8_t *fenc, uint8_t *fdec, int res[3] );
+;-----------------------------------------------------------------------------
+
+;xmm7: DC prediction xmm6: H prediction xmm5: V prediction
+;xmm4: DC pred score xmm3: H pred score xmm2: V pred score
+%macro INTRA_SAD16 1
+cglobal x264_intra_sad_x3_16x16_%1,3,5
+ pxor mm0, mm0
+ pxor mm1, mm1
+ psadbw mm0, [r1-FDEC_STRIDE+0]
+ psadbw mm1, [r1-FDEC_STRIDE+8]
+ paddw mm0, mm1
+ movd r3d, mm0
+%ifidn %1, ssse3
+ mova m1, [pb_3 GLOBAL]
+%endif
+%assign n 0
+%rep 16
+ movzx r4d, byte [r1-1+FDEC_STRIDE*n]
+ add r3d, r4d
+%assign n n+1
+%endrep
+ add r3d, 16
+ shr r3d, 5
+ imul r3d, 0x01010101
+ movd m7, r3d
+ mova m5, [r1-FDEC_STRIDE]
+%if mmsize==16
+ pshufd m7, m7, 0
+%else
+ mova m1, [r1-FDEC_STRIDE+8]
+ punpckldq m7, m7
+%endif
+ pxor m4, m4
+ pxor m3, m3
+ pxor m2, m2
+ mov r3d, 15*FENC_STRIDE
+.vloop:
+ SPLATB m6, r1+r3*2-1, m1
+ mova m0, [r0+r3]
+ psadbw m0, m7
+ paddw m4, m0
+ mova m0, [r0+r3]
+ psadbw m0, m5
+ paddw m2, m0
+%if mmsize==8
+ mova m0, [r0+r3]
+ psadbw m0, m6
+ paddw m3, m0
+ mova m0, [r0+r3+8]
+ psadbw m0, m7
+ paddw m4, m0
+ mova m0, [r0+r3+8]
+ psadbw m0, m1
+ paddw m2, m0
+ psadbw m6, [r0+r3+8]
+ paddw m3, m6
+%else
+ psadbw m6, [r0+r3]
+ paddw m3, m6
+%endif
+ add r3d, -FENC_STRIDE
+ jge .vloop
+%if mmsize==16
+ pslldq m3, 4
+ por m3, m2
+ movhlps m1, m3
+ paddw m3, m1
+ movq [r2+0], m3
+ movhlps m1, m4
+ paddw m4, m1
+%else
+ movd [r2+0], m2
+ movd [r2+4], m3
+%endif
+ movd [r2+8], m4
+ RET
+%endmacro
+
+INIT_MMX
+%define SPLATB SPLATB_MMX
+INTRA_SAD16 mmxext
+INIT_XMM
+INTRA_SAD16 sse2
+%define SPLATB SPLATB_SSSE3
+INTRA_SAD16 ssse3
+
+
+
;=============================================================================
; SAD x3/x4 MMX
;=============================================================================
ABS2 %3, %4, %5, %6
%endmacro
+%macro SPLATB_MMX 3
+ movd %1, [%2-3] ;to avoid crossing a cacheline
+ punpcklbw %1, %1
+%if mmsize==16
+ pshuflw %1, %1, 0xff
+ movlhps %1, %1
+%else
+ pshufw %1, %1, 0xff
+%endif
+%endmacro
+
+%macro SPLATB_SSSE3 3
+ movd %1, [%2-3]
+ pshufb %1, %3
+%endmacro
+
%macro PALIGNR_MMX 4
%ifnidn %4, %2
mova %4, %2
packuswb %1, %1
movh %4, %1
%endmacro
+
int i, idx;
int i_max;
int predict_mode[9];
- int b_merged_satd = h->pixf.intra_satd_x3_16x16 && h->pixf.mbcmp[0] == h->pixf.satd[0];
+ int b_merged_satd = !!h->pixf.intra_mbcmp_x3_16x16;
/*---------------- Try all mode and calculate their score ---------------*/
if( b_merged_satd && i_max == 4 )
{
- h->pixf.intra_satd_x3_16x16( p_src, p_dst, a->i_satd_i16x16_dir );
+ h->pixf.intra_mbcmp_x3_16x16( p_src, p_dst, a->i_satd_i16x16_dir );
h->predict_16x16[I_PRED_16x16_P]( p_dst );
a->i_satd_i16x16_dir[I_PRED_16x16_P] =
h->pixf.mbcmp[PIXEL_16x16]( p_dst, FDEC_STRIDE, p_src, FENC_STRIDE );
int satd = !h->mb.b_lossless && h->param.analyse.i_subpel_refine > 1;
memcpy( h->pixf.mbcmp, satd ? h->pixf.satd : h->pixf.sad_aligned, sizeof(h->pixf.mbcmp) );
memcpy( h->pixf.mbcmp_unaligned, satd ? h->pixf.satd : h->pixf.sad, sizeof(h->pixf.mbcmp_unaligned) );
+ h->pixf.intra_mbcmp_x3_16x16 = satd ? h->pixf.intra_satd_x3_16x16 : h->pixf.intra_sad_x3_16x16;
satd &= h->param.analyse.i_me_method == X264_ME_TESA;
memcpy( h->pixf.fpelcmp, satd ? h->pixf.satd : h->pixf.sad, sizeof(h->pixf.fpelcmp) );
memcpy( h->pixf.fpelcmp_x3, satd ? h->pixf.satd_x3 : h->pixf.sad_x3, sizeof(h->pixf.fpelcmp_x3) );
TEST_PIXEL_VAR( PIXEL_8x8 );
report( "pixel var :" );
-#define TEST_INTRA_SATD( name, pred, satd, i8x8, ... ) \
+#define TEST_INTRA_MBCMP( name, pred, satd, i8x8, ... ) \
if( pixel_asm.name && pixel_asm.name != pixel_ref.name ) \
{ \
int res_c[3], res_asm[3]; \
memcpy( buf3, buf2, 1024 ); \
for( i=0; i<3; i++ ) \
{ \
- pred[i]( buf3+40, ##__VA_ARGS__ ); \
- res_c[i] = pixel_c.satd( buf1+40, 16, buf3+40, 32 ); \
+ pred[i]( buf3+48, ##__VA_ARGS__ ); \
+ res_c[i] = pixel_c.satd( buf1+48, 16, buf3+48, 32 ); \
} \
- call_a( pixel_asm.name, buf1+40, i8x8 ? edge : buf3+40, res_asm ); \
+ call_a( pixel_asm.name, buf1+48, i8x8 ? edge : buf3+48, res_asm ); \
if( memcmp(res_c, res_asm, sizeof(res_c)) ) \
{ \
ok = 0; \
}
ok = 1; used_asm = 0;
- TEST_INTRA_SATD( intra_satd_x3_16x16, predict_16x16, satd[PIXEL_16x16], 0 );
- TEST_INTRA_SATD( intra_satd_x3_8x8c, predict_8x8c, satd[PIXEL_8x8], 0 );
- TEST_INTRA_SATD( intra_satd_x3_4x4, predict_4x4, satd[PIXEL_4x4], 0 );
- TEST_INTRA_SATD( intra_sa8d_x3_8x8, predict_8x8, sa8d[PIXEL_8x8], 1, edge );
+ TEST_INTRA_MBCMP( intra_satd_x3_16x16, predict_16x16, satd[PIXEL_16x16], 0 );
+ TEST_INTRA_MBCMP( intra_satd_x3_8x8c , predict_8x8c , satd[PIXEL_8x8] , 0 );
+ TEST_INTRA_MBCMP( intra_satd_x3_4x4 , predict_4x4 , satd[PIXEL_4x4] , 0 );
+ TEST_INTRA_MBCMP( intra_sa8d_x3_8x8 , predict_8x8 , sa8d[PIXEL_8x8] , 1, edge );
report( "intra satd_x3 :" );
+ TEST_INTRA_MBCMP( intra_sad_x3_16x16 , predict_16x16, sad [PIXEL_16x16], 0 );
+ report( "intra sad_x3 :" );
if( pixel_asm.ssim_4x4x2_core != pixel_ref.ssim_4x4x2_core ||
pixel_asm.ssim_end4 != pixel_ref.ssim_end4 )