From 4d84a45d7e505e4929a0110e047aa29a752e3253 Mon Sep 17 00:00:00 2001 From: Fiona Glaser Date: Wed, 3 Sep 2008 15:32:16 -0700 Subject: [PATCH] Add merged SAD for i16x16 analysis Roughly 30% faster i16x16 analysis under subme=1 --- common/pixel.c | 4 +- common/pixel.h | 12 +++--- common/x86/pixel.h | 23 ++++++----- common/x86/sad-a.asm | 92 ++++++++++++++++++++++++++++++++++++++++++ common/x86/x86util.asm | 17 ++++++++ encoder/analyse.c | 4 +- encoder/encoder.c | 1 + tools/checkasm.c | 18 +++++---- 8 files changed, 145 insertions(+), 26 deletions(-) diff --git a/common/pixel.c b/common/pixel.c index 3e28d857..86388fa0 100644 --- a/common/pixel.c +++ b/common/pixel.c @@ -619,6 +619,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ) } #endif pixf->intra_satd_x3_16x16 = x264_intra_satd_x3_16x16_mmxext; + pixf->intra_sad_x3_16x16 = x264_intra_sad_x3_16x16_mmxext; pixf->intra_satd_x3_8x8c = x264_intra_satd_x3_8x8c_mmxext; pixf->intra_satd_x3_4x4 = x264_intra_satd_x3_4x4_mmxext; } @@ -630,7 +631,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ) INIT2( sad_x4, _sse2 ); INIT_ADS( _sse2 ); pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_sse2; - + pixf->intra_sad_x3_16x16 = x264_intra_sad_x3_16x16_sse2; #ifdef ARCH_X86 if( cpu&X264_CPU_CACHELINE_64 ) { @@ -673,6 +674,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ) pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_ssse3; pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_ssse3; pixf->intra_satd_x3_16x16 = x264_intra_satd_x3_16x16_ssse3; + pixf->intra_sad_x3_16x16 = x264_intra_sad_x3_16x16_ssse3; pixf->intra_satd_x3_8x8c = x264_intra_satd_x3_8x8c_ssse3; pixf->intra_satd_x3_4x4 = x264_intra_satd_x3_4x4_ssse3; #ifdef ARCH_X86_64 diff --git a/common/pixel.h b/common/pixel.h index 127f89dd..0ed1ef4c 100644 --- a/common/pixel.h +++ b/common/pixel.h @@ -91,12 +91,14 @@ typedef struct int (*ads[7])( int enc_dc[4], uint16_t *sums, int delta, uint16_t *cost_mvx, int16_t *mvs, int width, int thresh ); - /* calculate satd of V, H, and DC modes. + /* calculate satd or sad of V, H, and DC modes. * may be NULL, in which case just use pred+satd instead. */ - void (*intra_satd_x3_16x16)( uint8_t *fenc, uint8_t *fdec, int res[3] ); - void (*intra_satd_x3_8x8c)( uint8_t *fenc, uint8_t *fdec, int res[3] ); - void (*intra_satd_x3_4x4)( uint8_t *fenc, uint8_t *fdec, int res[3] ); - void (*intra_sa8d_x3_8x8)( uint8_t *fenc, uint8_t edge[33], int res[3] ); + void (*intra_mbcmp_x3_16x16)( uint8_t *fenc, uint8_t *fdec , int res[3] ); + void (*intra_satd_x3_16x16) ( uint8_t *fenc, uint8_t *fdec , int res[3] ); + void (*intra_sad_x3_16x16) ( uint8_t *fenc, uint8_t *fdec , int res[3] ); + void (*intra_satd_x3_8x8c) ( uint8_t *fenc, uint8_t *fdec , int res[3] ); + void (*intra_satd_x3_4x4) ( uint8_t *fenc, uint8_t *fdec , int res[3] ); + void (*intra_sa8d_x3_8x8) ( uint8_t *fenc, uint8_t edge[33], int res[3] ); } x264_pixel_function_t; void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ); diff --git a/common/x86/pixel.h b/common/x86/pixel.h index cbb37dda..f0df2dd2 100644 --- a/common/x86/pixel.h +++ b/common/x86/pixel.h @@ -74,18 +74,21 @@ int x264_pixel_var_16x16_sse2 ( uint8_t *pix, int i_stride, uint32_t *sad ); int x264_pixel_var_8x8_mmxext ( uint8_t *pix, int i_stride, uint32_t *sad ); int x264_pixel_var_8x8_sse2 ( uint8_t *pix, int i_stride, uint32_t *sad ); -void x264_intra_satd_x3_4x4_mmxext( uint8_t *, uint8_t *, int * ); -void x264_intra_satd_x3_4x4_ssse3( uint8_t *, uint8_t *, int * ); -void x264_intra_satd_x3_8x8c_mmxext( uint8_t *, uint8_t *, int * ); -void x264_intra_satd_x3_8x8c_ssse3( uint8_t *, uint8_t *, int * ); +void x264_intra_satd_x3_4x4_mmxext ( uint8_t *, uint8_t *, int * ); +void x264_intra_satd_x3_4x4_ssse3 ( uint8_t *, uint8_t *, int * ); +void x264_intra_satd_x3_8x8c_mmxext ( uint8_t *, uint8_t *, int * ); +void x264_intra_satd_x3_8x8c_ssse3 ( uint8_t *, uint8_t *, int * ); void x264_intra_satd_x3_16x16_mmxext( uint8_t *, uint8_t *, int * ); -void x264_intra_satd_x3_16x16_ssse3( uint8_t *, uint8_t *, int * ); -void x264_intra_sa8d_x3_8x8_mmxext( uint8_t *, uint8_t *, int * ); -void x264_intra_sa8d_x3_8x8_sse2( uint8_t *, uint8_t *, int * ); -void x264_intra_sa8d_x3_8x8_ssse3( uint8_t *, uint8_t *, int * ); +void x264_intra_satd_x3_16x16_ssse3 ( uint8_t *, uint8_t *, int * ); +void x264_intra_sad_x3_16x16_mmxext ( uint8_t *, uint8_t *, int * ); +void x264_intra_sad_x3_16x16_sse2 ( uint8_t *, uint8_t *, int * ); +void x264_intra_sad_x3_16x16_ssse3 ( uint8_t *, uint8_t *, int * ); +void x264_intra_sa8d_x3_8x8_mmxext ( uint8_t *, uint8_t *, int * ); +void x264_intra_sa8d_x3_8x8_sse2 ( uint8_t *, uint8_t *, int * ); +void x264_intra_sa8d_x3_8x8_ssse3 ( uint8_t *, uint8_t *, int * ); void x264_intra_sa8d_x3_8x8_core_mmxext( uint8_t *, int16_t [2][8], int * ); -void x264_intra_sa8d_x3_8x8_core_sse2( uint8_t *, int16_t [2][8], int * ); -void x264_intra_sa8d_x3_8x8_core_ssse3( uint8_t *, int16_t [2][8], int * ); +void x264_intra_sa8d_x3_8x8_core_sse2 ( uint8_t *, int16_t [2][8], int * ); +void x264_intra_sa8d_x3_8x8_core_ssse3 ( uint8_t *, int16_t [2][8], int * ); void x264_pixel_ssim_4x4x2_core_mmxext( const uint8_t *pix1, int stride1, const uint8_t *pix2, int stride2, int sums[2][4] ); diff --git a/common/x86/sad-a.asm b/common/x86/sad-a.asm index 2c167221..49db7892 100644 --- a/common/x86/sad-a.asm +++ b/common/x86/sad-a.asm @@ -23,8 +23,10 @@ ;***************************************************************************** %include "x86inc.asm" +%include "x86util.asm" SECTION_RODATA +pb_3: times 16 db 3 sw_64: dd 64 SECTION .text @@ -221,6 +223,96 @@ SAD_W16 sse2_aligned +;----------------------------------------------------------------------------- +; void intra_sad_x3_16x16 ( uint8_t *fenc, uint8_t *fdec, int res[3] ); +;----------------------------------------------------------------------------- + +;xmm7: DC prediction xmm6: H prediction xmm5: V prediction +;xmm4: DC pred score xmm3: H pred score xmm2: V pred score +%macro INTRA_SAD16 1 +cglobal x264_intra_sad_x3_16x16_%1,3,5 + pxor mm0, mm0 + pxor mm1, mm1 + psadbw mm0, [r1-FDEC_STRIDE+0] + psadbw mm1, [r1-FDEC_STRIDE+8] + paddw mm0, mm1 + movd r3d, mm0 +%ifidn %1, ssse3 + mova m1, [pb_3 GLOBAL] +%endif +%assign n 0 +%rep 16 + movzx r4d, byte [r1-1+FDEC_STRIDE*n] + add r3d, r4d +%assign n n+1 +%endrep + add r3d, 16 + shr r3d, 5 + imul r3d, 0x01010101 + movd m7, r3d + mova m5, [r1-FDEC_STRIDE] +%if mmsize==16 + pshufd m7, m7, 0 +%else + mova m1, [r1-FDEC_STRIDE+8] + punpckldq m7, m7 +%endif + pxor m4, m4 + pxor m3, m3 + pxor m2, m2 + mov r3d, 15*FENC_STRIDE +.vloop: + SPLATB m6, r1+r3*2-1, m1 + mova m0, [r0+r3] + psadbw m0, m7 + paddw m4, m0 + mova m0, [r0+r3] + psadbw m0, m5 + paddw m2, m0 +%if mmsize==8 + mova m0, [r0+r3] + psadbw m0, m6 + paddw m3, m0 + mova m0, [r0+r3+8] + psadbw m0, m7 + paddw m4, m0 + mova m0, [r0+r3+8] + psadbw m0, m1 + paddw m2, m0 + psadbw m6, [r0+r3+8] + paddw m3, m6 +%else + psadbw m6, [r0+r3] + paddw m3, m6 +%endif + add r3d, -FENC_STRIDE + jge .vloop +%if mmsize==16 + pslldq m3, 4 + por m3, m2 + movhlps m1, m3 + paddw m3, m1 + movq [r2+0], m3 + movhlps m1, m4 + paddw m4, m1 +%else + movd [r2+0], m2 + movd [r2+4], m3 +%endif + movd [r2+8], m4 + RET +%endmacro + +INIT_MMX +%define SPLATB SPLATB_MMX +INTRA_SAD16 mmxext +INIT_XMM +INTRA_SAD16 sse2 +%define SPLATB SPLATB_SSSE3 +INTRA_SAD16 ssse3 + + + ;============================================================================= ; SAD x3/x4 MMX ;============================================================================= diff --git a/common/x86/x86util.asm b/common/x86/x86util.asm index d4eec429..e61d1381 100644 --- a/common/x86/x86util.asm +++ b/common/x86/x86util.asm @@ -131,6 +131,22 @@ ABS2 %3, %4, %5, %6 %endmacro +%macro SPLATB_MMX 3 + movd %1, [%2-3] ;to avoid crossing a cacheline + punpcklbw %1, %1 +%if mmsize==16 + pshuflw %1, %1, 0xff + movlhps %1, %1 +%else + pshufw %1, %1, 0xff +%endif +%endmacro + +%macro SPLATB_SSSE3 3 + movd %1, [%2-3] + pshufb %1, %3 +%endmacro + %macro PALIGNR_MMX 4 %ifnidn %4, %2 mova %4, %2 @@ -221,3 +237,4 @@ packuswb %1, %1 movh %4, %1 %endmacro + diff --git a/encoder/analyse.c b/encoder/analyse.c index d9e77a32..10399690 100644 --- a/encoder/analyse.c +++ b/encoder/analyse.c @@ -544,7 +544,7 @@ static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_ int i, idx; int i_max; int predict_mode[9]; - int b_merged_satd = h->pixf.intra_satd_x3_16x16 && h->pixf.mbcmp[0] == h->pixf.satd[0]; + int b_merged_satd = !!h->pixf.intra_mbcmp_x3_16x16; /*---------------- Try all mode and calculate their score ---------------*/ @@ -553,7 +553,7 @@ static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_ if( b_merged_satd && i_max == 4 ) { - h->pixf.intra_satd_x3_16x16( p_src, p_dst, a->i_satd_i16x16_dir ); + h->pixf.intra_mbcmp_x3_16x16( p_src, p_dst, a->i_satd_i16x16_dir ); h->predict_16x16[I_PRED_16x16_P]( p_dst ); a->i_satd_i16x16_dir[I_PRED_16x16_P] = h->pixf.mbcmp[PIXEL_16x16]( p_dst, FDEC_STRIDE, p_src, FENC_STRIDE ); diff --git a/encoder/encoder.c b/encoder/encoder.c index 584aef65..92bc699c 100644 --- a/encoder/encoder.c +++ b/encoder/encoder.c @@ -585,6 +585,7 @@ static void mbcmp_init( x264_t *h ) int satd = !h->mb.b_lossless && h->param.analyse.i_subpel_refine > 1; memcpy( h->pixf.mbcmp, satd ? h->pixf.satd : h->pixf.sad_aligned, sizeof(h->pixf.mbcmp) ); memcpy( h->pixf.mbcmp_unaligned, satd ? h->pixf.satd : h->pixf.sad, sizeof(h->pixf.mbcmp_unaligned) ); + h->pixf.intra_mbcmp_x3_16x16 = satd ? h->pixf.intra_satd_x3_16x16 : h->pixf.intra_sad_x3_16x16; satd &= h->param.analyse.i_me_method == X264_ME_TESA; memcpy( h->pixf.fpelcmp, satd ? h->pixf.satd : h->pixf.sad, sizeof(h->pixf.fpelcmp) ); memcpy( h->pixf.fpelcmp_x3, satd ? h->pixf.satd_x3 : h->pixf.sad_x3, sizeof(h->pixf.fpelcmp_x3) ); diff --git a/tools/checkasm.c b/tools/checkasm.c index 9f892859..8700df2e 100644 --- a/tools/checkasm.c +++ b/tools/checkasm.c @@ -324,7 +324,7 @@ static int check_pixel( int cpu_ref, int cpu_new ) TEST_PIXEL_VAR( PIXEL_8x8 ); report( "pixel var :" ); -#define TEST_INTRA_SATD( name, pred, satd, i8x8, ... ) \ +#define TEST_INTRA_MBCMP( name, pred, satd, i8x8, ... ) \ if( pixel_asm.name && pixel_asm.name != pixel_ref.name ) \ { \ int res_c[3], res_asm[3]; \ @@ -333,10 +333,10 @@ static int check_pixel( int cpu_ref, int cpu_new ) memcpy( buf3, buf2, 1024 ); \ for( i=0; i<3; i++ ) \ { \ - pred[i]( buf3+40, ##__VA_ARGS__ ); \ - res_c[i] = pixel_c.satd( buf1+40, 16, buf3+40, 32 ); \ + pred[i]( buf3+48, ##__VA_ARGS__ ); \ + res_c[i] = pixel_c.satd( buf1+48, 16, buf3+48, 32 ); \ } \ - call_a( pixel_asm.name, buf1+40, i8x8 ? edge : buf3+40, res_asm ); \ + call_a( pixel_asm.name, buf1+48, i8x8 ? edge : buf3+48, res_asm ); \ if( memcmp(res_c, res_asm, sizeof(res_c)) ) \ { \ ok = 0; \ @@ -347,11 +347,13 @@ static int check_pixel( int cpu_ref, int cpu_new ) } ok = 1; used_asm = 0; - TEST_INTRA_SATD( intra_satd_x3_16x16, predict_16x16, satd[PIXEL_16x16], 0 ); - TEST_INTRA_SATD( intra_satd_x3_8x8c, predict_8x8c, satd[PIXEL_8x8], 0 ); - TEST_INTRA_SATD( intra_satd_x3_4x4, predict_4x4, satd[PIXEL_4x4], 0 ); - TEST_INTRA_SATD( intra_sa8d_x3_8x8, predict_8x8, sa8d[PIXEL_8x8], 1, edge ); + TEST_INTRA_MBCMP( intra_satd_x3_16x16, predict_16x16, satd[PIXEL_16x16], 0 ); + TEST_INTRA_MBCMP( intra_satd_x3_8x8c , predict_8x8c , satd[PIXEL_8x8] , 0 ); + TEST_INTRA_MBCMP( intra_satd_x3_4x4 , predict_4x4 , satd[PIXEL_4x4] , 0 ); + TEST_INTRA_MBCMP( intra_sa8d_x3_8x8 , predict_8x8 , sa8d[PIXEL_8x8] , 1, edge ); report( "intra satd_x3 :" ); + TEST_INTRA_MBCMP( intra_sad_x3_16x16 , predict_16x16, sad [PIXEL_16x16], 0 ); + report( "intra sad_x3 :" ); if( pixel_asm.ssim_4x4x2_core != pixel_ref.ssim_4x4x2_core || pixel_asm.ssim_end4 != pixel_ref.ssim_end4 ) -- 2.40.0