From 82aef940468385dbff6e32b77477a0c80124aca9 Mon Sep 17 00:00:00 2001 From: Fiona Glaser Date: Mon, 30 Mar 2009 04:07:50 -0700 Subject: [PATCH] intra_sad_x3_8x8c assembly Also fix intra_sad_x3_16x16's use of "n" as a loop variable (broke SWAP) --- common/pixel.c | 4 +- common/pixel.h | 2 + common/x86/pixel.h | 2 + common/x86/sad-a.asm | 127 ++++++++++++++++++++++++++++++++++++++++++- encoder/analyse.c | 7 ++- encoder/encoder.c | 1 + tools/checkasm.c | 1 + 7 files changed, 137 insertions(+), 7 deletions(-) diff --git a/common/pixel.c b/common/pixel.c index 76d04e0d..24bf4307 100644 --- a/common/pixel.c +++ b/common/pixel.c @@ -662,8 +662,9 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ) } #endif pixf->intra_satd_x3_16x16 = x264_intra_satd_x3_16x16_mmxext; - pixf->intra_sad_x3_16x16 = x264_intra_sad_x3_16x16_mmxext; + pixf->intra_sad_x3_16x16 = x264_intra_sad_x3_16x16_mmxext; pixf->intra_satd_x3_8x8c = x264_intra_satd_x3_8x8c_mmxext; + pixf->intra_sad_x3_8x8c = x264_intra_sad_x3_8x8c_mmxext; pixf->intra_satd_x3_4x4 = x264_intra_satd_x3_4x4_mmxext; } @@ -753,6 +754,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ) pixf->intra_satd_x3_16x16 = x264_intra_satd_x3_16x16_ssse3; pixf->intra_sad_x3_16x16 = x264_intra_sad_x3_16x16_ssse3; pixf->intra_satd_x3_8x8c = x264_intra_satd_x3_8x8c_ssse3; + pixf->intra_sad_x3_8x8c = x264_intra_sad_x3_8x8c_ssse3; pixf->intra_satd_x3_4x4 = x264_intra_satd_x3_4x4_ssse3; #ifdef ARCH_X86_64 pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_ssse3; diff --git a/common/pixel.h b/common/pixel.h index a08879c9..f1c901e3 100644 --- a/common/pixel.h +++ b/common/pixel.h @@ -97,7 +97,9 @@ typedef struct void (*intra_mbcmp_x3_16x16)( uint8_t *fenc, uint8_t *fdec , int res[3] ); void (*intra_satd_x3_16x16) ( uint8_t *fenc, uint8_t *fdec , int res[3] ); void (*intra_sad_x3_16x16) ( uint8_t *fenc, uint8_t *fdec , int res[3] ); + void (*intra_mbcmp_x3_8x8c) ( uint8_t *fenc, uint8_t *fdec , int res[3] ); void (*intra_satd_x3_8x8c) ( uint8_t *fenc, uint8_t *fdec , int res[3] ); + void (*intra_sad_x3_8x8c) ( uint8_t *fenc, uint8_t *fdec , int res[3] ); void (*intra_satd_x3_4x4) ( uint8_t *fenc, uint8_t *fdec , int res[3] ); void (*intra_sa8d_x3_8x8) ( uint8_t *fenc, uint8_t edge[33], int res[3] ); } x264_pixel_function_t; diff --git a/common/x86/pixel.h b/common/x86/pixel.h index 0bb7dfeb..1e04dcda 100644 --- a/common/x86/pixel.h +++ b/common/x86/pixel.h @@ -81,6 +81,8 @@ void x264_intra_satd_x3_4x4_mmxext ( uint8_t *, uint8_t *, int * ); void x264_intra_satd_x3_4x4_ssse3 ( uint8_t *, uint8_t *, int * ); void x264_intra_satd_x3_8x8c_mmxext ( uint8_t *, uint8_t *, int * ); void x264_intra_satd_x3_8x8c_ssse3 ( uint8_t *, uint8_t *, int * ); +void x264_intra_sad_x3_8x8c_mmxext ( uint8_t *, uint8_t *, int * ); +void x264_intra_sad_x3_8x8c_ssse3 ( uint8_t *, uint8_t *, int * ); void x264_intra_satd_x3_16x16_mmxext( uint8_t *, uint8_t *, int * ); void x264_intra_satd_x3_16x16_ssse3 ( uint8_t *, uint8_t *, int * ); void x264_intra_sad_x3_16x16_mmxext ( uint8_t *, uint8_t *, int * ); diff --git a/common/x86/sad-a.asm b/common/x86/sad-a.asm index c4f1cae9..2eb260b9 100644 --- a/common/x86/sad-a.asm +++ b/common/x86/sad-a.asm @@ -28,6 +28,8 @@ SECTION_RODATA pb_3: times 16 db 3 +pb_shuf8x8c0: db 0,0,0,0,2,2,2,2 +pb_shuf8x8c1: db 4,4,4,4,6,6,6,6 sw_64: dd 64 SECTION .text @@ -255,6 +257,125 @@ cglobal x264_pixel_sad_8x16_sse2, 4,4 SAD_END_SSE2 RET +;----------------------------------------------------------------------------- +; void intra_sad_x3_8x8c ( uint8_t *fenc, uint8_t *fdec, int res[3] ); +;----------------------------------------------------------------------------- + +%macro INTRA_SAD_HV_ITER 2 +%ifidn %2, ssse3 + movd m1, [r1 + FDEC_STRIDE*(%1-4) - 4] + movd m3, [r1 + FDEC_STRIDE*(%1-3) - 4] + pshufb m1, m7 + pshufb m3, m7 +%else + movq m1, [r1 + FDEC_STRIDE*(%1-4) - 8] + movq m3, [r1 + FDEC_STRIDE*(%1-3) - 8] + punpckhbw m1, m1 + punpckhbw m3, m3 + pshufw m1, m1, 0xff + pshufw m3, m3, 0xff +%endif + movq m4, [r0 + FENC_STRIDE*(%1+0)] + movq m5, [r0 + FENC_STRIDE*(%1+1)] + psadbw m1, m4 + psadbw m3, m5 + psadbw m4, m6 + psadbw m5, m6 + paddw m1, m3 + paddw m4, m5 +%if %1 + paddw m0, m1 + paddw m2, m4 +%else + SWAP 0,1 + SWAP 2,4 +%endif +%endmacro + +%macro INTRA_SAD_8x8C 1 +cglobal x264_intra_sad_x3_8x8c_%1, 3,3 + movq m6, [r1 - FDEC_STRIDE] + add r1, FDEC_STRIDE*4 +%ifidn %1,ssse3 + movq m7, [pb_3 GLOBAL] +%endif + INTRA_SAD_HV_ITER 0, %1 + INTRA_SAD_HV_ITER 2, %1 + INTRA_SAD_HV_ITER 4, %1 + INTRA_SAD_HV_ITER 6, %1 + movd [r2+4], m0 + movd [r2+8], m2 + pxor m7, m7 + movq m2, [r1 + FDEC_STRIDE*-4 - 8] + movq m4, [r1 + FDEC_STRIDE*-2 - 8] + movq m3, [r1 + FDEC_STRIDE* 0 - 8] + movq m5, [r1 + FDEC_STRIDE* 2 - 8] + punpckhbw m2, [r1 + FDEC_STRIDE*-3 - 8] + punpckhbw m4, [r1 + FDEC_STRIDE*-1 - 8] + punpckhbw m3, [r1 + FDEC_STRIDE* 1 - 8] + punpckhbw m5, [r1 + FDEC_STRIDE* 3 - 8] + punpckhbw m2, m4 + punpckhbw m3, m5 + psrlq m2, 32 + psrlq m3, 32 + psadbw m2, m7 ; s2 + psadbw m3, m7 ; s3 + movq m1, m6 + SWAP 0, 6 + punpckldq m0, m7 + punpckhdq m1, m7 + psadbw m0, m7 ; s0 + psadbw m1, m7 ; s1 + punpcklwd m0, m1 + punpcklwd m2, m3 + punpckldq m0, m2 ;s0 s1 s2 s3 + pshufw m3, m0, 11110110b ;s2,s1,s3,s3 + pshufw m0, m0, 01110100b ;s0,s1,s3,s1 + paddw m0, m3 + psrlw m0, 2 + pavgw m0, m7 ; s0+s2, s1, s3, s1+s3 +%ifidn %1, ssse3 + movq m1, m0 + pshufb m0, [pb_shuf8x8c0 GLOBAL] + pshufb m1, [pb_shuf8x8c1 GLOBAL] +%else + packuswb m0, m0 + punpcklbw m0, m0 + movq m1, m0 + punpcklbw m0, m0 ; 4x dc0 4x dc1 + punpckhbw m1, m1 ; 4x dc2 4x dc3 +%endif + movq m2, [r0+FENC_STRIDE*0] + movq m3, [r0+FENC_STRIDE*1] + movq m4, [r0+FENC_STRIDE*2] + movq m5, [r0+FENC_STRIDE*3] + movq m6, [r0+FENC_STRIDE*4] + movq m7, [r0+FENC_STRIDE*5] + psadbw m2, m0 + psadbw m3, m0 + psadbw m4, m0 + psadbw m5, m0 + movq m0, [r0+FENC_STRIDE*6] + psadbw m6, m1 + psadbw m7, m1 + psadbw m0, m1 + psadbw m1, [r0+FENC_STRIDE*7] + paddw m2, m3 + paddw m4, m5 + paddw m6, m7 + paddw m0, m1 + paddw m2, m4 + paddw m6, m0 + paddw m2, m6 + movd [r2], m2 + RET +%endmacro + +INIT_MMX +INTRA_SAD_8x8C mmxext +INTRA_SAD_8x8C ssse3 + + ;----------------------------------------------------------------------------- ; void intra_sad_x3_16x16 ( uint8_t *fenc, uint8_t *fdec, int res[3] ); ;----------------------------------------------------------------------------- @@ -272,11 +393,11 @@ cglobal x264_intra_sad_x3_16x16_%1,3,5,%2 %ifidn %1, ssse3 mova m1, [pb_3 GLOBAL] %endif -%assign n 0 +%assign x 0 %rep 16 - movzx r4d, byte [r1-1+FDEC_STRIDE*n] + movzx r4d, byte [r1-1+FDEC_STRIDE*x] add r3d, r4d -%assign n n+1 +%assign x x+1 %endrep add r3d, 16 shr r3d, 5 diff --git a/encoder/analyse.c b/encoder/analyse.c index d56c93e6..b1df50b5 100644 --- a/encoder/analyse.c +++ b/encoder/analyse.c @@ -539,6 +539,7 @@ static void x264_mb_analyse_intra_chroma( x264_t *h, x264_mb_analysis_t *a ) int i_max; int predict_mode[4]; + int b_merged_satd = !!h->pixf.intra_mbcmp_x3_8x8c && !h->mb.b_lossless; uint8_t *p_dstc[2], *p_srcc[2]; @@ -553,11 +554,11 @@ static void x264_mb_analyse_intra_chroma( x264_t *h, x264_mb_analysis_t *a ) predict_8x8chroma_mode_available( h->mb.i_neighbour, predict_mode, &i_max ); a->i_satd_i8x8chroma = COST_MAX; - if( i_max == 4 && h->pixf.intra_satd_x3_8x8c && h->pixf.mbcmp[0] == h->pixf.satd[0] ) + if( i_max == 4 && b_merged_satd ) { int satdu[4], satdv[4]; - h->pixf.intra_satd_x3_8x8c( p_srcc[0], p_dstc[0], satdu ); - h->pixf.intra_satd_x3_8x8c( p_srcc[1], p_dstc[1], satdv ); + h->pixf.intra_mbcmp_x3_8x8c( p_srcc[0], p_dstc[0], satdu ); + h->pixf.intra_mbcmp_x3_8x8c( p_srcc[1], p_dstc[1], satdv ); h->predict_8x8c[I_PRED_CHROMA_P]( p_dstc[0] ); h->predict_8x8c[I_PRED_CHROMA_P]( p_dstc[1] ); satdu[I_PRED_CHROMA_P] = diff --git a/encoder/encoder.c b/encoder/encoder.c index 55634560..30cc067a 100644 --- a/encoder/encoder.c +++ b/encoder/encoder.c @@ -605,6 +605,7 @@ static void mbcmp_init( x264_t *h ) memcpy( h->pixf.mbcmp, satd ? h->pixf.satd : h->pixf.sad_aligned, sizeof(h->pixf.mbcmp) ); memcpy( h->pixf.mbcmp_unaligned, satd ? h->pixf.satd : h->pixf.sad, sizeof(h->pixf.mbcmp_unaligned) ); h->pixf.intra_mbcmp_x3_16x16 = satd ? h->pixf.intra_satd_x3_16x16 : h->pixf.intra_sad_x3_16x16; + h->pixf.intra_mbcmp_x3_8x8c = satd ? h->pixf.intra_satd_x3_8x8c : h->pixf.intra_sad_x3_8x8c; satd &= h->param.analyse.i_me_method == X264_ME_TESA; memcpy( h->pixf.fpelcmp, satd ? h->pixf.satd : h->pixf.sad, sizeof(h->pixf.fpelcmp) ); memcpy( h->pixf.fpelcmp_x3, satd ? h->pixf.satd_x3 : h->pixf.sad_x3, sizeof(h->pixf.fpelcmp_x3) ); diff --git a/tools/checkasm.c b/tools/checkasm.c index 31fd9758..e4a5f7c4 100644 --- a/tools/checkasm.c +++ b/tools/checkasm.c @@ -405,6 +405,7 @@ static int check_pixel( int cpu_ref, int cpu_new ) TEST_INTRA_MBCMP( intra_sa8d_x3_8x8 , predict_8x8 , sa8d[PIXEL_8x8] , 1, edge ); report( "intra satd_x3 :" ); TEST_INTRA_MBCMP( intra_sad_x3_16x16 , predict_16x16, sad [PIXEL_16x16], 0 ); + TEST_INTRA_MBCMP( intra_sad_x3_8x8c , predict_8x8c , sad [PIXEL_8x8] , 0 ); report( "intra sad_x3 :" ); if( pixel_asm.ssim_4x4x2_core != pixel_ref.ssim_4x4x2_core || -- 2.40.0