From: Fiona Glaser Date: Mon, 30 Mar 2009 23:37:46 +0000 (-0700) Subject: intra_sad_x3_4x4 assembly X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=104511d6e13a2d6628ba321fe7f0cb25ac545b6f;p=libx264 intra_sad_x3_4x4 assembly --- diff --git a/common/pixel.c b/common/pixel.c index 24bf4307..72b5e6fd 100644 --- a/common/pixel.c +++ b/common/pixel.c @@ -666,6 +666,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ) pixf->intra_satd_x3_8x8c = x264_intra_satd_x3_8x8c_mmxext; pixf->intra_sad_x3_8x8c = x264_intra_sad_x3_8x8c_mmxext; pixf->intra_satd_x3_4x4 = x264_intra_satd_x3_4x4_mmxext; + pixf->intra_sad_x3_4x4 = x264_intra_sad_x3_4x4_mmxext; } if( cpu&X264_CPU_SSE2 ) diff --git a/common/pixel.h b/common/pixel.h index f1c901e3..9ce368be 100644 --- a/common/pixel.h +++ b/common/pixel.h @@ -100,7 +100,9 @@ typedef struct void (*intra_mbcmp_x3_8x8c) ( uint8_t *fenc, uint8_t *fdec , int res[3] ); void (*intra_satd_x3_8x8c) ( uint8_t *fenc, uint8_t *fdec , int res[3] ); void (*intra_sad_x3_8x8c) ( uint8_t *fenc, uint8_t *fdec , int res[3] ); + void (*intra_mbcmp_x3_4x4) ( uint8_t *fenc, uint8_t *fdec , int res[3] ); void (*intra_satd_x3_4x4) ( uint8_t *fenc, uint8_t *fdec , int res[3] ); + void (*intra_sad_x3_4x4) ( uint8_t *fenc, uint8_t *fdec , int res[3] ); void (*intra_sa8d_x3_8x8) ( uint8_t *fenc, uint8_t edge[33], int res[3] ); } x264_pixel_function_t; diff --git a/common/x86/pixel.h b/common/x86/pixel.h index 1e04dcda..dbc04fe2 100644 --- a/common/x86/pixel.h +++ b/common/x86/pixel.h @@ -79,6 +79,7 @@ DECL_PIXELS( uint64_t, hadamard_ac, sse4, ( uint8_t *pix, int i_stride )) void x264_intra_satd_x3_4x4_mmxext ( uint8_t *, uint8_t *, int * ); void x264_intra_satd_x3_4x4_ssse3 ( uint8_t *, uint8_t *, int * ); +void x264_intra_sad_x3_4x4_mmxext ( uint8_t *, uint8_t *, int * ); void x264_intra_satd_x3_8x8c_mmxext ( uint8_t *, uint8_t *, int * ); void x264_intra_satd_x3_8x8c_ssse3 ( uint8_t *, uint8_t *, int * ); void x264_intra_sad_x3_8x8c_mmxext ( uint8_t *, uint8_t *, int * ); diff --git a/common/x86/sad-a.asm b/common/x86/sad-a.asm index 2eb260b9..ba1cbe48 100644 --- a/common/x86/sad-a.asm +++ b/common/x86/sad-a.asm @@ -257,6 +257,53 @@ cglobal x264_pixel_sad_8x16_sse2, 4,4 SAD_END_SSE2 RET +;----------------------------------------------------------------------------- +; void intra_sad_x3_4x4 ( uint8_t *fenc, uint8_t *fdec, int res[3] ); +;----------------------------------------------------------------------------- + +cglobal x264_intra_sad_x3_4x4_mmxext, 3,3 + pxor mm7, mm7 + movd mm0, [r1-FDEC_STRIDE] + movd mm1, [r0+FENC_STRIDE*0] + movd mm2, [r0+FENC_STRIDE*2] + punpckldq mm0, mm0 + punpckldq mm1, [r0+FENC_STRIDE*1] + punpckldq mm2, [r0+FENC_STRIDE*3] + movq mm6, mm0 + movq mm3, mm1 + psadbw mm3, mm0 + psadbw mm0, mm2 + paddw mm0, mm3 + movd [r2], mm0 ;V prediction cost + movd mm3, [r1+FDEC_STRIDE*0-4] + movd mm0, [r1+FDEC_STRIDE*1-4] + movd mm4, [r1+FDEC_STRIDE*2-4] + movd mm5, [r1+FDEC_STRIDE*3-4] + punpcklbw mm3, mm0 + punpcklbw mm4, mm5 + movq mm5, mm3 + punpckhwd mm5, mm4 + punpckhdq mm5, mm6 + psadbw mm5, mm7 + punpckhbw mm3, mm3 + punpckhbw mm4, mm4 + punpckhwd mm3, mm3 + punpckhwd mm4, mm4 + psraw mm5, 2 + pavgw mm5, mm7 + punpcklbw mm5, mm5 + pshufw mm5, mm5, 0x0 ;DC prediction + movq mm6, mm5 + psadbw mm5, mm1 + psadbw mm6, mm2 + psadbw mm1, mm3 + psadbw mm2, mm4 + paddw mm5, mm6 + paddw mm1, mm2 + movd [r2+8], mm5 ;DC prediction cost + movd [r2+4], mm1 ;H prediction cost + RET + ;----------------------------------------------------------------------------- ; void intra_sad_x3_8x8c ( uint8_t *fenc, uint8_t *fdec, int res[3] ); ;----------------------------------------------------------------------------- diff --git a/encoder/analyse.c b/encoder/analyse.c index b1df50b5..3bcde119 100644 --- a/encoder/analyse.c +++ b/encoder/analyse.c @@ -760,7 +760,7 @@ static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_ int i_cost; int i_satd_thresh = X264_MIN3( i_satd_inter, a->i_satd_i16x16, a->i_satd_i8x8 ); h->mb.i_cbp_luma = 0; - b_merged_satd = h->pixf.intra_satd_x3_4x4 && h->pixf.mbcmp[0] == h->pixf.satd[0]; + b_merged_satd = h->pixf.intra_mbcmp_x3_4x4 && !h->mb.b_lossless; if( a->i_mbrd ) i_satd_thresh = i_satd_thresh * (10-a->b_fast_intra)/8; diff --git a/encoder/encoder.c b/encoder/encoder.c index 30cc067a..b181d1e7 100644 --- a/encoder/encoder.c +++ b/encoder/encoder.c @@ -606,6 +606,7 @@ static void mbcmp_init( x264_t *h ) memcpy( h->pixf.mbcmp_unaligned, satd ? h->pixf.satd : h->pixf.sad, sizeof(h->pixf.mbcmp_unaligned) ); h->pixf.intra_mbcmp_x3_16x16 = satd ? h->pixf.intra_satd_x3_16x16 : h->pixf.intra_sad_x3_16x16; h->pixf.intra_mbcmp_x3_8x8c = satd ? h->pixf.intra_satd_x3_8x8c : h->pixf.intra_sad_x3_8x8c; + h->pixf.intra_mbcmp_x3_4x4 = satd ? h->pixf.intra_satd_x3_4x4 : h->pixf.intra_sad_x3_4x4; satd &= h->param.analyse.i_me_method == X264_ME_TESA; memcpy( h->pixf.fpelcmp, satd ? h->pixf.satd : h->pixf.sad, sizeof(h->pixf.fpelcmp) ); memcpy( h->pixf.fpelcmp_x3, satd ? h->pixf.satd_x3 : h->pixf.sad_x3, sizeof(h->pixf.fpelcmp_x3) ); diff --git a/tools/checkasm.c b/tools/checkasm.c index e4a5f7c4..85a65c23 100644 --- a/tools/checkasm.c +++ b/tools/checkasm.c @@ -406,6 +406,7 @@ static int check_pixel( int cpu_ref, int cpu_new ) report( "intra satd_x3 :" ); TEST_INTRA_MBCMP( intra_sad_x3_16x16 , predict_16x16, sad [PIXEL_16x16], 0 ); TEST_INTRA_MBCMP( intra_sad_x3_8x8c , predict_8x8c , sad [PIXEL_8x8] , 0 ); + TEST_INTRA_MBCMP( intra_sad_x3_4x4 , predict_4x4 , sad [PIXEL_4x4] , 0 ); report( "intra sad_x3 :" ); if( pixel_asm.ssim_4x4x2_core != pixel_ref.ssim_4x4x2_core ||