From: Fiona Glaser Date: Wed, 15 Jul 2009 19:43:35 +0000 (-0700) Subject: Cacheline-split SSSE3 chroma MC X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=a79dc7b5bc6e95508d8456681c30b57605a05fd0;p=libx264 Cacheline-split SSSE3 chroma MC ~70% faster chroma MC on 32-bit Conroe Also slightly faster SSSE3 intra_sad_8x8c --- diff --git a/common/frame.c b/common/frame.c index bd7ad45c..6a088ca8 100644 --- a/common/frame.c +++ b/common/frame.c @@ -50,7 +50,7 @@ x264_frame_t *x264_frame_new( x264_t *h ) frame->i_plane = 3; for( i = 0; i < 3; i++ ) { - frame->i_stride[i] = ALIGN( i_stride >> !!i, 16 ); + frame->i_stride[i] = ALIGN( i_stride >> !!i, align ); frame->i_width[i] = i_width >> !!i; frame->i_lines[i] = i_lines >> !!i; } diff --git a/common/x86/mc-a.asm b/common/x86/mc-a.asm index eee2604d..69033b4f 100644 --- a/common/x86/mc-a.asm +++ b/common/x86/mc-a.asm @@ -25,8 +25,9 @@ %include "x86inc.asm" -SECTION_RODATA +SECTION_RODATA 32 +ch_shuffle: db 0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,0,0 pw_4: times 8 dw 4 pw_8: times 8 dw 8 pw_32: times 8 dw 32 @@ -869,8 +870,9 @@ MC_CHROMA mmxext INIT_XMM MC_CHROMA sse2, 8 +%macro MC_CHROMA_SSSE3 2 INIT_MMX -cglobal x264_mc_chroma_ssse3, 0,6,8 +cglobal x264_mc_chroma_ssse3%1, 0,6,%2 MC_CHROMA_START and r4d, 7 and r5d, 7 @@ -887,7 +889,7 @@ cglobal x264_mc_chroma_ssse3, 0,6,8 mova m5, [pw_32 GLOBAL] movd m6, r5d movd m7, r4d - movifnidn r0, r0mp + movifnidn r0, r0mp movifnidn r1d, r1m movifnidn r4d, r7m SPLATW m6, m6 @@ -925,23 +927,28 @@ cglobal x264_mc_chroma_ssse3, 0,6,8 INIT_XMM .width8: - mova m5, [pw_32 GLOBAL] movd m6, r5d movd m7, r4d - movifnidn r0, r0mp + movifnidn r0, r0mp movifnidn r1d, r1m movifnidn r4d, r7m SPLATW m6, m6 SPLATW m7, m7 +%ifidn %1, _cache64 + mov r5, r2 + and r5, 0x3f + cmp r5, 0x38 + jge .split +%endif + mova m5, [pw_32 GLOBAL] movh m0, [r2] movh m1, [r2+1] punpcklbw m0, m1 - add r2, r3 .loop8: - movh m1, [r2] - movh m2, [r2+1] - movh m3, [r2+r3] - movh m4, [r2+r3+1] + movh m1, [r2+1*r3] + movh m2, [r2+1*r3+1] + movh m3, [r2+2*r3] + movh m4, [r2+2*r3+1] punpcklbw m1, m2 punpcklbw m3, m4 lea r2, [r2+2*r3] @@ -965,6 +972,53 @@ INIT_XMM lea r0, [r0+2*r1] jg .loop8 REP_RET - +%ifidn %1, _cache64 +.split: + and r2, ~7 + and r5, 7 +%ifdef PIC + lea r11, [ch_shuffle GLOBAL] + movu m5, [r11 + r5*2] +%else + movu m5, [ch_shuffle + r5*2 GLOBAL] +%endif + movu m0, [r2] + pshufb m0, m5 +%ifdef ARCH_X86_64 + mova m8, [pw_32 GLOBAL] + %define round m8 +%else + %define round [pw_32 GLOBAL] +%endif +.splitloop8: + movu m1, [r2+r3] + pshufb m1, m5 + movu m3, [r2+2*r3] + pshufb m3, m5 + lea r2, [r2+2*r3] + mova m2, m1 + mova m4, m3 + pmaddubsw m0, m7 + pmaddubsw m1, m6 + pmaddubsw m2, m7 + pmaddubsw m3, m6 + paddw m0, round + paddw m2, round + paddw m1, m0 + paddw m3, m2 + mova m0, m4 + psrlw m1, 6 + psrlw m3, 6 + packuswb m1, m3 + movh [r0], m1 + movhps [r0+r1], m1 + sub r4d, 2 + lea r0, [r0+2*r1] + jg .splitloop8 + REP_RET +%endif ; mc_chroma 1d ssse3 is negligibly faster, and definitely not worth the extra code size +%endmacro +MC_CHROMA_SSSE3 , 8 +MC_CHROMA_SSSE3 _cache64, 9 diff --git a/common/x86/mc-c.c b/common/x86/mc-c.c index b3363e31..1241a232 100644 --- a/common/x86/mc-c.c +++ b/common/x86/mc-c.c @@ -59,6 +59,9 @@ extern void x264_mc_chroma_sse2( uint8_t *src, int i_src_stride, extern void x264_mc_chroma_ssse3( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int dx, int dy, int i_width, int i_height ); +extern void x264_mc_chroma_ssse3_cache64( uint8_t *src, int i_src_stride, + uint8_t *dst, int i_dst_stride, + int dx, int dy, int i_width, int i_height ); extern void x264_plane_copy_mmxext( uint8_t *, int, uint8_t *, int, int w, int h); extern void *x264_memcpy_aligned_mmx( void * dst, const void * src, size_t n ); extern void *x264_memcpy_aligned_sse2( void * dst, const void * src, size_t n ); @@ -340,6 +343,8 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf ) pf->hpel_filter = x264_hpel_filter_ssse3; pf->frame_init_lowres_core = x264_frame_init_lowres_core_ssse3; pf->mc_chroma = x264_mc_chroma_ssse3; + if( cpu&X264_CPU_CACHELINE_64 ) + pf->mc_chroma = x264_mc_chroma_ssse3_cache64; if( cpu&X264_CPU_SHUFFLE_IS_FAST ) pf->integral_init4v = x264_integral_init4v_ssse3; diff --git a/common/x86/sad-a.asm b/common/x86/sad-a.asm index 843065c3..30920a6f 100644 --- a/common/x86/sad-a.asm +++ b/common/x86/sad-a.asm @@ -28,9 +28,8 @@ SECTION_RODATA pb_3: times 16 db 3 +pb_shuf8x8c: db 0,0,0,0,2,2,2,2,4,4,4,4,6,6,6,6 pw_8: times 4 dw 8 -pb_shuf8x8c0: db 0,0,0,0,2,2,2,2 -pb_shuf8x8c1: db 4,4,4,4,6,6,6,6 sw_64: dd 64 SECTION .text @@ -450,16 +449,32 @@ cglobal x264_intra_sad_x3_8x8c_%1, 3,3 psrlw m0, 2 pavgw m0, m7 ; s0+s2, s1, s3, s1+s3 %ifidn %1, ssse3 - movq m1, m0 - pshufb m0, [pb_shuf8x8c0 GLOBAL] - pshufb m1, [pb_shuf8x8c1 GLOBAL] + movq2dq xmm0, m0 + pshufb xmm0, [pb_shuf8x8c GLOBAL] + movq xmm1, [r0+FENC_STRIDE*0] + movq xmm2, [r0+FENC_STRIDE*1] + movq xmm3, [r0+FENC_STRIDE*2] + movq xmm4, [r0+FENC_STRIDE*3] + movhps xmm1, [r0+FENC_STRIDE*4] + movhps xmm2, [r0+FENC_STRIDE*5] + movhps xmm3, [r0+FENC_STRIDE*6] + movhps xmm4, [r0+FENC_STRIDE*7] + psadbw xmm1, xmm0 + psadbw xmm2, xmm0 + psadbw xmm3, xmm0 + psadbw xmm4, xmm0 + paddw xmm1, xmm2 + paddw xmm1, xmm3 + paddw xmm1, xmm4 + movhlps xmm0, xmm1 + paddw xmm1, xmm0 + movd [r2], xmm1 %else packuswb m0, m0 punpcklbw m0, m0 movq m1, m0 punpcklbw m0, m0 ; 4x dc0 4x dc1 punpckhbw m1, m1 ; 4x dc2 4x dc3 -%endif movq m2, [r0+FENC_STRIDE*0] movq m3, [r0+FENC_STRIDE*1] movq m4, [r0+FENC_STRIDE*2] @@ -483,6 +498,7 @@ cglobal x264_intra_sad_x3_8x8c_%1, 3,3 paddw m6, m0 paddw m2, m6 movd [r2], m2 +%endif RET %endmacro diff --git a/common/x86/x86inc.asm b/common/x86/x86inc.asm index fced5c6d..15990d58 100644 --- a/common/x86/x86inc.asm +++ b/common/x86/x86inc.asm @@ -37,14 +37,14 @@ ; Name of the .rodata section. ; Kludge: Something on OS X fails to align .rodata even given an align attribute, ; so use a different read-only section. -%macro SECTION_RODATA 0 +%macro SECTION_RODATA 0-1 16 %ifidn __OUTPUT_FORMAT__,macho64 - SECTION .text align=16 + SECTION .text align=%1 %elifidn __OUTPUT_FORMAT__,macho - SECTION .text align=16 + SECTION .text align=%1 fakegot: %else - SECTION .rodata align=16 + SECTION .rodata align=%1 %endif %endmacro diff --git a/tools/checkasm.c b/tools/checkasm.c index 9dae095d..ff2e445a 100644 --- a/tools/checkasm.c +++ b/tools/checkasm.c @@ -803,8 +803,8 @@ static int check_mc( int cpu_ref, int cpu_new ) used_asm = 1; \ memset(buf3, 0xCD, 1024); \ memset(buf4, 0xCD, 1024); \ - call_c( mc_c.mc_chroma, dst1, 16, src, 32, dx, dy, w, h ); \ - call_a( mc_a.mc_chroma, dst2, 16, src, 32, dx, dy, w, h ); \ + call_c( mc_c.mc_chroma, dst1, 16, src, 64, dx, dy, w, h ); \ + call_a( mc_a.mc_chroma, dst2, 16, src, 64, dx, dy, w, h ); \ /* mc_chroma width=2 may write garbage to the right of dst. ignore that. */\ for( j=0; j