From: Fiona Glaser Date: Wed, 10 Jun 2009 17:11:00 +0000 (-0700) Subject: SSSE3, faster SSE2/MMX integral_init4v X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=892dad35970375e99da6e047f677964b8eb69fc8;p=libx264 SSSE3, faster SSE2/MMX integral_init4v The real reason I wrote this was an excuse to use shufpd. --- diff --git a/common/x86/mc-a2.asm b/common/x86/mc-a2.asm index cebe59fd..9ed06fa4 100644 --- a/common/x86/mc-a2.asm +++ b/common/x86/mc-a2.asm @@ -796,32 +796,7 @@ cglobal x264_integral_init8h_sse4, 3,4 jl .loop REP_RET -%macro INTEGRAL_INIT 1 -;----------------------------------------------------------------------------- -; void x264_integral_init4v_mmx( uint16_t *sum8, uint16_t *sum4, int stride ) -;----------------------------------------------------------------------------- -cglobal x264_integral_init4v_%1, 3,5 - shl r2, 1 - add r0, r2 - add r1, r2 - lea r3, [r0+r2*4] - lea r4, [r0+r2*8] - neg r2 -.loop: - movu m0, [r0+r2+8] - mova m2, [r0+r2] - movu m1, [r4+r2+8] - paddw m0, m2 - paddw m1, [r4+r2] - mova m3, [r3+r2] - psubw m1, m0 - psubw m3, m2 - mova [r0+r2], m1 - mova [r1+r2], m3 - add r2, mmsize - jl .loop - REP_RET - +%macro INTEGRAL_INIT_8V 1 ;----------------------------------------------------------------------------- ; void x264_integral_init8v_mmx( uint16_t *sum8, int stride ) ;----------------------------------------------------------------------------- @@ -843,11 +818,85 @@ cglobal x264_integral_init8v_%1, 3,3 %endmacro INIT_MMX -INTEGRAL_INIT mmx +INTEGRAL_INIT_8V mmx INIT_XMM -INTEGRAL_INIT sse2 +INTEGRAL_INIT_8V sse2 + +;----------------------------------------------------------------------------- +; void x264_integral_init4v_mmx( uint16_t *sum8, uint16_t *sum4, int stride ) +;----------------------------------------------------------------------------- +INIT_MMX +cglobal x264_integral_init4v_mmx, 3,5 + shl r2, 1 + lea r3, [r0+r2*4] + lea r4, [r0+r2*8] + mova m0, [r0+r2] + mova m4, [r4+r2] +.loop: + sub r2, 8 + mova m1, m4 + psubw m1, m0 + mova m4, [r4+r2] + mova m0, [r0+r2] + paddw m1, m4 + mova m3, [r3+r2] + psubw m1, m0 + psubw m3, m0 + mova [r0+r2], m1 + mova [r1+r2], m3 + jge .loop + REP_RET +INIT_XMM +cglobal x264_integral_init4v_sse2, 3,5 + shl r2, 1 + add r0, r2 + add r1, r2 + lea r3, [r0+r2*4] + lea r4, [r0+r2*8] + neg r2 +.loop: + mova m0, [r0+r2] + mova m1, [r4+r2] + mova m2, m0 + mova m4, m1 + shufpd m0, [r0+r2+16], 1 + shufpd m1, [r4+r2+16], 1 + paddw m0, m2 + paddw m1, m4 + mova m3, [r3+r2] + psubw m1, m0 + psubw m3, m2 + mova [r0+r2], m1 + mova [r1+r2], m3 + add r2, 16 + jl .loop + REP_RET +cglobal x264_integral_init4v_ssse3, 3,5 + shl r2, 1 + add r0, r2 + add r1, r2 + lea r3, [r0+r2*4] + lea r4, [r0+r2*8] + neg r2 +.loop: + mova m2, [r0+r2] + mova m0, [r0+r2+16] + mova m4, [r4+r2] + mova m1, [r4+r2+16] + palignr m0, m2, 8 + palignr m1, m4, 8 + paddw m0, m2 + paddw m1, m4 + mova m3, [r3+r2] + psubw m1, m0 + psubw m3, m2 + mova [r0+r2], m1 + mova [r1+r2], m3 + add r2, 16 + jl .loop + REP_RET %macro FILT8x4 7 mova %3, [r0+%7] diff --git a/common/x86/mc-c.c b/common/x86/mc-c.c index 2027d3ed..b3363e31 100644 --- a/common/x86/mc-c.c +++ b/common/x86/mc-c.c @@ -70,6 +70,7 @@ extern void x264_integral_init4v_mmx( uint16_t *sum8, uint16_t *sum4, int stride extern void x264_integral_init4v_sse2( uint16_t *sum8, uint16_t *sum4, int stride ); extern void x264_integral_init8v_mmx( uint16_t *sum8, int stride ); extern void x264_integral_init8v_sse2( uint16_t *sum8, int stride ); +extern void x264_integral_init4v_ssse3( uint16_t *sum8, uint16_t *sum4, int stride ); #define LOWRES(cpu) \ extern void x264_frame_init_lowres_core_##cpu( uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,\ int src_stride, int dst_stride, int width, int height ); @@ -340,6 +341,9 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf ) pf->frame_init_lowres_core = x264_frame_init_lowres_core_ssse3; pf->mc_chroma = x264_mc_chroma_ssse3; + if( cpu&X264_CPU_SHUFFLE_IS_FAST ) + pf->integral_init4v = x264_integral_init4v_ssse3; + if( !(cpu&X264_CPU_SSE4) ) return;