From a8703c8933f51715c25118eb83487072a548934e Mon Sep 17 00:00:00 2001 From: Laurent Aimar Date: Tue, 29 Jun 2004 22:41:42 +0000 Subject: [PATCH] * all: fixed ss2 runtime selection. git-svn-id: svn://svn.videolan.org/x264/trunk@11 df754926-b1dd-0310-bc7b-ec298dee348c --- Jamfile | 2 +- core/i386/mc-c.c | 399 +++++++++++++++++++++++++++-------------------- core/i386/mc.asm | 125 +++++++++++---- core/i386/mc.h | 1 + core/mc.c | 9 +- 5 files changed, 333 insertions(+), 203 deletions(-) diff --git a/Jamfile b/Jamfile index 3266e836..7084c524 100644 --- a/Jamfile +++ b/Jamfile @@ -35,7 +35,7 @@ SOURCES_ALTIVEC = core/ppc/mc.c core/ppc/pixel.c ; SOURCES_X264 = $(SOURCES_C) ; if $(OS) = LINUX { - DEFINES += ARCH_X86 HAVE_MMXEXT HAVE_MALLOC_H ; + DEFINES += ARCH_X86 HAVE_MMXEXT HAVE_SSE2 HAVE_MALLOC_H ; SOURCES_X264 += $(SOURCES_MMX) ; SOURCES_X264 += $(SOURCES_X86) ; ASFLAGS = -f elf ; diff --git a/core/i386/mc-c.c b/core/i386/mc-c.c index bab42647..bec61db4 100644 --- a/core/i386/mc-c.c +++ b/core/i386/mc-c.c @@ -181,107 +181,87 @@ static inline int x264_tapfilter1( uint8_t *pix ) return pix[-2] - 5*pix[-1] + 20*(pix[0] + pix[1]) - 5*pix[ 2] + pix[ 3]; } -#if 0 -static inline void pixel_avg_w4( uint8_t *dst, int i_dst_stride, - uint8_t *src1, int i_src1_stride, - uint8_t *src2, int i_src2_stride, - int i_height ) -{ - int x, y; - for( y = 0; y < i_height; y++ ) - { - for( x = 0; x < 4; x++ ) - { - dst[x] = ( src1[x] + src2[x] + 1 ) >> 1; - } - dst += i_dst_stride; - src1 += i_src1_stride; - src2 += i_src2_stride; - } +typedef void (*pf_mc_t)(uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ); + +/* NASM functions */ +extern void x264_pixel_avg_w4_mmxext( uint8_t *, int, uint8_t *, int, uint8_t *, int, int ); +extern void x264_pixel_avg_w8_mmxext( uint8_t *, int, uint8_t *, int, uint8_t *, int, int ); +extern void x264_pixel_avg_w16_mmxext( uint8_t *, int, uint8_t *, int, uint8_t *, int, int ); +extern void x264_pixel_avg_w16_sse2( uint8_t *, int, uint8_t *, int, uint8_t *, int, int ); + +/* Macro to define NxM functions */ +/* mc I+H */ +#define MC_IH( name, cpu, width, height, off ) \ +static void name##_w##width##_##cpu( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ) \ +{ \ + DECLARE_ALIGNED( uint8_t, tmp[width*height], width ); \ + \ + mc_hh_w##width( src, i_src_stride, tmp, width, i_height ); \ + x264_pixel_avg_w##width##_##cpu( dst, i_dst_stride, \ + src+(off), i_src_stride, \ + tmp, width, i_height ); \ } -static inline void pixel_avg_w8( uint8_t *dst, int i_dst_stride, - uint8_t *src1, int i_src1_stride, - uint8_t *src2, int i_src2_stride, - int i_height ) -{ - int y; - for( y = 0; y < i_height; y++ ) - { - asm volatile( - "movq (%1), %%mm0\n" - "movq (%2), %%mm1\n" - "pavgb %%mm1, %%mm0\n" - "movq %%mm0, (%0)\n" - : : "r"(dst), "r"(src1), "r"(src2) - ); - dst += i_dst_stride; - src1 += i_src1_stride; - src2 += i_src2_stride; - } +/* mc I+V */ +#define MC_IV( name, cpu, width, height, off ) \ +static void name##_w##width##_##cpu( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ) \ +{ \ + DECLARE_ALIGNED( uint8_t, tmp[width*height], width ); \ + \ + mc_hv_w##width( src, i_src_stride, tmp, width, i_height ); \ + x264_pixel_avg_w##width##_##cpu( dst, i_dst_stride, \ + src+(off), i_src_stride, \ + tmp, width, i_height ); \ } -static inline void pixel_avg_w16( uint8_t *dst, int i_dst_stride, - uint8_t *src1, int i_src1_stride, - uint8_t *src2, int i_src2_stride, - int i_height ) -{ - int y; - for( y = 0; y < i_height; y++ ) - { - asm volatile( - "movq (%1), %%mm0\n" - "movq 8(%1), %%mm2\n" - "movq (%2), %%mm1\n" - "movq 8(%2), %%mm3\n" - - "pavgb %%mm1, %%mm0\n" - "movq %%mm0, (%0)\n" - "pavgb %%mm3, %%mm2\n" - "movq %%mm2, 8(%0)\n" - : : "r"(dst), "r"(src1), "r"(src2) - ); - dst += i_dst_stride; - src1 += i_src1_stride; - src2 += i_src2_stride; - } +/* mc H+V */ +#define MC_HV( name, cpu, width, height, off1, off2 ) \ +static void name##_w##width##_##cpu( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ) \ +{ \ + DECLARE_ALIGNED( uint8_t, tmp1[width*height], width ); \ + DECLARE_ALIGNED( uint8_t, tmp2[width*height], width ); \ + \ + mc_hv_w##width( src+(off1), i_src_stride, tmp1, width, i_height ); \ + mc_hh_w##width( src+(off2), i_src_stride, tmp2, width, i_height ); \ + x264_pixel_avg_w##width##_##cpu( dst, i_dst_stride, \ + tmp1, width, tmp2, width, \ + i_height ); \ +} + +/* mc C+H */ +#define MC_CH( name, cpu, width, height, off ) \ +static void name##_w##width##_##cpu( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ) \ +{ \ + DECLARE_ALIGNED( uint8_t, tmp1[width*height], width ); \ + DECLARE_ALIGNED( uint8_t, tmp2[width*height], width ); \ + \ + mc_hc_w##width( src, i_src_stride, tmp1, width, i_height ); \ + mc_hh_w##width( src+(off), i_src_stride, tmp2, width, i_height ); \ + x264_pixel_avg_w##width##_##cpu( dst, i_dst_stride, \ + tmp1, width, tmp2, width, \ + i_height ); \ +} + +/* mc C+V */ +#define MC_CV( name, cpu, width, height, off ) \ +static void name##_w##width##_##cpu( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ) \ +{ \ + DECLARE_ALIGNED( uint8_t, tmp1[width*height], width ); \ + DECLARE_ALIGNED( uint8_t, tmp2[width*height], width ); \ + \ + mc_hc_w##width( src, i_src_stride, tmp1, width, i_height ); \ + mc_hv_w##width( src+(off), i_src_stride, tmp2, width, i_height ); \ + x264_pixel_avg_w##width##_##cpu( dst, i_dst_stride, \ + tmp1, width, tmp2, width, \ + i_height ); \ } -#else -extern void pixel_avg_w4( uint8_t *dst, int i_dst_stride, - uint8_t *src1, int i_src1_stride, - uint8_t *src2, int i_src2_stride, - int i_height ); -extern void pixel_avg_w8( uint8_t *dst, int i_dst_stride, - uint8_t *src1, int i_src1_stride, - uint8_t *src2, int i_src2_stride, - int i_height ); -extern void pixel_avg_w16( uint8_t *dst, int i_dst_stride, - uint8_t *src1, int i_src1_stride, - uint8_t *src2, int i_src2_stride, - int i_height ); -#endif -typedef void (*pf_mc_t)(uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ); /***************************************************************************** * MC with width == 4 (height <= 8) *****************************************************************************/ -#if 0 -static void mc_copy_w4( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ) -{ - int y; - for( y = 0; y < i_height; y++ ) - { - memcpy( dst, src, 4 ); - - src += i_src_stride; - dst += i_dst_stride; - } -} -#else -extern void mc_copy_w4( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ); -#endif +extern void x264_mc_copy_w4_mmxext( uint8_t *, int, uint8_t *, int, int ); static inline void mc_hh_w4( uint8_t *src, int i_src, uint8_t *dst, int i_dst, int i_height ) { @@ -384,7 +364,24 @@ static inline void mc_hc_w4( uint8_t *src, int i_src_stride, uint8_t *dst, int i } } -/* mc I+H */ +MC_IH( mc_xy10, mmxext, 4, 8, 0 ) +MC_IH( mc_xy30, mmxext, 4, 8, 1 ) + +MC_IV( mc_xy01, mmxext, 4, 8, 0 ) +MC_IV( mc_xy03, mmxext, 4, 8, i_src_stride ) + +MC_HV( mc_xy11, mmxext, 4, 8, 0, 0 ) +MC_HV( mc_xy31, mmxext, 4, 8, 1, 0 ) +MC_HV( mc_xy13, mmxext, 4, 8, 0, i_src_stride ) +MC_HV( mc_xy33, mmxext, 4, 8, 1, i_src_stride ) + +MC_CH( mc_xy21, mmxext, 4, 8, 0 ) +MC_CH( mc_xy23, mmxext, 4, 8, i_src_stride ) + +MC_CV( mc_xy12, mmxext, 4, 8, 0 ) +MC_CV( mc_xy32, mmxext, 4, 8, 1 ) + +#if 0 static void mc_xy10_w4( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ) { uint8_t tmp[4*8]; @@ -397,7 +394,7 @@ static void mc_xy30_w4( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_ mc_hh_w4( src, i_src_stride, tmp, 4, i_height ); pixel_avg_w4( dst, i_dst_stride, src+1, i_src_stride, tmp, 4, i_height ); } -/* mc I+V */ + static void mc_xy01_w4( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ) { uint8_t tmp[4*8]; @@ -410,7 +407,7 @@ static void mc_xy03_w4( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_ mc_hv_w4( src, i_src_stride, tmp, 4, i_height ); pixel_avg_w4( dst, i_dst_stride, src+i_src_stride, i_src_stride, tmp, 4, i_height ); } -/* H+V */ + static void mc_xy11_w4( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ) { uint8_t tmp1[4*8]; @@ -447,6 +444,7 @@ static void mc_xy33_w4( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_ mc_hh_w4( src+i_src_stride, i_src_stride, tmp2, 4, i_height ); pixel_avg_w4( dst, i_dst_stride, tmp1, 4, tmp2, 4, i_height ); } + static void mc_xy21_w4( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ) { uint8_t tmp1[4*8]; @@ -456,54 +454,40 @@ static void mc_xy21_w4( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_ mc_hh_w4( src, i_src_stride, tmp2, 4, i_height ); pixel_avg_w4( dst, i_dst_stride, tmp1, 4, tmp2, 4, i_height ); } -static void mc_xy12_w4( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ) +static void mc_xy23_w4( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ) { uint8_t tmp1[4*8]; uint8_t tmp2[4*8]; - mc_hc_w4( src, i_src_stride, tmp1, 4, i_height ); - mc_hv_w4( src, i_src_stride, tmp2, 4, i_height ); + mc_hc_w4( src, i_src_stride, tmp1, 4, i_height ); + mc_hh_w4( src+i_src_stride, i_src_stride, tmp2, 4, i_height ); pixel_avg_w4( dst, i_dst_stride, tmp1, 4, tmp2, 4, i_height ); } -static void mc_xy32_w4( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ) + +static void mc_xy12_w4( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ) { uint8_t tmp1[4*8]; uint8_t tmp2[4*8]; - mc_hc_w4( src, i_src_stride, tmp1, 4, i_height ); - mc_hv_w4( src+1, i_src_stride, tmp2, 4, i_height ); + mc_hc_w4( src, i_src_stride, tmp1, 4, i_height ); + mc_hv_w4( src, i_src_stride, tmp2, 4, i_height ); pixel_avg_w4( dst, i_dst_stride, tmp1, 4, tmp2, 4, i_height ); } -static void mc_xy23_w4( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ) +static void mc_xy32_w4( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ) { uint8_t tmp1[4*8]; uint8_t tmp2[4*8]; - mc_hc_w4( src, i_src_stride, tmp1, 4, i_height ); - mc_hh_w4( src+i_src_stride, i_src_stride, tmp2, 4, i_height ); + mc_hc_w4( src, i_src_stride, tmp1, 4, i_height ); + mc_hv_w4( src+1, i_src_stride, tmp2, 4, i_height ); pixel_avg_w4( dst, i_dst_stride, tmp1, 4, tmp2, 4, i_height ); } - +#endif /***************************************************************************** * MC with width == 8 (height <= 16) *****************************************************************************/ -#if 0 -static void mc_copy_w8( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ) -{ - int y; - - for( y = 0; y < i_height; y++ ) - { - memcpy( dst, src, 8 ); - - src += i_src_stride; - dst += i_dst_stride; - } -} -#else -extern void mc_copy_w8( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ); -#endif +extern void x264_mc_copy_w8_mmxext( uint8_t *, int, uint8_t *, int, int ); static inline void mc_hh_w8( uint8_t *src, int i_src, uint8_t *dst, int i_dst, int i_height ) { @@ -670,6 +654,24 @@ static inline void mc_hc_w8( uint8_t *src, int i_src_stride, uint8_t *dst, int i } } +MC_IH( mc_xy10, mmxext, 8, 16, 0 ) +MC_IH( mc_xy30, mmxext, 8, 16, 1 ) + +MC_IV( mc_xy01, mmxext, 8, 16, 0 ) +MC_IV( mc_xy03, mmxext, 8, 16, i_src_stride ) + +MC_HV( mc_xy11, mmxext, 8, 16, 0, 0 ) +MC_HV( mc_xy31, mmxext, 8, 16, 1, 0 ) +MC_HV( mc_xy13, mmxext, 8, 16, 0, i_src_stride ) +MC_HV( mc_xy33, mmxext, 8, 16, 1, i_src_stride ) + +MC_CH( mc_xy21, mmxext, 8, 16, 0 ) +MC_CH( mc_xy23, mmxext, 8, 16, i_src_stride ) + +MC_CV( mc_xy12, mmxext, 8, 16, 0 ) +MC_CV( mc_xy32, mmxext, 8, 16, 1 ) + +#if 0 /* mc I+H */ static void mc_xy10_w8( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ) { @@ -769,27 +771,15 @@ static void mc_xy23_w8( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_ mc_hh_w8( src+i_src_stride, i_src_stride, tmp2, 8, i_height ); pixel_avg_w8( dst, i_dst_stride, tmp1, 8, tmp2, 8, i_height ); } - +#endif /***************************************************************************** * MC with width == 16 (height <= 16) *****************************************************************************/ -#if 0 -static void mc_copy_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ) -{ - int y; - for( y = 0; y < i_height; y++ ) - { - memcpy( dst, src, 16 ); +extern void x264_mc_copy_w16_mmxext( uint8_t *, int, uint8_t *, int, int ); +extern void x264_mc_copy_w16_sse2( uint8_t *, int, uint8_t *, int, int ); - src += i_src_stride; - dst += i_dst_stride; - } -} -#else -extern void mc_copy_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ); -#endif static inline void mc_hh_w16( uint8_t *src, int i_src, uint8_t *dst, int i_dst, int i_height ) { mc_hh_w4( &src[ 0], i_src, &dst[ 0], i_dst, i_height ); @@ -809,6 +799,44 @@ static inline void mc_hc_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int mc_hc_w8( &src[8], i_src_stride, &dst[8], i_dst_stride, i_height ); } +/* MMX avg/copy */ +MC_IH( mc_xy10, mmxext, 16, 16, 0 ) +MC_IH( mc_xy30, mmxext, 16, 16, 1 ) + +MC_IV( mc_xy01, mmxext, 16, 16, 0 ) +MC_IV( mc_xy03, mmxext, 16, 16, i_src_stride ) + +MC_HV( mc_xy11, mmxext, 16, 16, 0, 0 ) +MC_HV( mc_xy31, mmxext, 16, 16, 1, 0 ) +MC_HV( mc_xy13, mmxext, 16, 16, 0, i_src_stride ) +MC_HV( mc_xy33, mmxext, 16, 16, 1, i_src_stride ) + +MC_CH( mc_xy21, mmxext, 16, 16, 0 ) +MC_CH( mc_xy23, mmxext, 16, 16, i_src_stride ) + +MC_CV( mc_xy12, mmxext, 16, 16, 0 ) +MC_CV( mc_xy32, mmxext, 16, 16, 1 ) + +/* SSE2 avg/copy */ +MC_IH( mc_xy10, sse2, 16, 16, 0 ) +MC_IH( mc_xy30, sse2, 16, 16, 1 ) + +MC_IV( mc_xy01, sse2, 16, 16, 0 ) +MC_IV( mc_xy03, sse2, 16, 16, i_src_stride ) + +MC_HV( mc_xy11, sse2, 16, 16, 0, 0 ) +MC_HV( mc_xy31, sse2, 16, 16, 1, 0 ) +MC_HV( mc_xy13, sse2, 16, 16, 0, i_src_stride ) +MC_HV( mc_xy33, sse2, 16, 16, 1, i_src_stride ) + +MC_CH( mc_xy21, sse2, 16, 16, 0 ) +MC_CH( mc_xy23, sse2, 16, 16, i_src_stride ) + +MC_CV( mc_xy12, sse2, 16, 16, 0 ) +MC_CV( mc_xy32, sse2, 16, 16, 1 ) + + +#if 0 /* mc I+H */ static void mc_xy10_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ) { @@ -908,55 +936,92 @@ static void mc_xy23_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst mc_hh_w16( src+i_src_stride, i_src_stride, tmp2, 16, i_height ); pixel_avg_w16( dst, i_dst_stride, tmp1, 16, tmp2, 16, i_height ); } +#endif -static void motion_compensation_luma( uint8_t *src, int i_src_stride, - uint8_t *dst, int i_dst_stride, - int mvx,int mvy, - int i_width, int i_height ) +#define MOTION_COMPENSATION_LUMA \ + src += (mvy >> 2) * i_src_stride + (mvx >> 2); \ + if( i_width == 4 ) \ + { \ + pf_mc[0][mvy&0x03][mvx&0x03]( src, i_src_stride, dst, i_dst_stride, i_height ); \ + } \ + else if( i_width == 8 ) \ + { \ + pf_mc[1][mvy&0x03][mvx&0x03]( src, i_src_stride, dst, i_dst_stride, i_height ); \ + } \ + else if( i_width == 16 ) \ + { \ + pf_mc[2][mvy&0x03][mvx&0x03]( src, i_src_stride, dst, i_dst_stride, i_height ); \ + } \ + else \ + { \ + fprintf( stderr, "Error: motion_compensation_luma called with invalid width" ); \ + } + +static void motion_compensation_luma_mmxext( uint8_t *src, int i_src_stride, + uint8_t *dst, int i_dst_stride, + int mvx,int mvy, + int i_width, int i_height ) { static const pf_mc_t pf_mc[3][4][4] = /*XXX [dqy][dqx] */ { { - { mc_copy_w4, mc_xy10_w4, mc_hh_w4, mc_xy30_w4 }, - { mc_xy01_w4, mc_xy11_w4, mc_xy21_w4, mc_xy31_w4 }, - { mc_hv_w4, mc_xy12_w4, mc_hc_w4, mc_xy32_w4 }, - { mc_xy03_w4, mc_xy13_w4, mc_xy23_w4, mc_xy33_w4 }, + { x264_mc_copy_w4_mmxext, mc_xy10_w4_mmxext, mc_hh_w4, mc_xy30_w4_mmxext }, + { mc_xy01_w4_mmxext, mc_xy11_w4_mmxext, mc_xy21_w4_mmxext, mc_xy31_w4_mmxext }, + { mc_hv_w4, mc_xy12_w4_mmxext, mc_hc_w4, mc_xy32_w4_mmxext }, + { mc_xy03_w4_mmxext, mc_xy13_w4_mmxext, mc_xy23_w4_mmxext, mc_xy33_w4_mmxext }, }, { - { mc_copy_w8, mc_xy10_w8, mc_hh_w8, mc_xy30_w8 }, - { mc_xy01_w8, mc_xy11_w8, mc_xy21_w8, mc_xy31_w8 }, - { mc_hv_w8, mc_xy12_w8, mc_hc_w8, mc_xy32_w8 }, - { mc_xy03_w8, mc_xy13_w8, mc_xy23_w8, mc_xy33_w8 }, + { x264_mc_copy_w8_mmxext, mc_xy10_w8_mmxext, mc_hh_w8, mc_xy30_w8_mmxext }, + { mc_xy01_w8_mmxext, mc_xy11_w8_mmxext, mc_xy21_w8_mmxext, mc_xy31_w8_mmxext }, + { mc_hv_w8, mc_xy12_w8_mmxext, mc_hc_w8, mc_xy32_w8_mmxext }, + { mc_xy03_w8_mmxext, mc_xy13_w8_mmxext, mc_xy23_w8_mmxext, mc_xy33_w8_mmxext }, }, { - { mc_copy_w16, mc_xy10_w16, mc_hh_w16, mc_xy30_w16 }, - { mc_xy01_w16, mc_xy11_w16, mc_xy21_w16, mc_xy31_w16 }, - { mc_hv_w16, mc_xy12_w16, mc_hc_w16, mc_xy32_w16 }, - { mc_xy03_w16, mc_xy13_w16, mc_xy23_w16, mc_xy33_w16 }, + { x264_mc_copy_w16_mmxext, mc_xy10_w16_mmxext, mc_hh_w16, mc_xy30_w16_mmxext }, + { mc_xy01_w16_mmxext, mc_xy11_w16_mmxext, mc_xy21_w16_mmxext, mc_xy31_w16_mmxext }, + { mc_hv_w16, mc_xy12_w16_mmxext, mc_hc_w16, mc_xy32_w16_mmxext }, + { mc_xy03_w16_mmxext, mc_xy13_w16_mmxext, mc_xy23_w16_mmxext, mc_xy33_w16_mmxext }, } }; - src += (mvy >> 2) * i_src_stride + (mvx >> 2); - if( i_width == 4 ) - { - pf_mc[0][mvy&0x03][mvx&0x03]( src, i_src_stride, dst, i_dst_stride, i_height ); - } - else if( i_width == 8 ) - { - pf_mc[1][mvy&0x03][mvx&0x03]( src, i_src_stride, dst, i_dst_stride, i_height ); - } - else if( i_width == 16 ) - { - pf_mc[2][mvy&0x03][mvx&0x03]( src, i_src_stride, dst, i_dst_stride, i_height ); - } - else + MOTION_COMPENSATION_LUMA +} + +static void motion_compensation_luma_sse2( uint8_t *src, int i_src_stride, + uint8_t *dst, int i_dst_stride, + int mvx,int mvy, + int i_width, int i_height ) +{ + static const pf_mc_t pf_mc[3][4][4] = /*XXX [dqy][dqx] */ { - fprintf( stderr, "Error: motion_compensation_luma called with invalid width" ); - } + { + { x264_mc_copy_w4_mmxext, mc_xy10_w4_mmxext, mc_hh_w4, mc_xy30_w4_mmxext }, + { mc_xy01_w4_mmxext, mc_xy11_w4_mmxext, mc_xy21_w4_mmxext, mc_xy31_w4_mmxext }, + { mc_hv_w4, mc_xy12_w4_mmxext, mc_hc_w4, mc_xy32_w4_mmxext }, + { mc_xy03_w4_mmxext, mc_xy13_w4_mmxext, mc_xy23_w4_mmxext, mc_xy33_w4_mmxext }, + }, + { + { x264_mc_copy_w8_mmxext, mc_xy10_w8_mmxext, mc_hh_w8, mc_xy30_w8_mmxext }, + { mc_xy01_w8_mmxext, mc_xy11_w8_mmxext, mc_xy21_w8_mmxext, mc_xy31_w8_mmxext }, + { mc_hv_w8, mc_xy12_w8_mmxext, mc_hc_w8, mc_xy32_w8_mmxext }, + { mc_xy03_w8_mmxext, mc_xy13_w8_mmxext, mc_xy23_w8_mmxext, mc_xy33_w8_mmxext }, + }, + { + { x264_mc_copy_w16_sse2, mc_xy10_w16_sse2, mc_hh_w16, mc_xy30_w16_sse2 }, + { mc_xy01_w16_sse2, mc_xy11_w16_sse2, mc_xy21_w16_sse2, mc_xy31_w16_sse2 }, + { mc_hv_w16, mc_xy12_w16_sse2, mc_hc_w16, mc_xy32_w16_sse2 }, + { mc_xy03_w16_sse2, mc_xy13_w16_sse2, mc_xy23_w16_sse2, mc_xy33_w16_sse2 }, + } + }; + MOTION_COMPENSATION_LUMA } void x264_mc_mmxext_init( x264_mc_function_t pf[2] ) { - pf[MC_LUMA] = motion_compensation_luma; + pf[MC_LUMA] = motion_compensation_luma_mmxext; +} +void x264_mc_sse2_init( x264_mc_function_t pf[2] ) +{ + pf[MC_LUMA] = motion_compensation_luma_sse2; } diff --git a/core/i386/mc.asm b/core/i386/mc.asm index a932e159..9ee4191a 100644 --- a/core/i386/mc.asm +++ b/core/i386/mc.asm @@ -67,16 +67,25 @@ ALIGN 16 SECTION .text -cglobal pixel_avg_w4 +cglobal x264_pixel_avg_w4_mmxext +cglobal x264_pixel_avg_w8_mmxext +cglobal x264_pixel_avg_w16_mmxext +cglobal x264_pixel_avg_w16_sse2 + +cglobal x264_mc_copy_w4_mmxext +cglobal x264_mc_copy_w8_mmxext +cglobal x264_mc_copy_w16_mmxext +cglobal x264_mc_copy_w16_sse2 + ALIGN 16 ;----------------------------------------------------------------------------- -; void pixel_avg_w4( uint8_t *dst, int i_dst_stride, -; uint8_t *src1, int i_src1_stride, -; uint8_t *src2, int i_src2_stride, -; int i_height ); +; void x264_pixel_avg_w4_mmxext( uint8_t *dst, int i_dst_stride, +; uint8_t *src1, int i_src1_stride, +; uint8_t *src2, int i_src2_stride, +; int i_height ); ;----------------------------------------------------------------------------- -pixel_avg_w4: +x264_pixel_avg_w4_mmxext: push ebp push ebx push esi @@ -111,16 +120,15 @@ ALIGN 4 ret -cglobal pixel_avg_w8 ALIGN 16 ;----------------------------------------------------------------------------- -; void pixel_avg_w8( uint8_t *dst, int i_dst_stride, -; uint8_t *src1, int i_src1_stride, -; uint8_t *src2, int i_src2_stride, -; int i_height ); +; void x264_pixel_avg_w8_mmxext( uint8_t *dst, int i_dst_stride, +; uint8_t *src1, int i_src1_stride, +; uint8_t *src2, int i_src2_stride, +; int i_height ); ;----------------------------------------------------------------------------- -pixel_avg_w8: +x264_pixel_avg_w8_mmxext: push ebp push ebx push esi @@ -151,16 +159,15 @@ ALIGN 4 ret -cglobal pixel_avg_w16 ALIGN 16 ;----------------------------------------------------------------------------- -; void pixel_avg_w16( uint8_t *dst, int i_dst_stride, -; uint8_t *src1, int i_src1_stride, -; uint8_t *src2, int i_src2_stride, -; int i_height ); +; void x264_pixel_avg_w16_mmxext( uint8_t *dst, int i_dst_stride, +; uint8_t *src1, int i_src1_stride, +; uint8_t *src2, int i_src2_stride, +; int i_height ); ;----------------------------------------------------------------------------- -pixel_avg_w16: +x264_pixel_avg_w16_mmxext: push ebp push ebx push esi @@ -175,18 +182,50 @@ pixel_avg_w16: mov ebp, [esp+44] ; i_height ALIGN 4 .height_loop -%ifndef HAVE_SSE2 movq mm0, [ebx ] movq mm1, [ebx+8] pavgb mm0, [ecx ] pavgb mm1, [ecx+8] movq [edi ], mm0 movq [edi+8], mm1 -%else + dec ebp + lea ebx, [ebx+eax] + lea ecx, [ecx+edx] + lea edi, [edi+esi] + jne .height_loop + + pop edi + pop esi + pop ebx + pop ebp + ret + +ALIGN 16 +;----------------------------------------------------------------------------- +; void x264_pixel_avg_w16_sse2( uint8_t *dst, int i_dst_stride, +; uint8_t *src1, int i_src1_stride, +; uint8_t *src2, int i_src2_stride, +; int i_height ); +;----------------------------------------------------------------------------- +x264_pixel_avg_w16_sse2: + push ebp + push ebx + push esi + push edi + + mov edi, [esp+20] ; dst + mov ebx, [esp+28] ; src1 + mov ecx, [esp+36] ; src2 + mov esi, [esp+24] ; i_dst_stride + mov eax, [esp+32] ; i_src1_stride + mov edx, [esp+40] ; i_src2_stride + mov ebp, [esp+44] ; i_height +ALIGN 4 +.height_loop movdqu xmm0, [ebx] pavgb xmm0, [ecx] movdqu [edi], xmm0 -%endif + dec ebp lea ebx, [ebx+eax] lea ecx, [ecx+edx] @@ -200,13 +239,13 @@ ALIGN 4 ret -cglobal mc_copy_w4 ALIGN 16 ;----------------------------------------------------------------------------- -; void mc_copy_w4( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ) +; void x264_mc_copy_w4_mmxext( uint8_t *src, int i_src_stride, +; uint8_t *dst, int i_dst_stride, int i_height ) ;----------------------------------------------------------------------------- -mc_copy_w4: +x264_mc_copy_w4_mmxext: push ebx push esi push edi @@ -237,9 +276,10 @@ cglobal mc_copy_w8 ALIGN 16 ;----------------------------------------------------------------------------- -; void mc_copy_w8( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ) +; void x264_mc_copy_w8_mmxext( uint8_t *src, int i_src_stride, +; uint8_t *dst, int i_dst_stride, int i_height ) ;----------------------------------------------------------------------------- -mc_copy_w8: +x264_mc_copy_w8_mmxext: push ebx push esi push edi @@ -276,9 +316,10 @@ cglobal mc_copy_w16 ALIGN 16 ;----------------------------------------------------------------------------- -; void mc_copy_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ) +; void x264_mc_copy_w16_mmxext( uint8_t *src, int i_src_stride, +; uint8_t *dst, int i_dst_stride, int i_height ) ;----------------------------------------------------------------------------- -mc_copy_w16: +x264_mc_copy_w16_mmxext: push ebx push esi push edi @@ -288,9 +329,9 @@ mc_copy_w16: mov ebx, [esp+20] ; i_src_stride mov edx, [esp+28] ; i_dst_stride mov ecx, [esp+32] ; i_height + ALIGN 4 .height_loop -%ifndef HAVE_SSE2 movq mm0, [esi] movq mm1, [esi+8] movq [edi], mm0 @@ -313,7 +354,30 @@ ALIGN 4 lea edi, [edi+edx*2] sub ecx, byte 4 jnz .height_loop -%else + + pop edi + pop esi + pop ebx + ret + + +ALIGN 16 +;----------------------------------------------------------------------------- +; void x264_mc_copy_w16_sse2( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ) +;----------------------------------------------------------------------------- +x264_mc_copy_w16_sse2: + push ebx + push esi + push edi + + mov esi, [esp+16] ; src + mov edi, [esp+24] ; dst + mov ebx, [esp+20] ; i_src_stride + mov edx, [esp+28] ; i_dst_stride + mov ecx, [esp+32] ; i_height + +ALIGN 4 +.height_loop movdqu xmm0, [esi] movdqu xmm1, [esi+ebx] movdqu [edi], xmm0 @@ -323,7 +387,6 @@ ALIGN 4 lea esi, [esi+ebx*2] lea edi, [edi+edx*2] jnz .height_loop -%endif pop edi pop esi diff --git a/core/i386/mc.h b/core/i386/mc.h index c3e906fc..8cfc0a4f 100644 --- a/core/i386/mc.h +++ b/core/i386/mc.h @@ -25,5 +25,6 @@ #define _I386_MC_H 1 void x264_mc_mmxext_init( x264_mc_function_t pf[2] ); +void x264_mc_sse2_init( x264_mc_function_t pf[2] ); #endif diff --git a/core/mc.c b/core/mc.c index e7ff7541..2c3fd792 100644 --- a/core/mc.c +++ b/core/mc.c @@ -306,15 +306,16 @@ void x264_mc_init( int cpu, x264_mc_function_t pf[2] ) #ifdef HAVE_MMXEXT if( cpu&X264_CPU_MMXEXT ) - { x264_mc_mmxext_init( pf ); - } #endif +#ifdef HAVE_SSE2 + if( cpu&X264_CPU_SSE2 ) + x264_mc_sse2_init( pf ); +#endif + #ifdef HAVE_ALTIVEC if( cpu&X264_CPU_ALTIVEC ) - { x264_mc_altivec_init( pf ); - } #endif } -- 2.40.0