From: Loren Merritt Date: Fri, 17 Mar 2006 21:36:27 +0000 (+0000) Subject: some mmxext functions really only required mmx. X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=fdb64099b4da93ffa70af98aad85cc7c6fc564d0;p=libx264 some mmxext functions really only required mmx. git-svn-id: svn://svn.videolan.org/x264/trunk@470 df754926-b1dd-0310-bc7b-ec298dee348c --- diff --git a/common/amd64/dct-a.asm b/common/amd64/dct-a.asm index 1bb352fa..ca9c0adf 100644 --- a/common/amd64/dct-a.asm +++ b/common/amd64/dct-a.asm @@ -157,13 +157,13 @@ pw_32: times 8 dw 32 SECTION .text -cglobal x264_dct4x4dc_mmxext +cglobal x264_dct4x4dc_mmx ALIGN 16 ;----------------------------------------------------------------------------- -; void dct4x4dc( int16_t d[4][4] ) +; void x264_dct4x4dc_mmx( int16_t d[4][4] ) ;----------------------------------------------------------------------------- -x264_dct4x4dc_mmxext: +x264_dct4x4dc_mmx: movq mm0, [parm1q+ 0] movq mm1, [parm1q+ 8] movq mm2, [parm1q+16] @@ -192,13 +192,13 @@ x264_dct4x4dc_mmxext: movq [parm1q+24],mm4 ret -cglobal x264_idct4x4dc_mmxext +cglobal x264_idct4x4dc_mmx ALIGN 16 ;----------------------------------------------------------------------------- -; void x264_idct4x4dc_mmxext( int16_t d[4][4] ) +; void x264_idct4x4dc_mmx( int16_t d[4][4] ) ;----------------------------------------------------------------------------- -x264_idct4x4dc_mmxext: +x264_idct4x4dc_mmx: movq mm0, [parm1q+ 0] movq mm1, [parm1q+ 8] movq mm2, [parm1q+16] @@ -218,13 +218,13 @@ x264_idct4x4dc_mmxext: movq [parm1q+24], mm4 ret -cglobal x264_sub4x4_dct_mmxext +cglobal x264_sub4x4_dct_mmx ALIGN 16 ;----------------------------------------------------------------------------- -; void x264_sub4x4_dct_mmxext( int16_t dct[4][4], uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 ) +; void x264_sub4x4_dct_mmx( int16_t dct[4][4], uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 ) ;----------------------------------------------------------------------------- -x264_sub4x4_dct_mmxext: +x264_sub4x4_dct_mmx: firstpush rbx pushreg rbx endprolog @@ -272,13 +272,13 @@ x264_sub4x4_dct_mmxext: ret endfunc -cglobal x264_add4x4_idct_mmxext +cglobal x264_add4x4_idct_mmx ALIGN 16 ;----------------------------------------------------------------------------- -; void x264_add4x4_idct_mmxext( uint8_t *p_dst, int i_dst, int16_t dct[4][4] ) +; void x264_add4x4_idct_mmx( uint8_t *p_dst, int i_dst, int16_t dct[4][4] ) ;----------------------------------------------------------------------------- -x264_add4x4_idct_mmxext: +x264_add4x4_idct_mmx: ; Load dct coeffs movq mm0, [parm3q+ 0] ; dct movq mm1, [parm3q+ 8] diff --git a/common/amd64/mc-a.asm b/common/amd64/mc-a.asm index c6f2c820..b7022320 100644 --- a/common/amd64/mc-a.asm +++ b/common/amd64/mc-a.asm @@ -65,9 +65,9 @@ cglobal x264_pixel_avg_weight_4x4_mmxext cglobal x264_pixel_avg_weight_w8_mmxext cglobal x264_pixel_avg_weight_w16_mmxext -cglobal x264_mc_copy_w4_mmxext -cglobal x264_mc_copy_w8_mmxext -cglobal x264_mc_copy_w16_mmxext +cglobal x264_mc_copy_w4_mmx +cglobal x264_mc_copy_w8_mmx +cglobal x264_mc_copy_w16_mmx cglobal x264_mc_copy_w16_sse2 cglobal x264_mc_chroma_mmxext @@ -288,10 +288,10 @@ x264_pixel_avg_weight_4x4_mmxext: ALIGN 16 ;----------------------------------------------------------------------------- -; void x264_mc_copy_w4_mmxext( uint8_t *dst, int i_dst_stride, -; uint8_t *src, int i_src_stride, int i_height ) +; void x264_mc_copy_w4_mmx( uint8_t *dst, int i_dst_stride, +; uint8_t *src, int i_src_stride, int i_height ) ;----------------------------------------------------------------------------- -x264_mc_copy_w4_mmxext: +x264_mc_copy_w4_mmx: mov eax, parm5d ; i_height ALIGN 4 @@ -310,10 +310,10 @@ ALIGN 4 ALIGN 16 ;----------------------------------------------------------------------------- -; void x264_mc_copy_w8_mmxext( uint8_t *dst, int i_dst_stride, -; uint8_t *src, int i_src_stride, int i_height ) +; void x264_mc_copy_w8_mmx( uint8_t *dst, int i_dst_stride, +; uint8_t *src, int i_src_stride, int i_height ) ;----------------------------------------------------------------------------- -x264_mc_copy_w8_mmxext: +x264_mc_copy_w8_mmx: mov eax, parm5d ; i_height lea r10, [parm4q+parm4q*2] ; 3 * i_src_stride @@ -339,10 +339,10 @@ ALIGN 4 ALIGN 16 ;----------------------------------------------------------------------------- -; void x264_mc_copy_w16_mmxext( uint8_t *dst, int i_dst_stride, -; uint8_t *src, int i_src_stride, int i_height ) +; void x264_mc_copy_w16_mmx( uint8_t *dst, int i_dst_stride, +; uint8_t *src, int i_src_stride, int i_height ) ;----------------------------------------------------------------------------- -x264_mc_copy_w16_mmxext: +x264_mc_copy_w16_mmx: mov eax, parm5d ; i_height lea r10, [parm4q+parm4q*2] ; 3 * i_src_stride diff --git a/common/amd64/mc-a2.asm b/common/amd64/mc-a2.asm index 67d4cfcb..96f32fd4 100644 --- a/common/amd64/mc-a2.asm +++ b/common/amd64/mc-a2.asm @@ -276,9 +276,7 @@ x264_center_filter_mmxext : jnz .loopcx2 add r10, r11 ; dst2 += dst2_stride - dec r15 ; height - test r15, r15 jnz .loopcy lea rsp, [rbp] @@ -326,7 +324,6 @@ x264_horizontal_filter_mmxext : loophy: - dec rcx xor rax, rax loophx: @@ -365,7 +362,7 @@ loophx: add rdx, r11 ; src_pitch add r9, r10 ; dst_pitch - test rcx, rcx + dec rcx jnz loophy ret diff --git a/common/amd64/pixel-a.asm b/common/amd64/pixel-a.asm index da5b074a..d107a37f 100644 --- a/common/amd64/pixel-a.asm +++ b/common/amd64/pixel-a.asm @@ -266,13 +266,13 @@ cglobal x264_pixel_sad_pde_16x16_mmxext cglobal x264_pixel_sad_pde_16x8_mmxext cglobal x264_pixel_sad_pde_8x16_mmxext -cglobal x264_pixel_ssd_16x16_mmxext -cglobal x264_pixel_ssd_16x8_mmxext -cglobal x264_pixel_ssd_8x16_mmxext -cglobal x264_pixel_ssd_8x8_mmxext -cglobal x264_pixel_ssd_8x4_mmxext -cglobal x264_pixel_ssd_4x8_mmxext -cglobal x264_pixel_ssd_4x4_mmxext +cglobal x264_pixel_ssd_16x16_mmx +cglobal x264_pixel_ssd_16x8_mmx +cglobal x264_pixel_ssd_8x16_mmx +cglobal x264_pixel_ssd_8x8_mmx +cglobal x264_pixel_ssd_8x4_mmx +cglobal x264_pixel_ssd_4x8_mmx +cglobal x264_pixel_ssd_4x4_mmx cglobal x264_pixel_satd_4x4_mmxext cglobal x264_pixel_satd_4x8_mmxext @@ -470,22 +470,22 @@ x264_pixel_sad_pde_8x16_mmxext: ALIGN 16 ;----------------------------------------------------------------------------- -; int x264_pixel_ssd_16x16_mmxext (uint8_t *, int, uint8_t *, int ) +; int x264_pixel_ssd_16x16_mmx (uint8_t *, int, uint8_t *, int ) ;----------------------------------------------------------------------------- -x264_pixel_ssd_16x16_mmxext: +x264_pixel_ssd_16x16_mmx: SSD_START SSD_INC_8x16P SSD_INC_8x16P SSD_END ALIGN 16 -x264_pixel_ssd_16x8_mmxext: +x264_pixel_ssd_16x8_mmx: SSD_START SSD_INC_8x16P SSD_END ALIGN 16 -x264_pixel_ssd_8x16_mmxext: +x264_pixel_ssd_8x16_mmx: SSD_START SSD_INC_4x8P SSD_INC_4x8P @@ -494,27 +494,27 @@ x264_pixel_ssd_8x16_mmxext: SSD_END ALIGN 16 -x264_pixel_ssd_8x8_mmxext: +x264_pixel_ssd_8x8_mmx: SSD_START SSD_INC_4x8P SSD_INC_4x8P SSD_END ALIGN 16 -x264_pixel_ssd_8x4_mmxext: +x264_pixel_ssd_8x4_mmx: SSD_START SSD_INC_4x8P SSD_END ALIGN 16 -x264_pixel_ssd_4x8_mmxext: +x264_pixel_ssd_4x8_mmx: SSD_START SSD_INC_4x4P SSD_INC_4x4P SSD_END ALIGN 16 -x264_pixel_ssd_4x4_mmxext: +x264_pixel_ssd_4x4_mmx: SSD_START SSD_INC_4x4P SSD_END diff --git a/common/dct.c b/common/dct.c index 6212e323..a85e8a9e 100644 --- a/common/dct.c +++ b/common/dct.c @@ -397,23 +397,20 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf ) dctf->idct2x2dc = dct2x2dc; #ifdef HAVE_MMXEXT - if( cpu&X264_CPU_MMXEXT ) + if( cpu&X264_CPU_MMX ) { - dctf->sub4x4_dct = x264_sub4x4_dct_mmxext; - dctf->sub8x8_dct = x264_sub8x8_dct_mmxext; - dctf->sub16x16_dct = x264_sub16x16_dct_mmxext; + dctf->sub4x4_dct = x264_sub4x4_dct_mmx; + dctf->sub8x8_dct = x264_sub8x8_dct_mmx; + dctf->sub16x16_dct = x264_sub16x16_dct_mmx; - dctf->add4x4_idct = x264_add4x4_idct_mmxext; - dctf->add8x8_idct = x264_add8x8_idct_mmxext; - dctf->add16x16_idct = x264_add16x16_idct_mmxext; + dctf->add4x4_idct = x264_add4x4_idct_mmx; + dctf->add8x8_idct = x264_add8x8_idct_mmx; + dctf->add16x16_idct = x264_add16x16_idct_mmx; - dctf->dct4x4dc = x264_dct4x4dc_mmxext; - dctf->idct4x4dc = x264_idct4x4dc_mmxext; - } + dctf->dct4x4dc = x264_dct4x4dc_mmx; + dctf->idct4x4dc = x264_idct4x4dc_mmx; #ifndef ARCH_X86_64 - if( cpu&X264_CPU_MMX ) - { dctf->sub8x8_dct8 = x264_sub8x8_dct8_mmx; dctf->sub16x16_dct8 = x264_sub16x16_dct8_mmx; @@ -433,6 +430,7 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf ) dctf->add16x16_idct8= x264_add16x16_idct8_sse2; } #endif + /* FIXME altivec dct is not transposed yet #ifdef ARCH_PPC if( cpu&X264_CPU_ALTIVEC ) diff --git a/common/i386/dct-a.asm b/common/i386/dct-a.asm index 504e1336..dfafff96 100644 --- a/common/i386/dct-a.asm +++ b/common/i386/dct-a.asm @@ -143,13 +143,13 @@ x264_mmx_p2n2p1p1: dw 2, -2, 1, 1 SECTION .text -cglobal x264_dct4x4dc_mmxext +cglobal x264_dct4x4dc_mmx ALIGN 16 ;----------------------------------------------------------------------------- -; void __cdecl dct4x4dc( int16_t d[4][4] ) +; void __cdecl x264_dct4x4dc_mmx( int16_t d[4][4] ) ;----------------------------------------------------------------------------- -x264_dct4x4dc_mmxext: +x264_dct4x4dc_mmx: mov eax, [esp+ 4] movq mm0, [eax+ 0] movq mm1, [eax+ 8] @@ -183,13 +183,13 @@ x264_dct4x4dc_mmxext: picpop ebx ret -cglobal x264_idct4x4dc_mmxext +cglobal x264_idct4x4dc_mmx ALIGN 16 ;----------------------------------------------------------------------------- -; void __cdecl x264_idct4x4dc_mmxext( int16_t d[4][4] ) +; void __cdecl x264_idct4x4dc_mmx( int16_t d[4][4] ) ;----------------------------------------------------------------------------- -x264_idct4x4dc_mmxext: +x264_idct4x4dc_mmx: mov eax, [esp+ 4] movq mm0, [eax+ 0] movq mm1, [eax+ 8] @@ -210,13 +210,13 @@ x264_idct4x4dc_mmxext: movq [eax+24], mm4 ret -cglobal x264_sub4x4_dct_mmxext +cglobal x264_sub4x4_dct_mmx ALIGN 16 ;----------------------------------------------------------------------------- -; void __cdecl x264_sub4x4_dct_mmxext( int16_t dct[4][4], uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 ) +; void __cdecl x264_sub4x4_dct_mmx( int16_t dct[4][4], uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 ) ;----------------------------------------------------------------------------- -x264_sub4x4_dct_mmxext: +x264_sub4x4_dct_mmx: push ebx mov eax, [esp+12] ; pix1 mov ebx, [esp+16] ; i_pix1 @@ -255,13 +255,13 @@ x264_sub4x4_dct_mmxext: pop ebx ret -cglobal x264_add4x4_idct_mmxext +cglobal x264_add4x4_idct_mmx ALIGN 16 ;----------------------------------------------------------------------------- -; void __cdecl x264_add4x4_idct_mmxext( uint8_t *p_dst, int i_dst, int16_t dct[4][4] ) +; void __cdecl x264_add4x4_idct_mmx( uint8_t *p_dst, int i_dst, int16_t dct[4][4] ) ;----------------------------------------------------------------------------- -x264_add4x4_idct_mmxext: +x264_add4x4_idct_mmx: ; Load dct coeffs mov eax, [esp+12] ; dct movq mm0, [eax+ 0] diff --git a/common/i386/dct-c.c b/common/i386/dct-c.c index 3ca08b13..28b2fbed 100644 --- a/common/i386/dct-c.c +++ b/common/i386/dct-c.c @@ -34,20 +34,20 @@ #include "dct.h" -void x264_sub8x8_dct_mmxext( int16_t dct[4][4][4], uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 ) +void x264_sub8x8_dct_mmx( int16_t dct[4][4][4], uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 ) { - x264_sub4x4_dct_mmxext( dct[0], &pix1[0], i_pix1, &pix2[0], i_pix2 ); - x264_sub4x4_dct_mmxext( dct[1], &pix1[4], i_pix1, &pix2[4], i_pix2 ); - x264_sub4x4_dct_mmxext( dct[2], &pix1[4*i_pix1+0], i_pix1, &pix2[4*i_pix2+0], i_pix2 ); - x264_sub4x4_dct_mmxext( dct[3], &pix1[4*i_pix1+4], i_pix1, &pix2[4*i_pix2+4], i_pix2 ); + x264_sub4x4_dct_mmx( dct[0], &pix1[0], i_pix1, &pix2[0], i_pix2 ); + x264_sub4x4_dct_mmx( dct[1], &pix1[4], i_pix1, &pix2[4], i_pix2 ); + x264_sub4x4_dct_mmx( dct[2], &pix1[4*i_pix1+0], i_pix1, &pix2[4*i_pix2+0], i_pix2 ); + x264_sub4x4_dct_mmx( dct[3], &pix1[4*i_pix1+4], i_pix1, &pix2[4*i_pix2+4], i_pix2 ); } -void x264_sub16x16_dct_mmxext( int16_t dct[16][4][4], uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 ) +void x264_sub16x16_dct_mmx( int16_t dct[16][4][4], uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 ) { - x264_sub8x8_dct_mmxext( &dct[ 0], &pix1[0], i_pix1, &pix2[0], i_pix2 ); - x264_sub8x8_dct_mmxext( &dct[ 4], &pix1[8], i_pix1, &pix2[8], i_pix2 ); - x264_sub8x8_dct_mmxext( &dct[ 8], &pix1[8*i_pix1], i_pix1, &pix2[8*i_pix2], i_pix2 ); - x264_sub8x8_dct_mmxext( &dct[12], &pix1[8*i_pix1+8], i_pix1, &pix2[8*i_pix2+8], i_pix2 ); + x264_sub8x8_dct_mmx( &dct[ 0], &pix1[0], i_pix1, &pix2[0], i_pix2 ); + x264_sub8x8_dct_mmx( &dct[ 4], &pix1[8], i_pix1, &pix2[8], i_pix2 ); + x264_sub8x8_dct_mmx( &dct[ 8], &pix1[8*i_pix1], i_pix1, &pix2[8*i_pix2], i_pix2 ); + x264_sub8x8_dct_mmx( &dct[12], &pix1[8*i_pix1+8], i_pix1, &pix2[8*i_pix2+8], i_pix2 ); } @@ -56,20 +56,20 @@ void x264_sub16x16_dct_mmxext( int16_t dct[16][4][4], uint8_t *pix1, int i_pix1, * addXxX_idct: ****************************************************************************/ -void x264_add8x8_idct_mmxext( uint8_t *p_dst, int i_dst, int16_t dct[4][4][4] ) +void x264_add8x8_idct_mmx( uint8_t *p_dst, int i_dst, int16_t dct[4][4][4] ) { - x264_add4x4_idct_mmxext( p_dst, i_dst, dct[0] ); - x264_add4x4_idct_mmxext( &p_dst[4], i_dst, dct[1] ); - x264_add4x4_idct_mmxext( &p_dst[4*i_dst+0], i_dst, dct[2] ); - x264_add4x4_idct_mmxext( &p_dst[4*i_dst+4], i_dst, dct[3] ); + x264_add4x4_idct_mmx( p_dst, i_dst, dct[0] ); + x264_add4x4_idct_mmx( &p_dst[4], i_dst, dct[1] ); + x264_add4x4_idct_mmx( &p_dst[4*i_dst+0], i_dst, dct[2] ); + x264_add4x4_idct_mmx( &p_dst[4*i_dst+4], i_dst, dct[3] ); } -void x264_add16x16_idct_mmxext( uint8_t *p_dst, int i_dst, int16_t dct[16][4][4] ) +void x264_add16x16_idct_mmx( uint8_t *p_dst, int i_dst, int16_t dct[16][4][4] ) { - x264_add8x8_idct_mmxext( &p_dst[0], i_dst, &dct[0] ); - x264_add8x8_idct_mmxext( &p_dst[8], i_dst, &dct[4] ); - x264_add8x8_idct_mmxext( &p_dst[8*i_dst], i_dst, &dct[8] ); - x264_add8x8_idct_mmxext( &p_dst[8*i_dst+8], i_dst, &dct[12] ); + x264_add8x8_idct_mmx( &p_dst[0], i_dst, &dct[0] ); + x264_add8x8_idct_mmx( &p_dst[8], i_dst, &dct[4] ); + x264_add8x8_idct_mmx( &p_dst[8*i_dst], i_dst, &dct[8] ); + x264_add8x8_idct_mmx( &p_dst[8*i_dst+8], i_dst, &dct[12] ); } /*********************** diff --git a/common/i386/dct.h b/common/i386/dct.h index f89b23ba..d7974167 100644 --- a/common/i386/dct.h +++ b/common/i386/dct.h @@ -24,16 +24,16 @@ #ifndef _I386_DCT_H #define _I386_DCT_H 1 -void x264_sub4x4_dct_mmxext( int16_t dct[4][4], uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 ); -void x264_sub8x8_dct_mmxext( int16_t dct[4][4][4], uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 ); -void x264_sub16x16_dct_mmxext( int16_t dct[16][4][4], uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 ); +void x264_sub4x4_dct_mmx( int16_t dct[4][4], uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 ); +void x264_sub8x8_dct_mmx( int16_t dct[4][4][4], uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 ); +void x264_sub16x16_dct_mmx( int16_t dct[16][4][4], uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 ); -void x264_add4x4_idct_mmxext( uint8_t *p_dst, int i_dst, int16_t dct[4][4] ); -void x264_add8x8_idct_mmxext( uint8_t *p_dst, int i_dst, int16_t dct[4][4][4] ); -void x264_add16x16_idct_mmxext( uint8_t *p_dst, int i_dst, int16_t dct[16][4][4] ); +void x264_add4x4_idct_mmx( uint8_t *p_dst, int i_dst, int16_t dct[4][4] ); +void x264_add8x8_idct_mmx( uint8_t *p_dst, int i_dst, int16_t dct[4][4][4] ); +void x264_add16x16_idct_mmx( uint8_t *p_dst, int i_dst, int16_t dct[16][4][4] ); -void x264_dct4x4dc_mmxext( int16_t d[4][4] ); -void x264_idct4x4dc_mmxext( int16_t d[4][4] ); +void x264_dct4x4dc_mmx( int16_t d[4][4] ); +void x264_idct4x4dc_mmx( int16_t d[4][4] ); void x264_sub8x8_dct8_mmx( int16_t dct[8][8], uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 ); void x264_sub16x16_dct8_mmx( int16_t dct[4][8][8], uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 ); diff --git a/common/i386/mc-a.asm b/common/i386/mc-a.asm index 9f58bf86..6193dab3 100644 --- a/common/i386/mc-a.asm +++ b/common/i386/mc-a.asm @@ -65,9 +65,9 @@ cglobal x264_pixel_avg_weight_4x4_mmxext cglobal x264_pixel_avg_weight_w8_mmxext cglobal x264_pixel_avg_weight_w16_mmxext -cglobal x264_mc_copy_w4_mmxext -cglobal x264_mc_copy_w8_mmxext -cglobal x264_mc_copy_w16_mmxext +cglobal x264_mc_copy_w4_mmx +cglobal x264_mc_copy_w8_mmx +cglobal x264_mc_copy_w16_mmx cglobal x264_mc_copy_w16_sse2 cglobal x264_mc_chroma_mmxext @@ -345,10 +345,10 @@ x264_pixel_avg_weight_4x4_mmxext: ALIGN 16 ;----------------------------------------------------------------------------- -; void x264_mc_copy_w4_mmxext( uint8_t *src, int i_src_stride, -; uint8_t *dst, int i_dst_stride, int i_height ) +; void x264_mc_copy_w4_mmx( uint8_t *src, int i_src_stride, +; uint8_t *dst, int i_dst_stride, int i_height ) ;----------------------------------------------------------------------------- -x264_mc_copy_w4_mmxext: +x264_mc_copy_w4_mmx: push ebx push esi push edi @@ -377,10 +377,10 @@ ALIGN 4 ALIGN 16 ;----------------------------------------------------------------------------- -; void x264_mc_copy_w8_mmxext( uint8_t *src, int i_src_stride, -; uint8_t *dst, int i_dst_stride, int i_height ) +; void x264_mc_copy_w8_mmx( uint8_t *src, int i_src_stride, +; uint8_t *dst, int i_dst_stride, int i_height ) ;----------------------------------------------------------------------------- -x264_mc_copy_w8_mmxext: +x264_mc_copy_w8_mmx: push ebx push esi push edi @@ -415,10 +415,10 @@ ALIGN 4 ALIGN 16 ;----------------------------------------------------------------------------- -; void x264_mc_copy_w16_mmxext( uint8_t *src, int i_src_stride, -; uint8_t *dst, int i_dst_stride, int i_height ) +; void x264_mc_copy_w16_mmx( uint8_t *src, int i_src_stride, +; uint8_t *dst, int i_dst_stride, int i_height ) ;----------------------------------------------------------------------------- -x264_mc_copy_w16_mmxext: +x264_mc_copy_w16_mmx: push ebx push esi push edi diff --git a/common/i386/mc-a2.asm b/common/i386/mc-a2.asm index 49ba566a..3c2020d5 100644 --- a/common/i386/mc-a2.asm +++ b/common/i386/mc-a2.asm @@ -287,10 +287,7 @@ loopcx2: add edi, [picesp + tdstp2] mov [picesp + tdst2], edi - mov ebp, [picesp + theight] - dec ebp - test ebp, ebp - mov [picesp + theight], ebp + dec dword [picesp + theight] jnz loopcy picpop ebx @@ -332,7 +329,6 @@ x264_horizontal_filter_mmxext : loophy: - dec ecx xor eax, eax loophx: @@ -371,7 +367,7 @@ loophx: add esi, [esp + 24] ; src_pitch add edi, [esp + 16] ; dst_pitch - test ecx, ecx + dec ecx jnz loophy pop esi diff --git a/common/i386/mc-c.c b/common/i386/mc-c.c index e261a0be..c7e8a4a5 100644 --- a/common/i386/mc-c.c +++ b/common/i386/mc-c.c @@ -35,9 +35,9 @@ extern void x264_pixel_avg_w16_sse2( uint8_t *, int, uint8_t *, int, uint8_t *, extern void x264_pixel_avg_weight_4x4_mmxext( uint8_t *, int, uint8_t *, int, int ); extern void x264_pixel_avg_weight_w8_mmxext( uint8_t *, int, uint8_t *, int, int, int ); extern void x264_pixel_avg_weight_w16_mmxext( uint8_t *, int, uint8_t *, int, int, int ); -extern void x264_mc_copy_w4_mmxext( uint8_t *, int, uint8_t *, int, int ); -extern void x264_mc_copy_w8_mmxext( uint8_t *, int, uint8_t *, int, int ); -extern void x264_mc_copy_w16_mmxext( uint8_t *, int, uint8_t *, int, int ); +extern void x264_mc_copy_w4_mmx( uint8_t *, int, uint8_t *, int, int ); +extern void x264_mc_copy_w8_mmx( uint8_t *, int, uint8_t *, int, int ); +extern void x264_mc_copy_w16_mmx( uint8_t *, int, uint8_t *, int, int ); extern void x264_mc_copy_w16_sse2( uint8_t *, int, uint8_t *, int, int ); #define AVG(W,H) \ @@ -73,13 +73,13 @@ static void (* const x264_pixel_avg_wtab_mmxext[5])( uint8_t *, int, uint8_t *, NULL, x264_pixel_avg_w16_mmxext }; -static void (* const x264_mc_copy_wtab_mmxext[5])( uint8_t *, int, uint8_t *, int, int ) = +static void (* const x264_mc_copy_wtab_mmx[5])( uint8_t *, int, uint8_t *, int, int ) = { NULL, - x264_mc_copy_w4_mmxext, - x264_mc_copy_w8_mmxext, + x264_mc_copy_w4_mmx, + x264_mc_copy_w8_mmx, NULL, - x264_mc_copy_w16_mmxext + x264_mc_copy_w16_mmx }; static const int hpel_ref0[16] = {0,1,1,1,0,1,1,1,2,3,3,3,0,1,1,1}; static const int hpel_ref1[16] = {0,0,0,0,2,2,3,2,2,2,3,2,2,2,3,2}; @@ -102,7 +102,7 @@ void mc_luma_mmx( uint8_t *src[4], int i_src_stride, } else { - x264_mc_copy_wtab_mmxext[i_width>>2]( + x264_mc_copy_wtab_mmx[i_width>>2]( dst, i_dst_stride, src1, i_src_stride, i_height ); } } @@ -155,9 +155,9 @@ void x264_mc_mmxext_init( x264_mc_functions_t *pf ) pf->avg_weight[PIXEL_4x4] = x264_pixel_avg_weight_4x4_mmxext; // avg_weight_4x8 is rare and 4x2 is not used - pf->copy[PIXEL_16x16] = x264_mc_copy_w16_mmxext; - pf->copy[PIXEL_8x8] = x264_mc_copy_w8_mmxext; - pf->copy[PIXEL_4x4] = x264_mc_copy_w4_mmxext; + pf->copy[PIXEL_16x16] = x264_mc_copy_w16_mmx; + pf->copy[PIXEL_8x8] = x264_mc_copy_w8_mmx; + pf->copy[PIXEL_4x4] = x264_mc_copy_w4_mmx; } void x264_mc_sse2_init( x264_mc_functions_t *pf ) { diff --git a/common/i386/pixel-a.asm b/common/i386/pixel-a.asm index f9df7619..8b635470 100644 --- a/common/i386/pixel-a.asm +++ b/common/i386/pixel-a.asm @@ -266,13 +266,13 @@ cglobal x264_pixel_sad_pde_16x16_mmxext cglobal x264_pixel_sad_pde_16x8_mmxext cglobal x264_pixel_sad_pde_8x16_mmxext -cglobal x264_pixel_ssd_16x16_mmxext -cglobal x264_pixel_ssd_16x8_mmxext -cglobal x264_pixel_ssd_8x16_mmxext -cglobal x264_pixel_ssd_8x8_mmxext -cglobal x264_pixel_ssd_8x4_mmxext -cglobal x264_pixel_ssd_4x8_mmxext -cglobal x264_pixel_ssd_4x4_mmxext +cglobal x264_pixel_ssd_16x16_mmx +cglobal x264_pixel_ssd_16x8_mmx +cglobal x264_pixel_ssd_8x16_mmx +cglobal x264_pixel_ssd_8x8_mmx +cglobal x264_pixel_ssd_8x4_mmx +cglobal x264_pixel_ssd_4x8_mmx +cglobal x264_pixel_ssd_4x4_mmx cglobal x264_pixel_satd_4x4_mmxext cglobal x264_pixel_satd_4x8_mmxext @@ -473,22 +473,22 @@ x264_pixel_sad_pde_8x16_mmxext: ALIGN 16 ;----------------------------------------------------------------------------- -; int __cdecl x264_pixel_ssd_16x16_mmxext (uint8_t *, int, uint8_t *, int ) +; int __cdecl x264_pixel_ssd_16x16_mmx (uint8_t *, int, uint8_t *, int ) ;----------------------------------------------------------------------------- -x264_pixel_ssd_16x16_mmxext: +x264_pixel_ssd_16x16_mmx: SSD_START SSD_INC_8x16P SSD_INC_8x16P SSD_END ALIGN 16 -x264_pixel_ssd_16x8_mmxext: +x264_pixel_ssd_16x8_mmx: SSD_START SSD_INC_8x16P SSD_END ALIGN 16 -x264_pixel_ssd_8x16_mmxext: +x264_pixel_ssd_8x16_mmx: SSD_START SSD_INC_4x8P SSD_INC_4x8P @@ -497,27 +497,27 @@ x264_pixel_ssd_8x16_mmxext: SSD_END ALIGN 16 -x264_pixel_ssd_8x8_mmxext: +x264_pixel_ssd_8x8_mmx: SSD_START SSD_INC_4x8P SSD_INC_4x8P SSD_END ALIGN 16 -x264_pixel_ssd_8x4_mmxext: +x264_pixel_ssd_8x4_mmx: SSD_START SSD_INC_4x8P SSD_END ALIGN 16 -x264_pixel_ssd_4x8_mmxext: +x264_pixel_ssd_4x8_mmx: SSD_START SSD_INC_4x4P SSD_INC_4x4P SSD_END ALIGN 16 -x264_pixel_ssd_4x4_mmxext: +x264_pixel_ssd_4x4_mmx: SSD_START SSD_INC_4x4P SSD_END diff --git a/common/i386/pixel.h b/common/i386/pixel.h index 94778a6c..df7ea616 100644 --- a/common/i386/pixel.h +++ b/common/i386/pixel.h @@ -36,13 +36,13 @@ int x264_pixel_sad_pde_16x16_mmxext( uint8_t *, int, uint8_t *, int, int ); int x264_pixel_sad_pde_16x8_mmxext( uint8_t *, int, uint8_t *, int, int ); int x264_pixel_sad_pde_8x16_mmxext( uint8_t *, int, uint8_t *, int, int ); -int x264_pixel_ssd_16x16_mmxext( uint8_t *, int, uint8_t *, int ); -int x264_pixel_ssd_16x8_mmxext( uint8_t *, int, uint8_t *, int ); -int x264_pixel_ssd_8x16_mmxext( uint8_t *, int, uint8_t *, int ); -int x264_pixel_ssd_8x8_mmxext( uint8_t *, int, uint8_t *, int ); -int x264_pixel_ssd_8x4_mmxext( uint8_t *, int, uint8_t *, int ); -int x264_pixel_ssd_4x8_mmxext( uint8_t *, int, uint8_t *, int ); -int x264_pixel_ssd_4x4_mmxext( uint8_t *, int, uint8_t *, int ); +int x264_pixel_ssd_16x16_mmx( uint8_t *, int, uint8_t *, int ); +int x264_pixel_ssd_16x8_mmx( uint8_t *, int, uint8_t *, int ); +int x264_pixel_ssd_8x16_mmx( uint8_t *, int, uint8_t *, int ); +int x264_pixel_ssd_8x8_mmx( uint8_t *, int, uint8_t *, int ); +int x264_pixel_ssd_8x4_mmx( uint8_t *, int, uint8_t *, int ); +int x264_pixel_ssd_4x8_mmx( uint8_t *, int, uint8_t *, int ); +int x264_pixel_ssd_4x4_mmx( uint8_t *, int, uint8_t *, int ); int x264_pixel_satd_16x16_mmxext( uint8_t *, int, uint8_t *, int ); int x264_pixel_satd_16x8_mmxext( uint8_t *, int, uint8_t *, int ); diff --git a/common/pixel.c b/common/pixel.c index 46c1d514..9eb4f933 100644 --- a/common/pixel.c +++ b/common/pixel.c @@ -328,6 +328,17 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ) pixf->sa8d[PIXEL_8x8] = pixel_sa8d_8x8; #ifdef HAVE_MMXEXT + if( cpu&X264_CPU_MMX ) + { + pixf->ssd[PIXEL_16x16] = x264_pixel_ssd_16x16_mmx; + pixf->ssd[PIXEL_16x8] = x264_pixel_ssd_16x8_mmx; + pixf->ssd[PIXEL_8x16] = x264_pixel_ssd_8x16_mmx; + pixf->ssd[PIXEL_8x8] = x264_pixel_ssd_8x8_mmx; + pixf->ssd[PIXEL_8x4] = x264_pixel_ssd_8x4_mmx; + pixf->ssd[PIXEL_4x8] = x264_pixel_ssd_4x8_mmx; + pixf->ssd[PIXEL_4x4] = x264_pixel_ssd_4x4_mmx; + } + if( cpu&X264_CPU_MMXEXT ) { pixf->sad[PIXEL_16x16] = x264_pixel_sad_16x16_mmxext; @@ -342,14 +353,6 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ) pixf->sad_pde[PIXEL_16x8 ] = x264_pixel_sad_pde_16x8_mmxext; pixf->sad_pde[PIXEL_8x16 ] = x264_pixel_sad_pde_8x16_mmxext; - pixf->ssd[PIXEL_16x16] = x264_pixel_ssd_16x16_mmxext; - pixf->ssd[PIXEL_16x8] = x264_pixel_ssd_16x8_mmxext; - pixf->ssd[PIXEL_8x16] = x264_pixel_ssd_8x16_mmxext; - pixf->ssd[PIXEL_8x8] = x264_pixel_ssd_8x8_mmxext; - pixf->ssd[PIXEL_8x4] = x264_pixel_ssd_8x4_mmxext; - pixf->ssd[PIXEL_4x8] = x264_pixel_ssd_4x8_mmxext; - pixf->ssd[PIXEL_4x4] = x264_pixel_ssd_4x4_mmxext; - pixf->satd[PIXEL_16x16]= x264_pixel_satd_16x16_mmxext; pixf->satd[PIXEL_16x8] = x264_pixel_satd_16x8_mmxext; pixf->satd[PIXEL_8x16] = x264_pixel_satd_8x16_mmxext; @@ -388,10 +391,10 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ) } #endif #ifdef ARCH_UltraSparc - pixf->sad[PIXEL_8x8] = x264_pixel_sad_8x8_vis; - pixf->sad[PIXEL_8x16] = x264_pixel_sad_8x16_vis; - pixf->sad[PIXEL_16x8] = x264_pixel_sad_16x8_vis; - pixf->sad[PIXEL_16x16] = x264_pixel_sad_16x16_vis; + pixf->sad[PIXEL_8x8] = x264_pixel_sad_8x8_vis; + pixf->sad[PIXEL_8x16] = x264_pixel_sad_8x16_vis; + pixf->sad[PIXEL_16x8] = x264_pixel_sad_16x8_vis; + pixf->sad[PIXEL_16x16] = x264_pixel_sad_16x16_vis; #endif }