From e3a07e098f96dfc2dbde8da6cad77ed012d4397e Mon Sep 17 00:00:00 2001 From: Loren Merritt Date: Wed, 4 Apr 2007 00:48:55 +0000 Subject: [PATCH] cosmetics in asm macros git-svn-id: svn://svn.videolan.org/x264/trunk@640 df754926-b1dd-0310-bc7b-ec298dee348c --- common/amd64/amd64inc.asm | 2 + common/amd64/cpu-a.asm | 13 +--- common/amd64/dct-a.asm | 36 ++--------- common/amd64/deblock-a.asm | 24 ++----- common/amd64/mc-a.asm | 66 +++++--------------- common/amd64/mc-a2.asm | 10 +-- common/amd64/pixel-a.asm | 104 +++++-------------------------- common/amd64/pixel-sse2.asm | 66 +++++--------------- common/amd64/predict-a.asm | 89 ++++++-------------------- common/amd64/quant-a.asm | 74 ++++++---------------- common/i386/cpu-a.asm | 13 +--- common/i386/dct-a.asm | 40 +++--------- common/i386/deblock-a.asm | 24 ++----- common/i386/i386inc.asm | 2 + common/i386/mc-a.asm | 64 +++++-------------- common/i386/mc-a2.asm | 10 +-- common/i386/pixel-a.asm | 121 +++++++----------------------------- common/i386/pixel-sse2.asm | 55 ++++------------ common/i386/predict-a.asm | 66 ++++---------------- common/i386/quant-a.asm | 61 +++++------------- 20 files changed, 201 insertions(+), 739 deletions(-) diff --git a/common/amd64/amd64inc.asm b/common/amd64/amd64inc.asm index e9409965..78f8ad9e 100644 --- a/common/amd64/amd64inc.asm +++ b/common/amd64/amd64inc.asm @@ -37,6 +37,8 @@ BITS 64 %ifdef WIN64 %define %1 pad %1 %endif + align 16 + %1: %endmacro %macro pad 1 diff --git a/common/amd64/cpu-a.asm b/common/amd64/cpu-a.asm index be1ab323..7137a4c6 100644 --- a/common/amd64/cpu-a.asm +++ b/common/amd64/cpu-a.asm @@ -35,15 +35,10 @@ BITS 64 SECTION .text -cglobal x264_cpu_cpuid_test -cglobal x264_cpu_cpuid -cglobal x264_emms - -ALIGN 16 ;----------------------------------------------------------------------------- ; int x264_cpu_cpuid_test( void ) return 0 if unsupported ;----------------------------------------------------------------------------- -x264_cpu_cpuid_test: +cglobal x264_cpu_cpuid_test firstpush rbx pushreg rbx push rbp @@ -69,11 +64,10 @@ x264_cpu_cpuid_test: ret endfunc -ALIGN 16 ;----------------------------------------------------------------------------- ; int x264_cpu_cpuid( int op, int *eax, int *ebx, int *ecx, int *edx ) ;----------------------------------------------------------------------------- -x264_cpu_cpuid: +cglobal x264_cpu_cpuid firstpush rbx pushreg rbx endprolog @@ -97,11 +91,10 @@ x264_cpu_cpuid: ret endfunc -ALIGN 16 ;----------------------------------------------------------------------------- ; void x264_emms( void ) ;----------------------------------------------------------------------------- -x264_emms: +cglobal x264_emms emms ret diff --git a/common/amd64/dct-a.asm b/common/amd64/dct-a.asm index de449cf0..5c859d2c 100644 --- a/common/amd64/dct-a.asm +++ b/common/amd64/dct-a.asm @@ -158,13 +158,10 @@ pw_32: times 8 dw 32 SECTION .text -cglobal x264_dct4x4dc_mmx - -ALIGN 16 ;----------------------------------------------------------------------------- ; void x264_dct4x4dc_mmx( int16_t d[4][4] ) ;----------------------------------------------------------------------------- -x264_dct4x4dc_mmx: +cglobal x264_dct4x4dc_mmx movq mm0, [parm1q+ 0] movq mm1, [parm1q+ 8] movq mm2, [parm1q+16] @@ -193,13 +190,10 @@ x264_dct4x4dc_mmx: movq [parm1q+24],mm4 ret -cglobal x264_idct4x4dc_mmx - -ALIGN 16 ;----------------------------------------------------------------------------- ; void x264_idct4x4dc_mmx( int16_t d[4][4] ) ;----------------------------------------------------------------------------- -x264_idct4x4dc_mmx: +cglobal x264_idct4x4dc_mmx movq mm0, [parm1q+ 0] movq mm1, [parm1q+ 8] movq mm2, [parm1q+16] @@ -219,13 +213,10 @@ x264_idct4x4dc_mmx: movq [parm1q+24], mm4 ret -cglobal x264_sub4x4_dct_mmx - -ALIGN 16 ;----------------------------------------------------------------------------- ; void x264_sub4x4_dct_mmx( int16_t dct[4][4], uint8_t *pix1, uint8_t *pix2 ) ;----------------------------------------------------------------------------- -x264_sub4x4_dct_mmx: +cglobal x264_sub4x4_dct_mmx MMX_ZERO mm7 ; Load 4 lines @@ -253,13 +244,10 @@ x264_sub4x4_dct_mmx: movq [parm1q+24], mm0 ret -cglobal x264_add4x4_idct_mmx - -ALIGN 16 ;----------------------------------------------------------------------------- ; void x264_add4x4_idct_mmx( uint8_t *p_dst, int16_t dct[4][4] ) ;----------------------------------------------------------------------------- -x264_add4x4_idct_mmx: +cglobal x264_add4x4_idct_mmx ; Load dct coeffs movq mm0, [parm2q+ 0] ; dct movq mm1, [parm2q+ 8] @@ -347,13 +335,10 @@ x264_add4x4_idct_mmx: psubw %4, %1 ; %4=b5 %endmacro -cglobal x264_sub8x8_dct8_sse2 - -ALIGN 16 ;----------------------------------------------------------------------------- ; void __cdecl x264_sub8x8_dct8_sse2( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 ) ;----------------------------------------------------------------------------- -x264_sub8x8_dct8_sse2: +cglobal x264_sub8x8_dct8_sse2 MMX_ZERO xmm9 MMX_LOAD_DIFF_8P xmm0, xmm8, xmm9, [parm2q+0*FENC_STRIDE], [parm3q+0*FDEC_STRIDE] @@ -433,13 +418,10 @@ x264_sub8x8_dct8_sse2: MMX_SUMSUB_BA %4, %5 ; %4=c3, %5=c4 %endmacro -cglobal x264_add8x8_idct8_sse2 - -ALIGN 16 ;----------------------------------------------------------------------------- ; void __cdecl x264_add8x8_idct8_sse2( uint8_t *p_dst, int16_t dct[8][8] ) ;----------------------------------------------------------------------------- -x264_add8x8_idct8_sse2: +cglobal x264_add8x8_idct8_sse2 movdqa xmm0, [parm2q+0x00] movdqa xmm1, [parm2q+0x10] movdqa xmm2, [parm2q+0x20] @@ -472,9 +454,7 @@ x264_add8x8_idct8_sse2: ; uint8_t *pix1, uint8_t *pix2 ) ;----------------------------------------------------------------------------- %macro SUB_NxN_DCT 6 -ALIGN 16 cglobal %1 -%1: call %2 add parm1q, %3 add parm2q, %4-%5*FENC_STRIDE @@ -494,9 +474,7 @@ cglobal %1 ; void __cdecl x264_add8x8_idct_mmx( uint8_t *pix, int16_t dct[4][4][4] ) ;----------------------------------------------------------------------------- %macro ADD_NxN_IDCT 6 -ALIGN 16 cglobal %1 -%1: call %2 add parm1q, %4-%5*FDEC_STRIDE add parm2q, %3 @@ -522,9 +500,7 @@ ADD_NxN_IDCT x264_add16x16_idct8_sse2, x264_add8x8_idct8_sse2, 128, 8, 0, 8 ;----------------------------------------------------------------------------- ; void __cdecl x264_zigzag_scan_4x4_field_sse2( int level[16], int16_t dct[4][4] ) ;----------------------------------------------------------------------------- -ALIGN 16 cglobal x264_zigzag_scan_4x4_field_sse2 -x264_zigzag_scan_4x4_field_sse2: punpcklwd xmm0, [parm2q] punpckhwd xmm1, [parm2q] punpcklwd xmm2, [parm2q+16] diff --git a/common/amd64/deblock-a.asm b/common/amd64/deblock-a.asm index 85f391cb..8af8cd0c 100644 --- a/common/amd64/deblock-a.asm +++ b/common/amd64/deblock-a.asm @@ -30,12 +30,6 @@ pb_03: times 16 db 0x03 pb_a1: times 16 db 0xa1 SECTION .text -cglobal x264_deblock_v_luma_sse2 -cglobal x264_deblock_h_luma_sse2 -cglobal x264_deblock_v_chroma_mmxext -cglobal x264_deblock_h_chroma_mmxext -cglobal x264_deblock_v_chroma_intra_mmxext -cglobal x264_deblock_h_chroma_intra_mmxext ; expands to [base],...,[base+7*stride] %define PASS8ROWS(base, base3, stride, stride3) \ @@ -267,11 +261,10 @@ cglobal x264_deblock_h_chroma_intra_mmxext SECTION .text -ALIGN 16 ;----------------------------------------------------------------------------- ; void x264_deblock_v_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) ;----------------------------------------------------------------------------- -x264_deblock_v_luma_sse2: +cglobal x264_deblock_v_luma_sse2 ; rdi = pix movsxd rsi, esi ; stride dec edx ; alpha-1 @@ -317,11 +310,10 @@ x264_deblock_v_luma_sse2: ret -ALIGN 16 ;----------------------------------------------------------------------------- ; void x264_deblock_h_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) ;----------------------------------------------------------------------------- -x264_deblock_h_luma_sse2: +cglobal x264_deblock_h_luma_sse2 movsxd r10, esi lea r11, [r10+r10*2] lea rax, [rdi-4] @@ -383,11 +375,10 @@ x264_deblock_h_luma_sse2: add rdi, r9 %endmacro -ALIGN 16 ;----------------------------------------------------------------------------- ; void x264_deblock_v_chroma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) ;----------------------------------------------------------------------------- -x264_deblock_v_chroma_mmxext: +cglobal x264_deblock_v_chroma_mmxext CHROMA_V_START movq mm0, [rax] @@ -406,11 +397,10 @@ x264_deblock_v_chroma_mmxext: ret -ALIGN 16 ;----------------------------------------------------------------------------- ; void x264_deblock_h_chroma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) ;----------------------------------------------------------------------------- -x264_deblock_h_chroma_mmxext: +cglobal x264_deblock_h_chroma_mmxext CHROMA_H_START TRANSPOSE4x8_LOAD PASS8ROWS(rax, rdi, rsi, r9) @@ -454,11 +444,10 @@ x264_deblock_h_chroma_mmxext: paddb mm2, mm6 %endmacro -ALIGN 16 ;----------------------------------------------------------------------------- ; void x264_deblock_v_chroma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta ) ;----------------------------------------------------------------------------- -x264_deblock_v_chroma_intra_mmxext: +cglobal x264_deblock_v_chroma_intra_mmxext CHROMA_V_START movq mm0, [rax] @@ -472,11 +461,10 @@ x264_deblock_v_chroma_intra_mmxext: movq [rdi], mm2 ret -ALIGN 16 ;----------------------------------------------------------------------------- ; void x264_deblock_h_chroma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta ) ;----------------------------------------------------------------------------- -x264_deblock_h_chroma_intra_mmxext: +cglobal x264_deblock_h_chroma_intra_mmxext CHROMA_H_START TRANSPOSE4x8_LOAD PASS8ROWS(rax, rdi, rsi, r9) CHROMA_INTRA_BODY diff --git a/common/amd64/mc-a.asm b/common/amd64/mc-a.asm index 7e0bfa27..f1b4bea0 100644 --- a/common/amd64/mc-a.asm +++ b/common/amd64/mc-a.asm @@ -56,38 +56,17 @@ pw_64: times 4 dw 64 SECTION .text -cglobal x264_pixel_avg_w4_mmxext -cglobal x264_pixel_avg_w8_mmxext -cglobal x264_pixel_avg_w16_mmxext -cglobal x264_pixel_avg_w20_mmxext -cglobal x264_pixel_avg_w16_sse2 - -cglobal x264_pixel_avg_weight_4x4_mmxext -cglobal x264_pixel_avg_weight_w8_mmxext -cglobal x264_pixel_avg_weight_w16_mmxext - -cglobal x264_mc_copy_w4_mmx -cglobal x264_mc_copy_w8_mmx -cglobal x264_mc_copy_w16_mmx -cglobal x264_mc_copy_w16_sse2 - -cglobal x264_mc_chroma_mmxext - -cglobal x264_prefetch_fenc_mmxext -cglobal x264_prefetch_ref_mmxext - ;============================================================================= ; pixel avg ;============================================================================= -ALIGN 16 ;----------------------------------------------------------------------------- ; void x264_pixel_avg_w4_mmxext( uint8_t *dst, int i_dst_stride, ; uint8_t *src1, int i_src1_stride, ; uint8_t *src2, int i_src2_stride, ; int i_height ); ;----------------------------------------------------------------------------- -x264_pixel_avg_w4_mmxext: +cglobal x264_pixel_avg_w4_mmxext mov r10, parm5q ; src2 movsxd r11, parm6d ; i_src2_stride mov eax, parm7d ; i_height @@ -109,14 +88,13 @@ ALIGN 4 -ALIGN 16 ;----------------------------------------------------------------------------- ; void x264_pixel_avg_w8_mmxext( uint8_t *dst, int i_dst_stride, ; uint8_t *src1, int i_src1_stride, ; uint8_t *src2, int i_src2_stride, ; int i_height ); ;----------------------------------------------------------------------------- -x264_pixel_avg_w8_mmxext: +cglobal x264_pixel_avg_w8_mmxext mov r10, parm5q ; src2 movsxd r11, parm6d ; i_src2_stride mov eax, parm7d ; i_height @@ -136,14 +114,13 @@ ALIGN 4 jg .height_loop rep ret -ALIGN 16 ;----------------------------------------------------------------------------- ; void x264_pixel_avg_w16_mmxext( uint8_t *dst, int i_dst_stride, ; uint8_t *src1, int i_src1_stride, ; uint8_t *src2, int i_src2_stride, ; int i_height ); ;----------------------------------------------------------------------------- -x264_pixel_avg_w16_mmxext: +cglobal x264_pixel_avg_w16_mmxext mov r10, parm5q ; src2 movsxd r11, parm6d ; i_src2_stride mov eax, parm7d ; i_height @@ -163,14 +140,13 @@ ALIGN 4 jg .height_loop rep ret -ALIGN 16 ;----------------------------------------------------------------------------- ; void x264_pixel_avg_w20_mmxext( uint8_t *dst, int i_dst_stride, ; uint8_t *src1, int i_src1_stride, ; uint8_t *src2, int i_src2_stride, ; int i_height ); ;----------------------------------------------------------------------------- -x264_pixel_avg_w20_mmxext: +cglobal x264_pixel_avg_w20_mmxext mov r10, parm5q ; src2 movsxd r11, parm6d ; i_src2_stride mov eax, parm7d ; i_height @@ -193,14 +169,13 @@ ALIGN 4 jg .height_loop rep ret -ALIGN 16 ;----------------------------------------------------------------------------- ; void x264_pixel_avg_w16_sse2( uint8_t *dst, int i_dst_stride, ; uint8_t *src1, int i_src1_stride, ; uint8_t *src2, int i_src2_stride, ; int i_height ); ;----------------------------------------------------------------------------- -x264_pixel_avg_w16_sse2: +cglobal x264_pixel_avg_w16_sse2 mov r10, parm5q ; src2 movsxd r11, parm6d ; i_src2_stride mov eax, parm7d ; i_height @@ -260,11 +235,10 @@ ALIGN 4 .height_loop %endmacro -ALIGN 16 ;----------------------------------------------------------------------------- ; int x264_pixel_avg_weight_w16_mmxext( uint8_t *dst, int, uint8_t *src, int, int i_weight, int ) ;----------------------------------------------------------------------------- -x264_pixel_avg_weight_w16_mmxext: +cglobal x264_pixel_avg_weight_w16_mmxext BIWEIGHT_START_MMX BIWEIGHT_4P_MMX [parm1q ], [parm3q ] @@ -278,11 +252,10 @@ x264_pixel_avg_weight_w16_mmxext: jg .height_loop rep ret -ALIGN 16 ;----------------------------------------------------------------------------- ; int x264_pixel_avg_weight_w8_mmxext( uint8_t *, int, uint8_t *, int, int, int ) ;----------------------------------------------------------------------------- -x264_pixel_avg_weight_w8_mmxext: +cglobal x264_pixel_avg_weight_w8_mmxext BIWEIGHT_START_MMX BIWEIGHT_4P_MMX [parm1q ], [parm3q ] @@ -294,11 +267,10 @@ x264_pixel_avg_weight_w8_mmxext: jg .height_loop rep ret -ALIGN 16 ;----------------------------------------------------------------------------- ; int x264_pixel_avg_weight_4x4_mmxext( uint8_t *, int, uint8_t *, int, int ) ;----------------------------------------------------------------------------- -x264_pixel_avg_weight_4x4_mmxext: +cglobal x264_pixel_avg_weight_4x4_mmxext BIWEIGHT_START_MMX BIWEIGHT_4P_MMX [parm1q ], [parm3q ] BIWEIGHT_4P_MMX [parm1q+parm2q ], [parm3q+parm4q ] @@ -314,12 +286,11 @@ x264_pixel_avg_weight_4x4_mmxext: ; pixel copy ;============================================================================= -ALIGN 16 ;----------------------------------------------------------------------------- ; void x264_mc_copy_w4_mmx( uint8_t *dst, int i_dst_stride, ; uint8_t *src, int i_src_stride, int i_height ) ;----------------------------------------------------------------------------- -x264_mc_copy_w4_mmx: +cglobal x264_mc_copy_w4_mmx mov eax, parm5d ; i_height ALIGN 4 @@ -335,12 +306,11 @@ ALIGN 4 jg .height_loop rep ret -ALIGN 16 ;----------------------------------------------------------------------------- ; void x264_mc_copy_w8_mmx( uint8_t *dst, int i_dst_stride, ; uint8_t *src, int i_src_stride, int i_height ) ;----------------------------------------------------------------------------- -x264_mc_copy_w8_mmx: +cglobal x264_mc_copy_w8_mmx mov eax, parm5d ; i_height lea r10, [parm4q+parm4q*2] ; 3 * i_src_stride @@ -363,12 +333,11 @@ ALIGN 4 jg .height_loop rep ret -ALIGN 16 ;----------------------------------------------------------------------------- ; void x264_mc_copy_w16_mmx( uint8_t *dst, int i_dst_stride, ; uint8_t *src, int i_src_stride, int i_height ) ;----------------------------------------------------------------------------- -x264_mc_copy_w16_mmx: +cglobal x264_mc_copy_w16_mmx mov eax, parm5d ; i_height lea r10, [parm4q+parm4q*2] ; 3 * i_src_stride @@ -399,11 +368,10 @@ ALIGN 4 rep ret -ALIGN 16 ;----------------------------------------------------------------------------- ; void x264_mc_copy_w16_sse2( uint8_t *dst, int i_dst_stride, uint8_t *src, int i_src_stride, int i_height ) ;----------------------------------------------------------------------------- -x264_mc_copy_w16_sse2: +cglobal x264_mc_copy_w16_sse2 mov eax, parm5d ; i_height ALIGN 4 @@ -424,15 +392,13 @@ ALIGN 4 ; chroma MC ;============================================================================= -ALIGN 16 ;----------------------------------------------------------------------------- ; void x264_mc_chroma_mmxext( uint8_t *src, int i_src_stride, ; uint8_t *dst, int i_dst_stride, ; int dx, int dy, ; int i_width, int i_height ) ;----------------------------------------------------------------------------- - -x264_mc_chroma_mmxext: +cglobal x264_mc_chroma_mmxext mov r10d, parm6d mov r11d, parm5d sar r10d, 3 @@ -590,8 +556,7 @@ ALIGN 4 ; void x264_prefetch_fenc_mmxext( uint8_t *pix_y, int stride_y, ; uint8_t *pix_uv, int stride_uv, int mb_x ) ;----------------------------------------------------------------------------- -ALIGN 16 -x264_prefetch_fenc_mmxext: +cglobal x264_prefetch_fenc_mmxext mov eax, parm5d and eax, 3 imul eax, parm2d @@ -613,8 +578,7 @@ x264_prefetch_fenc_mmxext: ;----------------------------------------------------------------------------- ; void x264_prefetch_ref_mmxext( uint8_t *pix, int stride, int parity ) ;----------------------------------------------------------------------------- -ALIGN 16 -x264_prefetch_ref_mmxext: +cglobal x264_prefetch_ref_mmxext dec parm3d and parm3d, parm2d lea parm1q, [parm1q+parm3q*8+64] diff --git a/common/amd64/mc-a2.asm b/common/amd64/mc-a2.asm index 40c9a824..1152f5b8 100644 --- a/common/amd64/mc-a2.asm +++ b/common/amd64/mc-a2.asm @@ -94,16 +94,11 @@ pw_32: times 4 dw 32 SECTION .text -cglobal x264_hpel_filter_mmxext -cglobal x264_plane_copy_mmxext - ;----------------------------------------------------------------------------- ; void x264_hpel_filter_mmxext( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src, ; int i_stride, int i_width, int i_height ); ;----------------------------------------------------------------------------- - -ALIGN 16 -x264_hpel_filter_mmxext : +cglobal x264_hpel_filter_mmxext %ifdef WIN64 push rdi @@ -276,8 +271,7 @@ ALIGN 16 ; void x264_plane_copy_mmxext( uint8_t *dst, int i_dst, ; uint8_t *src, int i_src, int w, int h) ;----------------------------------------------------------------------------- -ALIGN 16 -x264_plane_copy_mmxext: +cglobal x264_plane_copy_mmxext movsxd parm2q, parm2d movsxd parm4q, parm4d add parm5d, 3 diff --git a/common/amd64/pixel-a.asm b/common/amd64/pixel-a.asm index 705596b7..67092f02 100644 --- a/common/amd64/pixel-a.asm +++ b/common/amd64/pixel-a.asm @@ -408,59 +408,6 @@ BITS 64 SECTION .text -cglobal x264_pixel_sad_16x16_mmxext -cglobal x264_pixel_sad_16x8_mmxext -cglobal x264_pixel_sad_8x16_mmxext -cglobal x264_pixel_sad_8x8_mmxext -cglobal x264_pixel_sad_8x4_mmxext -cglobal x264_pixel_sad_4x8_mmxext -cglobal x264_pixel_sad_4x4_mmxext - -cglobal x264_pixel_sad_x3_16x16_mmxext -cglobal x264_pixel_sad_x3_16x8_mmxext -cglobal x264_pixel_sad_x3_8x16_mmxext -cglobal x264_pixel_sad_x3_8x8_mmxext -cglobal x264_pixel_sad_x3_8x4_mmxext -cglobal x264_pixel_sad_x3_4x8_mmxext -cglobal x264_pixel_sad_x3_4x4_mmxext - -cglobal x264_pixel_sad_x4_16x16_mmxext -cglobal x264_pixel_sad_x4_16x8_mmxext -cglobal x264_pixel_sad_x4_8x16_mmxext -cglobal x264_pixel_sad_x4_8x8_mmxext -cglobal x264_pixel_sad_x4_8x4_mmxext -cglobal x264_pixel_sad_x4_4x8_mmxext -cglobal x264_pixel_sad_x4_4x4_mmxext - -cglobal x264_pixel_sad_pde_16x16_mmxext -cglobal x264_pixel_sad_pde_16x8_mmxext -cglobal x264_pixel_sad_pde_8x16_mmxext - -cglobal x264_pixel_ssd_16x16_mmx -cglobal x264_pixel_ssd_16x8_mmx -cglobal x264_pixel_ssd_8x16_mmx -cglobal x264_pixel_ssd_8x8_mmx -cglobal x264_pixel_ssd_8x4_mmx -cglobal x264_pixel_ssd_4x8_mmx -cglobal x264_pixel_ssd_4x4_mmx - -cglobal x264_pixel_satd_4x4_mmxext -cglobal x264_pixel_satd_4x8_mmxext -cglobal x264_pixel_satd_8x4_mmxext -cglobal x264_pixel_satd_8x8_mmxext -cglobal x264_pixel_satd_16x8_mmxext -cglobal x264_pixel_satd_8x16_mmxext -cglobal x264_pixel_satd_16x16_mmxext - -cglobal x264_intra_satd_x3_4x4_mmxext -cglobal x264_intra_satd_x3_8x8c_mmxext -cglobal x264_intra_satd_x3_16x16_mmxext - -cglobal x264_pixel_ads4_mmxext -cglobal x264_pixel_ads2_mmxext -cglobal x264_pixel_ads1_mmxext - - %macro SAD_START 0 pxor mm0, mm0 %endmacro @@ -474,8 +421,7 @@ cglobal x264_pixel_ads1_mmxext ; int x264_pixel_sad_16x16_mmxext (uint8_t *, int, uint8_t *, int ) ;----------------------------------------------------------------------------- %macro SAD 2 -ALIGN 16 -x264_pixel_sad_%1x%2_mmxext: +cglobal x264_pixel_sad_%1x%2_mmxext SAD_START %rep %2/2 SAD_INC_2x%1P @@ -496,8 +442,7 @@ SAD 4, 4 ; uint8_t *pix2, int i_stride, int scores[3] ) ;----------------------------------------------------------------------------- %macro SAD_X 3 -ALIGN 16 -x264_pixel_sad_x%1_%2x%3_mmxext: +cglobal x264_pixel_sad_x%1_%2x%3_mmxext SAD_X%1_2x%2P 1 %rep %3/2-1 SAD_X%1_2x%2P 0 @@ -534,8 +479,7 @@ ALIGN 4 ; int x264_pixel_sad_pde_16x16_mmxext (uint8_t *, int, uint8_t *, int, int ) ;----------------------------------------------------------------------------- %macro SAD_PDE 2 -ALIGN 16 -x264_pixel_sad_pde_%1x%2_mmxext: +cglobal x264_pixel_sad_pde_%1x%2_mmxext SAD_START %rep %2/4 SAD_INC_2x%1P @@ -577,8 +521,7 @@ SAD_PDE 8, 16 ; int x264_pixel_ssd_16x16_mmx (uint8_t *, int, uint8_t *, int ) ;----------------------------------------------------------------------------- %macro SSD 2 -ALIGN 16 -x264_pixel_ssd_%1x%2_mmx: +cglobal x264_pixel_ssd_%1x%2_mmx SSD_START %rep %2 SSD_INC_1x%1P @@ -611,42 +554,38 @@ SSD 4, 4 ret %endmacro -ALIGN 16 ;----------------------------------------------------------------------------- ; int x264_pixel_satd_4x4_mmxext (uint8_t *, int, uint8_t *, int ) ;----------------------------------------------------------------------------- -x264_pixel_satd_4x4_mmxext: +cglobal x264_pixel_satd_4x4_mmxext SATD_START LOAD_DIFF_HADAMARD_SUM mm0, 0, 0 SATD_END -ALIGN 16 ;----------------------------------------------------------------------------- ; int x264_pixel_satd_4x8_mmxext (uint8_t *, int, uint8_t *, int ) ;----------------------------------------------------------------------------- -x264_pixel_satd_4x8_mmxext: +cglobal x264_pixel_satd_4x8_mmxext SATD_START LOAD_DIFF_HADAMARD_SUM mm0, 0, 1 LOAD_DIFF_HADAMARD_SUM mm1, 0, 0 paddw mm0, mm1 SATD_END -ALIGN 16 ;----------------------------------------------------------------------------- ; int x264_pixel_satd_8x4_mmxext (uint8_t *, int, uint8_t *, int ) ;----------------------------------------------------------------------------- -x264_pixel_satd_8x4_mmxext: +cglobal x264_pixel_satd_8x4_mmxext SATD_START LOAD_DIFF_HADAMARD_SUM mm0, 0, 0 LOAD_DIFF_HADAMARD_SUM mm1, 4, 0 paddw mm0, mm1 SATD_END -ALIGN 16 ;----------------------------------------------------------------------------- ; int x264_pixel_satd_8x8_mmxext (uint8_t *, int, uint8_t *, int ) ;----------------------------------------------------------------------------- -x264_pixel_satd_8x8_mmxext: +cglobal x264_pixel_satd_8x8_mmxext SATD_START LOAD_DIFF_HADAMARD_SUM mm0, 0, 0 LOAD_DIFF_HADAMARD_SUM mm1, 4, 1 @@ -657,11 +596,10 @@ x264_pixel_satd_8x8_mmxext: paddw mm0, mm1 SATD_END -ALIGN 16 ;----------------------------------------------------------------------------- ; int x264_pixel_satd_16x8_mmxext (uint8_t *, int, uint8_t *, int ) ;----------------------------------------------------------------------------- -x264_pixel_satd_16x8_mmxext: +cglobal x264_pixel_satd_16x8_mmxext SATD_START LOAD_DIFF_HADAMARD_SUM mm0, 0, 0 LOAD_DIFF_HADAMARD_SUM mm1, 4, 0 @@ -681,11 +619,10 @@ x264_pixel_satd_16x8_mmxext: paddw mm0, mm1 SATD_END -ALIGN 16 ;----------------------------------------------------------------------------- ; int x264_pixel_satd_8x16_mmxext (uint8_t *, int, uint8_t *, int ) ;----------------------------------------------------------------------------- -x264_pixel_satd_8x16_mmxext: +cglobal x264_pixel_satd_8x16_mmxext SATD_START LOAD_DIFF_HADAMARD_SUM mm0, 0, 0 LOAD_DIFF_HADAMARD_SUM mm1, 4, 1 @@ -705,11 +642,10 @@ x264_pixel_satd_8x16_mmxext: paddw mm0, mm1 SATD_END -ALIGN 16 ;----------------------------------------------------------------------------- ; int x264_pixel_satd_16x16_mmxext (uint8_t *, int, uint8_t *, int ) ;----------------------------------------------------------------------------- -x264_pixel_satd_16x16_mmxext: +cglobal x264_pixel_satd_16x16_mmxext SATD_START LOAD_DIFF_HADAMARD_SUM mm0, 0, 0 LOAD_DIFF_HADAMARD_SUM mm1, 4, 0 @@ -802,11 +738,10 @@ load_hadamard: %8 %3, %6 %endmacro -ALIGN 16 ;----------------------------------------------------------------------------- ; void x264_intra_satd_x3_4x4_mmxext( uint8_t *fenc, uint8_t *fdec, int *res ) ;----------------------------------------------------------------------------- -x264_intra_satd_x3_4x4_mmxext: +cglobal x264_intra_satd_x3_4x4_mmxext %define top_1d rsp-8 ; +8 %define left_1d rsp-16 ; +8 call load_hadamard @@ -871,11 +806,10 @@ x264_intra_satd_x3_4x4_mmxext: movd [parm3q+8], mm5 ; i4x4_dc satd ret -ALIGN 16 ;----------------------------------------------------------------------------- ; void x264_intra_satd_x3_16x16_mmxext( uint8_t *fenc, uint8_t *fdec, int *res ) ;----------------------------------------------------------------------------- -x264_intra_satd_x3_16x16_mmxext: +cglobal x264_intra_satd_x3_16x16_mmxext %define sums rsp-32 ; +24 %define top_1d rsp-64 ; +32 %define left_1d rsp-96 ; +32 @@ -986,11 +920,10 @@ x264_intra_satd_x3_16x16_mmxext: movd [parm3q+0], mm0 ; i16x16_v satd ret -ALIGN 16 ;----------------------------------------------------------------------------- ; void x264_intra_satd_x3_8x8c_mmxext( uint8_t *fenc, uint8_t *fdec, int *res ) ;----------------------------------------------------------------------------- -x264_intra_satd_x3_8x8c_mmxext: +cglobal x264_intra_satd_x3_8x8c_mmxext %define sums rsp-32 ; +24 %define top_1d rsp-48 ; +16 %define left_1d rsp-64 ; +16 @@ -1121,8 +1054,7 @@ x264_intra_satd_x3_8x8c_mmxext: ; void x264_pixel_ads4_mmxext( int enc_dc[4], uint16_t *sums, int delta, ; uint16_t *res, int width ) ;----------------------------------------------------------------------------- -ALIGN 16 -x264_pixel_ads4_mmxext: +cglobal x264_pixel_ads4_mmxext movq mm6, [parm1q] movq mm4, [parm1q+8] pshufw mm7, mm6, 0 @@ -1154,8 +1086,7 @@ x264_pixel_ads4_mmxext: nop ret -ALIGN 16 -x264_pixel_ads2_mmxext: +cglobal x264_pixel_ads2_mmxext movq mm6, [parm1q] pshufw mm7, mm6, 0 pshufw mm6, mm6, 0xAA @@ -1176,8 +1107,7 @@ x264_pixel_ads2_mmxext: nop ret -ALIGN 16 -x264_pixel_ads1_mmxext: +cglobal x264_pixel_ads1_mmxext pshufw mm7, [parm1q], 0 .loop: movq mm0, [parm2q] diff --git a/common/amd64/pixel-sse2.asm b/common/amd64/pixel-sse2.asm index 57a1a910..bc1888be 100644 --- a/common/amd64/pixel-sse2.asm +++ b/common/amd64/pixel-sse2.asm @@ -40,30 +40,6 @@ mask_ff: times 16 db 0xff SECTION .text - -cglobal x264_pixel_sad_16x16_sse2 -cglobal x264_pixel_sad_16x8_sse2 -cglobal x264_pixel_ssd_16x16_sse2 -cglobal x264_pixel_ssd_16x8_sse2 -cglobal x264_pixel_satd_8x4_sse2 -cglobal x264_pixel_satd_8x8_sse2 -cglobal x264_pixel_satd_16x8_sse2 -cglobal x264_pixel_satd_8x16_sse2 -cglobal x264_pixel_satd_16x16_sse2 -cglobal x264_pixel_satd_8x4_ssse3 -cglobal x264_pixel_satd_8x8_ssse3 -cglobal x264_pixel_satd_16x8_ssse3 -cglobal x264_pixel_satd_8x16_ssse3 -cglobal x264_pixel_satd_16x16_ssse3 -cglobal x264_pixel_sa8d_8x8_sse2 -cglobal x264_pixel_sa8d_16x16_sse2 -cglobal x264_pixel_sa8d_8x8_ssse3 -cglobal x264_pixel_sa8d_16x16_ssse3 -cglobal x264_intra_sa8d_x3_8x8_core_sse2 -cglobal x264_pixel_ssim_4x4x2_core_sse2 -cglobal x264_pixel_ssim_end4_sse2 - - %macro HADDD 2 ; sum junk movhlps %2, %1 paddd %1, %2 @@ -103,11 +79,10 @@ cglobal x264_pixel_ssim_end4_sse2 ret %endmacro -ALIGN 16 ;----------------------------------------------------------------------------- ; int x264_pixel_sad_16x16_sse2 (uint8_t *, int, uint8_t *, int ) ;----------------------------------------------------------------------------- -x264_pixel_sad_16x16_sse2: +cglobal x264_pixel_sad_16x16_sse2 movdqu xmm0, [rdx] movdqu xmm1, [rdx+rcx] lea rdx, [rdx+2*rcx] @@ -171,11 +146,10 @@ x264_pixel_sad_16x16_sse2: paddw xmm0, xmm7 SAD_END_SSE2 -ALIGN 16 ;----------------------------------------------------------------------------- ; int x264_pixel_sad_16x8_sse2 (uint8_t *, int, uint8_t *, int ) ;----------------------------------------------------------------------------- -x264_pixel_sad_16x8_sse2: +cglobal x264_pixel_sad_16x8_sse2 pxor xmm0, xmm0 SAD_INC_4x16P_SSE2 SAD_INC_4x16P_SSE2 @@ -227,22 +201,20 @@ x264_pixel_sad_16x8_sse2: ret %endmacro -ALIGN 16 ;----------------------------------------------------------------------------- ; int x264_pixel_ssd_16x16_sse2 (uint8_t *, int, uint8_t *, int ) ;----------------------------------------------------------------------------- -x264_pixel_ssd_16x16_sse2: +cglobal x264_pixel_ssd_16x16_sse2 SSD_START_SSE2 %rep 8 SSD_INC_2x16P_SSE2 %endrep SSD_END_SSE2 -ALIGN 16 ;----------------------------------------------------------------------------- ; int x264_pixel_ssd_16x8_sse2 (uint8_t *, int, uint8_t *, int ) ;----------------------------------------------------------------------------- -x264_pixel_ssd_16x8_sse2: +cglobal x264_pixel_ssd_16x8_sse2 SSD_START_SSE2 %rep 4 SSD_INC_2x16P_SSE2 @@ -420,11 +392,10 @@ x264_pixel_ssd_16x8_sse2: %endmacro %macro SATDS 1 -ALIGN 16 ;----------------------------------------------------------------------------- ; int x264_pixel_satd_16x16_sse2 (uint8_t *, int, uint8_t *, int ) ;----------------------------------------------------------------------------- -x264_pixel_satd_16x16_%1: +cglobal x264_pixel_satd_16x16_%1 SATD_START mov r8, rdi mov r9, rdx @@ -440,11 +411,10 @@ x264_pixel_satd_16x16_%1: SATD_TWO_SSE2 SATD_END -ALIGN 16 ;----------------------------------------------------------------------------- ; int x264_pixel_satd_8x16_sse2 (uint8_t *, int, uint8_t *, int ) ;----------------------------------------------------------------------------- -x264_pixel_satd_8x16_%1: +cglobal x264_pixel_satd_8x16_%1 SATD_START SATD_TWO_SSE2 SATD_TWO_SSE2 @@ -452,11 +422,10 @@ x264_pixel_satd_8x16_%1: SATD_TWO_SSE2 SATD_END -ALIGN 16 ;----------------------------------------------------------------------------- ; int x264_pixel_satd_16x8_sse2 (uint8_t *, int, uint8_t *, int ) ;----------------------------------------------------------------------------- -x264_pixel_satd_16x8_%1: +cglobal x264_pixel_satd_16x8_%1 SATD_START mov r8, rdi mov r9, rdx @@ -468,31 +437,28 @@ x264_pixel_satd_16x8_%1: SATD_TWO_SSE2 SATD_END -ALIGN 16 ;----------------------------------------------------------------------------- ; int x264_pixel_satd_8x8_sse2 (uint8_t *, int, uint8_t *, int ) ;----------------------------------------------------------------------------- -x264_pixel_satd_8x8_%1: +cglobal x264_pixel_satd_8x8_%1 SATD_START SATD_TWO_SSE2 SATD_TWO_SSE2 SATD_END -ALIGN 16 ;----------------------------------------------------------------------------- ; int x264_pixel_satd_8x4_sse2 (uint8_t *, int, uint8_t *, int ) ;----------------------------------------------------------------------------- -x264_pixel_satd_8x4_%1: +cglobal x264_pixel_satd_8x4_%1 SATD_START SATD_TWO_SSE2 SATD_END -ALIGN 16 ;----------------------------------------------------------------------------- ; int x264_pixel_sa8d_8x8_sse2( uint8_t *, int, uint8_t *, int ) ;----------------------------------------------------------------------------- -x264_pixel_sa8d_8x8_%1: +cglobal x264_pixel_sa8d_8x8_%1 lea r10, [3*parm2q] lea r11, [3*parm4q] LOAD_DIFF_4x8P xmm0, xmm1, xmm2, xmm3, xmm8, xmm8 @@ -515,12 +481,11 @@ x264_pixel_sa8d_8x8_%1: shr eax, 1 ret -ALIGN 16 ;----------------------------------------------------------------------------- ; int x264_pixel_sa8d_16x16_sse2( uint8_t *, int, uint8_t *, int ) ;----------------------------------------------------------------------------- ;; violates calling convention -x264_pixel_sa8d_16x16_%1: +cglobal x264_pixel_sa8d_16x16_%1 xor r8d, r8d call x264_pixel_sa8d_8x8_%1 ; pix[0] lea parm1q, [parm1q+4*parm2q] @@ -551,11 +516,10 @@ SATDS ssse3 -ALIGN 16 ;----------------------------------------------------------------------------- ; void x264_intra_sa8d_x3_8x8_core_sse2( uint8_t *fenc, int16_t edges[2][8], int *res ) ;----------------------------------------------------------------------------- -x264_intra_sa8d_x3_8x8_core_sse2: +cglobal x264_intra_sa8d_x3_8x8_core_sse2 ; 8x8 hadamard pxor xmm4, xmm4 movq xmm0, [parm1q+0*FENC_STRIDE] @@ -643,8 +607,7 @@ x264_intra_sa8d_x3_8x8_core_sse2: ; void x264_pixel_ssim_4x4x2_core_sse2( const uint8_t *pix1, int stride1, ; const uint8_t *pix2, int stride2, int sums[2][4] ) ;----------------------------------------------------------------------------- -ALIGN 16 -x264_pixel_ssim_4x4x2_core_sse2: +cglobal x264_pixel_ssim_4x4x2_core_sse2 pxor xmm0, xmm0 pxor xmm1, xmm1 pxor xmm2, xmm2 @@ -692,8 +655,7 @@ x264_pixel_ssim_4x4x2_core_sse2: ;----------------------------------------------------------------------------- ; float x264_pixel_ssim_end_sse2( int sum0[5][4], int sum1[5][4], int width ) ;----------------------------------------------------------------------------- -ALIGN 16 -x264_pixel_ssim_end4_sse2: +cglobal x264_pixel_ssim_end4_sse2 movdqa xmm0, [parm1q+ 0] movdqa xmm1, [parm1q+16] movdqa xmm2, [parm1q+32] diff --git a/common/amd64/predict-a.asm b/common/amd64/predict-a.asm index b2e7fb8f..96751a1e 100644 --- a/common/amd64/predict-a.asm +++ b/common/amd64/predict-a.asm @@ -83,26 +83,6 @@ pb_0s_ff: SECTION .text -cglobal predict_4x4_ddl_mmxext -cglobal predict_4x4_vl_mmxext -cglobal predict_8x8_v_mmxext -cglobal predict_8x8_dc_mmxext -cglobal predict_8x8_dc_top_mmxext -cglobal predict_8x8_dc_left_mmxext -cglobal predict_8x8_ddl_mmxext -cglobal predict_8x8_ddl_sse2 -cglobal predict_8x8_ddr_sse2 -cglobal predict_8x8_vl_sse2 -cglobal predict_8x8_vr_core_mmxext -cglobal predict_8x8c_v_mmx -cglobal predict_8x8c_dc_core_mmxext -cglobal predict_8x8c_p_core_mmxext -cglobal predict_16x16_p_core_mmxext -cglobal predict_16x16_v_mmx -cglobal predict_16x16_dc_core_mmxext -cglobal predict_16x16_dc_top_mmxext - - ; dest, left, right, src, tmp ; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2 %macro PRED8x8_LOWPASS0 6 @@ -125,9 +105,7 @@ cglobal predict_16x16_dc_top_mmxext ;----------------------------------------------------------------------------- ; void predict_4x4_ddl_mmxext( uint8_t *src ) ;----------------------------------------------------------------------------- - -ALIGN 16 -predict_4x4_ddl_mmxext: +cglobal predict_4x4_ddl_mmxext sub parm1q, FDEC_STRIDE movq mm3, [parm1q] movq mm1, [parm1q-1] @@ -151,9 +129,7 @@ predict_4x4_ddl_mmxext: ;----------------------------------------------------------------------------- ; void predict_4x4_vl_mmxext( uint8_t *src ) ;----------------------------------------------------------------------------- - -ALIGN 16 -predict_4x4_vl_mmxext: +cglobal predict_4x4_vl_mmxext movq mm1, [parm1q-FDEC_STRIDE] movq mm3, mm1 movq mm2, mm1 @@ -176,9 +152,7 @@ predict_4x4_vl_mmxext: ;----------------------------------------------------------------------------- ; void predict_8x8_v_mmxext( uint8_t *src, uint8_t *edge ) ;----------------------------------------------------------------------------- - -ALIGN 16 -predict_8x8_v_mmxext: +cglobal predict_8x8_v_mmxext movq mm0, [parm2q+16] STORE8x8 mm0, mm0 ret @@ -186,9 +160,7 @@ predict_8x8_v_mmxext: ;----------------------------------------------------------------------------- ; void predict_8x8_dc_mmxext( uint8_t *src, uint8_t *edge ); ;----------------------------------------------------------------------------- - -ALIGN 16 -predict_8x8_dc_mmxext: +cglobal predict_8x8_dc_mmxext pxor mm0, mm0 pxor mm1, mm1 psadbw mm0, [parm2q+7] @@ -204,9 +176,7 @@ predict_8x8_dc_mmxext: ;----------------------------------------------------------------------------- ; void predict_8x8_dc_top_mmxext( uint8_t *src, uint8_t *edge ); ;----------------------------------------------------------------------------- - -ALIGN 16 -predict_8x8_dc_top_mmxext: +cglobal predict_8x8_dc_top_mmxext pxor mm0, mm0 psadbw mm0, [parm2q+16] paddw mm0, [pw_4 GLOBAL] @@ -219,9 +189,7 @@ predict_8x8_dc_top_mmxext: ;----------------------------------------------------------------------------- ; void predict_8x8_dc_left_mmxext( uint8_t *src, uint8_t *edge ); ;----------------------------------------------------------------------------- - -ALIGN 16 -predict_8x8_dc_left_mmxext: +cglobal predict_8x8_dc_left_mmxext pxor mm0, mm0 psadbw mm0, [parm2q+7] paddw mm0, [pw_4 GLOBAL] @@ -234,9 +202,7 @@ predict_8x8_dc_left_mmxext: ;----------------------------------------------------------------------------- ; void predict_8x8_ddl_mmxext( uint8_t *src, uint8_t *edge ) ;----------------------------------------------------------------------------- - -ALIGN 16 -predict_8x8_ddl_mmxext: +cglobal predict_8x8_ddl_mmxext movq mm5, [parm2q+16] movq mm2, [parm2q+17] movq mm3, [parm2q+23] @@ -268,9 +234,7 @@ predict_8x8_ddl_mmxext: ;----------------------------------------------------------------------------- ; void predict_8x8_ddl_sse2( uint8_t *src, uint8_t *edge ) ;----------------------------------------------------------------------------- - -ALIGN 16 -predict_8x8_ddl_sse2: +cglobal predict_8x8_ddl_sse2 movdqa xmm3, [parm2q+16] movdqu xmm2, [parm2q+17] movdqa xmm1, xmm3 @@ -288,9 +252,7 @@ predict_8x8_ddl_sse2: ;----------------------------------------------------------------------------- ; void predict_8x8_ddr_sse2( uint8_t *src, uint8_t *edge ) ;----------------------------------------------------------------------------- - -ALIGN 16 -predict_8x8_ddr_sse2: +cglobal predict_8x8_ddr_sse2 movdqu xmm3, [parm2q+8] movdqu xmm1, [parm2q+7] movdqa xmm2, xmm3 @@ -315,9 +277,7 @@ predict_8x8_ddr_sse2: ;----------------------------------------------------------------------------- ; void predict_8x8_vl_sse2( uint8_t *src, uint8_t *edge ) ;----------------------------------------------------------------------------- - -ALIGN 16 -predict_8x8_vl_sse2: +cglobal predict_8x8_vl_sse2 movdqa xmm4, [parm2q+16] movdqa xmm2, xmm4 movdqa xmm1, xmm4 @@ -358,8 +318,7 @@ predict_8x8_vl_sse2: ; 6 ..... ; 7 ,,,,, -ALIGN 16 -predict_8x8_vr_core_mmxext: +cglobal predict_8x8_vr_core_mmxext movq mm2, [parm2q+16] movq mm3, [parm2q+15] movq mm1, [parm2q+14] @@ -383,9 +342,7 @@ predict_8x8_vr_core_mmxext: ;----------------------------------------------------------------------------- ; void predict_8x8c_v_mmx( uint8_t *src ) ;----------------------------------------------------------------------------- - -ALIGN 16 -predict_8x8c_v_mmx : +cglobal predict_8x8c_v_mmx movq mm0, [parm1q - FDEC_STRIDE] STORE8x8 mm0, mm0 ret @@ -393,9 +350,7 @@ predict_8x8c_v_mmx : ;----------------------------------------------------------------------------- ; void predict_8x8c_dc_core_mmxext( uint8_t *src, int s2, int s3 ) ;----------------------------------------------------------------------------- - -ALIGN 16 -predict_8x8c_dc_core_mmxext: +cglobal predict_8x8c_dc_core_mmxext movq mm0, [parm1q - FDEC_STRIDE] pxor mm1, mm1 pxor mm2, mm2 @@ -427,9 +382,7 @@ predict_8x8c_dc_core_mmxext: ;----------------------------------------------------------------------------- ; void predict_8x8c_p_core_mmxext( uint8_t *src, int i00, int b, int c ) ;----------------------------------------------------------------------------- - -ALIGN 16 -predict_8x8c_p_core_mmxext: +cglobal predict_8x8c_p_core_mmxext movd mm0, parm2d movd mm2, parm3d movd mm4, parm4d @@ -464,9 +417,7 @@ ALIGN 4 ;----------------------------------------------------------------------------- ; void predict_16x16_p_core_mmxext( uint8_t *src, int i00, int b, int c ) ;----------------------------------------------------------------------------- - -ALIGN 16 -predict_16x16_p_core_mmxext: +cglobal predict_16x16_p_core_mmxext movd mm0, parm2d movd mm2, parm3d movd mm4, parm4d @@ -515,9 +466,7 @@ ALIGN 4 ;----------------------------------------------------------------------------- ; void predict_16x16_v_mmx( uint8_t *src ) ;----------------------------------------------------------------------------- - -ALIGN 16 -predict_16x16_v_mmx : +cglobal predict_16x16_v_mmx sub parm1q, FDEC_STRIDE movq mm0, [parm1q] movq mm1, [parm1q + 8] @@ -544,14 +493,12 @@ predict_16x16_v_mmx : STORE16x16 mm0, mm0 %endmacro -ALIGN 16 -predict_16x16_dc_core_mmxext: +cglobal predict_16x16_dc_core_mmxext movd mm2, parm2d PRED16x16_DC mm2, 5 ret -ALIGN 16 -predict_16x16_dc_top_mmxext: +cglobal predict_16x16_dc_top_mmxext PRED16x16_DC [pw_8 GLOBAL], 4 ret diff --git a/common/amd64/quant-a.asm b/common/amd64/quant-a.asm index ba7058de..32ff0cda 100644 --- a/common/amd64/quant-a.asm +++ b/common/amd64/quant-a.asm @@ -40,28 +40,6 @@ pd_1: times 2 dd 1 SECTION .text -cglobal x264_quant_2x2_dc_core15_mmx -cglobal x264_quant_4x4_dc_core15_mmx -cglobal x264_quant_4x4_core15_mmx -cglobal x264_quant_8x8_core15_mmx - -cglobal x264_quant_4x4_dc_core15_ssse3 -cglobal x264_quant_4x4_core15_ssse3 -cglobal x264_quant_8x8_core15_ssse3 - -cglobal x264_quant_2x2_dc_core16_mmxext -cglobal x264_quant_4x4_dc_core16_mmxext -cglobal x264_quant_4x4_core16_mmxext -cglobal x264_quant_8x8_core16_mmxext - -cglobal x264_quant_2x2_dc_core32_mmxext -cglobal x264_quant_4x4_dc_core32_mmxext -cglobal x264_quant_4x4_core32_mmxext -cglobal x264_quant_8x8_core32_mmxext - -cglobal x264_dequant_4x4_mmx -cglobal x264_dequant_8x8_mmx - %macro MMX_QUANT_AC_START 0 ; mov rdi, rdi ; &dct[0][0] ; mov rsi, rsi ; &quant_mf[0][0] @@ -149,22 +127,20 @@ cglobal x264_dequant_8x8_mmx movdqa %1, xmm0 ; store %endmacro -ALIGN 16 ;----------------------------------------------------------------------------- ; void x264_quant_2x2_dc_core15_mmx( int16_t dct[2][2], ; int const i_qmf, int const i_qbits, int const f ); ;----------------------------------------------------------------------------- -x264_quant_2x2_dc_core15_mmx: +cglobal x264_quant_2x2_dc_core15_mmx MMX_QUANT15_DC_START MMX_QUANT15_1x4 [parm1q], mm5, mm6, mm7 ret -ALIGN 16 ;----------------------------------------------------------------------------- ; void x264_quant_4x4_dc_core15_mmx( int16_t dct[4][4], ; int const i_qmf, int const i_qbits, int const f ); ;----------------------------------------------------------------------------- -x264_quant_4x4_dc_core15_mmx: +cglobal x264_quant_4x4_dc_core15_mmx MMX_QUANT15_DC_START %rep 4 @@ -174,12 +150,11 @@ x264_quant_4x4_dc_core15_mmx: ret -ALIGN 16 ;----------------------------------------------------------------------------- ; void x264_quant_4x4_core15_mmx( int16_t dct[4][4], ; int const quant_mf[4][4], int const i_qbits, int const f ); ;----------------------------------------------------------------------------- -x264_quant_4x4_core15_mmx: +cglobal x264_quant_4x4_core15_mmx MMX_QUANT_AC_START %rep 4 @@ -192,12 +167,11 @@ x264_quant_4x4_core15_mmx: ret -ALIGN 16 ;----------------------------------------------------------------------------- ; void x264_quant_8x8_core15_mmx( int16_t dct[8][8], ; int const quant_mf[8][8], int const i_qbits, int const f ); ;----------------------------------------------------------------------------- -x264_quant_8x8_core15_mmx: +cglobal x264_quant_8x8_core15_mmx MMX_QUANT_AC_START %rep 16 @@ -211,23 +185,21 @@ x264_quant_8x8_core15_mmx: ret %ifdef HAVE_SSE3 -ALIGN 16 ;----------------------------------------------------------------------------- ; void x264_quant_4x4_dc_core15_ssse3( int16_t dct[4][4], ; int const i_qmf, int const i_qbits, int const f ); ;----------------------------------------------------------------------------- -x264_quant_4x4_dc_core15_ssse3: +cglobal x264_quant_4x4_dc_core15_ssse3 SSE2_QUANT15_DC_START SSSE3_QUANT15_1x8 [parm1q], xmm5, xmm6, xmm7 SSSE3_QUANT15_1x8 [parm1q+16], xmm5, xmm6, xmm7 ret -ALIGN 16 ;----------------------------------------------------------------------------- ; void x264_quant_4x4_core15_ssse3( int16_t dct[4][4], ; int const quant_mf[4][4], int const i_qbits, int const f ); ;----------------------------------------------------------------------------- -x264_quant_4x4_core15_ssse3: +cglobal x264_quant_4x4_core15_ssse3 SSE2_QUANT_AC_START %assign x 0 %rep 2 @@ -238,12 +210,11 @@ x264_quant_4x4_core15_ssse3: %endrep ret -ALIGN 16 ;----------------------------------------------------------------------------- ; void x264_quant_8x8_core15_ssse3( int16_t dct[8][8], ; int const quant_mf[8][8], int const i_qbits, int const f ); ;----------------------------------------------------------------------------- -x264_quant_8x8_core15_ssse3: +cglobal x264_quant_8x8_core15_ssse3 SSE2_QUANT_AC_START %assign x 0 %rep 8 @@ -298,22 +269,20 @@ x264_quant_8x8_core15_ssse3: movq %1, mm0 ; store %endmacro -ALIGN 16 ;----------------------------------------------------------------------------- ; void x264_quant_2x2_dc_core16_mmxext( int16_t dct[2][2], ; int const i_qmf, int const i_qbits, int const f ); ;----------------------------------------------------------------------------- -x264_quant_2x2_dc_core16_mmxext: +cglobal x264_quant_2x2_dc_core16_mmxext MMXEXT_QUANT16_DC_START MMXEXT_QUANT16_1x4 [parm1q], mm5, mm6, mm7 ret -ALIGN 16 ;----------------------------------------------------------------------------- ; void x264_quant_4x4_dc_core16_mmxext( int16_t dct[4][4], ; int const i_qmf, int const i_qbits, int const f ); ;----------------------------------------------------------------------------- -x264_quant_4x4_dc_core16_mmxext: +cglobal x264_quant_4x4_dc_core16_mmxext MMXEXT_QUANT16_DC_START %rep 4 @@ -323,12 +292,11 @@ x264_quant_4x4_dc_core16_mmxext: ret -ALIGN 16 ;----------------------------------------------------------------------------- ; void x264_quant_4x4_core16_mmxext( int16_t dct[4][4], ; int const quant_mf[4][4], int const i_qbits, int const f ); ;----------------------------------------------------------------------------- -x264_quant_4x4_core16_mmxext: +cglobal x264_quant_4x4_core16_mmxext MMX_QUANT_AC_START %rep 4 @@ -342,12 +310,11 @@ x264_quant_4x4_core16_mmxext: ret -ALIGN 16 ;----------------------------------------------------------------------------- ; void x264_quant_8x8_core16_mmxext( int16_t dct[8][8], ; int const quant_mf[8][8], int const i_qbits, int const f ); ;----------------------------------------------------------------------------- -x264_quant_8x8_core16_mmxext: +cglobal x264_quant_8x8_core16_mmxext MMX_QUANT_AC_START %rep 16 @@ -409,22 +376,20 @@ x264_quant_8x8_core16_mmxext: movq %1, mm0 ; store %endmacro -ALIGN 16 ;----------------------------------------------------------------------------- ; void x264_quant_2x2_dc_core32_mmxext( int16_t dct[2][2], ; int const i_qmf, int const i_qbits, int const f ); ;----------------------------------------------------------------------------- -x264_quant_2x2_dc_core32_mmxext: +cglobal x264_quant_2x2_dc_core32_mmxext MMX_QUANT32_DC_START MMXEXT_QUANT32_1x4 [parm1q], mm5, mm5, mm6, mm7 ret -ALIGN 16 ;----------------------------------------------------------------------------- ; void x264_quant_4x4_dc_core32_mmxext( int16_t dct[4][4], ; int const i_qmf, int const i_qbits, int const f ); ;----------------------------------------------------------------------------- -x264_quant_4x4_dc_core32_mmxext: +cglobal x264_quant_4x4_dc_core32_mmxext MMX_QUANT32_DC_START %rep 4 @@ -434,12 +399,11 @@ x264_quant_4x4_dc_core32_mmxext: ret -ALIGN 16 ;----------------------------------------------------------------------------- ; void x264_quant_4x4_core32_mmxext( int16_t dct[4][4], ; int const quant_mf[4][4], int const i_qbits, int const f ); ;----------------------------------------------------------------------------- -x264_quant_4x4_core32_mmxext: +cglobal x264_quant_4x4_core32_mmxext MMX_QUANT_AC_START %rep 4 @@ -450,12 +414,11 @@ x264_quant_4x4_core32_mmxext: ret -ALIGN 16 ;----------------------------------------------------------------------------- ; void x264_quant_8x8_core32_mmxext( int16_t dct[8][8], ; int const quant_mf[8][8], int const i_qbits, int const f ); ;----------------------------------------------------------------------------- -x264_quant_8x8_core32_mmxext: +cglobal x264_quant_8x8_core32_mmxext MMX_QUANT_AC_START %rep 16 @@ -517,10 +480,11 @@ x264_quant_8x8_core32_mmxext: movq %1, mm0 %endmacro +;----------------------------------------------------------------------------- +; void x264_dequant_4x4_mmx( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp ) +;----------------------------------------------------------------------------- %macro DEQUANT_WxH 3 -ALIGN 16 -;;; void x264_dequant_4x4_mmx( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp ) -%1: +cglobal %1 ; mov rdi, rdi ; dct ; mov rsi, rsi ; dequant_mf ; mov edx, edx ; i_qp diff --git a/common/i386/cpu-a.asm b/common/i386/cpu-a.asm index 1fa5f468..2b7b56f8 100644 --- a/common/i386/cpu-a.asm +++ b/common/i386/cpu-a.asm @@ -35,15 +35,10 @@ BITS 32 SECTION .text -cglobal x264_cpu_cpuid_test -cglobal x264_cpu_cpuid -cglobal x264_emms - -ALIGN 16 ;----------------------------------------------------------------------------- ; int __cdecl x264_cpu_cpuid_test( void ) return 0 if unsupported ;----------------------------------------------------------------------------- -x264_cpu_cpuid_test: +cglobal x264_cpu_cpuid_test pushfd push ebx push ebp @@ -67,11 +62,10 @@ x264_cpu_cpuid_test: popfd ret -ALIGN 16 ;----------------------------------------------------------------------------- ; int __cdecl x264_cpu_cpuid( int op, int *eax, int *ebx, int *ecx, int *edx ) ;----------------------------------------------------------------------------- -x264_cpu_cpuid: +cglobal x264_cpu_cpuid push ebp mov ebp, esp @@ -100,11 +94,10 @@ x264_cpu_cpuid: pop ebp ret -ALIGN 16 ;----------------------------------------------------------------------------- ; void __cdecl x264_emms( void ) ;----------------------------------------------------------------------------- -x264_emms: +cglobal x264_emms emms ret diff --git a/common/i386/dct-a.asm b/common/i386/dct-a.asm index 9d12e7b2..8361c3fb 100644 --- a/common/i386/dct-a.asm +++ b/common/i386/dct-a.asm @@ -144,13 +144,10 @@ x264_mmx_p2n2p1p1: dw 2, -2, 1, 1 SECTION .text -cglobal x264_dct4x4dc_mmx - -ALIGN 16 ;----------------------------------------------------------------------------- ; void __cdecl x264_dct4x4dc_mmx( int16_t d[4][4] ) ;----------------------------------------------------------------------------- -x264_dct4x4dc_mmx: +cglobal x264_dct4x4dc_mmx mov eax, [esp+ 4] movq mm0, [eax+ 0] movq mm1, [eax+ 8] @@ -184,13 +181,10 @@ x264_dct4x4dc_mmx: picpop ebx ret -cglobal x264_idct4x4dc_mmx - -ALIGN 16 ;----------------------------------------------------------------------------- ; void __cdecl x264_idct4x4dc_mmx( int16_t d[4][4] ) ;----------------------------------------------------------------------------- -x264_idct4x4dc_mmx: +cglobal x264_idct4x4dc_mmx mov eax, [esp+ 4] movq mm0, [eax+ 0] movq mm1, [eax+ 8] @@ -211,13 +205,10 @@ x264_idct4x4dc_mmx: movq [eax+24], mm4 ret -cglobal x264_sub4x4_dct_mmx - -ALIGN 16 ;----------------------------------------------------------------------------- ; void __cdecl x264_sub4x4_dct_mmx( int16_t dct[4][4], uint8_t *pix1, uint8_t *pix2 ) ;----------------------------------------------------------------------------- -x264_sub4x4_dct_mmx: +cglobal x264_sub4x4_dct_mmx mov eax, [esp+ 8] ; pix1 mov ecx, [esp+12] ; pix2 @@ -250,13 +241,10 @@ x264_sub4x4_dct_mmx: ret -cglobal x264_add4x4_idct_mmx - -ALIGN 16 ;----------------------------------------------------------------------------- ; void __cdecl x264_add4x4_idct_mmx( uint8_t *p_dst, int16_t dct[4][4] ) ;----------------------------------------------------------------------------- -x264_add4x4_idct_mmx: +cglobal x264_add4x4_idct_mmx ; Load dct coeffs mov eax, [esp+ 8] ; dct movq mm0, [eax+ 0] @@ -321,10 +309,10 @@ x264_add4x4_idct_mmx: MMX_SUMSUB_BA %1, %2 %endmacro -ALIGN 16 ;----------------------------------------------------------------------------- ; void __cdecl x264_pixel_sub_8x8_mmx( int16_t *diff, uint8_t *pix1, uint8_t *pix2 ); ;----------------------------------------------------------------------------- +ALIGN 16 x264_pixel_sub_8x8_mmx: mov edx, [esp+ 4] ; diff @@ -345,10 +333,10 @@ x264_pixel_sub_8x8_mmx: ret -ALIGN 16 ;----------------------------------------------------------------------------- ; void __cdecl x264_ydct8_mmx( int16_t dest[8][8] ); ;----------------------------------------------------------------------------- +ALIGN 16 x264_ydct8_mmx: mov eax, [esp+04] ; dest @@ -430,10 +418,10 @@ x264_ydct8_mmx: ret -ALIGN 16 ;----------------------------------------------------------------------------- ; void __cdecl x264_yidct8_mmx( int16_t dest[8][8] ); ;----------------------------------------------------------------------------- +ALIGN 16 x264_yidct8_mmx: mov eax, [esp+04] ; dest @@ -524,10 +512,10 @@ x264_yidct8_mmx: ret -ALIGN 16 ;----------------------------------------------------------------------------- ; void __cdecl x264_pixel_add_8x8_mmx( uint8_t *dst, int16_t src[8][8] ); ;----------------------------------------------------------------------------- +ALIGN 16 x264_pixel_add_8x8_mmx: mov eax, [esp+4] ; dst mov edx, [esp+8] ; src @@ -553,10 +541,10 @@ x264_pixel_add_8x8_mmx: %endrep ret -ALIGN 16 ;----------------------------------------------------------------------------- ; void __cdecl x264_transpose_8x8_mmx( int16_t src[8][8] ); ;----------------------------------------------------------------------------- +ALIGN 16 x264_transpose_8x8_mmx: mov eax, [esp+4] @@ -605,9 +593,7 @@ x264_transpose_8x8_mmx: ;----------------------------------------------------------------------------- ; void __cdecl x264_sub8x8_dct8_mmx( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 ) ;----------------------------------------------------------------------------- -ALIGN 16 cglobal x264_sub8x8_dct8_mmx -x264_sub8x8_dct8_mmx: push dword [esp+12] push dword [esp+12] push dword [esp+12] @@ -620,9 +606,7 @@ x264_sub8x8_dct8_mmx: ;----------------------------------------------------------------------------- ; void __cdecl x264_add8x8_idct8_mmx( uint8_t *dst, int16_t dct[8][8] ) ;----------------------------------------------------------------------------- -ALIGN 16 cglobal x264_add8x8_idct8_mmx -x264_add8x8_idct8_mmx: mov eax, [esp+8] add word [eax], 32 push eax @@ -637,9 +621,7 @@ x264_add8x8_idct8_mmx: ; uint8_t *pix1, uint8_t *pix2 ) ;----------------------------------------------------------------------------- %macro SUB_NxN_DCT 4 -ALIGN 16 cglobal %1 -%1: mov edx, [esp+12] mov ecx, [esp+ 8] mov eax, [esp+ 4] @@ -666,9 +648,7 @@ cglobal %1 ; void __cdecl x264_add8x8_idct_mmx( uint8_t *pix, int16_t dct[4][4][4] ) ;----------------------------------------------------------------------------- %macro ADD_NxN_IDCT 4 -ALIGN 16 cglobal %1 -%1: mov ecx, [esp+8] mov eax, [esp+4] add ecx, %3 @@ -699,9 +679,7 @@ ADD_NxN_IDCT x264_add16x16_idct8_mmx, x264_add8x8_idct8_mmx, 128, 8 ;----------------------------------------------------------------------------- ; void __cdecl x264_zigzag_scan_4x4_field_mmx( int level[16], int16_t dct[4][4] ) ;----------------------------------------------------------------------------- -ALIGN 16 cglobal x264_zigzag_scan_4x4_field_mmx -x264_zigzag_scan_4x4_field_mmx: mov edx, [esp+8] mov ecx, [esp+4] punpcklwd mm0, [edx] diff --git a/common/i386/deblock-a.asm b/common/i386/deblock-a.asm index 658ec5db..518e61cb 100644 --- a/common/i386/deblock-a.asm +++ b/common/i386/deblock-a.asm @@ -30,12 +30,6 @@ pb_03: times 8 db 0x03 pb_a1: times 8 db 0xa1 SECTION .text -cglobal x264_deblock_v8_luma_mmxext -cglobal x264_deblock_h_luma_mmxext -cglobal x264_deblock_v_chroma_mmxext -cglobal x264_deblock_h_chroma_mmxext -cglobal x264_deblock_v_chroma_intra_mmxext -cglobal x264_deblock_h_chroma_intra_mmxext ; expands to [base],...,[base+7*stride] %define PASS8ROWS(base, base3, stride, stride3) \ @@ -231,11 +225,10 @@ cglobal x264_deblock_h_chroma_intra_mmxext SECTION .text -ALIGN 16 ;----------------------------------------------------------------------------- ; void x264_deblock_v8_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) ;----------------------------------------------------------------------------- -x264_deblock_v8_luma_mmxext: +cglobal x264_deblock_v8_luma_mmxext picpush ebx picgetgot ebx push edi @@ -298,11 +291,10 @@ x264_deblock_v8_luma_mmxext: ret -ALIGN 16 ;----------------------------------------------------------------------------- ; void x264_deblock_h_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) ;----------------------------------------------------------------------------- -x264_deblock_h_luma_mmxext: +cglobal x264_deblock_h_luma_mmxext push ebx push ebp mov eax, [esp+12] ; pix @@ -396,11 +388,10 @@ x264_deblock_h_luma_mmxext: ret %endmacro -ALIGN 16 ;----------------------------------------------------------------------------- ; void x264_deblock_v_chroma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) ;----------------------------------------------------------------------------- -x264_deblock_v_chroma_mmxext: +cglobal x264_deblock_v_chroma_mmxext CHROMA_V_START push ebx mov ebx, [esp+32] ; tc0 @@ -424,11 +415,10 @@ x264_deblock_v_chroma_mmxext: CHROMA_END -ALIGN 16 ;----------------------------------------------------------------------------- ; void x264_deblock_h_chroma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) ;----------------------------------------------------------------------------- -x264_deblock_h_chroma_mmxext: +cglobal x264_deblock_h_chroma_mmxext CHROMA_H_START push ebx mov ebx, [esp+36] ; tc0 @@ -480,11 +470,10 @@ x264_deblock_h_chroma_mmxext: paddb mm2, mm6 %endmacro -ALIGN 16 ;----------------------------------------------------------------------------- ; void x264_deblock_v_chroma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta ) ;----------------------------------------------------------------------------- -x264_deblock_v_chroma_intra_mmxext: +cglobal x264_deblock_v_chroma_intra_mmxext CHROMA_V_START picpush ebx picgetgot ebx @@ -498,11 +487,10 @@ x264_deblock_v_chroma_intra_mmxext: picpop ebx CHROMA_END -ALIGN 16 ;----------------------------------------------------------------------------- ; void x264_deblock_h_chroma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta ) ;----------------------------------------------------------------------------- -x264_deblock_h_chroma_intra_mmxext: +cglobal x264_deblock_h_chroma_intra_mmxext CHROMA_H_START picpush ebx picgetgot ebx diff --git a/common/i386/i386inc.asm b/common/i386/i386inc.asm index 3fd60a87..deda8591 100644 --- a/common/i386/i386inc.asm +++ b/common/i386/i386inc.asm @@ -34,6 +34,8 @@ BITS 32 %else global %1 %endif + align 16 + %1: %endmacro ; Name of the .rodata section. On OS X we cannot use .rodata because NASM diff --git a/common/i386/mc-a.asm b/common/i386/mc-a.asm index 2e68b994..c689a8df 100644 --- a/common/i386/mc-a.asm +++ b/common/i386/mc-a.asm @@ -56,38 +56,17 @@ pw_64: times 4 dw 64 SECTION .text -cglobal x264_pixel_avg_w4_mmxext -cglobal x264_pixel_avg_w8_mmxext -cglobal x264_pixel_avg_w16_mmxext -cglobal x264_pixel_avg_w20_mmxext -cglobal x264_pixel_avg_w16_sse2 - -cglobal x264_pixel_avg_weight_4x4_mmxext -cglobal x264_pixel_avg_weight_w8_mmxext -cglobal x264_pixel_avg_weight_w16_mmxext - -cglobal x264_mc_copy_w4_mmx -cglobal x264_mc_copy_w8_mmx -cglobal x264_mc_copy_w16_mmx -cglobal x264_mc_copy_w16_sse2 - -cglobal x264_mc_chroma_mmxext - -cglobal x264_prefetch_fenc_mmxext -cglobal x264_prefetch_ref_mmxext - ;============================================================================= ; pixel avg ;============================================================================= -ALIGN 16 ;----------------------------------------------------------------------------- ; void x264_pixel_avg_w4_mmxext( uint8_t *dst, int i_dst_stride, ; uint8_t *src1, int i_src1_stride, ; uint8_t *src2, int i_src2_stride, ; int i_height ); ;----------------------------------------------------------------------------- -x264_pixel_avg_w4_mmxext: +cglobal x264_pixel_avg_w4_mmxext push ebp push ebx push esi @@ -123,14 +102,13 @@ ALIGN 4 -ALIGN 16 ;----------------------------------------------------------------------------- ; void x264_pixel_avg_w8_mmxext( uint8_t *dst, int i_dst_stride, ; uint8_t *src1, int i_src1_stride, ; uint8_t *src2, int i_src2_stride, ; int i_height ); ;----------------------------------------------------------------------------- -x264_pixel_avg_w8_mmxext: +cglobal x264_pixel_avg_w8_mmxext push ebp push ebx push esi @@ -162,14 +140,13 @@ ALIGN 4 -ALIGN 16 ;----------------------------------------------------------------------------- ; void x264_pixel_avg_w16_mmxext( uint8_t *dst, int i_dst_stride, ; uint8_t *src1, int i_src1_stride, ; uint8_t *src2, int i_src2_stride, ; int i_height ); ;----------------------------------------------------------------------------- -x264_pixel_avg_w16_mmxext: +cglobal x264_pixel_avg_w16_mmxext push ebp push ebx push esi @@ -204,14 +181,13 @@ ALIGN 4 -ALIGN 16 ;----------------------------------------------------------------------------- ; void x264_pixel_avg_w20_mmxext( uint8_t *dst, int i_dst_stride, ; uint8_t *src1, int i_src1_stride, ; uint8_t *src2, int i_src2_stride, ; int i_height ); ;----------------------------------------------------------------------------- -x264_pixel_avg_w20_mmxext: +cglobal x264_pixel_avg_w20_mmxext push ebp push ebx push esi @@ -256,7 +232,7 @@ ALIGN 16 ; uint8_t *src2, int i_src2_stride, ; int i_height ); ;----------------------------------------------------------------------------- -x264_pixel_avg_w16_sse2: +cglobal x264_pixel_avg_w16_sse2 push ebp push ebx push esi @@ -332,11 +308,10 @@ ALIGN 4 ret %endmacro -ALIGN 16 ;----------------------------------------------------------------------------- ; int __cdecl x264_pixel_avg_weight_w16_mmxext( uint8_t *, int, uint8_t *, int, int, int ) ;----------------------------------------------------------------------------- -x264_pixel_avg_weight_w16_mmxext: +cglobal x264_pixel_avg_weight_w16_mmxext BIWEIGHT_START_MMX mov eax, [picesp+32] ; i_height ALIGN 4 @@ -353,11 +328,10 @@ x264_pixel_avg_weight_w16_mmxext: jg .height_loop BIWEIGHT_END_MMX -ALIGN 16 ;----------------------------------------------------------------------------- ; int __cdecl x264_pixel_avg_weight_w8_mmxext( uint8_t *, int, uint8_t *, int, int, int ) ;----------------------------------------------------------------------------- -x264_pixel_avg_weight_w8_mmxext: +cglobal x264_pixel_avg_weight_w8_mmxext BIWEIGHT_START_MMX mov eax, [picesp+32] ALIGN 4 @@ -374,11 +348,10 @@ x264_pixel_avg_weight_w8_mmxext: jg .height_loop BIWEIGHT_END_MMX -ALIGN 16 ;----------------------------------------------------------------------------- ; int __cdecl x264_pixel_avg_weight_4x4_mmxext( uint8_t *, int, uint8_t *, int, int ) ;----------------------------------------------------------------------------- -x264_pixel_avg_weight_4x4_mmxext: +cglobal x264_pixel_avg_weight_4x4_mmxext BIWEIGHT_START_MMX BIWEIGHT_4P_MMX [edi ], [edx ] BIWEIGHT_4P_MMX [edi+esi ], [edx+ecx ] @@ -394,12 +367,11 @@ x264_pixel_avg_weight_4x4_mmxext: ; pixel copy ;============================================================================= -ALIGN 16 ;----------------------------------------------------------------------------- ; void x264_mc_copy_w4_mmx( uint8_t *src, int i_src_stride, ; uint8_t *dst, int i_dst_stride, int i_height ) ;----------------------------------------------------------------------------- -x264_mc_copy_w4_mmx: +cglobal x264_mc_copy_w4_mmx push ebx push esi push edi @@ -426,12 +398,11 @@ ALIGN 4 pop ebx ret -ALIGN 16 ;----------------------------------------------------------------------------- ; void x264_mc_copy_w8_mmx( uint8_t *src, int i_src_stride, ; uint8_t *dst, int i_dst_stride, int i_height ) ;----------------------------------------------------------------------------- -x264_mc_copy_w8_mmx: +cglobal x264_mc_copy_w8_mmx push ebx push esi push edi @@ -464,12 +435,11 @@ ALIGN 4 pop ebx ret -ALIGN 16 ;----------------------------------------------------------------------------- ; void x264_mc_copy_w16_mmx( uint8_t *src, int i_src_stride, ; uint8_t *dst, int i_dst_stride, int i_height ) ;----------------------------------------------------------------------------- -x264_mc_copy_w16_mmx: +cglobal x264_mc_copy_w16_mmx push ebx push esi push edi @@ -511,11 +481,10 @@ ALIGN 4 ret -ALIGN 16 ;----------------------------------------------------------------------------- ; void x264_mc_copy_w16_sse2( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height ) ;----------------------------------------------------------------------------- -x264_mc_copy_w16_sse2: +cglobal x264_mc_copy_w16_sse2 push ebx push esi push edi @@ -549,7 +518,6 @@ ALIGN 4 ; chroma MC ;============================================================================= -ALIGN 16 ;----------------------------------------------------------------------------- ; void x264_mc_chroma_mmxext( uint8_t *src, int i_src_stride, ; uint8_t *dst, int i_dst_stride, @@ -557,7 +525,7 @@ ALIGN 16 ; int i_width, int i_height ) ;----------------------------------------------------------------------------- -x264_mc_chroma_mmxext: +cglobal x264_mc_chroma_mmxext picpush ebx picgetgot ebx push edi @@ -656,8 +624,7 @@ ALIGN 4 ; void x264_prefetch_fenc_mmxext( uint8_t *pix_y, int stride_y, ; uint8_t *pix_uv, int stride_uv, int mb_x ) ;----------------------------------------------------------------------------- -ALIGN 16 -x264_prefetch_fenc_mmxext: +cglobal x264_prefetch_fenc_mmxext mov eax, [esp+20] mov ecx, [esp+8] mov edx, [esp+4] @@ -683,8 +650,7 @@ x264_prefetch_fenc_mmxext: ;----------------------------------------------------------------------------- ; void x264_prefetch_ref_mmxext( uint8_t *pix, int stride, int parity ) ;----------------------------------------------------------------------------- -ALIGN 16 -x264_prefetch_ref_mmxext: +cglobal x264_prefetch_ref_mmxext mov eax, [esp+12] mov ecx, [esp+8] mov edx, [esp+4] diff --git a/common/i386/mc-a2.asm b/common/i386/mc-a2.asm index b9eb7e50..211427b2 100644 --- a/common/i386/mc-a2.asm +++ b/common/i386/mc-a2.asm @@ -94,16 +94,11 @@ pw_32: times 4 dw 32 SECTION .text -cglobal x264_hpel_filter_mmxext -cglobal x264_plane_copy_mmxext - ;----------------------------------------------------------------------------- ; void x264_hpel_filter_mmxext( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src, ; int i_stride, int i_width, int i_height ); ;----------------------------------------------------------------------------- - -ALIGN 16 -x264_hpel_filter_mmxext : +cglobal x264_hpel_filter_mmxext push ebp mov ebp, esp push ebx @@ -276,8 +271,7 @@ ALIGN 16 ; void x264_plane_copy_mmxext( uint8_t *dst, int i_dst, ; uint8_t *src, int i_src, int w, int h) ;----------------------------------------------------------------------------- -ALIGN 16 -x264_plane_copy_mmxext: +cglobal x264_plane_copy_mmxext push edi push esi push ebx diff --git a/common/i386/pixel-a.asm b/common/i386/pixel-a.asm index dad09d99..ba81d873 100644 --- a/common/i386/pixel-a.asm +++ b/common/i386/pixel-a.asm @@ -438,64 +438,6 @@ BITS 32 SECTION .text -cglobal x264_pixel_sad_16x16_mmxext -cglobal x264_pixel_sad_16x8_mmxext -cglobal x264_pixel_sad_8x16_mmxext -cglobal x264_pixel_sad_8x8_mmxext -cglobal x264_pixel_sad_8x4_mmxext -cglobal x264_pixel_sad_4x8_mmxext -cglobal x264_pixel_sad_4x4_mmxext - -cglobal x264_pixel_sad_x3_16x16_mmxext -cglobal x264_pixel_sad_x3_16x8_mmxext -cglobal x264_pixel_sad_x3_8x16_mmxext -cglobal x264_pixel_sad_x3_8x8_mmxext -cglobal x264_pixel_sad_x3_8x4_mmxext -cglobal x264_pixel_sad_x3_4x8_mmxext -cglobal x264_pixel_sad_x3_4x4_mmxext - -cglobal x264_pixel_sad_x4_16x16_mmxext -cglobal x264_pixel_sad_x4_16x8_mmxext -cglobal x264_pixel_sad_x4_8x16_mmxext -cglobal x264_pixel_sad_x4_8x8_mmxext -cglobal x264_pixel_sad_x4_8x4_mmxext -cglobal x264_pixel_sad_x4_4x8_mmxext -cglobal x264_pixel_sad_x4_4x4_mmxext - -cglobal x264_pixel_sad_pde_16x16_mmxext -cglobal x264_pixel_sad_pde_16x8_mmxext -cglobal x264_pixel_sad_pde_8x16_mmxext - -cglobal x264_pixel_ssd_16x16_mmx -cglobal x264_pixel_ssd_16x8_mmx -cglobal x264_pixel_ssd_8x16_mmx -cglobal x264_pixel_ssd_8x8_mmx -cglobal x264_pixel_ssd_8x4_mmx -cglobal x264_pixel_ssd_4x8_mmx -cglobal x264_pixel_ssd_4x4_mmx - -cglobal x264_pixel_satd_4x4_mmxext -cglobal x264_pixel_satd_4x8_mmxext -cglobal x264_pixel_satd_8x4_mmxext -cglobal x264_pixel_satd_8x8_mmxext -cglobal x264_pixel_satd_16x8_mmxext -cglobal x264_pixel_satd_8x16_mmxext -cglobal x264_pixel_satd_16x16_mmxext - -cglobal x264_pixel_sa8d_16x16_mmxext -cglobal x264_pixel_sa8d_8x8_mmxext - -cglobal x264_intra_satd_x3_4x4_mmxext -cglobal x264_intra_satd_x3_8x8c_mmxext -cglobal x264_intra_satd_x3_16x16_mmxext -cglobal x264_intra_sa8d_x3_8x8_core_mmxext - -cglobal x264_pixel_ssim_4x4x2_core_mmxext - -cglobal x264_pixel_ads4_mmxext -cglobal x264_pixel_ads2_mmxext -cglobal x264_pixel_ads1_mmxext - %macro SAD_START 0 push ebx @@ -517,8 +459,7 @@ cglobal x264_pixel_ads1_mmxext ; int __cdecl x264_pixel_sad_16x16_mmxext (uint8_t *, int, uint8_t *, int ) ;----------------------------------------------------------------------------- %macro SAD 2 -ALIGN 16 -x264_pixel_sad_%1x%2_mmxext: +cglobal x264_pixel_sad_%1x%2_mmxext SAD_START %rep %2/2 SAD_INC_2x%1P @@ -539,8 +480,7 @@ SAD 4, 4 ; uint8_t *pix2, int i_stride, int scores[3] ) ;----------------------------------------------------------------------------- %macro SAD_X 3 -ALIGN 16 -x264_pixel_sad_x%1_%2x%3_mmxext: +cglobal x264_pixel_sad_x%1_%2x%3_mmxext SAD_X%1_2x%2P 1 %rep %3/2-1 SAD_X%1_2x%2P 0 @@ -568,8 +508,7 @@ SAD_X 4, 4, 4 ; int __cdecl x264_pixel_sad_pde_16x16_mmxext (uint8_t *, int, uint8_t *, int, int ) ;----------------------------------------------------------------------------- %macro SAD_PDE 2 -ALIGN 16 -x264_pixel_sad_pde_%1x%2_mmxext: +cglobal x264_pixel_sad_pde_%1x%2_mmxext SAD_START %rep %2/4 SAD_INC_2x%1P @@ -623,8 +562,7 @@ SAD_PDE 8, 16 ; int __cdecl x264_pixel_ssd_16x16_mmx (uint8_t *, int, uint8_t *, int ) ;----------------------------------------------------------------------------- %macro SSD 2 -ALIGN 16 -x264_pixel_ssd_%1x%2_mmx: +cglobal x264_pixel_ssd_%1x%2_mmx SSD_START %rep %2 SSD_INC_1x%1P @@ -662,31 +600,28 @@ SSD 4, 4 ret %endmacro -ALIGN 16 ;----------------------------------------------------------------------------- ; int __cdecl x264_pixel_satd_4x4_mmxext (uint8_t *, int, uint8_t *, int ) ;----------------------------------------------------------------------------- -x264_pixel_satd_4x4_mmxext: +cglobal x264_pixel_satd_4x4_mmxext SATD_START LOAD_DIFF_HADAMARD_SUM mm0, 0, 0 SATD_END -ALIGN 16 ;----------------------------------------------------------------------------- ; int __cdecl x264_pixel_satd_4x8_mmxext (uint8_t *, int, uint8_t *, int ) ;----------------------------------------------------------------------------- -x264_pixel_satd_4x8_mmxext: +cglobal x264_pixel_satd_4x8_mmxext SATD_START LOAD_DIFF_HADAMARD_SUM mm0, 0, 1 LOAD_DIFF_HADAMARD_SUM mm1, 0, 0 paddw mm0, mm1 SATD_END -ALIGN 16 ;----------------------------------------------------------------------------- ; int __cdecl x264_pixel_satd_8x4_mmxext (uint8_t *, int, uint8_t *, int ) ;----------------------------------------------------------------------------- -x264_pixel_satd_8x4_mmxext: +cglobal x264_pixel_satd_8x4_mmxext SATD_START LOAD_DIFF_HADAMARD_SUM mm0, 0, 0 sub eax, ebx @@ -695,11 +630,10 @@ x264_pixel_satd_8x4_mmxext: paddw mm0, mm1 SATD_END -ALIGN 16 ;----------------------------------------------------------------------------- ; int __cdecl x264_pixel_satd_8x8_mmxext (uint8_t *, int, uint8_t *, int ) ;----------------------------------------------------------------------------- -x264_pixel_satd_8x8_mmxext: +cglobal x264_pixel_satd_8x8_mmxext SATD_START LOAD_DIFF_HADAMARD_SUM mm0, 0, 1 LOAD_DIFF_HADAMARD_SUM mm1, 0, 0 @@ -713,11 +647,10 @@ x264_pixel_satd_8x8_mmxext: paddw mm0, mm1 SATD_END -ALIGN 16 ;----------------------------------------------------------------------------- ; int __cdecl x264_pixel_satd_16x8_mmxext (uint8_t *, int, uint8_t *, int ) ;----------------------------------------------------------------------------- -x264_pixel_satd_16x8_mmxext: +cglobal x264_pixel_satd_16x8_mmxext SATD_START LOAD_DIFF_HADAMARD_SUM mm0, 0, 1 LOAD_DIFF_HADAMARD_SUM mm1, 0, 0 @@ -745,11 +678,10 @@ x264_pixel_satd_16x8_mmxext: paddw mm0, mm1 SATD_END -ALIGN 16 ;----------------------------------------------------------------------------- ; int __cdecl x264_pixel_satd_8x16_mmxext (uint8_t *, int, uint8_t *, int ) ;----------------------------------------------------------------------------- -x264_pixel_satd_8x16_mmxext: +cglobal x264_pixel_satd_8x16_mmxext SATD_START LOAD_DIFF_HADAMARD_SUM mm0, 0, 1 LOAD_DIFF_HADAMARD_SUM mm1, 0, 1 @@ -771,11 +703,10 @@ x264_pixel_satd_8x16_mmxext: paddw mm0, mm1 SATD_END -ALIGN 16 ;----------------------------------------------------------------------------- ; int __cdecl x264_pixel_satd_16x16_mmxext (uint8_t *, int, uint8_t *, int ) ;----------------------------------------------------------------------------- -x264_pixel_satd_16x16_mmxext: +cglobal x264_pixel_satd_16x16_mmxext SATD_START LOAD_DIFF_HADAMARD_SUM mm0, 0, 1 LOAD_DIFF_HADAMARD_SUM mm1, 0, 1 @@ -875,11 +806,10 @@ x264_pixel_satd_16x16_mmxext: paddw mm0, mm1 %endmacro -ALIGN 16 ;----------------------------------------------------------------------------- ; int __cdecl x264_pixel_sa8d_8x8_mmxext( uint8_t *, int, uint8_t *, int ) ;----------------------------------------------------------------------------- -x264_pixel_sa8d_8x8_mmxext: +cglobal x264_pixel_sa8d_8x8_mmxext SATD_START sub esp, 0x70 %define args esp+0x74 @@ -952,12 +882,11 @@ x264_pixel_sa8d_8x8_mmxext: %undef spill %undef trans -ALIGN 16 ;----------------------------------------------------------------------------- ; int __cdecl x264_pixel_sa8d_16x16_mmxext( uint8_t *, int, uint8_t *, int ) ;----------------------------------------------------------------------------- ;; violates calling convention -x264_pixel_sa8d_16x16_mmxext: +cglobal x264_pixel_sa8d_16x16_mmxext push esi push edi push ebp @@ -1036,11 +965,10 @@ x264_pixel_sa8d_16x16_mmxext: %8 %3, %6 %endmacro -ALIGN 16 ;----------------------------------------------------------------------------- ; void x264_intra_satd_x3_4x4_mmxext( uint8_t *fenc, uint8_t *fdec, int *res ) ;----------------------------------------------------------------------------- -x264_intra_satd_x3_4x4_mmxext: +cglobal x264_intra_satd_x3_4x4_mmxext push ebx push edi push esi @@ -1119,11 +1047,10 @@ x264_intra_satd_x3_4x4_mmxext: pop ebx ret -ALIGN 16 ;----------------------------------------------------------------------------- ; void x264_intra_satd_x3_16x16_mmxext( uint8_t *fenc, uint8_t *fdec, int *res ) ;----------------------------------------------------------------------------- -x264_intra_satd_x3_16x16_mmxext: +cglobal x264_intra_satd_x3_16x16_mmxext push ebx push ebp push edi @@ -1250,11 +1177,10 @@ x264_intra_satd_x3_16x16_mmxext: pop ebx ret -ALIGN 16 ;----------------------------------------------------------------------------- ; void x264_intra_satd_x3_8x8c_mmxext( uint8_t *fenc, uint8_t *fdec, int *res ) ;----------------------------------------------------------------------------- -x264_intra_satd_x3_8x8c_mmxext: +cglobal x264_intra_satd_x3_8x8c_mmxext push ebx push ebp push edi @@ -1417,11 +1343,10 @@ x264_intra_satd_x3_8x8c_mmxext: movq mm7, [spill] %endmacro -ALIGN 16 ;----------------------------------------------------------------------------- ; void x264_intra_sa8d_x3_8x8_core_mmxext( uint8_t *fenc, int16_t edges[2][8], int *res ) ;----------------------------------------------------------------------------- -x264_intra_sa8d_x3_8x8_core_mmxext: +cglobal x264_intra_sa8d_x3_8x8_core_mmxext mov eax, [esp+4] mov ecx, [esp+8] sub esp, 0x70 @@ -1582,8 +1507,7 @@ x264_intra_sa8d_x3_8x8_core_mmxext: ; void x264_pixel_ssim_4x4x2_core_mmxext( const uint8_t *pix1, int stride1, ; const uint8_t *pix2, int stride2, int sums[2][4] ) ;----------------------------------------------------------------------------- -ALIGN 16 -x264_pixel_ssim_4x4x2_core_mmxext: +cglobal x264_pixel_ssim_4x4x2_core_mmxext push ebx push edi mov ebx, [esp+16] @@ -1646,8 +1570,7 @@ x264_pixel_ssim_4x4x2_core_mmxext: ; void x264_pixel_ads4_mmxext( int enc_dc[4], uint16_t *sums, int delta, ; uint16_t *res, int width ) ;----------------------------------------------------------------------------- -ALIGN 16 -x264_pixel_ads4_mmxext: +cglobal x264_pixel_ads4_mmxext push ebx mov eax, [esp+8] movq mm6, [eax] @@ -1685,8 +1608,7 @@ x264_pixel_ads4_mmxext: pop ebx ret -ALIGN 16 -x264_pixel_ads2_mmxext: +cglobal x264_pixel_ads2_mmxext push ebx mov eax, [esp+8] movq mm6, [eax] @@ -1713,8 +1635,7 @@ x264_pixel_ads2_mmxext: pop ebx ret -ALIGN 16 -x264_pixel_ads1_mmxext: +cglobal x264_pixel_ads1_mmxext mov eax, [esp+4] pshufw mm7, [eax], 0 mov eax, [esp+8] diff --git a/common/i386/pixel-sse2.asm b/common/i386/pixel-sse2.asm index 35ed5b31..e4aa7858 100644 --- a/common/i386/pixel-sse2.asm +++ b/common/i386/pixel-sse2.asm @@ -40,24 +40,6 @@ mask_ff: times 16 db 0xff SECTION .text - -cglobal x264_pixel_sad_16x16_sse2 -cglobal x264_pixel_sad_16x8_sse2 -cglobal x264_pixel_sad_x3_16x16_sse2 -cglobal x264_pixel_sad_x3_16x8_sse2 -cglobal x264_pixel_sad_x4_16x16_sse2 -cglobal x264_pixel_sad_x4_16x8_sse2 -cglobal x264_pixel_ssd_16x16_sse2 -cglobal x264_pixel_ssd_16x8_sse2 -cglobal x264_pixel_satd_8x4_sse2 -cglobal x264_pixel_satd_8x8_sse2 -cglobal x264_pixel_satd_16x8_sse2 -cglobal x264_pixel_satd_8x16_sse2 -cglobal x264_pixel_satd_16x16_sse2 -cglobal x264_pixel_ssim_4x4x2_core_sse2 -cglobal x264_pixel_ssim_end4_sse2 - - %macro HADDW 2 ; sum junk ; ebx is no longer used at this point, so no push needed picgetgot ebx @@ -106,11 +88,10 @@ cglobal x264_pixel_ssim_end4_sse2 ret %endmacro -ALIGN 16 ;----------------------------------------------------------------------------- ; int __cdecl x264_pixel_sad_16x16_sse2 (uint8_t *, int, uint8_t *, int ) ;----------------------------------------------------------------------------- -x264_pixel_sad_16x16_sse2: +cglobal x264_pixel_sad_16x16_sse2 SAD_START_SSE2 movdqu xmm0, [ecx] movdqu xmm1, [ecx+edx] @@ -175,11 +156,10 @@ x264_pixel_sad_16x16_sse2: paddw xmm0, xmm7 SAD_END_SSE2 -ALIGN 16 ;----------------------------------------------------------------------------- ; int __cdecl x264_pixel_sad_16x8_sse2 (uint8_t *, int, uint8_t *, int ) ;----------------------------------------------------------------------------- -x264_pixel_sad_16x8_sse2: +cglobal x264_pixel_sad_16x8_sse2 SAD_START_SSE2 pxor xmm0, xmm0 SAD_INC_4x16P_SSE2 @@ -317,14 +297,12 @@ x264_pixel_sad_16x8_sse2: ret %endmacro -ALIGN 16 ;----------------------------------------------------------------------------- ; void x264_pixel_sad_x3_16x16_sse2( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1, ; uint8_t *pix2, int i_stride, int scores[3] ) ;----------------------------------------------------------------------------- %macro SAD_X 3 -ALIGN 16 -x264_pixel_sad_x%1_%2x%3_sse2: +cglobal x264_pixel_sad_x%1_%2x%3_sse2 SAD_X%1_2x%2P 1 %rep %3/2-1 SAD_X%1_2x%2P 0 @@ -400,22 +378,20 @@ SAD_X 4, 16, 8 ret %endmacro -ALIGN 16 ;----------------------------------------------------------------------------- ; int __cdecl x264_pixel_ssd_16x16_sse2 (uint8_t *, int, uint8_t *, int ) ;----------------------------------------------------------------------------- -x264_pixel_ssd_16x16_sse2: +cglobal x264_pixel_ssd_16x16_sse2 SSD_START_SSE2 %rep 8 SSD_INC_2x16P_SSE2 %endrep SSD_END_SSE2 -ALIGN 16 ;----------------------------------------------------------------------------- ; int __cdecl x264_pixel_ssd_16x8_sse2 (uint8_t *, int, uint8_t *, int ) ;----------------------------------------------------------------------------- -x264_pixel_ssd_16x8_sse2: +cglobal x264_pixel_ssd_16x8_sse2 SSD_START_SSE2 %rep 4 SSD_INC_2x16P_SSE2 @@ -543,11 +519,10 @@ x264_pixel_ssd_16x8_sse2: ret %endmacro -ALIGN 16 ;----------------------------------------------------------------------------- ; int __cdecl x264_pixel_satd_16x16_sse2 (uint8_t *, int, uint8_t *, int ) ;----------------------------------------------------------------------------- -x264_pixel_satd_16x16_sse2: +cglobal x264_pixel_satd_16x16_sse2 SATD_START SATD_TWO_SSE2 @@ -567,11 +542,10 @@ x264_pixel_satd_16x16_sse2: SATD_END -ALIGN 16 ;----------------------------------------------------------------------------- ; int __cdecl x264_pixel_satd_8x16_sse2 (uint8_t *, int, uint8_t *, int ) ;----------------------------------------------------------------------------- -x264_pixel_satd_8x16_sse2: +cglobal x264_pixel_satd_8x16_sse2 SATD_START SATD_TWO_SSE2 @@ -581,11 +555,10 @@ x264_pixel_satd_8x16_sse2: SATD_END -ALIGN 16 ;----------------------------------------------------------------------------- ; int __cdecl x264_pixel_satd_16x8_sse2 (uint8_t *, int, uint8_t *, int ) ;----------------------------------------------------------------------------- -x264_pixel_satd_16x8_sse2: +cglobal x264_pixel_satd_16x8_sse2 SATD_START SATD_TWO_SSE2 @@ -601,11 +574,10 @@ x264_pixel_satd_16x8_sse2: SATD_END -ALIGN 16 ;----------------------------------------------------------------------------- ; int __cdecl x264_pixel_satd_8x8_sse2 (uint8_t *, int, uint8_t *, int ) ;----------------------------------------------------------------------------- -x264_pixel_satd_8x8_sse2: +cglobal x264_pixel_satd_8x8_sse2 SATD_START SATD_TWO_SSE2 @@ -613,11 +585,10 @@ x264_pixel_satd_8x8_sse2: SATD_END -ALIGN 16 ;----------------------------------------------------------------------------- ; int __cdecl x264_pixel_satd_8x4_sse2 (uint8_t *, int, uint8_t *, int ) ;----------------------------------------------------------------------------- -x264_pixel_satd_8x4_sse2: +cglobal x264_pixel_satd_8x4_sse2 SATD_START SATD_TWO_SSE2 @@ -630,8 +601,7 @@ x264_pixel_satd_8x4_sse2: ; void x264_pixel_ssim_4x4x2_core_sse2( const uint8_t *pix1, int stride1, ; const uint8_t *pix2, int stride2, int sums[2][4] ) ;----------------------------------------------------------------------------- -ALIGN 16 -x264_pixel_ssim_4x4x2_core_sse2: +cglobal x264_pixel_ssim_4x4x2_core_sse2 push ebx mov eax, [esp+ 8] mov ebx, [esp+12] @@ -687,8 +657,7 @@ x264_pixel_ssim_4x4x2_core_sse2: ;----------------------------------------------------------------------------- ; float x264_pixel_ssim_end_sse2( int sum0[5][4], int sum1[5][4], int width ) ;----------------------------------------------------------------------------- -ALIGN 16 -x264_pixel_ssim_end4_sse2: +cglobal x264_pixel_ssim_end4_sse2 mov eax, [esp+ 4] mov ecx, [esp+ 8] mov edx, [esp+12] diff --git a/common/i386/predict-a.asm b/common/i386/predict-a.asm index 4be13ab0..01156cc8 100644 --- a/common/i386/predict-a.asm +++ b/common/i386/predict-a.asm @@ -69,22 +69,6 @@ pw_3210: SECTION .text -cglobal predict_8x8_v_mmxext -cglobal predict_8x8_dc_mmxext -cglobal predict_8x8_dc_top_mmxext -cglobal predict_8x8_dc_left_mmxext -cglobal predict_8x8_ddl_mmxext -cglobal predict_8x8_ddr_mmxext -cglobal predict_8x8_vr_core_mmxext -cglobal predict_8x8c_v_mmx -cglobal predict_8x8c_dc_core_mmxext -cglobal predict_8x8c_p_core_mmxext -cglobal predict_16x16_p_core_mmxext -cglobal predict_16x16_v_mmx -cglobal predict_16x16_dc_core_mmxext -cglobal predict_16x16_dc_top_mmxext - - ; dest, left, right, src, tmp ; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2 %macro PRED8x8_LOWPASS 5 @@ -101,9 +85,7 @@ cglobal predict_16x16_dc_top_mmxext ;----------------------------------------------------------------------------- ; void predict_8x8_v_mmxext( uint8_t *src, uint8_t *edge ) ;----------------------------------------------------------------------------- - -ALIGN 16 -predict_8x8_v_mmxext: +cglobal predict_8x8_v_mmxext mov eax, [esp+8] mov edx, [esp+4] movq mm0, [eax+16] @@ -113,9 +95,7 @@ predict_8x8_v_mmxext: ;----------------------------------------------------------------------------- ; void predict_8x8_dc_mmxext( uint8_t *src, uint8_t *edge ) ;----------------------------------------------------------------------------- - -ALIGN 16 -predict_8x8_dc_mmxext: +cglobal predict_8x8_dc_mmxext picpush ebx picgetgot ebx mov eax, [picesp + 8] @@ -137,8 +117,7 @@ predict_8x8_dc_mmxext: ; void predict_8x8_top_mmxext( uint8_t *src, uint8_t *edge ) ;----------------------------------------------------------------------------- %macro PRED8x8_DC 2 -ALIGN 16 -%1: +cglobal %1 picpush ebx picgetgot ebx mov eax, [picesp + 8] @@ -160,9 +139,7 @@ PRED8x8_DC predict_8x8_dc_left_mmxext, 7 ;----------------------------------------------------------------------------- ; void predict_8x8_ddl_mmxext( uint8_t *src, uint8_t *edge ) ;----------------------------------------------------------------------------- - -ALIGN 16 -predict_8x8_ddl_mmxext: +cglobal predict_8x8_ddl_mmxext picpush ebx picgetgot ebx mov eax, [picesp + 8] @@ -197,9 +174,7 @@ predict_8x8_ddl_mmxext: ;----------------------------------------------------------------------------- ; void predict_8x8_ddr_mmxext( uint8_t *src, uint8_t *edge ) ;----------------------------------------------------------------------------- - -ALIGN 16 -predict_8x8_ddr_mmxext: +cglobal predict_8x8_ddr_mmxext picpush ebx picgetgot ebx mov eax, [picesp + 8] @@ -246,8 +221,7 @@ predict_8x8_ddr_mmxext: ; 6 ..... ; 7 ,,,,, -ALIGN 16 -predict_8x8_vr_core_mmxext: +cglobal predict_8x8_vr_core_mmxext picpush ebx picgetgot ebx mov eax, [picesp + 8] @@ -276,9 +250,7 @@ predict_8x8_vr_core_mmxext: ;----------------------------------------------------------------------------- ; void predict_8x8c_v_mmx( uint8_t *src ) ;----------------------------------------------------------------------------- - -ALIGN 16 -predict_8x8c_v_mmx : +cglobal predict_8x8c_v_mmx mov edx, [esp + 4] movq mm0, [edx - FDEC_STRIDE] STORE8x8 mm0, mm0 @@ -287,9 +259,7 @@ predict_8x8c_v_mmx : ;----------------------------------------------------------------------------- ; void predict_8x8c_dc_core_mmxext( uint8_t *src, int s2, int s3 ) ;----------------------------------------------------------------------------- - -ALIGN 16 -predict_8x8c_dc_core_mmxext: +cglobal predict_8x8c_dc_core_mmxext picpush ebx picgetgot ebx @@ -326,9 +296,7 @@ predict_8x8c_dc_core_mmxext: ;----------------------------------------------------------------------------- ; void predict_8x8c_p_core_mmxext( uint8_t *src, int i00, int b, int c ) ;----------------------------------------------------------------------------- - -ALIGN 16 -predict_8x8c_p_core_mmxext: +cglobal predict_8x8c_p_core_mmxext picpush ebx picgetgot ebx @@ -366,10 +334,7 @@ ALIGN 4 ;----------------------------------------------------------------------------- ; void predict_16x16_p_core_mmxext( uint8_t *src, int i00, int b, int c ) ;----------------------------------------------------------------------------- - -ALIGN 16 -predict_16x16_p_core_mmxext: - +cglobal predict_16x16_p_core_mmxext picpush ebx picgetgot ebx @@ -421,10 +386,7 @@ ALIGN 4 ;----------------------------------------------------------------------------- ; void predict_16x16_v_mmx( uint8_t *src ) ;----------------------------------------------------------------------------- - -ALIGN 16 -predict_16x16_v_mmx : - +cglobal predict_16x16_v_mmx mov edx, [esp + 4] mov ecx, FDEC_STRIDE sub edx, ecx ; edx <-- line -1 @@ -490,13 +452,11 @@ ALIGN 4 pop edi %endmacro -ALIGN 16 -predict_16x16_dc_core_mmxext: +cglobal predict_16x16_dc_core_mmxext PRED16x16_DC [esp+8], 5, esp ret -ALIGN 16 -predict_16x16_dc_top_mmxext: +cglobal predict_16x16_dc_top_mmxext picpush ebx picgetgot ebx PRED16x16_DC [pw_8 GOT_ebx], 4, picesp diff --git a/common/i386/quant-a.asm b/common/i386/quant-a.asm index 6435c2f5..b8860557 100644 --- a/common/i386/quant-a.asm +++ b/common/i386/quant-a.asm @@ -40,24 +40,6 @@ pd_1: times 2 dd 1 SECTION .text -cglobal x264_quant_2x2_dc_core15_mmx -cglobal x264_quant_4x4_dc_core15_mmx -cglobal x264_quant_4x4_core15_mmx -cglobal x264_quant_8x8_core15_mmx - -cglobal x264_quant_2x2_dc_core16_mmxext -cglobal x264_quant_4x4_dc_core16_mmxext -cglobal x264_quant_4x4_core16_mmxext -cglobal x264_quant_8x8_core16_mmxext - -cglobal x264_quant_2x2_dc_core32_mmxext -cglobal x264_quant_4x4_dc_core32_mmxext -cglobal x264_quant_4x4_core32_mmxext -cglobal x264_quant_8x8_core32_mmxext - -cglobal x264_dequant_4x4_mmx -cglobal x264_dequant_8x8_mmx - %macro MMX_QUANT_AC_START 0 mov eax, [esp+ 4] ; &dct[0][0] mov ecx, [esp+ 8] ; &quant_mf[0][0] @@ -107,22 +89,20 @@ cglobal x264_dequant_8x8_mmx movq %1, mm0 ; store %endmacro -ALIGN 16 ;----------------------------------------------------------------------------- ; void __cdecl x264_quant_2x2_dc_core15_mmx( int16_t dct[2][2], ; int const i_qmf, int const i_qbits, int const f ); ;----------------------------------------------------------------------------- -x264_quant_2x2_dc_core15_mmx: +cglobal x264_quant_2x2_dc_core15_mmx MMX_QUANT15_DC_START MMX_QUANT15_1x4 [eax], mm5, mm6, mm7 ret -ALIGN 16 ;----------------------------------------------------------------------------- ; void __cdecl x264_quant_4x4_dc_core15_mmx( int16_t dct[4][4], ; int const i_qmf, int const i_qbits, int const f ); ;----------------------------------------------------------------------------- -x264_quant_4x4_dc_core15_mmx: +cglobal x264_quant_4x4_dc_core15_mmx MMX_QUANT15_DC_START %rep 4 @@ -132,12 +112,11 @@ x264_quant_4x4_dc_core15_mmx: ret -ALIGN 16 ;----------------------------------------------------------------------------- ; void __cdecl x264_quant_4x4_core15_mmx( int16_t dct[4][4], ; int const quant_mf[4][4], int const i_qbits, int const f ); ;----------------------------------------------------------------------------- -x264_quant_4x4_core15_mmx: +cglobal x264_quant_4x4_core15_mmx MMX_QUANT_AC_START %rep 4 @@ -150,12 +129,11 @@ x264_quant_4x4_core15_mmx: ret -ALIGN 16 ;----------------------------------------------------------------------------- ; void __cdecl x264_quant_8x8_core15_mmx( int16_t dct[8][8], ; int const quant_mf[8][8], int const i_qbits, int const f ); ;----------------------------------------------------------------------------- -x264_quant_8x8_core15_mmx: +cglobal x264_quant_8x8_core15_mmx MMX_QUANT_AC_START %rep 16 @@ -210,22 +188,20 @@ x264_quant_8x8_core15_mmx: movq %1, mm0 ; store %endmacro -ALIGN 16 ;----------------------------------------------------------------------------- ; void __cdecl x264_quant_2x2_dc_core16_mmxext( int16_t dct[2][2], ; int const i_qmf, int const i_qbits, int const f ); ;----------------------------------------------------------------------------- -x264_quant_2x2_dc_core16_mmxext: +cglobal x264_quant_2x2_dc_core16_mmxext MMXEXT_QUANT16_DC_START MMXEXT_QUANT16_1x4 [eax], mm5, mm6, mm7 ret -ALIGN 16 ;----------------------------------------------------------------------------- ; void __cdecl x264_quant_4x4_dc_core16_mmxext( int16_t dct[4][4], ; int const i_qmf, int const i_qbits, int const f ); ;----------------------------------------------------------------------------- -x264_quant_4x4_dc_core16_mmxext: +cglobal x264_quant_4x4_dc_core16_mmxext MMXEXT_QUANT16_DC_START %rep 4 @@ -235,12 +211,11 @@ x264_quant_4x4_dc_core16_mmxext: ret -ALIGN 16 ;----------------------------------------------------------------------------- ; void __cdecl x264_quant_4x4_core16_mmxext( int16_t dct[4][4], ; int const quant_mf[4][4], int const i_qbits, int const f ); ;----------------------------------------------------------------------------- -x264_quant_4x4_core16_mmxext: +cglobal x264_quant_4x4_core16_mmxext MMX_QUANT_AC_START %rep 4 @@ -254,12 +229,11 @@ x264_quant_4x4_core16_mmxext: ret -ALIGN 16 ;----------------------------------------------------------------------------- ; void __cdecl x264_quant_8x8_core16_mmxext( int16_t dct[8][8], ; int const quant_mf[8][8], int const i_qbits, int const f ); ;----------------------------------------------------------------------------- -x264_quant_8x8_core16_mmxext: +cglobal x264_quant_8x8_core16_mmxext MMX_QUANT_AC_START %rep 16 @@ -321,22 +295,20 @@ x264_quant_8x8_core16_mmxext: movq %1, mm0 ; store %endmacro -ALIGN 16 ;----------------------------------------------------------------------------- ; void __cdecl x264_quant_2x2_dc_core32_mmxext( int16_t dct[2][2], ; int const i_qmf, int const i_qbits, int const f ); ;----------------------------------------------------------------------------- -x264_quant_2x2_dc_core32_mmxext: +cglobal x264_quant_2x2_dc_core32_mmxext MMX_QUANT32_DC_START MMXEXT_QUANT32_1x4 [eax], mm5, mm5, mm6, mm7 ret -ALIGN 16 ;----------------------------------------------------------------------------- ; void __cdecl x264_quant_4x4_dc_core32_mmxext( int16_t dct[4][4], ; int const i_qmf, int const i_qbits, int const f ); ;----------------------------------------------------------------------------- -x264_quant_4x4_dc_core32_mmxext: +cglobal x264_quant_4x4_dc_core32_mmxext MMX_QUANT32_DC_START %rep 4 @@ -346,12 +318,11 @@ x264_quant_4x4_dc_core32_mmxext: ret -ALIGN 16 ;----------------------------------------------------------------------------- ; void __cdecl x264_quant_4x4_core32_mmxext( int16_t dct[4][4], ; int const quant_mf[4][4], int const i_qbits, int const f ); ;----------------------------------------------------------------------------- -x264_quant_4x4_core32_mmxext: +cglobal x264_quant_4x4_core32_mmxext MMX_QUANT_AC_START %rep 4 @@ -362,12 +333,11 @@ x264_quant_4x4_core32_mmxext: ret -ALIGN 16 ;----------------------------------------------------------------------------- ; void __cdecl x264_quant_8x8_core32_mmxext( int16_t dct[8][8], ; int const quant_mf[8][8], int const i_qbits, int const f ); ;----------------------------------------------------------------------------- -x264_quant_8x8_core32_mmxext: +cglobal x264_quant_8x8_core32_mmxext MMX_QUANT_AC_START %rep 16 @@ -445,10 +415,11 @@ x264_quant_8x8_core32_mmxext: movq %1, mm0 %endmacro +;----------------------------------------------------------------------------- +; void x264_dequant_4x4_mmx( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp ) +;----------------------------------------------------------------------------- %macro DEQUANT_WxH 3 -ALIGN 16 -;;; void x264_dequant_4x4_mmx( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp ) -%1: +cglobal %1 mov edx, [esp+12] ; i_qp imul eax, edx, 0x2b shr eax, 8 ; i_qbits = i_qp / 6 -- 2.49.0