From 4cf272851a9d24aacdf664f27a87ebdbfb50e6c2 Mon Sep 17 00:00:00 2001 From: Loren Merritt Date: Sun, 2 Dec 2012 15:56:30 +0000 Subject: [PATCH] x86inc: activate REP_RET automatically Now RET checks whether it immediately follows a branch, so the programmer dosen't have to keep track of that condition. REP_RET is still needed manually when it's a branch target, but that's much rarer. The implementation involves lots of spurious labels, but that's ok because we strip them. --- common/x86/cabac-a.asm | 4 ++-- common/x86/dct-a.asm | 4 ++-- common/x86/deblock-a.asm | 28 ++++++++++++------------ common/x86/mc-a.asm | 44 +++++++++++++++++++------------------- common/x86/mc-a2.asm | 46 ++++++++++++++++++++-------------------- common/x86/predict-a.asm | 32 ++++++++++++++-------------- common/x86/quant-a.asm | 16 +++++++------- common/x86/x86inc.asm | 36 ++++++++++++++++++++++++++----- tools/checkasm-a.asm | 2 +- 9 files changed, 119 insertions(+), 93 deletions(-) diff --git a/common/x86/cabac-a.asm b/common/x86/cabac-a.asm index 0136511f..009b720b 100644 --- a/common/x86/cabac-a.asm +++ b/common/x86/cabac-a.asm @@ -139,13 +139,13 @@ cglobal cabac_encode_terminal_asm, 0,3 ; can only be 0 or 1 and is zero over 99% of the time. test dword [t0+cb.range], 0x100 je .renorm - REP_RET + RET .renorm: shl dword [t0+cb.low], 1 shl dword [t0+cb.range], 1 inc dword [t0+cb.queue] jge .putbyte - REP_RET + RET .putbyte: PROLOGUE 0,7 mov t3d, [t0+cb.queue] diff --git a/common/x86/dct-a.asm b/common/x86/dct-a.asm index e16f7134..8ee94c20 100644 --- a/common/x86/dct-a.asm +++ b/common/x86/dct-a.asm @@ -555,7 +555,7 @@ cglobal add16x16_idct_dc, 2,3,8 add r0, 4*FDEC_STRIDEB dec r2 jg .loop - REP_RET + RET %endmacro ; ADD_IDCT_DC INIT_XMM sse2 @@ -664,7 +664,7 @@ cglobal add16x16_idct_dc, 2,3 add r0, FDEC_STRIDE*4 dec r2 jg .loop - REP_RET + RET INIT_XMM sse2 cglobal add16x16_idct_dc, 2,2,8 diff --git a/common/x86/deblock-a.asm b/common/x86/deblock-a.asm index 210761a8..a8be80e7 100644 --- a/common/x86/deblock-a.asm +++ b/common/x86/deblock-a.asm @@ -378,7 +378,7 @@ cglobal deblock_v_luma, 5,5,15 add r4, 2 dec r3 jg .loop - REP_RET + RET cglobal deblock_h_luma, 5,7,15 add r1, r1 @@ -416,7 +416,7 @@ cglobal deblock_h_luma, 5,7,15 lea r5, [r5+r1*8] dec r6 jg .loop - REP_RET + RET %endmacro INIT_XMM sse2 @@ -650,7 +650,7 @@ cglobal deblock_v_luma_intra, 4,7,16 add r4, mmsize dec r6 jg .loop - REP_RET + RET ;----------------------------------------------------------------------------- ; void deblock_h_luma_intra( uint16_t *pix, intptr_t stride, int alpha, int beta ) @@ -1497,7 +1497,7 @@ cglobal deblock_%1_luma_intra, 4,6,16,ARCH_X86_64*0x50-0x50 LUMA_INTRA_SWAP_PQ LUMA_INTRA_P012 [r0], [r0+r1], [r0+2*r1], [r0+r5] .end: - RET + REP_RET INIT_MMX cpuname %if ARCH_X86_64 @@ -1687,7 +1687,7 @@ cglobal deblock_v_chroma, 5,7,8 add r4, mmsize/8 dec r6 jg .loop - REP_RET + RET ;----------------------------------------------------------------------------- ; void deblock_h_chroma( uint16_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ) @@ -1706,7 +1706,7 @@ cglobal deblock_h_chroma, 5,7,8 add r4, mmsize/8 dec r5 jg .loop - REP_RET + RET cglobal deblock_intra_body @@ -1734,7 +1734,7 @@ cglobal deblock_v_chroma_intra, 4,6,8 add r4, mmsize dec r5 jg .loop - REP_RET + RET ;----------------------------------------------------------------------------- ; void deblock_h_chroma_intra( uint16_t *pix, intptr_t stride, int alpha, int beta ) @@ -1752,7 +1752,7 @@ cglobal deblock_h_chroma_intra, 4,6,8 lea r0, [r0+r1*(mmsize/4)] dec r4 jg .loop - REP_RET + RET ;----------------------------------------------------------------------------- ; void deblock_h_chroma_intra_mbaff( uint16_t *pix, intptr_t stride, int alpha, int beta ) @@ -1775,7 +1775,7 @@ cglobal deblock_h_chroma_intra_mbaff, 4,6,8 dec r4 jg .loop %endif - REP_RET + RET ;----------------------------------------------------------------------------- ; void deblock_h_chroma_mbaff( uint16_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ) @@ -1803,7 +1803,7 @@ cglobal deblock_h_chroma_mbaff, 5,7,8 dec r5 jg .loop %endif - REP_RET + RET ;----------------------------------------------------------------------------- ; void deblock_h_chroma_422_intra( uint16_t *pix, intptr_t stride, int alpha, int beta ) @@ -1821,7 +1821,7 @@ cglobal deblock_h_chroma_422_intra, 4,6,8 lea r0, [r0+r1*(mmsize/4)] dec r4 jg .loop - REP_RET + RET ;----------------------------------------------------------------------------- ; void deblock_h_chroma_422( uint16_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ) @@ -1852,7 +1852,7 @@ cglobal deblock_h_chroma_422, 5,7,8 %endif dec r5 jg .loop - REP_RET + RET %endmacro ; DEBLOCK_CHROMA %if ARCH_X86_64 == 0 @@ -2020,7 +2020,7 @@ cglobal deblock_h_chroma_422, 5,8,8 add r4, mmsize/8 dec cntr jg .loop - REP_RET + RET %endmacro INIT_MMX mmx2 @@ -2101,7 +2101,7 @@ cglobal deblock_h_chroma_422_intra, 4,7,8 lea t5, [t5+r1*(mmsize/2)] dec r6d jg .loop - REP_RET + RET %endmacro ; DEBLOCK_CHROMA_INTRA INIT_XMM sse2 diff --git a/common/x86/mc-a.asm b/common/x86/mc-a.asm index 923a2cd3..4b336816 100644 --- a/common/x86/mc-a.asm +++ b/common/x86/mc-a.asm @@ -87,7 +87,7 @@ cextern pd_32 lea t0, [t0+t1*2*SIZEOF_PIXEL] sub eax, 2 jg .height_loop - REP_RET + RET %endmacro %if HIGH_BIT_DEPTH @@ -415,7 +415,7 @@ cglobal mc_weight_w%1, 6,6,8 lea r2, [r2+r3*2] sub r5d, 2 jg .loop - REP_RET + RET %endmacro INIT_MMX mmx2 @@ -495,7 +495,7 @@ cglobal mc_offset%2_w%1, 6,6 lea r2, [r2+r3*2] sub r5d, 2 jg .loop - REP_RET + RET %endmacro %macro OFFSETPN 1 @@ -672,7 +672,7 @@ cglobal pixel_avg2_w%1, 6,7,4 lea r0, [r0+r1*4] sub r5d, 2 jg .height_loop - REP_RET + RET %endmacro %macro AVG2_W_TWO 3 @@ -707,7 +707,7 @@ cglobal pixel_avg2_w%1, 6,7,8 lea r0, [r0+r1*4] sub r5d, 2 jg .height_loop - REP_RET + RET %endmacro INIT_MMX mmx2 @@ -745,7 +745,7 @@ cglobal pixel_avg2_w10_mmx2, 6,7 lea r0, [r0+r1*2*2] sub r5d, 2 jg .height_loop - REP_RET + RET cglobal pixel_avg2_w16_mmx2, 6,7 sub r4, r2 @@ -779,7 +779,7 @@ cglobal pixel_avg2_w16_mmx2, 6,7 lea r0, [r0+r1*2*2] sub r5d, 2 jg .height_loop - REP_RET + RET cglobal pixel_avg2_w18_mmx2, 6,7 sub r4, r2 @@ -803,7 +803,7 @@ cglobal pixel_avg2_w18_mmx2, 6,7 lea r0, [r0+r1*2] dec r5d jg .height_loop - REP_RET + RET INIT_XMM cglobal pixel_avg2_w18_sse2, 6,7,6 @@ -825,7 +825,7 @@ cglobal pixel_avg2_w18_sse2, 6,7,6 lea r0, [r0+r1*2] dec r5d jg .height_loop - REP_RET + RET %endif ; HIGH_BIT_DEPTH %if HIGH_BIT_DEPTH == 0 @@ -849,7 +849,7 @@ cglobal pixel_avg2_w%1_mmx2, 6,7 lea r0, [r0+r1*2] sub r5d, 2 jg .height_loop - REP_RET + RET %endmacro INIT_MMX @@ -877,7 +877,7 @@ cglobal pixel_avg2_w%1_mmx2, 6,7 lea r0, [r0+r1*2] sub r5d, 2 jg .height_loop - REP_RET + RET %endmacro AVG2_W16 12, movd @@ -909,7 +909,7 @@ cglobal pixel_avg2_w20_mmx2, 6,7 lea r0, [r0+r1*2] sub r5d, 2 jg .height_loop - REP_RET + RET cglobal pixel_avg2_w16_sse2, 6,7 sub r4, r2 @@ -927,7 +927,7 @@ cglobal pixel_avg2_w16_sse2, 6,7 lea r0, [r0+r1*2] sub r5d, 2 jg .height_loop - REP_RET + RET %macro AVG2_W20 1 cglobal pixel_avg2_w20_%1, 6,7 @@ -959,7 +959,7 @@ cglobal pixel_avg2_w20_%1, 6,7 lea r0, [r0+r1*2] sub r5d, 2 jg .height_loop - REP_RET + RET %endmacro AVG2_W20 sse2 @@ -1022,7 +1022,7 @@ pixel_avg2_w%1_cache_mmx2: add r0, r1 dec r5d jg .height_loop - REP_RET + RET %endmacro %macro AVG_CACHELINE_CHECK 3 ; width, cacheline, instruction set @@ -1226,7 +1226,7 @@ cglobal mc_copy_w%1, 5,7,8*(%%w/2) lea r0, [r0+r1*4] sub r4d, 4 jg .height_loop - REP_RET + RET %endif %endmacro @@ -1506,7 +1506,7 @@ ALIGN 4 add r1, r2 dec r5d jg .loop2 - REP_RET + RET %if mmsize==8 .width4: @@ -1626,11 +1626,11 @@ ALIGN 4 dec r5d jg .loop4 %if mmsize!=8 - REP_RET + RET %else sub dword r7m, 4 jg .width8 - REP_RET + RET .width8: %if ARCH_X86_64 lea r3, [t2+8*SIZEOF_PIXEL] @@ -1766,7 +1766,7 @@ ALIGN 4 add r1, r2 dec r5d jg .loop1d_w4 - REP_RET + RET .mc1d_w8: sub r2, 4*SIZEOF_PIXEL sub r4, 8*SIZEOF_PIXEL @@ -1848,7 +1848,7 @@ cglobal mc_chroma lea r1, [r1+r2*2] sub r5d, 2 jg .loop4 - REP_RET + RET .width8: movu m0, [r3] @@ -1909,7 +1909,7 @@ cglobal mc_chroma lea r1, [r1+r2*2] sub r5d, 2 jg .loop8 - REP_RET + RET %endmacro %if HIGH_BIT_DEPTH diff --git a/common/x86/mc-a2.asm b/common/x86/mc-a2.asm index e5eab350..b0633794 100644 --- a/common/x86/mc-a2.asm +++ b/common/x86/mc-a2.asm @@ -210,7 +210,7 @@ cglobal hpel_filter_v, 5,6,11 mova [r0+r4+mmsize], m4 add r4, 2*mmsize jl .loop - REP_RET + RET ;----------------------------------------------------------------------------- ; void hpel_filter_c( uint16_t *dst, int16_t *buf, intptr_t width ); @@ -259,7 +259,7 @@ cglobal hpel_filter_c, 3,3,10 mova [r0+r2], m1 add r2, mmsize jl .loop - REP_RET + RET ;----------------------------------------------------------------------------- ; void hpel_filter_h( uint16_t *dst, uint16_t *src, intptr_t width ); @@ -302,7 +302,7 @@ cglobal hpel_filter_h, 3,4,8 mova [r0+r2+mmsize], m4 add r2, mmsize*2 jl .loop - REP_RET + RET %endmacro ; HPEL_FILTER INIT_MMX mmx2 @@ -365,7 +365,7 @@ cglobal hpel_filter_v, 5,6,%1 add r5, mmsize add r4, mmsize jl .loop - REP_RET + RET %endmacro ;----------------------------------------------------------------------------- @@ -396,7 +396,7 @@ cglobal hpel_filter_c_mmx2, 3,3 movntq [r0+r2], m1 add r2, 8 jl .loop - REP_RET + RET ;----------------------------------------------------------------------------- ; void hpel_filter_h( uint8_t *dst, uint8_t *src, intptr_t width ); @@ -440,7 +440,7 @@ cglobal hpel_filter_h_mmx2, 3,3 movntq [r0+r2], m1 add r2, 8 jl .loop - REP_RET + RET INIT_XMM @@ -510,7 +510,7 @@ cglobal hpel_filter_c, 3,3,9 movntps [r0+r2], m4 add r2, 16 jl .loop - REP_RET + RET %endmacro ;----------------------------------------------------------------------------- @@ -559,7 +559,7 @@ cglobal hpel_filter_h_sse2, 3,3,8 movntps [r0+r2], m1 add r2, 16 jl .loop - REP_RET + RET ;----------------------------------------------------------------------------- ; void hpel_filter_h( uint8_t *dst, uint8_t *src, intptr_t width ); @@ -600,7 +600,7 @@ cglobal hpel_filter_h, 3,3 movntps [r0+r2], m3 add r2, 16 jl .loop - REP_RET + RET %endmacro INIT_MMX mmx2 @@ -1026,7 +1026,7 @@ cglobal store_interleave_chroma, 5,5 lea r0, [r0+r1*2] sub r4d, 2 jg .loop - REP_RET + RET %endmacro ; PLANE_INTERLEAVE %macro DEINTERLEAVE_START 0 @@ -1068,7 +1068,7 @@ cglobal plane_copy_deinterleave, 6,7 add r4, r5 dec dword r7m jg .loopy - REP_RET + RET ;----------------------------------------------------------------------------- ; void load_deinterleave_chroma_fenc( pixel *dst, pixel *src, intptr_t i_src, int height ) @@ -1083,7 +1083,7 @@ cglobal load_deinterleave_chroma_fenc, 4,4 lea r1, [r1+r2*2] sub r3d, 2 jg .loop - REP_RET + RET ;----------------------------------------------------------------------------- ; void load_deinterleave_chroma_fdec( pixel *dst, pixel *src, intptr_t i_src, int height ) @@ -1098,7 +1098,7 @@ cglobal load_deinterleave_chroma_fdec, 4,4 lea r1, [r1+r2*2] sub r3d, 2 jg .loop - REP_RET + RET %endmacro ; PLANE_DEINTERLEAVE %if HIGH_BIT_DEPTH @@ -1155,7 +1155,7 @@ cglobal memcpy_aligned_mmx, 3,3 sub r2d, 32 jg .copy32 .ret - REP_RET + RET ;----------------------------------------------------------------------------- ; void *memcpy_aligned( void *dst, const void *src, size_t n ); @@ -1207,7 +1207,7 @@ cglobal memzero_aligned, 2,2 %endrep add r1, mmsize*8 jl .loop - REP_RET + RET %endmacro INIT_MMX mmx @@ -1239,7 +1239,7 @@ cglobal integral_init4h_sse4, 3,4 movdqa [r3+r2*2+16], m1 add r2, 16 jl .loop - REP_RET + RET %macro INTEGRAL_INIT8H 0 cglobal integral_init8h, 3,4 @@ -1263,7 +1263,7 @@ cglobal integral_init8h, 3,4 movdqa [r3+r2*2+16], m1 add r2, 16 jl .loop - REP_RET + RET %endmacro INIT_XMM sse4 @@ -1290,7 +1290,7 @@ cglobal integral_init8v, 3,3 mova [r0+r1+mmsize], m1 add r1, 2*mmsize jl .loop - REP_RET + RET %endmacro INIT_MMX mmx @@ -1321,7 +1321,7 @@ cglobal integral_init4v_mmx, 3,5 mova [r1+r2-8], m3 sub r2, 8 jge .loop - REP_RET + RET INIT_XMM cglobal integral_init4v_sse2, 3,5 @@ -1347,7 +1347,7 @@ cglobal integral_init4v_sse2, 3,5 mova [r1+r2], m3 add r2, 16 jl .loop - REP_RET + RET cglobal integral_init4v_ssse3, 3,5 shl r2, 1 @@ -1372,7 +1372,7 @@ cglobal integral_init4v_ssse3, 3,5 mova [r1+r2], m3 add r2, 16 jl .loop - REP_RET + RET %macro FILT8x4 7 mova %3, [r0+%7] @@ -1732,7 +1732,7 @@ cglobal mbtree_propagate_cost, 7,7,7 movdqa [r0+r6*2], xmm0 add r6, 8 jl .loop - REP_RET + RET %endmacro INIT_XMM sse2 @@ -1786,4 +1786,4 @@ cglobal mbtree_propagate_cost, 7,7,8 vmovdqu [r0+r6*2], ymm1 add r6, 16 jl .loop - REP_RET + RET diff --git a/common/x86/predict-a.asm b/common/x86/predict-a.asm index 460ecb75..0805aba2 100644 --- a/common/x86/predict-a.asm +++ b/common/x86/predict-a.asm @@ -807,7 +807,7 @@ cglobal predict_8x8_dc, 2,2 psrlw m0, 4 SPLATW m0, m0 STORE8x8 m0, m0 - REP_RET + RET %else ; !HIGH_BIT_DEPTH INIT_MMX mmx2 @@ -1103,7 +1103,7 @@ ALIGN 4 add r0, FDEC_STRIDE dec r1d jg .loop - REP_RET + RET %endmacro ; PREDICT_CHROMA_P_MMX INIT_MMX mmx2 @@ -1140,7 +1140,7 @@ cglobal predict_8x%1c_p_core, 1,2,7 add r0, FDEC_STRIDEB dec r1d jg .loop - REP_RET + RET %else ; !HIGH_BIT_DEPTH cglobal predict_8x%1c_p_core, 1,2 movd m0, r1m @@ -1225,7 +1225,7 @@ ALIGN 4 add r0, FDEC_STRIDE dec r1d jg .loop - REP_RET + RET %endif ; !ARCH_X86_64 %macro PREDICT_16x16_P 0 @@ -1282,7 +1282,7 @@ ALIGN 4 dec r1d jg .loop %endif ; !HIGH_BIT_DEPTH - REP_RET + RET %endmacro ; PREDICT_16x16_P INIT_XMM sse2 @@ -1996,20 +1996,20 @@ cglobal predict_16x16_v_mmx2, 1,2 mova m2, [r0 - FDEC_STRIDEB+16] mova m3, [r0 - FDEC_STRIDEB+24] STORE16x16 m0, m1, m2, m3 - REP_RET + RET INIT_XMM cglobal predict_16x16_v_sse2, 2,2 mova m0, [r0 - FDEC_STRIDEB+ 0] mova m1, [r0 - FDEC_STRIDEB+16] STORE16x16_SSE2 m0, m1 - REP_RET + RET %else ; !HIGH_BIT_DEPTH INIT_MMX cglobal predict_16x16_v_mmx2, 1,2 movq m0, [r0 - FDEC_STRIDE + 0] movq m1, [r0 - FDEC_STRIDE + 8] STORE16x16 m0, m1 - REP_RET + RET INIT_XMM cglobal predict_16x16_v_sse2, 1,1 movdqa xmm0, [r0 - FDEC_STRIDE] @@ -2055,7 +2055,7 @@ cglobal predict_16x16_h, 1,2 %endif ; HIGH_BIT_DEPTH sub r1, 4*FDEC_STRIDEB jge .vloop - REP_RET + RET %endmacro INIT_MMX mmx2 @@ -2106,12 +2106,12 @@ cglobal predict_16x16_dc_core, 1,2 %else PRED16x16_DC r1m, 5 %endif - REP_RET + RET INIT_MMX mmx2 cglobal predict_16x16_dc_top, 1,2 PRED16x16_DC [pw_8], 4 - REP_RET + RET INIT_MMX mmx2 %if HIGH_BIT_DEPTH @@ -2119,14 +2119,14 @@ cglobal predict_16x16_dc_left_core, 1,2 movd m0, r1m SPLATW m0, m0 STORE16x16 m0, m0, m0, m0 - REP_RET + RET %else ; !HIGH_BIT_DEPTH cglobal predict_16x16_dc_left_core, 1,1 movd m0, r1m pshufw m0, m0, 0 packuswb m0, m0 STORE16x16 m0, m0 - REP_RET + RET %endif ;----------------------------------------------------------------------------- @@ -2159,11 +2159,11 @@ INIT_XMM sse2 cglobal predict_16x16_dc_core, 2,2,4 movd m3, r1m PRED16x16_DC_SSE2 m3, 5 - REP_RET + RET cglobal predict_16x16_dc_top, 1,2 PRED16x16_DC_SSE2 [pw_8], 4 - REP_RET + RET INIT_XMM sse2 %if HIGH_BIT_DEPTH @@ -2171,7 +2171,7 @@ cglobal predict_16x16_dc_left_core, 1,2 movd m0, r1m SPLATW m0, m0 STORE16x16_SSE2 m0, m0 - REP_RET + RET %else ; !HIGH_BIT_DEPTH cglobal predict_16x16_dc_left_core, 1,1 movd m0, r1m diff --git a/common/x86/quant-a.asm b/common/x86/quant-a.asm index 3c43220b..00889bb2 100644 --- a/common/x86/quant-a.asm +++ b/common/x86/quant-a.asm @@ -416,7 +416,7 @@ QUANT_AC quant_8x8, 8 %1 [r0+(t0+8*%3)*SIZEOF_PIXEL], [r1+t0*2+16*%3], [r1+t0*2+24*%3] sub t0d, 16*%3 jge %%loop - REP_RET + RET %else %1 [r0+(8*%3)*SIZEOF_PIXEL], [r1+16*%3], [r1+24*%3] %1 [r0+(0 )*SIZEOF_PIXEL], [r1+0 ], [r1+ 8*%3] @@ -738,7 +738,7 @@ cglobal optimize_chroma_2x2_dc, 0,%%regs,7 PSIGND m5, m2, m1 test t3d, t3d jnz .outer_loop_0 - REP_RET + RET %endmacro %if HIGH_BIT_DEPTH == 0 @@ -783,7 +783,7 @@ cglobal denoise_dct, 4,4,8 mova [r1+r3*4-1*mmsize], m5 sub r3, mmsize/2 jg .loop - REP_RET + RET %endmacro %if ARCH_X86_64 == 0 @@ -831,7 +831,7 @@ cglobal denoise_dct, 4,4,7 mova [r1+r3*4-1*mmsize], m1 sub r3, mmsize jg .loop - REP_RET + RET %endmacro %if ARCH_X86_64 == 0 @@ -954,7 +954,7 @@ cglobal decimate_score%1, 1,3 jne .loop %endif .ret: - RET + REP_RET .ret9: mov eax, 9 RET @@ -1066,7 +1066,7 @@ cglobal decimate_score64, 1,5 .tryret: xor r4, -1 jne .cont - REP_RET + RET .ret9: mov eax, 9 RET @@ -1077,7 +1077,7 @@ cglobal decimate_score64, 1,5 shr r3, cl shr r3, 1 jne .loop - REP_RET + RET %endif ; ARCH %endmacro @@ -1381,7 +1381,7 @@ cglobal coeff_level_run%1,0,7 inc t6d sub t4d, t3d jge .loop - REP_RET + RET %endmacro INIT_MMX mmx2 diff --git a/common/x86/x86inc.asm b/common/x86/x86inc.asm index 7888903a..1b81ff54 100644 --- a/common/x86/x86inc.asm +++ b/common/x86/x86inc.asm @@ -125,8 +125,7 @@ CPU amdnop ; Pops anything that was pushed by PROLOGUE, and returns. ; REP_RET: -; Same, but if it doesn't pop anything it becomes a 2-byte ret, for athlons -; which are slow when a normal ret follows a branch. +; Use this instead of RET if it's a branch target. ; registers: ; rN and rNq are the native-size register holding function argument N @@ -473,7 +472,7 @@ DECLARE_REG 14, R15, 120 %if mmsize == 32 vzeroupper %endif - ret + AUTO_REP_RET %endmacro %elif ARCH_X86_64 ; *nix x64 ;============================================= @@ -520,7 +519,7 @@ DECLARE_REG 14, R15, 72 %if mmsize == 32 vzeroupper %endif - ret + AUTO_REP_RET %endmacro %else ; X86_32 ;============================================================== @@ -576,7 +575,7 @@ DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14 %if mmsize == 32 vzeroupper %endif - ret + AUTO_REP_RET %endmacro %endif ;====================================================================== @@ -590,6 +589,10 @@ DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14 %endmacro %endif +; On AMD cpus <=K10, an ordinary ret is slow if it immediately follows either +; a branch or a branch target. So switch to a 2-byte form of ret in that case. +; We can automatically detect "follows a branch", but not a branch target. +; (SSSE3 is a sufficient condition to know that your cpu doesn't have this problem.) %macro REP_RET 0 %if has_epilogue RET @@ -598,6 +601,29 @@ DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14 %endif %endmacro +%define last_branch_adr $$ +%macro AUTO_REP_RET 0 + %ifndef cpuflags + times ((last_branch_adr-$)>>31)+1 rep ; times 1 iff $ != last_branch_adr. + %elif notcpuflag(ssse3) + times ((last_branch_adr-$)>>31)+1 rep + %endif + ret +%endmacro + +%macro BRANCH_INSTR 0-* + %rep %0 + %macro %1 1-2 %1 + %2 %1 + %%branch_instr: + %xdefine last_branch_adr %%branch_instr + %endmacro + %rotate 1 + %endrep +%endmacro + +BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae, jna, jnae, jb, jbe, jnb, jnbe, jc, jnc, js, jns, jo, jno, jp, jnp + %macro TAIL_CALL 2 ; callee, is_nonadjacent %if has_epilogue call %1 diff --git a/tools/checkasm-a.asm b/tools/checkasm-a.asm index 47a4f65e..001eec82 100644 --- a/tools/checkasm-a.asm +++ b/tools/checkasm-a.asm @@ -199,7 +199,7 @@ cglobal checkasm_call, 1,7 mov dword [r1], 0 mov eax, r3 .ok: - RET + REP_RET %endif ; ARCH_X86_64 -- 2.40.0