From 4cf272851a9d24aacdf664f27a87ebdbfb50e6c2 Mon Sep 17 00:00:00 2001
From: Loren Merritt <pengvado@akuvian.org>
Date: Sun, 2 Dec 2012 15:56:30 +0000
Subject: [PATCH] x86inc: activate REP_RET automatically Now RET checks whether
 it immediately follows a branch, so the programmer dosen't have to keep track
 of that condition. REP_RET is still needed manually when it's a branch
 target, but that's much rarer. The implementation involves lots of spurious
 labels, but that's ok because we strip them.

---
 common/x86/cabac-a.asm   |  4 ++--
 common/x86/dct-a.asm     |  4 ++--
 common/x86/deblock-a.asm | 28 ++++++++++++------------
 common/x86/mc-a.asm      | 44 +++++++++++++++++++-------------------
 common/x86/mc-a2.asm     | 46 ++++++++++++++++++++--------------------
 common/x86/predict-a.asm | 32 ++++++++++++++--------------
 common/x86/quant-a.asm   | 16 +++++++-------
 common/x86/x86inc.asm    | 36 ++++++++++++++++++++++++++-----
 tools/checkasm-a.asm     |  2 +-
 9 files changed, 119 insertions(+), 93 deletions(-)

diff --git a/common/x86/cabac-a.asm b/common/x86/cabac-a.asm
index 0136511f..009b720b 100644
--- a/common/x86/cabac-a.asm
+++ b/common/x86/cabac-a.asm
@@ -139,13 +139,13 @@ cglobal cabac_encode_terminal_asm, 0,3
 ; can only be 0 or 1 and is zero over 99% of the time.
     test dword [t0+cb.range], 0x100
     je .renorm
-    REP_RET
+    RET
 .renorm:
     shl  dword [t0+cb.low], 1
     shl  dword [t0+cb.range], 1
     inc  dword [t0+cb.queue]
     jge .putbyte
-    REP_RET
+    RET
 .putbyte:
     PROLOGUE 0,7
     mov t3d, [t0+cb.queue]
diff --git a/common/x86/dct-a.asm b/common/x86/dct-a.asm
index e16f7134..8ee94c20 100644
--- a/common/x86/dct-a.asm
+++ b/common/x86/dct-a.asm
@@ -555,7 +555,7 @@ cglobal add16x16_idct_dc, 2,3,8
     add         r0, 4*FDEC_STRIDEB
     dec         r2
     jg .loop
-    REP_RET
+    RET
 %endmacro ; ADD_IDCT_DC
 
 INIT_XMM sse2
@@ -664,7 +664,7 @@ cglobal add16x16_idct_dc, 2,3
     add       r0, FDEC_STRIDE*4
     dec       r2
     jg .loop
-    REP_RET
+    RET
 
 INIT_XMM sse2
 cglobal add16x16_idct_dc, 2,2,8
diff --git a/common/x86/deblock-a.asm b/common/x86/deblock-a.asm
index 210761a8..a8be80e7 100644
--- a/common/x86/deblock-a.asm
+++ b/common/x86/deblock-a.asm
@@ -378,7 +378,7 @@ cglobal deblock_v_luma, 5,5,15
     add         r4, 2
     dec         r3
     jg .loop
-    REP_RET
+    RET
 
 cglobal deblock_h_luma, 5,7,15
     add         r1, r1
@@ -416,7 +416,7 @@ cglobal deblock_h_luma, 5,7,15
     lea         r5, [r5+r1*8]
     dec         r6
     jg .loop
-    REP_RET
+    RET
 %endmacro
 
 INIT_XMM sse2
@@ -650,7 +650,7 @@ cglobal deblock_v_luma_intra, 4,7,16
     add     r4, mmsize
     dec     r6
     jg .loop
-    REP_RET
+    RET
 
 ;-----------------------------------------------------------------------------
 ; void deblock_h_luma_intra( uint16_t *pix, intptr_t stride, int alpha, int beta )
@@ -1497,7 +1497,7 @@ cglobal deblock_%1_luma_intra, 4,6,16,ARCH_X86_64*0x50-0x50
     LUMA_INTRA_SWAP_PQ
     LUMA_INTRA_P012 [r0], [r0+r1], [r0+2*r1], [r0+r5]
 .end:
-    RET
+    REP_RET
 
 INIT_MMX cpuname
 %if ARCH_X86_64
@@ -1687,7 +1687,7 @@ cglobal deblock_v_chroma, 5,7,8
     add         r4, mmsize/8
     dec         r6
     jg .loop
-    REP_RET
+    RET
 
 ;-----------------------------------------------------------------------------
 ; void deblock_h_chroma( uint16_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
@@ -1706,7 +1706,7 @@ cglobal deblock_h_chroma, 5,7,8
     add         r4, mmsize/8
     dec         r5
     jg .loop
-    REP_RET
+    RET
 
 
 cglobal deblock_intra_body
@@ -1734,7 +1734,7 @@ cglobal deblock_v_chroma_intra, 4,6,8
     add         r4, mmsize
     dec         r5
     jg .loop
-    REP_RET
+    RET
 
 ;-----------------------------------------------------------------------------
 ; void deblock_h_chroma_intra( uint16_t *pix, intptr_t stride, int alpha, int beta )
@@ -1752,7 +1752,7 @@ cglobal deblock_h_chroma_intra, 4,6,8
     lea         r0, [r0+r1*(mmsize/4)]
     dec         r4
     jg .loop
-    REP_RET
+    RET
 
 ;-----------------------------------------------------------------------------
 ; void deblock_h_chroma_intra_mbaff( uint16_t *pix, intptr_t stride, int alpha, int beta )
@@ -1775,7 +1775,7 @@ cglobal deblock_h_chroma_intra_mbaff, 4,6,8
     dec         r4
     jg .loop
 %endif
-    REP_RET
+    RET
 
 ;-----------------------------------------------------------------------------
 ; void deblock_h_chroma_mbaff( uint16_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
@@ -1803,7 +1803,7 @@ cglobal deblock_h_chroma_mbaff, 5,7,8
     dec         r5
     jg .loop
 %endif
-    REP_RET
+    RET
 
 ;-----------------------------------------------------------------------------
 ; void deblock_h_chroma_422_intra( uint16_t *pix, intptr_t stride, int alpha, int beta )
@@ -1821,7 +1821,7 @@ cglobal deblock_h_chroma_422_intra, 4,6,8
     lea         r0, [r0+r1*(mmsize/4)]
     dec         r4
     jg .loop
-    REP_RET
+    RET
 
 ;-----------------------------------------------------------------------------
 ; void deblock_h_chroma_422( uint16_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
@@ -1852,7 +1852,7 @@ cglobal deblock_h_chroma_422, 5,7,8
 %endif
     dec         r5
     jg .loop
-    REP_RET
+    RET
 %endmacro ; DEBLOCK_CHROMA
 
 %if ARCH_X86_64 == 0
@@ -2020,7 +2020,7 @@ cglobal deblock_h_chroma_422, 5,8,8
     add   r4, mmsize/8
     dec   cntr
     jg .loop
-    REP_RET
+    RET
 %endmacro
 
 INIT_MMX mmx2
@@ -2101,7 +2101,7 @@ cglobal deblock_h_chroma_422_intra, 4,7,8
     lea   t5, [t5+r1*(mmsize/2)]
     dec  r6d
     jg .loop
-    REP_RET
+    RET
 %endmacro ; DEBLOCK_CHROMA_INTRA
 
 INIT_XMM sse2
diff --git a/common/x86/mc-a.asm b/common/x86/mc-a.asm
index 923a2cd3..4b336816 100644
--- a/common/x86/mc-a.asm
+++ b/common/x86/mc-a.asm
@@ -87,7 +87,7 @@ cextern pd_32
     lea  t0, [t0+t1*2*SIZEOF_PIXEL]
     sub eax, 2
     jg .height_loop
-    REP_RET
+    RET
 %endmacro
 
 %if HIGH_BIT_DEPTH
@@ -415,7 +415,7 @@ cglobal mc_weight_w%1, 6,6,8
     lea  r2, [r2+r3*2]
     sub r5d, 2
     jg .loop
-    REP_RET
+    RET
 %endmacro
 
 INIT_MMX mmx2
@@ -495,7 +495,7 @@ cglobal mc_offset%2_w%1, 6,6
     lea  r2, [r2+r3*2]
     sub r5d, 2
     jg .loop
-    REP_RET
+    RET
 %endmacro
 
 %macro OFFSETPN 1
@@ -672,7 +672,7 @@ cglobal pixel_avg2_w%1, 6,7,4
     lea     r0, [r0+r1*4]
     sub    r5d, 2
     jg .height_loop
-    REP_RET
+    RET
 %endmacro
 
 %macro AVG2_W_TWO 3
@@ -707,7 +707,7 @@ cglobal pixel_avg2_w%1, 6,7,8
     lea     r0, [r0+r1*4]
     sub    r5d, 2
     jg .height_loop
-    REP_RET
+    RET
 %endmacro
 
 INIT_MMX mmx2
@@ -745,7 +745,7 @@ cglobal pixel_avg2_w10_mmx2, 6,7
     lea     r0, [r0+r1*2*2]
     sub    r5d, 2
     jg .height_loop
-    REP_RET
+    RET
 
 cglobal pixel_avg2_w16_mmx2, 6,7
     sub     r4, r2
@@ -779,7 +779,7 @@ cglobal pixel_avg2_w16_mmx2, 6,7
     lea     r0, [r0+r1*2*2]
     sub    r5d, 2
     jg .height_loop
-    REP_RET
+    RET
 
 cglobal pixel_avg2_w18_mmx2, 6,7
     sub     r4, r2
@@ -803,7 +803,7 @@ cglobal pixel_avg2_w18_mmx2, 6,7
     lea     r0, [r0+r1*2]
     dec    r5d
     jg .height_loop
-    REP_RET
+    RET
 
 INIT_XMM
 cglobal pixel_avg2_w18_sse2, 6,7,6
@@ -825,7 +825,7 @@ cglobal pixel_avg2_w18_sse2, 6,7,6
     lea     r0, [r0+r1*2]
     dec    r5d
     jg .height_loop
-    REP_RET
+    RET
 %endif ; HIGH_BIT_DEPTH
 
 %if HIGH_BIT_DEPTH == 0
@@ -849,7 +849,7 @@ cglobal pixel_avg2_w%1_mmx2, 6,7
     lea    r0, [r0+r1*2]
     sub    r5d, 2
     jg     .height_loop
-    REP_RET
+    RET
 %endmacro
 
 INIT_MMX
@@ -877,7 +877,7 @@ cglobal pixel_avg2_w%1_mmx2, 6,7
     lea    r0, [r0+r1*2]
     sub    r5d, 2
     jg     .height_loop
-    REP_RET
+    RET
 %endmacro
 
 AVG2_W16 12, movd
@@ -909,7 +909,7 @@ cglobal pixel_avg2_w20_mmx2, 6,7
     lea    r0, [r0+r1*2]
     sub    r5d, 2
     jg     .height_loop
-    REP_RET
+    RET
 
 cglobal pixel_avg2_w16_sse2, 6,7
     sub    r4, r2
@@ -927,7 +927,7 @@ cglobal pixel_avg2_w16_sse2, 6,7
     lea    r0, [r0+r1*2]
     sub    r5d, 2
     jg     .height_loop
-    REP_RET
+    RET
 
 %macro AVG2_W20 1
 cglobal pixel_avg2_w20_%1, 6,7
@@ -959,7 +959,7 @@ cglobal pixel_avg2_w20_%1, 6,7
     lea    r0, [r0+r1*2]
     sub    r5d, 2
     jg     .height_loop
-    REP_RET
+    RET
 %endmacro
 
 AVG2_W20 sse2
@@ -1022,7 +1022,7 @@ pixel_avg2_w%1_cache_mmx2:
     add    r0, r1
     dec    r5d
     jg .height_loop
-    REP_RET
+    RET
 %endmacro
 
 %macro AVG_CACHELINE_CHECK 3 ; width, cacheline, instruction set
@@ -1226,7 +1226,7 @@ cglobal mc_copy_w%1, 5,7,8*(%%w/2)
     lea     r0, [r0+r1*4]
     sub    r4d, 4
     jg .height_loop
-    REP_RET
+    RET
 %endif
 %endmacro
 
@@ -1506,7 +1506,7 @@ ALIGN 4
     add        r1, r2
     dec       r5d
     jg .loop2
-    REP_RET
+    RET
 
 %if mmsize==8
 .width4:
@@ -1626,11 +1626,11 @@ ALIGN 4
     dec       r5d
     jg .loop4
 %if mmsize!=8
-    REP_RET
+    RET
 %else
     sub dword r7m, 4
     jg .width8
-    REP_RET
+    RET
 .width8:
 %if ARCH_X86_64
     lea        r3, [t2+8*SIZEOF_PIXEL]
@@ -1766,7 +1766,7 @@ ALIGN 4
     add        r1, r2
     dec       r5d
     jg .loop1d_w4
-    REP_RET
+    RET
 .mc1d_w8:
     sub       r2, 4*SIZEOF_PIXEL
     sub       r4, 8*SIZEOF_PIXEL
@@ -1848,7 +1848,7 @@ cglobal mc_chroma
     lea        r1, [r1+r2*2]
     sub       r5d, 2
     jg .loop4
-    REP_RET
+    RET
 
 .width8:
     movu       m0, [r3]
@@ -1909,7 +1909,7 @@ cglobal mc_chroma
     lea        r1, [r1+r2*2]
     sub       r5d, 2
     jg .loop8
-    REP_RET
+    RET
 %endmacro
 
 %if HIGH_BIT_DEPTH
diff --git a/common/x86/mc-a2.asm b/common/x86/mc-a2.asm
index e5eab350..b0633794 100644
--- a/common/x86/mc-a2.asm
+++ b/common/x86/mc-a2.asm
@@ -210,7 +210,7 @@ cglobal hpel_filter_v, 5,6,11
     mova      [r0+r4+mmsize], m4
     add        r4, 2*mmsize
     jl .loop
-    REP_RET
+    RET
 
 ;-----------------------------------------------------------------------------
 ; void hpel_filter_c( uint16_t *dst, int16_t *buf, intptr_t width );
@@ -259,7 +259,7 @@ cglobal hpel_filter_c, 3,3,10
     mova  [r0+r2], m1
     add        r2, mmsize
     jl .loop
-    REP_RET
+    RET
 
 ;-----------------------------------------------------------------------------
 ; void hpel_filter_h( uint16_t *dst, uint16_t *src, intptr_t width );
@@ -302,7 +302,7 @@ cglobal hpel_filter_h, 3,4,8
     mova      [r0+r2+mmsize], m4
     add        r2, mmsize*2
     jl .loop
-    REP_RET
+    RET
 %endmacro ; HPEL_FILTER
 
 INIT_MMX mmx2
@@ -365,7 +365,7 @@ cglobal hpel_filter_v, 5,6,%1
     add r5, mmsize
     add r4, mmsize
     jl .loop
-    REP_RET
+    RET
 %endmacro
 
 ;-----------------------------------------------------------------------------
@@ -396,7 +396,7 @@ cglobal hpel_filter_c_mmx2, 3,3
     movntq [r0+r2], m1
     add r2, 8
     jl .loop
-    REP_RET
+    RET
 
 ;-----------------------------------------------------------------------------
 ; void hpel_filter_h( uint8_t *dst, uint8_t *src, intptr_t width );
@@ -440,7 +440,7 @@ cglobal hpel_filter_h_mmx2, 3,3
     movntq     [r0+r2], m1
     add r2, 8
     jl .loop
-    REP_RET
+    RET
 
 INIT_XMM
 
@@ -510,7 +510,7 @@ cglobal hpel_filter_c, 3,3,9
     movntps [r0+r2], m4
     add r2, 16
     jl .loop
-    REP_RET
+    RET
 %endmacro
 
 ;-----------------------------------------------------------------------------
@@ -559,7 +559,7 @@ cglobal hpel_filter_h_sse2, 3,3,8
     movntps    [r0+r2], m1
     add r2, 16
     jl .loop
-    REP_RET
+    RET
 
 ;-----------------------------------------------------------------------------
 ; void hpel_filter_h( uint8_t *dst, uint8_t *src, intptr_t width );
@@ -600,7 +600,7 @@ cglobal hpel_filter_h, 3,3
     movntps [r0+r2], m3
     add r2, 16
     jl .loop
-    REP_RET
+    RET
 %endmacro
 
 INIT_MMX mmx2
@@ -1026,7 +1026,7 @@ cglobal store_interleave_chroma, 5,5
     lea    r0, [r0+r1*2]
     sub   r4d, 2
     jg .loop
-    REP_RET
+    RET
 %endmacro ; PLANE_INTERLEAVE
 
 %macro DEINTERLEAVE_START 0
@@ -1068,7 +1068,7 @@ cglobal plane_copy_deinterleave, 6,7
     add    r4, r5
     dec dword r7m
     jg .loopy
-    REP_RET
+    RET
 
 ;-----------------------------------------------------------------------------
 ; void load_deinterleave_chroma_fenc( pixel *dst, pixel *src, intptr_t i_src, int height )
@@ -1083,7 +1083,7 @@ cglobal load_deinterleave_chroma_fenc, 4,4
     lea    r1, [r1+r2*2]
     sub   r3d, 2
     jg .loop
-    REP_RET
+    RET
 
 ;-----------------------------------------------------------------------------
 ; void load_deinterleave_chroma_fdec( pixel *dst, pixel *src, intptr_t i_src, int height )
@@ -1098,7 +1098,7 @@ cglobal load_deinterleave_chroma_fdec, 4,4
     lea    r1, [r1+r2*2]
     sub   r3d, 2
     jg .loop
-    REP_RET
+    RET
 %endmacro ; PLANE_DEINTERLEAVE
 
 %if HIGH_BIT_DEPTH
@@ -1155,7 +1155,7 @@ cglobal memcpy_aligned_mmx, 3,3
     sub  r2d, 32
     jg .copy32
 .ret
-    REP_RET
+    RET
 
 ;-----------------------------------------------------------------------------
 ; void *memcpy_aligned( void *dst, const void *src, size_t n );
@@ -1207,7 +1207,7 @@ cglobal memzero_aligned, 2,2
 %endrep
     add r1, mmsize*8
     jl .loop
-    REP_RET
+    RET
 %endmacro
 
 INIT_MMX mmx
@@ -1239,7 +1239,7 @@ cglobal integral_init4h_sse4, 3,4
     movdqa  [r3+r2*2+16], m1
     add     r2, 16
     jl .loop
-    REP_RET
+    RET
 
 %macro INTEGRAL_INIT8H 0
 cglobal integral_init8h, 3,4
@@ -1263,7 +1263,7 @@ cglobal integral_init8h, 3,4
     movdqa  [r3+r2*2+16], m1
     add     r2, 16
     jl .loop
-    REP_RET
+    RET
 %endmacro
 
 INIT_XMM sse4
@@ -1290,7 +1290,7 @@ cglobal integral_init8v, 3,3
     mova  [r0+r1+mmsize], m1
     add   r1, 2*mmsize
     jl .loop
-    REP_RET
+    RET
 %endmacro
 
 INIT_MMX mmx
@@ -1321,7 +1321,7 @@ cglobal integral_init4v_mmx, 3,5
     mova  [r1+r2-8], m3
     sub   r2, 8
     jge .loop
-    REP_RET
+    RET
 
 INIT_XMM
 cglobal integral_init4v_sse2, 3,5
@@ -1347,7 +1347,7 @@ cglobal integral_init4v_sse2, 3,5
     mova  [r1+r2], m3
     add     r2, 16
     jl .loop
-    REP_RET
+    RET
 
 cglobal integral_init4v_ssse3, 3,5
     shl     r2, 1
@@ -1372,7 +1372,7 @@ cglobal integral_init4v_ssse3, 3,5
     mova  [r1+r2], m3
     add     r2, 16
     jl .loop
-    REP_RET
+    RET
 
 %macro FILT8x4 7
     mova      %3, [r0+%7]
@@ -1732,7 +1732,7 @@ cglobal mbtree_propagate_cost, 7,7,7
     movdqa [r0+r6*2], xmm0
     add         r6, 8
     jl .loop
-    REP_RET
+    RET
 %endmacro
 
 INIT_XMM sse2
@@ -1786,4 +1786,4 @@ cglobal mbtree_propagate_cost, 7,7,8
     vmovdqu [r0+r6*2], ymm1
     add            r6, 16
     jl .loop
-    REP_RET
+    RET
diff --git a/common/x86/predict-a.asm b/common/x86/predict-a.asm
index 460ecb75..0805aba2 100644
--- a/common/x86/predict-a.asm
+++ b/common/x86/predict-a.asm
@@ -807,7 +807,7 @@ cglobal predict_8x8_dc, 2,2
     psrlw       m0, 4
     SPLATW      m0, m0
     STORE8x8    m0, m0
-    REP_RET
+    RET
 
 %else ; !HIGH_BIT_DEPTH
 INIT_MMX mmx2
@@ -1103,7 +1103,7 @@ ALIGN 4
     add         r0, FDEC_STRIDE
     dec         r1d
     jg .loop
-    REP_RET
+    RET
 %endmacro ; PREDICT_CHROMA_P_MMX
 
 INIT_MMX mmx2
@@ -1140,7 +1140,7 @@ cglobal predict_8x%1c_p_core, 1,2,7
     add         r0, FDEC_STRIDEB
     dec        r1d
     jg .loop
-    REP_RET
+    RET
 %else ; !HIGH_BIT_DEPTH
 cglobal predict_8x%1c_p_core, 1,2
     movd        m0, r1m
@@ -1225,7 +1225,7 @@ ALIGN 4
     add         r0, FDEC_STRIDE
     dec         r1d
     jg          .loop
-    REP_RET
+    RET
 %endif ; !ARCH_X86_64
 
 %macro PREDICT_16x16_P 0
@@ -1282,7 +1282,7 @@ ALIGN 4
     dec      r1d
     jg       .loop
 %endif ; !HIGH_BIT_DEPTH
-    REP_RET
+    RET
 %endmacro ; PREDICT_16x16_P
 
 INIT_XMM sse2
@@ -1996,20 +1996,20 @@ cglobal predict_16x16_v_mmx2, 1,2
     mova        m2, [r0 - FDEC_STRIDEB+16]
     mova        m3, [r0 - FDEC_STRIDEB+24]
     STORE16x16  m0, m1, m2, m3
-    REP_RET
+    RET
 INIT_XMM
 cglobal predict_16x16_v_sse2, 2,2
     mova      m0, [r0 - FDEC_STRIDEB+ 0]
     mova      m1, [r0 - FDEC_STRIDEB+16]
     STORE16x16_SSE2 m0, m1
-    REP_RET
+    RET
 %else ; !HIGH_BIT_DEPTH
 INIT_MMX
 cglobal predict_16x16_v_mmx2, 1,2
     movq        m0, [r0 - FDEC_STRIDE + 0]
     movq        m1, [r0 - FDEC_STRIDE + 8]
     STORE16x16  m0, m1
-    REP_RET
+    RET
 INIT_XMM
 cglobal predict_16x16_v_sse2, 1,1
     movdqa      xmm0, [r0 - FDEC_STRIDE]
@@ -2055,7 +2055,7 @@ cglobal predict_16x16_h, 1,2
 %endif ; HIGH_BIT_DEPTH
     sub r1, 4*FDEC_STRIDEB
     jge .vloop
-    REP_RET
+    RET
 %endmacro
 
 INIT_MMX mmx2
@@ -2106,12 +2106,12 @@ cglobal predict_16x16_dc_core, 1,2
 %else
     PRED16x16_DC r1m, 5
 %endif
-    REP_RET
+    RET
 
 INIT_MMX mmx2
 cglobal predict_16x16_dc_top, 1,2
     PRED16x16_DC [pw_8], 4
-    REP_RET
+    RET
 
 INIT_MMX mmx2
 %if HIGH_BIT_DEPTH
@@ -2119,14 +2119,14 @@ cglobal predict_16x16_dc_left_core, 1,2
     movd       m0, r1m
     SPLATW     m0, m0
     STORE16x16 m0, m0, m0, m0
-    REP_RET
+    RET
 %else ; !HIGH_BIT_DEPTH
 cglobal predict_16x16_dc_left_core, 1,1
     movd       m0, r1m
     pshufw     m0, m0, 0
     packuswb   m0, m0
     STORE16x16 m0, m0
-    REP_RET
+    RET
 %endif
 
 ;-----------------------------------------------------------------------------
@@ -2159,11 +2159,11 @@ INIT_XMM sse2
 cglobal predict_16x16_dc_core, 2,2,4
     movd       m3, r1m
     PRED16x16_DC_SSE2 m3, 5
-    REP_RET
+    RET
 
 cglobal predict_16x16_dc_top, 1,2
     PRED16x16_DC_SSE2 [pw_8], 4
-    REP_RET
+    RET
 
 INIT_XMM sse2
 %if HIGH_BIT_DEPTH
@@ -2171,7 +2171,7 @@ cglobal predict_16x16_dc_left_core, 1,2
     movd       m0, r1m
     SPLATW     m0, m0
     STORE16x16_SSE2 m0, m0
-    REP_RET
+    RET
 %else ; !HIGH_BIT_DEPTH
 cglobal predict_16x16_dc_left_core, 1,1
     movd       m0, r1m
diff --git a/common/x86/quant-a.asm b/common/x86/quant-a.asm
index 3c43220b..00889bb2 100644
--- a/common/x86/quant-a.asm
+++ b/common/x86/quant-a.asm
@@ -416,7 +416,7 @@ QUANT_AC quant_8x8, 8
     %1 [r0+(t0+8*%3)*SIZEOF_PIXEL], [r1+t0*2+16*%3], [r1+t0*2+24*%3]
     sub t0d, 16*%3
     jge %%loop
-    REP_RET
+    RET
 %else
     %1 [r0+(8*%3)*SIZEOF_PIXEL], [r1+16*%3], [r1+24*%3]
     %1 [r0+(0   )*SIZEOF_PIXEL], [r1+0    ], [r1+ 8*%3]
@@ -738,7 +738,7 @@ cglobal optimize_chroma_2x2_dc, 0,%%regs,7
     PSIGND    m5, m2, m1
     test     t3d, t3d
     jnz .outer_loop_0
-    REP_RET
+    RET
 %endmacro
 
 %if HIGH_BIT_DEPTH == 0
@@ -783,7 +783,7 @@ cglobal denoise_dct, 4,4,8
     mova      [r1+r3*4-1*mmsize], m5
     sub       r3, mmsize/2
     jg .loop
-    REP_RET
+    RET
 %endmacro
 
 %if ARCH_X86_64 == 0
@@ -831,7 +831,7 @@ cglobal denoise_dct, 4,4,7
     mova      [r1+r3*4-1*mmsize], m1
     sub       r3, mmsize
     jg .loop
-    REP_RET
+    RET
 %endmacro
 
 %if ARCH_X86_64 == 0
@@ -954,7 +954,7 @@ cglobal decimate_score%1, 1,3
     jne  .loop
 %endif
 .ret:
-    RET
+    REP_RET
 .ret9:
     mov   eax, 9
     RET
@@ -1066,7 +1066,7 @@ cglobal decimate_score64, 1,5
 .tryret:
     xor   r4, -1
     jne  .cont
-    REP_RET
+    RET
 .ret9:
     mov   eax, 9
     RET
@@ -1077,7 +1077,7 @@ cglobal decimate_score64, 1,5
     shr   r3, cl
     shr   r3, 1
     jne  .loop
-    REP_RET
+    RET
 %endif ; ARCH
 
 %endmacro
@@ -1381,7 +1381,7 @@ cglobal coeff_level_run%1,0,7
     inc    t6d
     sub    t4d, t3d
     jge .loop
-    REP_RET
+    RET
 %endmacro
 
 INIT_MMX mmx2
diff --git a/common/x86/x86inc.asm b/common/x86/x86inc.asm
index 7888903a..1b81ff54 100644
--- a/common/x86/x86inc.asm
+++ b/common/x86/x86inc.asm
@@ -125,8 +125,7 @@ CPU amdnop
 ; Pops anything that was pushed by PROLOGUE, and returns.
 
 ; REP_RET:
-; Same, but if it doesn't pop anything it becomes a 2-byte ret, for athlons
-; which are slow when a normal ret follows a branch.
+; Use this instead of RET if it's a branch target.
 
 ; registers:
 ; rN and rNq are the native-size register holding function argument N
@@ -473,7 +472,7 @@ DECLARE_REG 14, R15, 120
 %if mmsize == 32
     vzeroupper
 %endif
-    ret
+    AUTO_REP_RET
 %endmacro
 
 %elif ARCH_X86_64 ; *nix x64 ;=============================================
@@ -520,7 +519,7 @@ DECLARE_REG 14, R15, 72
 %if mmsize == 32
     vzeroupper
 %endif
-    ret
+    AUTO_REP_RET
 %endmacro
 
 %else ; X86_32 ;==============================================================
@@ -576,7 +575,7 @@ DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14
 %if mmsize == 32
     vzeroupper
 %endif
-    ret
+    AUTO_REP_RET
 %endmacro
 
 %endif ;======================================================================
@@ -590,6 +589,10 @@ DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14
 %endmacro
 %endif
 
+; On AMD cpus <=K10, an ordinary ret is slow if it immediately follows either
+; a branch or a branch target. So switch to a 2-byte form of ret in that case.
+; We can automatically detect "follows a branch", but not a branch target.
+; (SSSE3 is a sufficient condition to know that your cpu doesn't have this problem.)
 %macro REP_RET 0
     %if has_epilogue
         RET
@@ -598,6 +601,29 @@ DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14
     %endif
 %endmacro
 
+%define last_branch_adr $$
+%macro AUTO_REP_RET 0
+    %ifndef cpuflags
+        times ((last_branch_adr-$)>>31)+1 rep ; times 1 iff $ != last_branch_adr.
+    %elif notcpuflag(ssse3)
+        times ((last_branch_adr-$)>>31)+1 rep
+    %endif
+    ret
+%endmacro
+
+%macro BRANCH_INSTR 0-*
+    %rep %0
+        %macro %1 1-2 %1
+            %2 %1
+            %%branch_instr:
+            %xdefine last_branch_adr %%branch_instr
+        %endmacro
+        %rotate 1
+    %endrep
+%endmacro
+
+BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae, jna, jnae, jb, jbe, jnb, jnbe, jc, jnc, js, jns, jo, jno, jp, jnp
+
 %macro TAIL_CALL 2 ; callee, is_nonadjacent
     %if has_epilogue
         call %1
diff --git a/tools/checkasm-a.asm b/tools/checkasm-a.asm
index 47a4f65e..001eec82 100644
--- a/tools/checkasm-a.asm
+++ b/tools/checkasm-a.asm
@@ -199,7 +199,7 @@ cglobal checkasm_call, 1,7
     mov  dword [r1], 0
     mov  eax, r3
 .ok:
-    RET
+    REP_RET
 
 %endif ; ARCH_X86_64
 
-- 
2.50.0