From 58e12b0e792596ae4eac95df4ae358ca664a6c20 Mon Sep 17 00:00:00 2001 From: Loren Merritt Date: Sun, 1 Oct 2006 13:08:42 +0000 Subject: [PATCH] tweak motion compensation amd64 asm. 0.3% overall speedup. git-svn-id: svn://svn.videolan.org/x264/trunk@575 df754926-b1dd-0310-bc7b-ec298dee348c --- common/amd64/mc-a.asm | 71 ++++++++++++++++++------------------------- 1 file changed, 30 insertions(+), 41 deletions(-) diff --git a/common/amd64/mc-a.asm b/common/amd64/mc-a.asm index f0b2cacd..85e9df44 100644 --- a/common/amd64/mc-a.asm +++ b/common/amd64/mc-a.asm @@ -86,24 +86,22 @@ ALIGN 16 x264_pixel_avg_w4_mmxext: mov r10, parm5q ; src2 movsxd r11, parm6d ; i_src2_stride - movsxd rax, parm7d ; i_height + mov eax, parm7d ; i_height ALIGN 4 .height_loop movd mm0, [parm3q] - pavgb mm0, [r10] movd mm1, [parm3q+parm4q] + pavgb mm0, [r10] pavgb mm1, [r10+r11] movd [parm1q], mm0 movd [parm1q+parm2q], mm1 - dec rax - dec rax + sub eax, 2 lea parm3q, [parm3q+parm4q*2] lea r10, [r10+r11*2] lea parm1q, [parm1q+parm2q*2] jne .height_loop - - ret + rep ret @@ -115,23 +113,24 @@ ALIGN 16 ; int i_height ); ;----------------------------------------------------------------------------- x264_pixel_avg_w8_mmxext: - mov r10, parm5q ; src2 movsxd r11, parm6d ; i_src2_stride - movsxd rax, parm7d ; i_height + mov eax, parm7d ; i_height ALIGN 4 .height_loop movq mm0, [parm3q] + movq mm1, [parm3q+parm4q] pavgb mm0, [r10] + pavgb mm1, [r10+r11] movq [parm1q], mm0 - dec rax - lea parm3q, [parm3q+parm4q] - lea r10, [r10+r11] - lea parm1q, [parm1q+parm2q] + movq [parm1q+parm2q], mm1 + sub eax, 2 + lea parm3q, [parm3q+parm4q*2] + lea r10, [r10+r11*2] + lea parm1q, [parm1q+parm2q*2] jne .height_loop - - ret + rep ret ALIGN 16 ;----------------------------------------------------------------------------- @@ -143,7 +142,7 @@ ALIGN 16 x264_pixel_avg_w16_mmxext: mov r10, parm5q ; src2 movsxd r11, parm6d ; i_src2_stride - movsxd rax, parm7d ; i_height + mov eax, parm7d ; i_height ALIGN 4 .height_loop @@ -153,13 +152,12 @@ ALIGN 4 pavgb mm1, [r10+8] movq [parm1q ], mm0 movq [parm1q+8], mm1 - dec rax + dec eax lea parm3q, [parm3q+parm4q] lea r10, [r10+r11] lea parm1q, [parm1q+parm2q] jne .height_loop - - ret + rep ret ALIGN 16 ;----------------------------------------------------------------------------- @@ -171,21 +169,19 @@ ALIGN 16 x264_pixel_avg_w16_sse2: mov r10, parm5q ; src2 movsxd r11, parm6d ; i_src2_stride - movsxd rax, parm7d ; i_height + mov eax, parm7d ; i_height ALIGN 4 .height_loop movdqu xmm0, [parm3q] pavgb xmm0, [r10] movdqu [parm1q], xmm0 - dec rax lea parm3q, [parm3q+parm4q] lea r10, [r10+r11] lea parm1q, [parm1q+parm2q] jne .height_loop - - ret + rep ret @@ -246,7 +242,7 @@ x264_pixel_avg_weight_w16_mmxext: add parm3q, parm4q dec r11d jnz .height_loop - ret + rep ret ALIGN 16 ;----------------------------------------------------------------------------- @@ -255,16 +251,14 @@ ALIGN 16 x264_pixel_avg_weight_w8_mmxext: BIWEIGHT_START_MMX - BIWEIGHT_4P_MMX [parm1q ], [parm3q ] - BIWEIGHT_4P_MMX [parm1q+4 ], [parm3q+4 ] - BIWEIGHT_4P_MMX [parm1q+parm2q ], [parm3q+parm4q ] - BIWEIGHT_4P_MMX [parm1q+parm2q+4], [parm3q+parm4q+4] + BIWEIGHT_4P_MMX [parm1q ], [parm3q ] + BIWEIGHT_4P_MMX [parm1q+4], [parm3q+4] - lea parm1q, [parm1q+parm2q*2] - lea parm3q, [parm3q+parm4q*2] - sub r11d, byte 2 + add parm1q, parm2q + add parm3q, parm4q + dec r11d jnz .height_loop - ret + rep ret ALIGN 16 ;----------------------------------------------------------------------------- @@ -305,8 +299,7 @@ ALIGN 4 dec eax dec eax jne .height_loop - - ret + rep ret ALIGN 16 ;----------------------------------------------------------------------------- @@ -334,8 +327,7 @@ ALIGN 4 sub eax, byte 4 jnz .height_loop - - ret + rep ret ALIGN 16 ;----------------------------------------------------------------------------- @@ -370,8 +362,7 @@ ALIGN 4 lea parm1q, [parm1q+parm2q*4] sub eax, byte 4 jnz .height_loop - - ret + rep ret ALIGN 16 @@ -387,13 +378,11 @@ ALIGN 4 movdqu xmm1, [parm3q+parm4q] movdqu [parm1q], xmm0 movdqu [parm1q+parm2q], xmm1 - dec eax - dec eax + sub eax, byte 2 lea parm3q, [parm3q+parm4q*2] lea parm1q, [parm1q+parm2q*2] jnz .height_loop - - ret + rep ret -- 2.40.0