From: Denys Vlasenko Date: Tue, 10 Feb 2015 17:30:56 +0000 (+0100) Subject: x86_64/memset: simple optimizations X-Git-Tag: v1.1.7~53 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=bf2071eda32528ee8b0bb89544152646684a2cf3;p=musl x86_64/memset: simple optimizations "and $0xff,%esi" is a six-byte insn (81 e6 ff 00 00 00), can use 4-byte "movzbl %sil,%esi" (40 0f b6 f6) instead. 64-bit imul is slow, move it as far up as possible so that the result (rax) has more time to be ready by the time we start using it in mem stores. There is no need to shuffle registers in preparation to "rep movs" if we are not going to take that code path. Thus, patch moves "jump if len < 16" instructions up, and changes alternate code path to use rdx and rdi instead of rcx and r8. Signed-off-by: Denys Vlasenko --- diff --git a/src/string/x86_64/memset.s b/src/string/x86_64/memset.s index fc06eef8..263336b5 100644 --- a/src/string/x86_64/memset.s +++ b/src/string/x86_64/memset.s @@ -1,41 +1,43 @@ .global memset .type memset,@function memset: - and $0xff,%esi + movzbl %sil,%esi mov $0x101010101010101,%rax - mov %rdx,%rcx - mov %rdi,%r8 + # 64-bit imul has 3-7 cycles latency, launch early imul %rsi,%rax - cmp $16,%rcx + + cmp $16,%rdx jb 1f - mov %rax,-8(%rdi,%rcx) + mov %rdx,%rcx + mov %rdi,%r8 shr $3,%rcx + mov %rax,-8(%rdi,%rdx) rep stosq mov %r8,%rax ret -1: test %ecx,%ecx +1: test %edx,%edx jz 1f mov %al,(%rdi) - mov %al,-1(%rdi,%rcx) - cmp $2,%ecx + mov %al,-1(%rdi,%rdx) + cmp $2,%edx jbe 1f mov %al,1(%rdi) - mov %al,-2(%rdi,%rcx) - cmp $4,%ecx + mov %al,-2(%rdi,%rdx) + cmp $4,%edx jbe 1f mov %eax,(%rdi) - mov %eax,-4(%rdi,%rcx) - cmp $8,%ecx + mov %eax,-4(%rdi,%rdx) + cmp $8,%edx jbe 1f mov %eax,4(%rdi) - mov %eax,-8(%rdi,%rcx) + mov %eax,-8(%rdi,%rdx) -1: mov %r8,%rax +1: mov %rdi,%rax ret