From: Andy Polyakov Date: Sun, 15 Jul 2012 13:37:35 +0000 (+0000) Subject: wp-x86_64.pl: ~10% performance improvement. X-Git-Tag: master-pre-reformat~1703 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=701d593f7095db84459c76265349a83d30a4cae5;p=openssl wp-x86_64.pl: ~10% performance improvement. --- diff --git a/crypto/whrlpool/asm/wp-x86_64.pl b/crypto/whrlpool/asm/wp-x86_64.pl index 9be1cc2238..855aabc892 100644 --- a/crypto/whrlpool/asm/wp-x86_64.pl +++ b/crypto/whrlpool/asm/wp-x86_64.pl @@ -90,41 +90,44 @@ for($i=0;$i<8;$i++) { $code.="mov @mm[$i],64+$i*8(%rsp)\n"; } # S=L $code.=<<___; xor %rsi,%rsi mov %rsi,24(%rbx) # zero round counter + jmp .Lround .align 16 .Lround: mov 4096(%rbp,%rsi,8),@mm[0] # rc[r] mov 0(%rsp),%eax mov 4(%rsp),%ebx + movz %al,%ecx + movz %ah,%edx ___ for($i=0;$i<8;$i++) { my $func = ($i==0)? "mov" : "xor"; $code.=<<___; - mov %al,%cl - mov %ah,%dl + shr \$16,%eax lea (%rcx,%rcx),%rsi + movz %al,%ecx lea (%rdx,%rdx),%rdi - shr \$16,%eax + movz %ah,%edx xor 0(%rbp,%rsi,8),@mm[0] $func 7(%rbp,%rdi,8),@mm[1] - mov %al,%cl - mov %ah,%dl mov $i*8+8(%rsp),%eax # ($i+1)*8 lea (%rcx,%rcx),%rsi + movz %bl,%ecx lea (%rdx,%rdx),%rdi + movz %bh,%edx $func 6(%rbp,%rsi,8),@mm[2] $func 5(%rbp,%rdi,8),@mm[3] - mov %bl,%cl - mov %bh,%dl + shr \$16,%ebx lea (%rcx,%rcx),%rsi + movz %bl,%ecx lea (%rdx,%rdx),%rdi - shr \$16,%ebx + movz %bh,%edx $func 4(%rbp,%rsi,8),@mm[4] $func 3(%rbp,%rdi,8),@mm[5] - mov %bl,%cl - mov %bh,%dl mov $i*8+8+4(%rsp),%ebx # ($i+1)*8+4 lea (%rcx,%rcx),%rsi + movz %al,%ecx lea (%rdx,%rdx),%rdi + movz %ah,%edx $func 2(%rbp,%rsi,8),@mm[6] $func 1(%rbp,%rdi,8),@mm[7] ___ @@ -133,32 +136,32 @@ ___ for($i=0;$i<8;$i++) { $code.="mov @mm[$i],$i*8(%rsp)\n"; } # K=L for($i=0;$i<8;$i++) { $code.=<<___; - mov %al,%cl - mov %ah,%dl + shr \$16,%eax lea (%rcx,%rcx),%rsi + movz %al,%ecx lea (%rdx,%rdx),%rdi - shr \$16,%eax + movz %ah,%edx xor 0(%rbp,%rsi,8),@mm[0] xor 7(%rbp,%rdi,8),@mm[1] - mov %al,%cl - mov %ah,%dl `"mov 64+$i*8+8(%rsp),%eax" if($i<7);` # 64+($i+1)*8 lea (%rcx,%rcx),%rsi + movz %bl,%ecx lea (%rdx,%rdx),%rdi + movz %bh,%edx xor 6(%rbp,%rsi,8),@mm[2] xor 5(%rbp,%rdi,8),@mm[3] - mov %bl,%cl - mov %bh,%dl + shr \$16,%ebx lea (%rcx,%rcx),%rsi + movz %bl,%ecx lea (%rdx,%rdx),%rdi - shr \$16,%ebx + movz %bh,%edx xor 4(%rbp,%rsi,8),@mm[4] xor 3(%rbp,%rdi,8),@mm[5] - mov %bl,%cl - mov %bh,%dl `"mov 64+$i*8+8+4(%rsp),%ebx" if($i<7);` # 64+($i+1)*8+4 lea (%rcx,%rcx),%rsi + movz %al,%ecx lea (%rdx,%rdx),%rdi + movz %ah,%edx xor 2(%rbp,%rsi,8),@mm[6] xor 1(%rbp,%rdi,8),@mm[7] ___