From 69f45c520ce3cba61480babf55f38a66b894c9ea Mon Sep 17 00:00:00 2001 From: Andy Polyakov Date: Sun, 15 Jul 2012 20:33:30 +0000 Subject: [PATCH] sha1-[586|x86_64].pl: shave off one instruction from body_40_59, it's 2% less instructions in SIMD code paths, so 2% improvement in average:-) --- crypto/sha/asm/sha1-586.pl | 29 +++++++++++++++-------------- crypto/sha/asm/sha1-x86_64.pl | 29 +++++++++++++++-------------- 2 files changed, 30 insertions(+), 28 deletions(-) diff --git a/crypto/sha/asm/sha1-586.pl b/crypto/sha/asm/sha1-586.pl index a9dbeae694..8d6774648f 100644 --- a/crypto/sha/asm/sha1-586.pl +++ b/crypto/sha/asm/sha1-586.pl @@ -89,12 +89,12 @@ # P4 10.6 - # AMD K8 7.1 - # Core2 7.3 6.1/+20% - -# Atom 12.5 9.5(*)/+32% - -# Westmere 7.3 5.6/+30% - -# Sandy Bridge 8.8 6.2/+40% 5.1(**)/+70% -# Ivy Bridge 7.2 4.9/+47% 4.8(**)/+50% -# Bulldozer 11.6 6.2/+88% -# VIA Nano 10.6 7.5/+41% +# Atom 12.5 9.3(*)/+35% - +# Westmere 7.3 5.5/+33% - +# Sandy Bridge 8.8 6.2/+40% 5.2(**)/+70% +# Ivy Bridge 7.2 4.8/+51% 4.7(**)/+53% +# Bulldozer 11.6 6.0/+92% +# VIA Nano 10.6 7.6/+40% # # (*) Loop is 1056 instructions long and expected result is ~8.25. # It remains mystery [to me] why ILP is limited to 1.7. @@ -616,7 +616,7 @@ sub Xupdate_ssse3_16_31() # recall that $Xi starts wtih 4 sub Xupdate_ssse3_32_79() { use integer; my $body = shift; - my @insns = (&$body,&$body,&$body,&$body); # 32 to 48 instructions + my @insns = (&$body,&$body,&$body,&$body); # 32 to 44 instructions my ($a,$b,$c,$d,$e); &movdqa (@X[2],@X[-1&7]) if ($Xi==8); @@ -783,17 +783,16 @@ sub body_20_39 () { sub body_40_59 () { ( '($a,$b,$c,$d,$e)=@V;'. - '&mov (@T[1],$c);', - '&xor ($c,$d);', + '&xor (@T[0],$c);', + '&xor (@T[1],$d);', '&add ($e,&DWP(4*($j++&15),"esp"));', # X[]+K xfer - '&and (@T[1],$d);', - '&and (@T[0],$c);', # ($b&($c^$d)) + '&and (@T[0],@T[1]);', '&$_ror ($b,7);', # $b>>>2 - '&add ($e,@T[1]);', + '&xor (@T[0],$c);', '&mov (@T[1],$a);', # $b in next round '&$_rol ($a,5);', '&add ($e,@T[0]);', - '&xor ($c,$d);', # restore $c + '&mov (@T[0],$b);', # copy of $c in next round '&add ($e,$a);' .'unshift(@V,pop(@V)); unshift(@T,pop(@T));' ); } @@ -809,6 +808,7 @@ sub body_40_59 () { &Xupdate_ssse3_32_79(\&body_20_39); &Xupdate_ssse3_32_79(\&body_20_39); &Xupdate_ssse3_32_79(\&body_20_39); + &mov (@T[1],@V[2]); # copy of $c in next round &Xupdate_ssse3_32_79(\&body_40_59); &Xupdate_ssse3_32_79(\&body_40_59); &Xupdate_ssse3_32_79(\&body_40_59); @@ -1032,7 +1032,7 @@ sub Xupdate_avx_16_31() # recall that $Xi starts wtih 4 sub Xupdate_avx_32_79() { use integer; my $body = shift; - my @insns = (&$body,&$body,&$body,&$body); # 32 to 48 instructions + my @insns = (&$body,&$body,&$body,&$body); # 32 to 44 instructions my ($a,$b,$c,$d,$e); &vpalignr(@X[2],@X[-1&7],@X[-2&7],8); # compose "X[-6]" @@ -1173,6 +1173,7 @@ sub Xtail_avx() &Xupdate_avx_32_79(\&body_20_39); &Xupdate_avx_32_79(\&body_20_39); &Xupdate_avx_32_79(\&body_20_39); + &mov (@T[1],@V[2]); # copy of $c in next round &Xupdate_avx_32_79(\&body_40_59); &Xupdate_avx_32_79(\&body_40_59); &Xupdate_avx_32_79(\&body_40_59); diff --git a/crypto/sha/asm/sha1-x86_64.pl b/crypto/sha/asm/sha1-x86_64.pl index 05b2234a62..96350413be 100755 --- a/crypto/sha/asm/sha1-x86_64.pl +++ b/crypto/sha/asm/sha1-x86_64.pl @@ -56,12 +56,12 @@ # x86_64 SSSE3 AVX # P4 9.8 - # Opteron 6.6 - -# Core2 6.7 6.1/+10% - -# Atom 11.0 9.7/+13% - -# Westmere 7.1 5.6/+27% - -# Sandy Bridge 7.9 6.3/+25% 5.2/+51% -# Ivy Bridge 6.4 4.8/+33% 4.7/+36% -# Bulldozer 10.9 6.1/+79% +# Core2 6.7 6.2/+8% - +# Atom 11.0 9.5/+15% - +# Westmere 7.1 5.5/+29% - +# Sandy Bridge 7.9 6.2/+28% 5.1/+54% +# Ivy Bridge 6.4 4.7/+35% 4.6/+37% +# Bulldozer 10.9 6.0/+82% # VIA Nano 10.2 7.4/+38% $flavour = shift; @@ -453,7 +453,7 @@ sub Xupdate_ssse3_16_31() # recall that $Xi starts wtih 4 sub Xupdate_ssse3_32_79() { use integer; my $body = shift; - my @insns = (&$body,&$body,&$body,&$body); # 32 to 48 instructions + my @insns = (&$body,&$body,&$body,&$body); # 32 to 44 instructions my ($a,$b,$c,$d,$e); &movdqa (@Tx[0],@X[-1&7]) if ($Xi==8); @@ -618,17 +618,16 @@ sub body_20_39 () { sub body_40_59 () { ( '($a,$b,$c,$d,$e)=@V;'. - '&mov (@T[1],$c);', - '&xor ($c,$d);', + '&xor (@T[0],$c);', + '&xor (@T[1],$d);', '&add ($e,eval(4*($j++&15))."(%rsp)");', # X[]+K xfer - '&and (@T[1],$d);', - '&and (@T[0],$c);', # ($b&($c^$d)) + '&and (@T[0],$T[1]);', '&$_ror ($b,7);', # $b>>>2 - '&add ($e,@T[1]);', + '&xor (@T[0],$c);', '&mov (@T[1],$a);', # $b in next round '&$_rol ($a,5);', '&add ($e,@T[0]);', - '&xor ($c,$d);', # restore $c + '&mov (@T[0],$b);', # copy of $c in next round '&add ($e,$a);' .'unshift(@V,pop(@V)); unshift(@T,pop(@T));' ); } @@ -646,6 +645,7 @@ ___ &Xupdate_ssse3_32_79(\&body_20_39); &Xupdate_ssse3_32_79(\&body_20_39); &Xupdate_ssse3_32_79(\&body_20_39); + &mov (@T[1],@V[2]); # copy of $c in next round &Xupdate_ssse3_32_79(\&body_40_59); &Xupdate_ssse3_32_79(\&body_40_59); &Xupdate_ssse3_32_79(\&body_40_59); @@ -859,7 +859,7 @@ sub Xupdate_avx_16_31() # recall that $Xi starts wtih 4 sub Xupdate_avx_32_79() { use integer; my $body = shift; - my @insns = (&$body,&$body,&$body,&$body); # 32 to 48 instructions + my @insns = (&$body,&$body,&$body,&$body); # 32 to 44 instructions my ($a,$b,$c,$d,$e); &vpalignr(@Tx[0],@X[-1&7],@X[-2&7],8); # compose "X[-6]" @@ -1002,6 +1002,7 @@ ___ &Xupdate_avx_32_79(\&body_20_39); &Xupdate_avx_32_79(\&body_20_39); &Xupdate_avx_32_79(\&body_20_39); + &mov (@T[1],@V[2]); # copy of $c in next round &Xupdate_avx_32_79(\&body_40_59); &Xupdate_avx_32_79(\&body_40_59); &Xupdate_avx_32_79(\&body_40_59); -- 2.40.0