]> granicus.if.org Git - openssl/commitdiff
aesni-sha1-x86_64.pl: harmonize [Atom-specific optimizations] with master branch.
authorAndy Polyakov <appro@openssl.org>
Sat, 4 Jan 2014 16:40:13 +0000 (17:40 +0100)
committerAndy Polyakov <appro@openssl.org>
Sat, 4 Jan 2014 16:42:13 +0000 (17:42 +0100)
crypto/aes/asm/aesni-sha1-x86_64.pl
crypto/sha/asm/sha1-x86_64.pl

index 2d1a222ac246221fcfc6b65b08c242cec34b5e0b..9f71d8b2122f9ceab7fddf35bb3080592a1f291c 100644 (file)
 # subroutine:
 #
 #              AES-128-CBC     +SHA1           stitch      gain
-# Westmere     3.77[+5.5]      9.26            6.58        +41%
-# Sandy Bridge 5.05[+5.0(6.2)] 10.06(11.21)    5.98(7.05)  +68%(+59%)
+# Westmere     3.77[+5.5]      9.26            6.66        +39%
+# Sandy Bridge 5.05[+5.0(6.2)] 10.06(11.21)    5.98(7.01)  +68%(+60%)
 # Ivy Bridge   5.05[+4.6]      9.65            5.54        +74%
-# Haswell      4.43[+3.6(4.4)] 8.00(8.80)      4.55(5.21)  +75%(+69%)
+# Haswell      4.43[+3.6(4.1)] 8.00(8.55)      4.55(5.21)  +75%(+64%)
 # Bulldozer    5.77[+6.0]      11.72           6.37        +84%
 #
 #              AES-192-CBC
-# Westmere     4.51            10.00           6.87        +46%
-# Sandy Bridge 6.05            11.06(12.21)    6.11(7.20)  +81%(+70%)
+# Westmere     4.51            10.00           6.91        +45%
+# Sandy Bridge 6.05            11.06(12.21)    6.11(7.18)  +81%(+70%)
 # Ivy Bridge   6.05            10.65           6.07        +75%
-# Haswell      5.29            8.86(9.65)      5.32(5.32)  +67%(+81%)
+# Haswell      5.29            8.86(9.42)      5.32(5.32)  +67%(+77%)
 # Bulldozer    6.89            12.84           6.96        +84%
 #
 #              AES-256-CBC
-# Westmere     5.25            10.74           7.19        +49%
-# Sandy Bridge 7.05            12.06(13.21)    7.12(7.68)  +69%(+72%)
+# Westmere     5.25            10.74           7.24        +48%
+# Sandy Bridge 7.05            12.06(13.21)    7.12(7.63)  +69%(+73%)
 # Ivy Bridge   7.05            11.65           7.12        +64%
-# Haswell      6.19            9.76(10.6)      6.21(6.41)  +57%(+65%)
+# Haswell      6.19            9.76(10.3)      6.21(6.25)  +57%(+65%)
 # Bulldozer    8.00            13.95           8.25        +69%
 #
 # (*)  There are two code paths: SSSE3 and AVX. See sha1-568.pl for
@@ -129,10 +129,13 @@ my $K_XX_XX="%r11";
 my ($iv,$in,$rndkey0)=map("%xmm$_",(11..13));
 my @rndkey=("%xmm14","%xmm15");
 
-if (1) {
-    @X=map("%xmm$_",(4..11));
-    @Tx=map("%xmm$_",(12..14));
-    ($iv,$in,$rndkey0)=map("%xmm$_",(2,3,15));
+if (1) {       # reassign for Atom Silvermont
+    # The goal is to minimize amount of instructions with more than
+    # 3 prefix bytes. Or in more practical terms to keep AES-NI *and*
+    # SSSE3 instructions to upper half of the register bank.
+    @X=map("%xmm$_",(8..11,4..7));
+    @Tx=map("%xmm$_",(12,13,3));
+    ($iv,$in,$rndkey0)=map("%xmm$_",(2,14,15));
     @rndkey=("%xmm0","%xmm1");
 }
 
@@ -203,17 +206,17 @@ $code.=<<___;
        xor     $D,@T[1]
        and     @T[1],@T[0]
 
-       movdqa  64($K_XX_XX),@X[2]      # pbswap mask
+       movdqa  64($K_XX_XX),@Tx[2]     # pbswap mask
        movdqa  0($K_XX_XX),@Tx[1]      # K_00_19
        movdqu  0($inp),@X[-4&7]        # load input to %xmm[0-3]
        movdqu  16($inp),@X[-3&7]
        movdqu  32($inp),@X[-2&7]
        movdqu  48($inp),@X[-1&7]
-       pshufb  @X[2],@X[-4&7]          # byte swap
+       pshufb  @Tx[2],@X[-4&7]         # byte swap
        add     \$64,$inp
-       pshufb  @X[2],@X[-3&7]
-       pshufb  @X[2],@X[-2&7]
-       pshufb  @X[2],@X[-1&7]
+       pshufb  @Tx[2],@X[-3&7]
+       pshufb  @Tx[2],@X[-2&7]
+       pshufb  @Tx[2],@X[-1&7]
        paddd   @Tx[1],@X[-4&7]         # add K_00_19
        paddd   @Tx[1],@X[-3&7]
        paddd   @Tx[1],@X[-2&7]
@@ -277,11 +280,11 @@ sub Xupdate_ssse3_16_31()         # recall that $Xi starts wtih 4
   my @insns = (&$body,&$body,&$body,&$body);   # 40 instructions
   my ($a,$b,$c,$d,$e);
 
-       &movdqa (@X[0],@X[-3&7]);
+       &pshufd (@X[0],@X[-4&7],0xee);  # was &movdqa(@X[0],@X[-3&7]);
         eval(shift(@insns));
         eval(shift(@insns));
        &movdqa (@Tx[0],@X[-1&7]);
-       &palignr(@X[0],@X[-4&7],8);     # compose "X[-14]" in "X[0]"
+       &punpcklqdq(@X[0],@X[-3&7]);    # compose "X[-14]" in "X[0]", was &palignr(@X[0],@X[-4&7],8);
         eval(shift(@insns));
         eval(shift(@insns));
 
@@ -358,10 +361,10 @@ sub Xupdate_ssse3_32_79()
   my @insns = (&$body,&$body,&$body,&$body);   # 32 to 48 instructions
   my ($a,$b,$c,$d,$e);
 
-       &movdqa (@Tx[0],@X[-1&7])       if ($Xi==8);
+       &pshufd (@Tx[0],@X[-2&7],0xee)  if ($Xi==8);    # was &movdqa   (@Tx[0],@X[-1&7])
         eval(shift(@insns));           # body_20_39
        &pxor   (@X[0],@X[-4&7]);       # "X[0]"="X[-32]"^"X[-16]"
-       &palignr(@Tx[0],@X[-2&7],8);    # compose "X[-6]"
+       &punpcklqdq(@Tx[0],@X[-1&7]);   # compose "X[-6]", was &palignr(@Tx[0],@X[-2&7],8);
         eval(shift(@insns));
         eval(shift(@insns));
         eval(shift(@insns));           # rol
@@ -405,7 +408,7 @@ sub Xupdate_ssse3_32_79()
        &por    (@X[0],@Tx[0]);         # "X[0]"<<<=2
         eval(shift(@insns));           # body_20_39
         eval(shift(@insns));
-         &movdqa       (@Tx[1],@X[0])  if ($Xi<19);
+         &pshufd(@Tx[1],@X[-1&7],0xee) if ($Xi<19);    # was &movdqa   (@Tx[1],@X[0])
         eval(shift(@insns));
         eval(shift(@insns));           # rol
         eval(shift(@insns));
@@ -441,13 +444,13 @@ sub Xuplast_ssse3_80()
 
        unshift(@Tx,pop(@Tx));
 
-       &movdqa (@X[2],"64($K_XX_XX)");         # pbswap mask
+       &movdqa (@Tx[2],"64($K_XX_XX)");        # pbswap mask
        &movdqa (@Tx[1],"0($K_XX_XX)");         # K_00_19
        &movdqu (@X[-4&7],"0($inp)");           # load input
        &movdqu (@X[-3&7],"16($inp)");
        &movdqu (@X[-2&7],"32($inp)");
        &movdqu (@X[-1&7],"48($inp)");
-       &pshufb (@X[-4&7],@X[2]);               # byte swap
+       &pshufb (@X[-4&7],@Tx[2]);              # byte swap
        &add    ($inp,64);
 
   $Xi=0;
@@ -461,7 +464,7 @@ sub Xloop_ssse3()
 
         eval(shift(@insns));
         eval(shift(@insns));
-       &pshufb (@X[($Xi-3)&7],@X[2]);
+       &pshufb (@X[($Xi-3)&7],@Tx[2]);
         eval(shift(@insns));
         eval(shift(@insns));
        &paddd  (@X[($Xi-4)&7],@Tx[1]);
index b128913dbf4390ad696cae6151ffa02ad1d873b0..71d5c12540cbe4e267382a9fddabd77922653108 100755 (executable)
@@ -68,7 +68,7 @@
 # Westmere     7.08            5.44/+30%       -
 # Sandy Bridge 7.93            6.16/+28%       4.99/+59%
 # Ivy Bridge   6.30            4.63/+36%       4.60/+37%
-# Haswell      5.98            4.36/+37%       3.57/+67%
+# Haswell      5.98            4.12/+45%       3.57/+67%
 # Bulldozer    10.9            5.95/+82%
 # VIA Nano     10.2            7.46/+37%
 # Atom         11.0            9.61/+14%