aes/asm/aesni-x86[_64].pl: minor Atom-specific performance tweak.

author Andy Polyakov <appro@openssl.org>

Fri, 21 Feb 2014 11:14:04 +0000 (12:14 +0100)

committer Andy Polyakov <appro@openssl.org>

Fri, 21 Feb 2014 11:15:07 +0000 (12:15 +0100)
author Andy Polyakov <appro@openssl.org>
Fri, 21 Feb 2014 11:14:04 +0000 (12:14 +0100)
committer Andy Polyakov <appro@openssl.org>
Fri, 21 Feb 2014 11:15:07 +0000 (12:15 +0100)
diff --git a/crypto/aes/asm/aesni-x86.pl b/crypto/aes/asm/aesni-x86.pl

index c3df97db7b1421a42fc721441525af09f9848ba1..3deb86aed636e11e8ad9136cd8db4c71c27b0ebd 100644 (file)
--- a/crypto/aes/asm/aesni-x86.pl
+++ b/crypto/aes/asm/aesni-x86.pl
@@ -207,12 +207,45 @@ sub aesni_generate1       # fully unrolled loop
  # every *2nd* cycle. Thus 3x interleave was the one providing optimal
  # utilization, i.e. when subroutine's throughput is virtually same as
  # of non-interleaved subroutine [for number of input blocks up to 3].
-# This is why it makes no sense to implement 2x subroutine.
-# aes[enc|dec] latency in next processor generation is 8, but the
-# instructions can be scheduled every cycle. Optimal interleave for
-# new processor is therefore 8x, but it's unfeasible to accommodate it
-# in XMM registers addreassable in 32-bit mode and therefore 6x is
-# used instead...
+# This is why it originally made no sense to implement 2x subroutine.
+# But times change and it became appropriate to spend extra 192 bytes
+# on 2x subroutine on Atom Silvermont account. For processors that
+# can schedule aes[enc|dec] every cycle optimal interleave factor
+# equals to corresponding instructions latency. 8x is optimal for
+# * Bridge, but it's unfeasible to accommodate such implementation
+# in XMM registers addreassable in 32-bit mode and therefore maximum
+# of 6x is used instead...
+
+sub aesni_generate2
+{ my $p=shift;
+
+    &function_begin_B("_aesni_${p}rypt2");
+       &$movekey       ($rndkey0,&QWP(0,$key));
+       &shl            ($rounds,4);
+       &$movekey       ($rndkey1,&QWP(16,$key));
+       &xorps          ($inout0,$rndkey0);
+       &pxor           ($inout1,$rndkey0);
+       &$movekey       ($rndkey0,&QWP(32,$key));
+       &lea            ($key,&DWP(32,$key,$rounds));
+       &neg            ($rounds);
+       &add            ($rounds,16);
+
+    &set_label("${p}2_loop");
+       eval"&aes${p}   ($inout0,$rndkey1)";
+       eval"&aes${p}   ($inout1,$rndkey1)";
+       &$movekey       ($rndkey1,&QWP(0,$key,$rounds));
+       &add            ($rounds,32);
+       eval"&aes${p}   ($inout0,$rndkey0)";
+       eval"&aes${p}   ($inout1,$rndkey0)";
+       &$movekey       ($rndkey0,&QWP(-16,$key,$rounds));
+       &jnz            (&label("${p}2_loop"));
+    eval"&aes${p}      ($inout0,$rndkey1)";
+    eval"&aes${p}      ($inout1,$rndkey1)";
+    eval"&aes${p}last  ($inout0,$rndkey0)";
+    eval"&aes${p}last  ($inout1,$rndkey0)";
+    &ret();
+    &function_end_B("_aesni_${p}rypt2");
+}
  
  sub aesni_generate3
  { my $p=shift;
@@ -357,6 +390,8 @@ sub aesni_generate6
      &ret();
      &function_end_B("_aesni_${p}rypt6");
  }
+&aesni_generate2("enc") if ($PREFIX eq "aesni");
+&aesni_generate2("dec");
  &aesni_generate3("enc") if ($PREFIX eq "aesni");
  &aesni_generate3("dec");
  &aesni_generate4("enc") if ($PREFIX eq "aesni");
@@ -460,8 +495,7 @@ if ($PREFIX eq "aesni") {
         &jmp    (&label("ecb_ret"));
  
  &set_label("ecb_enc_two",16);
-       &xorps  ($inout2,$inout2);
-       &call   ("_aesni_encrypt3");
+       &call   ("_aesni_encrypt2");
         &movups (&QWP(0,$out),$inout0);
         &movups (&QWP(0x10,$out),$inout1);
         &jmp    (&label("ecb_ret"));
@@ -561,8 +595,7 @@ if ($PREFIX eq "aesni") {
         &jmp    (&label("ecb_ret"));
  
  &set_label("ecb_dec_two",16);
-       &xorps  ($inout2,$inout2);
-       &call   ("_aesni_decrypt3");
+       &call   ("_aesni_decrypt2");
         &movups (&QWP(0,$out),$inout0);
         &movups (&QWP(0x10,$out),$inout1);
         &jmp    (&label("ecb_ret"));
@@ -982,7 +1015,7 @@ if ($PREFIX eq "aesni") {
         &jmp    (&label("ctr32_ret"));
  
  &set_label("ctr32_two",16);
-       &call   ("_aesni_encrypt3");
+       &call   ("_aesni_encrypt2");
         &movups ($inout3,&QWP(0,$inp));
         &movups ($inout4,&QWP(0x10,$inp));
         &xorps  ($inout0,$inout3);
@@ -1253,9 +1286,8 @@ if ($PREFIX eq "aesni") {
         &lea    ($inp,&DWP(16*2,$inp));
         &xorps  ($inout0,$inout3);              # input^=tweak
         &xorps  ($inout1,$inout4);
-       &xorps  ($inout2,$inout2);
  
-       &call   ("_aesni_encrypt3");
+       &call   ("_aesni_encrypt2");
  
         &xorps  ($inout0,$inout3);              # output^=tweak
         &xorps  ($inout1,$inout4);
@@ -1596,7 +1628,7 @@ if ($PREFIX eq "aesni") {
         &xorps  ($inout0,$inout3);              # input^=tweak
         &xorps  ($inout1,$inout4);
  
-       &call   ("_aesni_decrypt3");
+       &call   ("_aesni_decrypt2");
  
         &xorps  ($inout0,$inout3);              # output^=tweak
         &xorps  ($inout1,$inout4);
@@ -1896,8 +1928,7 @@ if ($PREFIX eq "aesni") {
         &jmp    (&label("cbc_dec_tail_collected"));
  
  &set_label("cbc_dec_two",16);
-       &xorps  ($inout2,$inout2);
-       &call   ("_aesni_decrypt3");
+       &call   ("_aesni_decrypt2");
         &xorps  ($inout0,$ivec);
         &xorps  ($inout1,$in0);
         &movups (&QWP(0,$out),$inout0);
diff --git a/crypto/aes/asm/aesni-x86_64.pl b/crypto/aes/asm/aesni-x86_64.pl

index 708fabd3de364c6dbc1f27fbd581f575df78138a..31c80ae6bc53a30995483712c2a850b76cd94ac4 100644 (file)
--- a/crypto/aes/asm/aesni-x86_64.pl
+++ b/crypto/aes/asm/aesni-x86_64.pl
@@ -288,10 +288,49 @@ ___
  # every *2nd* cycle. Thus 3x interleave was the one providing optimal
  # utilization, i.e. when subroutine's throughput is virtually same as
  # of non-interleaved subroutine [for number of input blocks up to 3].
-# This is why it makes no sense to implement 2x subroutine.
-# aes[enc|dec] latency in next processor generation is 8, but the
-# instructions can be scheduled every cycle. Optimal interleave for
-# new processor is therefore 8x...
+# This is why it originally made no sense to implement 2x subroutine.
+# But times change and it became appropriate to spend extra 192 bytes
+# on 2x subroutine on Atom Silvermont account. For processors that
+# can schedule aes[enc|dec] every cycle optimal interleave factor
+# equals to corresponding instructions latency. 8x is optimal for
+# * Bridge and "super-optimal" for other Intel CPUs... 
+
+sub aesni_generate2 {
+my $dir=shift;
+# As already mentioned it takes in $key and $rounds, which are *not*
+# preserved. $inout[0-1] is cipher/clear text...
+$code.=<<___;
+.type  _aesni_${dir}rypt2,\@abi-omnipotent
+.align 16
+_aesni_${dir}rypt2:
+       $movkey ($key),$rndkey0
+       shl     \$4,$rounds
+       $movkey 16($key),$rndkey1
+       xorps   $rndkey0,$inout0
+       xorps   $rndkey0,$inout1
+       $movkey 32($key),$rndkey0
+       lea     32($key,$rounds),$key
+       neg     %rax                            # $rounds
+       add     \$16,%rax
+
+.L${dir}_loop2:
+       aes${dir}       $rndkey1,$inout0
+       aes${dir}       $rndkey1,$inout1
+       $movkey         ($key,%rax),$rndkey1
+       add             \$32,%rax
+       aes${dir}       $rndkey0,$inout0
+       aes${dir}       $rndkey0,$inout1
+       $movkey         -16($key,%rax),$rndkey0
+       jnz             .L${dir}_loop2
+
+       aes${dir}       $rndkey1,$inout0
+       aes${dir}       $rndkey1,$inout1
+       aes${dir}last   $rndkey0,$inout0
+       aes${dir}last   $rndkey0,$inout1
+       ret
+.size  _aesni_${dir}rypt2,.-_aesni_${dir}rypt2
+___
+}
  sub aesni_generate3 {
  my $dir=shift;
  # As already mentioned it takes in $key and $rounds, which are *not*
@@ -524,6 +563,8 @@ _aesni_${dir}rypt8:
  .size  _aesni_${dir}rypt8,.-_aesni_${dir}rypt8
  ___
  }
+&aesni_generate2("enc") if ($PREFIX eq "aesni");
+&aesni_generate2("dec");
  &aesni_generate3("enc") if ($PREFIX eq "aesni");
  &aesni_generate3("dec");
  &aesni_generate4("enc") if ($PREFIX eq "aesni");
@@ -645,8 +686,7 @@ $code.=<<___;
         jmp     .Lecb_ret
  .align 16
  .Lecb_enc_two:
-       xorps   $inout2,$inout2
-       call    _aesni_encrypt3
+       call    _aesni_encrypt2
         movups  $inout0,($out)
         movups  $inout1,0x10($out)
         jmp     .Lecb_ret
@@ -782,8 +822,7 @@ $code.=<<___;
         jmp     .Lecb_ret
  .align 16
  .Lecb_dec_two:
-       xorps   $inout2,$inout2
-       call    _aesni_decrypt3
+       call    _aesni_decrypt2
         movups  $inout0,($out)
         movups  $inout1,0x10($out)
         jmp     .Lecb_ret
@@ -1875,7 +1914,7 @@ $code.=<<___;
         xorps   @tweak[0],$inout0
         xorps   @tweak[1],$inout1
  
-       call    _aesni_encrypt3
+       call    _aesni_encrypt2
  
         xorps   @tweak[0],$inout0
         movdqa  @tweak[2],@tweak[0]
@@ -2322,7 +2361,7 @@ $code.=<<___;
         xorps   @tweak[0],$inout0
         xorps   @tweak[1],$inout1
  
-       call    _aesni_decrypt3
+       call    _aesni_decrypt2
  
         xorps   @tweak[0],$inout0
         movdqa  @tweak[2],@tweak[0]
@@ -2831,8 +2870,7 @@ $code.=<<___;
  .align 16
  .Lcbc_dec_two:
         movaps  $inout1,$in1
-       xorps   $inout2,$inout2
-       call    _aesni_decrypt3
+       call    _aesni_decrypt2
         pxor    $iv,$inout0
         movaps  $in1,$iv
         pxor    $in0,$inout1
author	Andy Polyakov <appro@openssl.org>
	Fri, 21 Feb 2014 11:14:04 +0000 (12:14 +0100)
committer	Andy Polyakov <appro@openssl.org>
	Fri, 21 Feb 2014 11:15:07 +0000 (12:15 +0100)
crypto/aes/asm/aesni-x86.pl		patch \| blob \| history
crypto/aes/asm/aesni-x86_64.pl		patch \| blob \| history