From: Andy Polyakov Date: Fri, 21 Feb 2014 11:14:04 +0000 (+0100) Subject: aes/asm/aesni-x86[_64].pl: minor Atom-specific performance tweak. X-Git-Tag: OpenSSL_1_0_2-beta1~9 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=2d4d9623da229162ad4377174526af3c01b1707a;p=openssl aes/asm/aesni-x86[_64].pl: minor Atom-specific performance tweak. (cherry picked from commit 214368ffee5736836e2dbb80a16a4fbd85f0eaf9) --- diff --git a/crypto/aes/asm/aesni-x86.pl b/crypto/aes/asm/aesni-x86.pl index c3df97db7b..3deb86aed6 100644 --- a/crypto/aes/asm/aesni-x86.pl +++ b/crypto/aes/asm/aesni-x86.pl @@ -207,12 +207,45 @@ sub aesni_generate1 # fully unrolled loop # every *2nd* cycle. Thus 3x interleave was the one providing optimal # utilization, i.e. when subroutine's throughput is virtually same as # of non-interleaved subroutine [for number of input blocks up to 3]. -# This is why it makes no sense to implement 2x subroutine. -# aes[enc|dec] latency in next processor generation is 8, but the -# instructions can be scheduled every cycle. Optimal interleave for -# new processor is therefore 8x, but it's unfeasible to accommodate it -# in XMM registers addreassable in 32-bit mode and therefore 6x is -# used instead... +# This is why it originally made no sense to implement 2x subroutine. +# But times change and it became appropriate to spend extra 192 bytes +# on 2x subroutine on Atom Silvermont account. For processors that +# can schedule aes[enc|dec] every cycle optimal interleave factor +# equals to corresponding instructions latency. 8x is optimal for +# * Bridge, but it's unfeasible to accommodate such implementation +# in XMM registers addreassable in 32-bit mode and therefore maximum +# of 6x is used instead... + +sub aesni_generate2 +{ my $p=shift; + + &function_begin_B("_aesni_${p}rypt2"); + &$movekey ($rndkey0,&QWP(0,$key)); + &shl ($rounds,4); + &$movekey ($rndkey1,&QWP(16,$key)); + &xorps ($inout0,$rndkey0); + &pxor ($inout1,$rndkey0); + &$movekey ($rndkey0,&QWP(32,$key)); + &lea ($key,&DWP(32,$key,$rounds)); + &neg ($rounds); + &add ($rounds,16); + + &set_label("${p}2_loop"); + eval"&aes${p} ($inout0,$rndkey1)"; + eval"&aes${p} ($inout1,$rndkey1)"; + &$movekey ($rndkey1,&QWP(0,$key,$rounds)); + &add ($rounds,32); + eval"&aes${p} ($inout0,$rndkey0)"; + eval"&aes${p} ($inout1,$rndkey0)"; + &$movekey ($rndkey0,&QWP(-16,$key,$rounds)); + &jnz (&label("${p}2_loop")); + eval"&aes${p} ($inout0,$rndkey1)"; + eval"&aes${p} ($inout1,$rndkey1)"; + eval"&aes${p}last ($inout0,$rndkey0)"; + eval"&aes${p}last ($inout1,$rndkey0)"; + &ret(); + &function_end_B("_aesni_${p}rypt2"); +} sub aesni_generate3 { my $p=shift; @@ -357,6 +390,8 @@ sub aesni_generate6 &ret(); &function_end_B("_aesni_${p}rypt6"); } +&aesni_generate2("enc") if ($PREFIX eq "aesni"); +&aesni_generate2("dec"); &aesni_generate3("enc") if ($PREFIX eq "aesni"); &aesni_generate3("dec"); &aesni_generate4("enc") if ($PREFIX eq "aesni"); @@ -460,8 +495,7 @@ if ($PREFIX eq "aesni") { &jmp (&label("ecb_ret")); &set_label("ecb_enc_two",16); - &xorps ($inout2,$inout2); - &call ("_aesni_encrypt3"); + &call ("_aesni_encrypt2"); &movups (&QWP(0,$out),$inout0); &movups (&QWP(0x10,$out),$inout1); &jmp (&label("ecb_ret")); @@ -561,8 +595,7 @@ if ($PREFIX eq "aesni") { &jmp (&label("ecb_ret")); &set_label("ecb_dec_two",16); - &xorps ($inout2,$inout2); - &call ("_aesni_decrypt3"); + &call ("_aesni_decrypt2"); &movups (&QWP(0,$out),$inout0); &movups (&QWP(0x10,$out),$inout1); &jmp (&label("ecb_ret")); @@ -982,7 +1015,7 @@ if ($PREFIX eq "aesni") { &jmp (&label("ctr32_ret")); &set_label("ctr32_two",16); - &call ("_aesni_encrypt3"); + &call ("_aesni_encrypt2"); &movups ($inout3,&QWP(0,$inp)); &movups ($inout4,&QWP(0x10,$inp)); &xorps ($inout0,$inout3); @@ -1253,9 +1286,8 @@ if ($PREFIX eq "aesni") { &lea ($inp,&DWP(16*2,$inp)); &xorps ($inout0,$inout3); # input^=tweak &xorps ($inout1,$inout4); - &xorps ($inout2,$inout2); - &call ("_aesni_encrypt3"); + &call ("_aesni_encrypt2"); &xorps ($inout0,$inout3); # output^=tweak &xorps ($inout1,$inout4); @@ -1596,7 +1628,7 @@ if ($PREFIX eq "aesni") { &xorps ($inout0,$inout3); # input^=tweak &xorps ($inout1,$inout4); - &call ("_aesni_decrypt3"); + &call ("_aesni_decrypt2"); &xorps ($inout0,$inout3); # output^=tweak &xorps ($inout1,$inout4); @@ -1896,8 +1928,7 @@ if ($PREFIX eq "aesni") { &jmp (&label("cbc_dec_tail_collected")); &set_label("cbc_dec_two",16); - &xorps ($inout2,$inout2); - &call ("_aesni_decrypt3"); + &call ("_aesni_decrypt2"); &xorps ($inout0,$ivec); &xorps ($inout1,$in0); &movups (&QWP(0,$out),$inout0); diff --git a/crypto/aes/asm/aesni-x86_64.pl b/crypto/aes/asm/aesni-x86_64.pl index 708fabd3de..31c80ae6bc 100644 --- a/crypto/aes/asm/aesni-x86_64.pl +++ b/crypto/aes/asm/aesni-x86_64.pl @@ -288,10 +288,49 @@ ___ # every *2nd* cycle. Thus 3x interleave was the one providing optimal # utilization, i.e. when subroutine's throughput is virtually same as # of non-interleaved subroutine [for number of input blocks up to 3]. -# This is why it makes no sense to implement 2x subroutine. -# aes[enc|dec] latency in next processor generation is 8, but the -# instructions can be scheduled every cycle. Optimal interleave for -# new processor is therefore 8x... +# This is why it originally made no sense to implement 2x subroutine. +# But times change and it became appropriate to spend extra 192 bytes +# on 2x subroutine on Atom Silvermont account. For processors that +# can schedule aes[enc|dec] every cycle optimal interleave factor +# equals to corresponding instructions latency. 8x is optimal for +# * Bridge and "super-optimal" for other Intel CPUs... + +sub aesni_generate2 { +my $dir=shift; +# As already mentioned it takes in $key and $rounds, which are *not* +# preserved. $inout[0-1] is cipher/clear text... +$code.=<<___; +.type _aesni_${dir}rypt2,\@abi-omnipotent +.align 16 +_aesni_${dir}rypt2: + $movkey ($key),$rndkey0 + shl \$4,$rounds + $movkey 16($key),$rndkey1 + xorps $rndkey0,$inout0 + xorps $rndkey0,$inout1 + $movkey 32($key),$rndkey0 + lea 32($key,$rounds),$key + neg %rax # $rounds + add \$16,%rax + +.L${dir}_loop2: + aes${dir} $rndkey1,$inout0 + aes${dir} $rndkey1,$inout1 + $movkey ($key,%rax),$rndkey1 + add \$32,%rax + aes${dir} $rndkey0,$inout0 + aes${dir} $rndkey0,$inout1 + $movkey -16($key,%rax),$rndkey0 + jnz .L${dir}_loop2 + + aes${dir} $rndkey1,$inout0 + aes${dir} $rndkey1,$inout1 + aes${dir}last $rndkey0,$inout0 + aes${dir}last $rndkey0,$inout1 + ret +.size _aesni_${dir}rypt2,.-_aesni_${dir}rypt2 +___ +} sub aesni_generate3 { my $dir=shift; # As already mentioned it takes in $key and $rounds, which are *not* @@ -524,6 +563,8 @@ _aesni_${dir}rypt8: .size _aesni_${dir}rypt8,.-_aesni_${dir}rypt8 ___ } +&aesni_generate2("enc") if ($PREFIX eq "aesni"); +&aesni_generate2("dec"); &aesni_generate3("enc") if ($PREFIX eq "aesni"); &aesni_generate3("dec"); &aesni_generate4("enc") if ($PREFIX eq "aesni"); @@ -645,8 +686,7 @@ $code.=<<___; jmp .Lecb_ret .align 16 .Lecb_enc_two: - xorps $inout2,$inout2 - call _aesni_encrypt3 + call _aesni_encrypt2 movups $inout0,($out) movups $inout1,0x10($out) jmp .Lecb_ret @@ -782,8 +822,7 @@ $code.=<<___; jmp .Lecb_ret .align 16 .Lecb_dec_two: - xorps $inout2,$inout2 - call _aesni_decrypt3 + call _aesni_decrypt2 movups $inout0,($out) movups $inout1,0x10($out) jmp .Lecb_ret @@ -1875,7 +1914,7 @@ $code.=<<___; xorps @tweak[0],$inout0 xorps @tweak[1],$inout1 - call _aesni_encrypt3 + call _aesni_encrypt2 xorps @tweak[0],$inout0 movdqa @tweak[2],@tweak[0] @@ -2322,7 +2361,7 @@ $code.=<<___; xorps @tweak[0],$inout0 xorps @tweak[1],$inout1 - call _aesni_decrypt3 + call _aesni_decrypt2 xorps @tweak[0],$inout0 movdqa @tweak[2],@tweak[0] @@ -2831,8 +2870,7 @@ $code.=<<___; .align 16 .Lcbc_dec_two: movaps $inout1,$in1 - xorps $inout2,$inout2 - call _aesni_decrypt3 + call _aesni_decrypt2 pxor $iv,$inout0 movaps $in1,$iv pxor $in0,$inout1