From: Andy Polyakov <appro@openssl.org>
Date: Fri, 21 Feb 2014 11:14:04 +0000 (+0100)
Subject: aes/asm/aesni-x86[_64].pl: minor Atom-specific performance tweak.
X-Git-Tag: OpenSSL_1_0_2-beta1~9
X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=2d4d9623da229162ad4377174526af3c01b1707a;p=openssl

aes/asm/aesni-x86[_64].pl: minor Atom-specific performance tweak.
(cherry picked from commit 214368ffee5736836e2dbb80a16a4fbd85f0eaf9)
---

diff --git a/crypto/aes/asm/aesni-x86.pl b/crypto/aes/asm/aesni-x86.pl
index c3df97db7b..3deb86aed6 100644
--- a/crypto/aes/asm/aesni-x86.pl
+++ b/crypto/aes/asm/aesni-x86.pl
@@ -207,12 +207,45 @@ sub aesni_generate1	# fully unrolled loop
 # every *2nd* cycle. Thus 3x interleave was the one providing optimal
 # utilization, i.e. when subroutine's throughput is virtually same as
 # of non-interleaved subroutine [for number of input blocks up to 3].
-# This is why it makes no sense to implement 2x subroutine.
-# aes[enc|dec] latency in next processor generation is 8, but the
-# instructions can be scheduled every cycle. Optimal interleave for
-# new processor is therefore 8x, but it's unfeasible to accommodate it
-# in XMM registers addreassable in 32-bit mode and therefore 6x is
-# used instead...
+# This is why it originally made no sense to implement 2x subroutine.
+# But times change and it became appropriate to spend extra 192 bytes
+# on 2x subroutine on Atom Silvermont account. For processors that
+# can schedule aes[enc|dec] every cycle optimal interleave factor
+# equals to corresponding instructions latency. 8x is optimal for
+# * Bridge, but it's unfeasible to accommodate such implementation
+# in XMM registers addreassable in 32-bit mode and therefore maximum
+# of 6x is used instead...
+
+sub aesni_generate2
+{ my $p=shift;
+
+    &function_begin_B("_aesni_${p}rypt2");
+	&$movekey	($rndkey0,&QWP(0,$key));
+	&shl		($rounds,4);
+	&$movekey	($rndkey1,&QWP(16,$key));
+	&xorps		($inout0,$rndkey0);
+	&pxor		($inout1,$rndkey0);
+	&$movekey	($rndkey0,&QWP(32,$key));
+	&lea		($key,&DWP(32,$key,$rounds));
+	&neg		($rounds);
+	&add		($rounds,16);
+
+    &set_label("${p}2_loop");
+	eval"&aes${p}	($inout0,$rndkey1)";
+	eval"&aes${p}	($inout1,$rndkey1)";
+	&$movekey	($rndkey1,&QWP(0,$key,$rounds));
+	&add		($rounds,32);
+	eval"&aes${p}	($inout0,$rndkey0)";
+	eval"&aes${p}	($inout1,$rndkey0)";
+	&$movekey	($rndkey0,&QWP(-16,$key,$rounds));
+	&jnz		(&label("${p}2_loop"));
+    eval"&aes${p}	($inout0,$rndkey1)";
+    eval"&aes${p}	($inout1,$rndkey1)";
+    eval"&aes${p}last	($inout0,$rndkey0)";
+    eval"&aes${p}last	($inout1,$rndkey0)";
+    &ret();
+    &function_end_B("_aesni_${p}rypt2");
+}
 
 sub aesni_generate3
 { my $p=shift;
@@ -357,6 +390,8 @@ sub aesni_generate6
     &ret();
     &function_end_B("_aesni_${p}rypt6");
 }
+&aesni_generate2("enc") if ($PREFIX eq "aesni");
+&aesni_generate2("dec");
 &aesni_generate3("enc") if ($PREFIX eq "aesni");
 &aesni_generate3("dec");
 &aesni_generate4("enc") if ($PREFIX eq "aesni");
@@ -460,8 +495,7 @@ if ($PREFIX eq "aesni") {
 	&jmp	(&label("ecb_ret"));
 
 &set_label("ecb_enc_two",16);
-	&xorps	($inout2,$inout2);
-	&call	("_aesni_encrypt3");
+	&call	("_aesni_encrypt2");
 	&movups	(&QWP(0,$out),$inout0);
 	&movups	(&QWP(0x10,$out),$inout1);
 	&jmp	(&label("ecb_ret"));
@@ -561,8 +595,7 @@ if ($PREFIX eq "aesni") {
 	&jmp	(&label("ecb_ret"));
 
 &set_label("ecb_dec_two",16);
-	&xorps	($inout2,$inout2);
-	&call	("_aesni_decrypt3");
+	&call	("_aesni_decrypt2");
 	&movups	(&QWP(0,$out),$inout0);
 	&movups	(&QWP(0x10,$out),$inout1);
 	&jmp	(&label("ecb_ret"));
@@ -982,7 +1015,7 @@ if ($PREFIX eq "aesni") {
 	&jmp	(&label("ctr32_ret"));
 
 &set_label("ctr32_two",16);
-	&call	("_aesni_encrypt3");
+	&call	("_aesni_encrypt2");
 	&movups	($inout3,&QWP(0,$inp));
 	&movups	($inout4,&QWP(0x10,$inp));
 	&xorps	($inout0,$inout3);
@@ -1253,9 +1286,8 @@ if ($PREFIX eq "aesni") {
 	&lea	($inp,&DWP(16*2,$inp));
 	&xorps	($inout0,$inout3);		# input^=tweak
 	&xorps	($inout1,$inout4);
-	&xorps	($inout2,$inout2);
 
-	&call	("_aesni_encrypt3");
+	&call	("_aesni_encrypt2");
 
 	&xorps	($inout0,$inout3);		# output^=tweak
 	&xorps	($inout1,$inout4);
@@ -1596,7 +1628,7 @@ if ($PREFIX eq "aesni") {
 	&xorps	($inout0,$inout3);		# input^=tweak
 	&xorps	($inout1,$inout4);
 
-	&call	("_aesni_decrypt3");
+	&call	("_aesni_decrypt2");
 
 	&xorps	($inout0,$inout3);		# output^=tweak
 	&xorps	($inout1,$inout4);
@@ -1896,8 +1928,7 @@ if ($PREFIX eq "aesni") {
 	&jmp	(&label("cbc_dec_tail_collected"));
 
 &set_label("cbc_dec_two",16);
-	&xorps	($inout2,$inout2);
-	&call	("_aesni_decrypt3");
+	&call	("_aesni_decrypt2");
 	&xorps	($inout0,$ivec);
 	&xorps	($inout1,$in0);
 	&movups	(&QWP(0,$out),$inout0);
diff --git a/crypto/aes/asm/aesni-x86_64.pl b/crypto/aes/asm/aesni-x86_64.pl
index 708fabd3de..31c80ae6bc 100644
--- a/crypto/aes/asm/aesni-x86_64.pl
+++ b/crypto/aes/asm/aesni-x86_64.pl
@@ -288,10 +288,49 @@ ___
 # every *2nd* cycle. Thus 3x interleave was the one providing optimal
 # utilization, i.e. when subroutine's throughput is virtually same as
 # of non-interleaved subroutine [for number of input blocks up to 3].
-# This is why it makes no sense to implement 2x subroutine.
-# aes[enc|dec] latency in next processor generation is 8, but the
-# instructions can be scheduled every cycle. Optimal interleave for
-# new processor is therefore 8x...
+# This is why it originally made no sense to implement 2x subroutine.
+# But times change and it became appropriate to spend extra 192 bytes
+# on 2x subroutine on Atom Silvermont account. For processors that
+# can schedule aes[enc|dec] every cycle optimal interleave factor
+# equals to corresponding instructions latency. 8x is optimal for
+# * Bridge and "super-optimal" for other Intel CPUs... 
+
+sub aesni_generate2 {
+my $dir=shift;
+# As already mentioned it takes in $key and $rounds, which are *not*
+# preserved. $inout[0-1] is cipher/clear text...
+$code.=<<___;
+.type	_aesni_${dir}rypt2,\@abi-omnipotent
+.align	16
+_aesni_${dir}rypt2:
+	$movkey	($key),$rndkey0
+	shl	\$4,$rounds
+	$movkey	16($key),$rndkey1
+	xorps	$rndkey0,$inout0
+	xorps	$rndkey0,$inout1
+	$movkey	32($key),$rndkey0
+	lea	32($key,$rounds),$key
+	neg	%rax				# $rounds
+	add	\$16,%rax
+
+.L${dir}_loop2:
+	aes${dir}	$rndkey1,$inout0
+	aes${dir}	$rndkey1,$inout1
+	$movkey		($key,%rax),$rndkey1
+	add		\$32,%rax
+	aes${dir}	$rndkey0,$inout0
+	aes${dir}	$rndkey0,$inout1
+	$movkey		-16($key,%rax),$rndkey0
+	jnz		.L${dir}_loop2
+
+	aes${dir}	$rndkey1,$inout0
+	aes${dir}	$rndkey1,$inout1
+	aes${dir}last	$rndkey0,$inout0
+	aes${dir}last	$rndkey0,$inout1
+	ret
+.size	_aesni_${dir}rypt2,.-_aesni_${dir}rypt2
+___
+}
 sub aesni_generate3 {
 my $dir=shift;
 # As already mentioned it takes in $key and $rounds, which are *not*
@@ -524,6 +563,8 @@ _aesni_${dir}rypt8:
 .size	_aesni_${dir}rypt8,.-_aesni_${dir}rypt8
 ___
 }
+&aesni_generate2("enc") if ($PREFIX eq "aesni");
+&aesni_generate2("dec");
 &aesni_generate3("enc") if ($PREFIX eq "aesni");
 &aesni_generate3("dec");
 &aesni_generate4("enc") if ($PREFIX eq "aesni");
@@ -645,8 +686,7 @@ $code.=<<___;
 	jmp	.Lecb_ret
 .align	16
 .Lecb_enc_two:
-	xorps	$inout2,$inout2
-	call	_aesni_encrypt3
+	call	_aesni_encrypt2
 	movups	$inout0,($out)
 	movups	$inout1,0x10($out)
 	jmp	.Lecb_ret
@@ -782,8 +822,7 @@ $code.=<<___;
 	jmp	.Lecb_ret
 .align	16
 .Lecb_dec_two:
-	xorps	$inout2,$inout2
-	call	_aesni_decrypt3
+	call	_aesni_decrypt2
 	movups	$inout0,($out)
 	movups	$inout1,0x10($out)
 	jmp	.Lecb_ret
@@ -1875,7 +1914,7 @@ $code.=<<___;
 	xorps	@tweak[0],$inout0
 	xorps	@tweak[1],$inout1
 
-	call	_aesni_encrypt3
+	call	_aesni_encrypt2
 
 	xorps	@tweak[0],$inout0
 	movdqa	@tweak[2],@tweak[0]
@@ -2322,7 +2361,7 @@ $code.=<<___;
 	xorps	@tweak[0],$inout0
 	xorps	@tweak[1],$inout1
 
-	call	_aesni_decrypt3
+	call	_aesni_decrypt2
 
 	xorps	@tweak[0],$inout0
 	movdqa	@tweak[2],@tweak[0]
@@ -2831,8 +2870,7 @@ $code.=<<___;
 .align	16
 .Lcbc_dec_two:
 	movaps	$inout1,$in1
-	xorps	$inout2,$inout2
-	call	_aesni_decrypt3
+	call	_aesni_decrypt2
 	pxor	$iv,$inout0
 	movaps	$in1,$iv
 	pxor	$in0,$inout1