From: Andy Polyakov <appro@openssl.org>
Date: Wed, 14 Jul 2010 08:43:38 +0000 (+0000)
Subject: aes-s390x.pl: revisit buffer allocation and add performance data.
X-Git-Tag: OpenSSL-fips-2_0-rc1~1048
X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=26064d7f77ac20b61a35d758046eeb2344745e12;p=openssl

aes-s390x.pl: revisit buffer allocation and add performance data.
---

diff --git a/crypto/aes/asm/aes-s390x.pl b/crypto/aes/asm/aes-s390x.pl
index 38e18b2452..5ffacb8cf4 100644
--- a/crypto/aes/asm/aes-s390x.pl
+++ b/crypto/aes/asm/aes-s390x.pl
@@ -44,7 +44,7 @@
 # Unlike previous version hardware support detection takes place only
 # at the moment of key schedule setup, which is denoted in key->rounds.
 # This is done, because deferred key setup can't be made MT-safe, not
-# for key lengthes longer than 128 bits.
+# for keys longer than 128 bits.
 #
 # Add AES_cbc_encrypt, which gives incredible performance improvement,
 # it was measured to be ~6.6x. It's less than previously mentioned 8x,
@@ -52,7 +52,13 @@
 
 # May 2010.
 #
-# Add AES_ctr32_encrypt.
+# Add AES_ctr32_encrypt. If hardware-assisted, it provides up to 4.3x
+# performance improvement over "generic" counter mode routine relying
+# on single-block, also hardware-assisted, AES_encrypt. "Up to" refers
+# to the fact that exact throughput value depends on current stack
+# frame alignment within 4KB page. In worst case you get ~75% of the
+# maximum, but *on average* it would be as much as ~98%. Meaning that
+# worst case is unlike, it's like hitting ravine on plateau.
 
 while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
 open STDOUT,">$output";
@@ -1367,24 +1373,27 @@ $code.=<<___ if (!$softonly);
 	lg	$iv0,0($ivp)	# load ivec
 	lg	$ivp,8($ivp)
 
-	# prepare and allocate stack frame
-	lghi	$s0,-272	# guarantee at least 256-bytes buffer
+	# prepare and allocate stack frame at the top of 4K page
+	# with 1K reserved for eventual signal handling
+	lghi	$s0,-1024-256-16# guarantee at least 256-bytes buffer
 	lghi	$s1,-4096
-	lgr	$fp,$sp
 	algr	$s0,$sp
+	lgr	$fp,$sp
 	ngr	$s0,$s1		# align at page boundary
-	la	$sp,0($s0)	# alloca
-	stg	$fp,0($s0)	# back-chain
-
-	# calculate resultant buffer size
-	la	$s0,16($s0)	# buffer starts at offset of 16
-	slgr	$fp,$s0
-	srlg	$fp,$fp,4	# $fp is buffer length in blocks, minimum 16
+	slgr	$fp,$s0		# total buffer size
+	lgr	$s2,$sp
+	lghi	$s1,1024+16	# sl[g]fi is extended-immediate facility
+	slgr	$fp,$s1		# deduct reservation to get usable buffer size
+	# buffer size is at lest 256 and at most 3072+256-16
+
+	la	$sp,1024($s0)	# alloca
+	srlg	$fp,$fp,4	# convert bytes to blocks, minimum 16
+	stg	$s2,0($sp)	# back-chain
 	stg	$fp,8($sp)
 
 	slgr	$len,$fp
 	brc	1,.Lctr32_hw_loop	# not zero, no borrow
-	algr	$fp,$len
+	algr	$fp,$len	# input is shorter than allocated buffer
 	lghi	$len,0
 	stg	$fp,8($sp)