From 6c79faaa9dd288bfda72831a9ef22ca01fa482d4 Mon Sep 17 00:00:00 2001
From: Andy Polyakov <appro@openssl.org>
Date: Tue, 26 Mar 2013 14:29:18 +0100
Subject: [PATCH] aesni-x86_64.pl: optimize CTR even further.

Based on suggestions from Shay Gueron and Vlad Krasnov.
PR: 3021
---
 crypto/aes/asm/aesni-x86_64.pl | 656 ++++++++++++++++-----------------
 1 file changed, 313 insertions(+), 343 deletions(-)

diff --git a/crypto/aes/asm/aesni-x86_64.pl b/crypto/aes/asm/aesni-x86_64.pl
index 1f3c7f848b..27bb47c326 100644
--- a/crypto/aes/asm/aesni-x86_64.pl
+++ b/crypto/aes/asm/aesni-x86_64.pl
@@ -130,7 +130,7 @@
 # Further data for other parallelizable modes:
 #
 # CBC decrypt				1.16	0.93	0.93
-# CTR					1.14	0.91	0.86
+# CTR					1.14	0.91	0.77
 #
 # Well, given 3x column it's probably inappropriate to call the limit
 # asymptotic, if it can be surpassed, isn't it? What happens there?
@@ -160,7 +160,7 @@
 ######################################################################
 # For reference, AMD Bulldozer spends 5.77 cycles per byte processed
 # with 128-bit key in CBC encrypt and 0.76 cycles in CBC decrypt, 0.70
-# in ECB, 0.73 in CTR, 0.95 in XTS... This means that aes[enc|dec]
+# in ECB, 0.71 in CTR, 0.95 in XTS... This means that aes[enc|dec]
 # instruction latency is 9 cycles and that they can be issued every
 # cycle.
 
@@ -1011,385 +1011,389 @@ ___
 #                         const char *ivec);
 #
 # Handles only complete blocks, operates on 32-bit counter and
-# does not update *ivec! (see engine/eng_aesni.c for details)
+# does not update *ivec! (see crypto/modes/ctr128.c for details)
 #
+# Overhaul based on suggestions from Shay Gueron and Vlad Krasnov,
+# http://rt.openssl.org/Ticket/Display.html?id=3031&user=guest&pass=guest.
+# Keywords are full unroll and modulo-schedule counter calculations
+# with zero-round key xor.
 {
-my ($in0,$in1,$in2,$in3,$one,$ivec)=map("%xmm$_",(10..15));
-my $len_="%r9";
+my ($in0,$in1,$in2,$in3,$in4,$in5)=map("%xmm$_",(10..15));
+my ($key0,$ctr)=("${key_}d","${ivp}d");
+my $frame_size = 0x80 + ($win64?160:0);
 
 $code.=<<___;
 .globl	aesni_ctr32_encrypt_blocks
 .type	aesni_ctr32_encrypt_blocks,\@function,5
 .align	16
 aesni_ctr32_encrypt_blocks:
+	lea	(%rsp),%rax
+	push	%rbp
+	sub	\$$frame_size,%rsp
+	and	\$-16,%rsp	# Linux kernel stack can be incorrectly seeded
 ___
 $code.=<<___ if ($win64);
-	lea	-0xa8(%rsp),%rsp
-	movaps	%xmm6,0x00(%rsp)
-	movaps	%xmm7,0x10(%rsp)
-	movaps	%xmm8,0x20(%rsp)
-	movaps	%xmm9,0x30(%rsp)
-	movaps	%xmm10,0x40(%rsp)
-	movaps	%xmm11,0x50(%rsp)
-	movaps	%xmm12,0x60(%rsp)
-	movaps	%xmm13,0x70(%rsp)
-	movaps	%xmm14,0x80(%rsp)
-	movaps	%xmm15,0x90(%rsp)
+	movaps	%xmm6,-0xa8(%rax)
+	movaps	%xmm7,-0x98(%rax)
+	movaps	%xmm8,-0x88(%rax)
+	movaps	%xmm9,-0x78(%rax)
+	movaps	%xmm10,-0x68(%rax)
+	movaps	%xmm11,-0x58(%rax)
+	movaps	%xmm12,-0x48(%rax)
+	movaps	%xmm13,-0x38(%rax)
+	movaps	%xmm14,-0x28(%rax)
+	movaps	%xmm15,-0x18(%rax)
 .Lctr32_body:
 ___
 $code.=<<___;
+	lea	-8(%rax),%rbp
+
 	cmp	\$1,$len
 	je	.Lctr32_one_shortcut
 
-	movzb	15($ivp),%rax			# counter LSB
-	mov	$len,$len_			# backup $len
-	mov	240($key),$rnds_		# key->rounds
-	mov	$key,$key_			# backup $key
-	movdqu	($ivp),$ivec
-	neg	%rax
-	movdqa	.Lincrement1(%rip),$one
-	add	\$256,%rax			# steps to closest overflow
-
-.Lctr32_grandloop:
-	cmp	%rax,$len
-	cmova	%rax,$len
-	mov	$rnds_,$rounds			# restore $rounds
-	sub	$len,$len_
+	movdqu	($ivp),$inout0
+	movdqu	($key),$rndkey0
+	mov	12($ivp),$ctr			# counter LSB
+	pxor	$rndkey0,$inout0
+	mov	12($key),$key0			# 0-round key LSB
+	movdqa	$inout0,0x00(%rsp)		# populate counter block
+	bswap	$ctr
+	movdqa	$inout0,0x10(%rsp)
+	movdqa	$inout0,0x20(%rsp)
+	movdqa	$inout0,0x30(%rsp)
+	movdqa	$inout0,0x40(%rsp)
+	movdqa	$inout0,0x50(%rsp)
+	movdqa	$inout0,0x60(%rsp)
+	movdqa	$inout0,0x70(%rsp)
+
+	mov	240($key),$rounds		# key->rounds
+
+	lea	1($ctr),%r9
+	 lea	2($ctr),%r10
+	bswap	%r9d
+	 bswap	%r10d
+	xor	$key0,%r9d
+	 xor	$key0,%r10d
+	mov	%r9d,0x10+12(%rsp)
+	lea	3($ctr),%r9
+	 mov	%r10d,0x20+12(%rsp)
+	bswap	%r9d
+	 lea	4($ctr),%r10
+	xor	$key0,%r9d
+	 bswap	%r10d
+	mov	%r9d,0x30+12(%rsp)
+	 xor	$key0,%r10d
+	lea	5($ctr),%r9
+	 mov	%r10d,0x40+12(%rsp)
+	bswap	%r9d
+	 lea	6($ctr),%r10
+	xor	$key0,%r9d
+	 bswap	%r10d
+	mov	%r9d,0x50+12(%rsp)
+	 xor	$key0,%r10d
+	lea	7($ctr),%r9
+	 mov	%r10d,0x60+12(%rsp)
+	bswap	%r9d
+	xor	$key0,%r9d
+	mov	%r9d,0x70+12(%rsp)
+
+	$movkey	0x10($key),$rndkey1
+
+	movdqa	0x10(%rsp),$inout1
+	movdqa	0x20(%rsp),$inout2
+	movdqa	0x30(%rsp),$inout3
+	movdqa	0x40(%rsp),$inout4
+	movdqa	0x50(%rsp),$inout5
 
 	cmp	\$8,$len
 	jb	.Lctr32_tail
 
-	$movkey	($key_),$rndkey0
-	shr	\$1,$rounds
-	shr	\$1,$rnds_
-	movdqa	$rndkey0,$inout0
-	movdqa	$rndkey0,$inout1
-	movdqa	$rndkey0,$inout2
-	movdqa	$rndkey0,$inout3
-	movdqa	$rndkey0,$inout4
-	movdqa	$rndkey0,$inout5
-	movdqa	$rndkey0,$inout6
-	movdqa	$rndkey0,$inout7
-	$movkey	16($key_),$rndkey1
+	lea	0x80($key),$key		# size optimization
 	sub	\$8,$len
 	jmp	.Lctr32_loop8
 
-.align	16
+.align	32
 .Lctr32_loop8:
-	pxor		$ivec,$inout0
-	paddb		$one,$ivec
-	 aesenc		$rndkey1,$inout0
-	pxor		$ivec,$inout1
-	paddb		$one,$ivec
-	 lea		32($key_),$key
-	 aesenc		$rndkey1,$inout1
-	pxor		$ivec,$inout2
-	paddb		$one,$ivec
-	 aesenc		$rndkey1,$inout2
-	pxor		$ivec,$inout3
-	paddb		$one,$ivec
-	 aesenc		$rndkey1,$inout3
-	pxor		$ivec,$inout4
-	paddb		$one,$ivec
-	 aesenc		$rndkey1,$inout4
-	pxor		$ivec,$inout5
-	paddb		$one,$ivec
-	 aesenc		$rndkey1,$inout5
-	pxor		$ivec,$inout6
-	paddb		$one,$ivec
-	 $movkey	($key),$rndkey0
-	 aesenc		$rndkey1,$inout6
-	pxor		$ivec,$inout7
-	paddb		$one,$ivec
-	 dec		$rounds
-	 aesenc		$rndkey1,$inout7
-	 $movkey	16($key),$rndkey1
+	 add		\$8,$ctr
+	movdqa		0x60(%rsp),$inout6
+	aesenc		$rndkey1,$inout0
+	 mov		$ctr,%r9d
+	movdqa		0x70(%rsp),$inout7
+	aesenc		$rndkey1,$inout1
+	 bswap		%r9d
+	$movkey		0x20-0x80($key),$rndkey0
+	aesenc		$rndkey1,$inout2
+	 xor		$key0,%r9d
+	aesenc		$rndkey1,$inout3
+	 mov		%r9d,0x00+12(%rsp)
+	 lea		1($ctr),%r9
+	aesenc		$rndkey1,$inout4
+	aesenc		$rndkey1,$inout5
+	aesenc		$rndkey1,$inout6
+	aesenc		$rndkey1,$inout7
+	$movkey		0x30-0x80($key),$rndkey1
+___
+for($i=2;$i<8;$i++) {
+my $rndkeyx = ($i&1)?$rndkey1:$rndkey0;
+$code.=<<___;
+	aesenc		$rndkeyx,$inout0
+	aesenc		$rndkeyx,$inout1
+	 bswap		%r9d
+	aesenc		$rndkeyx,$inout2
+	 xor		$key0,%r9d
+	aesenc		$rndkeyx,$inout3
+	 mov		%r9d,`0x10*($i-1)`+12(%rsp)
+	 lea		$i($ctr),%r9
+	aesenc		$rndkeyx,$inout4
+	aesenc		$rndkeyx,$inout5
+	aesenc		$rndkeyx,$inout6
+	aesenc		$rndkeyx,$inout7
+	$movkey		`0x20+0x10*$i`-0x80($key),$rndkeyx
+___
+}
+$code.=<<___;
+	aesenc		$rndkey0,$inout0
+	aesenc		$rndkey0,$inout1
+	 bswap		%r9d
+	aesenc		$rndkey0,$inout2
+	 xor		$key0,%r9d
+	aesenc		$rndkey0,$inout3
+	 mov		%r9d,0x70+12(%rsp)
+	aesenc		$rndkey0,$inout4
+	aesenc		$rndkey0,$inout5
+	aesenc		$rndkey0,$inout6
+	 movdqu		0x00($inp),$in0
+	aesenc		$rndkey0,$inout7
+	$movkey		0xa0-0x80($key),$rndkey0
+
+	cmp		\$11,$rounds
+	jb		.Lctr32_enc_done
+
+	aesenc		$rndkey1,$inout0
+	aesenc		$rndkey1,$inout1
+	aesenc		$rndkey1,$inout2
+	aesenc		$rndkey1,$inout3
+	aesenc		$rndkey1,$inout4
+	aesenc		$rndkey1,$inout5
+	aesenc		$rndkey1,$inout6
+	aesenc		$rndkey1,$inout7
+	$movkey		0xb0-0x80($key),$rndkey1
 
 	aesenc		$rndkey0,$inout0
 	aesenc		$rndkey0,$inout1
-	lea		32($key),$key
 	aesenc		$rndkey0,$inout2
-	  movups	($inp),$in0		# load input
 	aesenc		$rndkey0,$inout3
-	  movups	0x10($inp),$in1
 	aesenc		$rndkey0,$inout4
-	  movups	0x20($inp),$in2
 	aesenc		$rndkey0,$inout5
-	  movups	0x30($inp),$in3
 	aesenc		$rndkey0,$inout6
-	  movups	0x40($inp),$one
 	aesenc		$rndkey0,$inout7
-	$movkey		($key),$rndkey0
+	$movkey		0xc0-0x80($key),$rndkey0
+	je		.Lctr32_enc_done
 
-.Lctr32_enc_loop8:
 	aesenc		$rndkey1,$inout0
 	aesenc		$rndkey1,$inout1
-	dec		$rounds
 	aesenc		$rndkey1,$inout2
 	aesenc		$rndkey1,$inout3
 	aesenc		$rndkey1,$inout4
 	aesenc		$rndkey1,$inout5
 	aesenc		$rndkey1,$inout6
 	aesenc		$rndkey1,$inout7
-	$movkey		16($key),$rndkey1
+	$movkey		0xd0-0x80($key),$rndkey1
 
 	aesenc		$rndkey0,$inout0
 	aesenc		$rndkey0,$inout1
-	lea		32($key),$key
 	aesenc		$rndkey0,$inout2
 	aesenc		$rndkey0,$inout3
 	aesenc		$rndkey0,$inout4
 	aesenc		$rndkey0,$inout5
 	aesenc		$rndkey0,$inout6
 	aesenc		$rndkey0,$inout7
-	$movkey		($key),$rndkey0
-	jnz		.Lctr32_enc_loop8
+	$movkey		0xe0-0x80($key),$rndkey0
 
+.Lctr32_enc_done:
 	aesenc		$rndkey1,$inout0
+	movdqu		0x10($inp),$in1
 	pxor		$rndkey0,$in0
 	aesenc		$rndkey1,$inout1
+	movdqu		0x20($inp),$in2
 	pxor		$rndkey0,$in1
 	aesenc		$rndkey1,$inout2
+	movdqu		0x30($inp),$in3
 	pxor		$rndkey0,$in2
 	aesenc		$rndkey1,$inout3
+	movdqu		0x40($inp),$in4
 	pxor		$rndkey0,$in3
 	aesenc		$rndkey1,$inout4
-	pxor		$rndkey0,$one
+	movdqu		0x50($inp),$in5
+	pxor		$rndkey0,$in4
 	aesenc		$rndkey1,$inout5
+	pxor		$rndkey0,$in5
 	aesenc		$rndkey1,$inout6
 	aesenc		$rndkey1,$inout7
-	movdqu		0x50($inp),$rndkey1
+	movdqu		0x60($inp),$rndkey1
+
 	aesenclast	$in0,$inout0
-	movdqu		0x60($inp),$in0
 	pxor		$rndkey0,$rndkey1
+	movdqu		0x70($inp),$in0
+	lea		0x80($inp),$inp
 	aesenclast	$in1,$inout1
-	movdqu		0x70($inp),$in1
 	pxor		$rndkey0,$in0
+	movdqa		0x00(%rsp),$in1		# load next counter block
 	aesenclast	$in2,$inout2
-	pxor		$rndkey0,$in1
-	$movkey		($key_),$rndkey0
+	movdqa		0x10(%rsp),$in2
 	aesenclast	$in3,$inout3
-	lea		0x80($inp),$inp
-	aesenclast	$one,$inout4
-	movdqa		.Lincrement1(%rip),$one
-	aesenclast	$rndkey1,$inout5
-	$movkey		16($key_),$rndkey1
-	aesenclast	$in0,$inout6
-	aesenclast	$in1,$inout7
+	movdqa		0x20(%rsp),$in3
+	aesenclast	$in4,$inout4
+	movdqa		0x30(%rsp),$in4
+	aesenclast	$in5,$inout5
+	movdqa		0x40(%rsp),$in5
+	aesenclast	$rndkey1,$inout6
+	movdqa		0x50(%rsp),$rndkey0
+	aesenclast	$in0,$inout7
+	$movkey		0x10-0x80($key),$rndkey1
 
 	movups		$inout0,($out)		# store output
-	movdqa		$rndkey0,$inout0
+	movdqa		$in1,$inout0
 	movups		$inout1,0x10($out)
-	movdqa		$rndkey0,$inout1
+	movdqa		$in2,$inout1
 	movups		$inout2,0x20($out)
-	movdqa		$rndkey0,$inout2
+	movdqa		$in3,$inout2
 	movups		$inout3,0x30($out)
-	movdqa		$rndkey0,$inout3
+	movdqa		$in4,$inout3
 	movups		$inout4,0x40($out)
-	movdqa		$rndkey0,$inout4
+	movdqa		$in5,$inout4
 	movups		$inout5,0x50($out)
 	movdqa		$rndkey0,$inout5
 	movups		$inout6,0x60($out)
-	movdqa		$rndkey0,$inout6
 	movups		$inout7,0x70($out)
-	movdqa		$rndkey0,$inout7
 	lea		0x80($out),$out
 	
-	mov	$rnds_,$rounds
 	sub	\$8,$len
 	jnc	.Lctr32_loop8
 
-	lea	1($rounds,$rounds),$rounds	# restore original value
-	lea	1($rnds_,$rnds_),$rnds_		# restore original value
 	add	\$8,$len
 	jz	.Lctr32_done
+	lea	-0x80($key),$key
 
 .Lctr32_tail:
-	mov	$key_,$key			# restore $key
-	movdqa	$ivec,$inout0
-	paddb	$one,$ivec
-	movups	($inp),$in0
-	cmp	\$2,$len
-	jb	.Lctr32_one
-
-	movdqa	$ivec,$inout1
-	paddb	$one,$ivec
-	movups	0x10($inp),$in1
-	je	.Lctr32_two
-
-	movdqa	$ivec,$inout2
-	paddb	$one,$ivec
-	movups	0x20($inp),$in2
+	lea	16($key),$key
 	cmp	\$4,$len
-	jb	.Lctr32_three
+	jbe	.Lctr32_loop4
 
-	movdqa	$ivec,$inout3
-	paddb	$one,$ivec
-	movups	0x30($inp),$in3
-	je	.Lctr32_four
+	movdqa		0x60(%rsp),$inout6
 
-	movdqa	$ivec,$inout4
-	paddb	$one,$ivec
-	cmp	\$6,$len
-	jb	.Lctr32_five
+	$movkey		16($key),$rndkey0
+	aesenc		$rndkey1,$inout0
+	lea		16($key),$key
+	aesenc		$rndkey1,$inout1
+	shr		\$1,$rounds
+	aesenc		$rndkey1,$inout2
+	dec		$rounds
+	aesenc		$rndkey1,$inout3
+	aesenc		$rndkey1,$inout4
+	aesenc		$rndkey1,$inout5
+	aesenc		$rndkey1,$inout6
+	pxor		$inout7,$inout7
+	$movkey		16($key),$rndkey1
 
-	movdqa	$ivec,$inout5
-	paddb	$one,$ivec
-	je	.Lctr32_six
+	call            .Lenc_loop8_enter
 
-	movdqa	$ivec,$inout6
-	paddb	$one,$ivec
-	xorps	$inout7,$inout7
+	movups	($inp),$in0
+	movups	0x10($inp),$in1
+	movups	0x20($inp),$in2
+	xorps	$in0,$inout0
+	movups	0x30($inp),$in3
+	xorps	$in1,$inout1
+	movups	0x40($inp),$in0
+	xorps	$in2,$inout2
+	movups	$inout0,($out)
+	xorps	$in3,$inout3
+	movups	$inout1,0x10($out)
+	xorps	$in0,$inout4
+	movups	$inout2,0x20($out)
+	movups	$inout3,0x30($out)
+	movups	$inout4,0x40($out)
+	cmp	\$6,$len
+	jb	.Lctr32_done
 
-	call	_aesni_encrypt8
+	movups	0x50($inp),$in1
+	xorps	$in1,$inout5
+	movups	$inout5,0x50($out)
+	je	.Lctr32_done
 
-	xorps		$in0,$inout0		# xor
-	movups		0x40($inp),$in0
-	xorps		$in1,$inout1
-	movups		0x50($inp),$in1
-	xorps		$in2,$inout2
-	movups		0x60($inp),$in2
-	lea		0x70($inp),$inp
-	xorps		$in3,$inout3
-	movups		$inout0,($out)		# store output
-	xorps		$in0,$inout4
-	movups		$inout1,0x10($out)
-	xorps		$in1,$inout5
-	movups		$inout2,0x20($out)
-	xorps		$in2,$inout6
-	movups		$inout3,0x30($out)
-	movups		$inout4,0x40($out)
-	movups		$inout5,0x50($out)
-	movups		$inout6,0x60($out)
-	lea		0x70($out),$out
+	movups	0x60($inp),$in2
+	xorps	$in2,$inout6
+	movups	$inout6,0x60($out)
 	jmp	.Lctr32_done
 
-.align	16
-.Lctr32_one_shortcut:
-	movups	($ivp),$inout0
-	xor	$len_,$len_
+.align	32
+.Lctr32_loop4:
+	aesenc		$rndkey1,$inout0
+	lea		16($key),$key
+	aesenc		$rndkey1,$inout1
+	aesenc		$rndkey1,$inout2
+	aesenc		$rndkey1,$inout3
+	$movkey		($key),$rndkey1
+	dec		$rounds
+	jnz		.Lctr32_loop4
+	aesenclast	$rndkey1,$inout0
+	aesenclast	$rndkey1,$inout1
+	aesenclast	$rndkey1,$inout2
+	aesenclast	$rndkey1,$inout3
+
 	movups	($inp),$in0
-	mov	240($key),$rounds		# key->rounds
-.Lctr32_one:
-___
-	&aesni_generate1("enc",$key,$rounds);
-$code.=<<___;
 	xorps	$in0,$inout0
-	lea	0x10($inp),$inp
 	movups	$inout0,($out)
-	lea	0x10($out),$out
-	jmp	.Lctr32_done
+	cmp	\$2,$len
+	jb	.Lctr32_done
 
-.align	16
-.Lctr32_two:
-	xorps	$inout2,$inout2
-	call	_aesni_encrypt3
-	xorps	$in0,$inout0		# xor
-	lea	0x20($inp),$inp
+	movups	0x10($inp),$in1
 	xorps	$in1,$inout1
-	movups	$inout0,($out)		# store output
 	movups	$inout1,0x10($out)
-	lea	0x20($out),$out
-	jmp	.Lctr32_done
+	je	.Lctr32_done
 
-.align	16
-.Lctr32_three:
-	call	_aesni_encrypt3
-	xorps	$in0,$inout0		# xor
-	lea	0x30($inp),$inp
-	xorps	$in1,$inout1
-	movups	$inout0,($out)		# store output
+	movups	0x20($inp),$in2
 	xorps	$in2,$inout2
-	movups	$inout1,0x10($out)
 	movups	$inout2,0x20($out)
-	lea	0x30($out),$out
-	jmp	.Lctr32_done
+	cmp	\$4,$len
+	jb	.Lctr32_done
 
-.align	16
-.Lctr32_four:
-	call	_aesni_encrypt4
-	xorps	$in0,$inout0		# xor
-	lea	0x40($inp),$inp
-	xorps	$in1,$inout1
-	movups	$inout0,($out)		# store output
-	xorps	$in2,$inout2
-	movups	$inout1,0x10($out)
+	movups	0x30($inp),$in3
 	xorps	$in3,$inout3
-	movups	$inout2,0x20($out)
 	movups	$inout3,0x30($out)
-	lea	0x40($out),$out
 	jmp	.Lctr32_done
 
 .align	16
-.Lctr32_five:
-	xorps	$inout5,$inout5
-	call	_aesni_encrypt6
-	xorps	$in0,$inout0		# xor
-	movups	0x40($inp),$in0
-	lea	0x50($inp),$inp
-	xorps	$in1,$inout1
-	movups	$inout0,($out)		# store output
-	xorps	$in2,$inout2
-	movups	$inout1,0x10($out)
-	xorps	$in3,$inout3
-	movups	$inout2,0x20($out)
-	xorps	$in0,$inout4
-	movups	$inout3,0x30($out)
-	movups	$inout4,0x40($out)
-	lea	0x50($out),$out
+.Lctr32_one_shortcut:
+	movups	($ivp),$inout0
+	movups	($inp),$in0
+	mov	240($key),$rounds		# key->rounds
+___
+	&aesni_generate1("enc",$key,$rounds);
+$code.=<<___;
+	xorps	$in0,$inout0
+	movups	$inout0,($out)
 	jmp	.Lctr32_done
 
 .align	16
-.Lctr32_six:
-	call	_aesni_encrypt6
-	xorps	$in0,$inout0		# xor
-	movups	0x40($inp),$in0
-	xorps	$in1,$inout1
-	movups	0x50($inp),$in1
-	lea	0x60($inp),$inp
-	xorps	$in2,$inout2
-	movups	$inout0,($out)		# store output
-	xorps	$in3,$inout3
-	movups	$inout1,0x10($out)
-	xorps	$in0,$inout4
-	movups	$inout2,0x20($out)
-	xorps	$in1,$inout5
-	movups	$inout3,0x30($out)
-	movups	$inout4,0x40($out)
-	movups	$inout5,0x50($out)
-	lea	0x60($out),$out
-
 .Lctr32_done:
-	test	$len_,$len_
-	jz	.Lctr32_really_done
-
-	movdqa	.Lbswap_mask(%rip),$rndkey1
-	pshufb	$rndkey1,$ivec
-	psrldq	\$14,$one		# 256
-	paddd	$one,$ivec
-	pslldq	\$14,$one
-	pshufb	$rndkey1,$ivec
-	mov	$len_,$len
-	mov	\$256,%rax
-	jmp	.Lctr32_grandloop
-
-.Lctr32_really_done:
 ___
 $code.=<<___ if ($win64);
-	movaps	0x00(%rsp),%xmm6
-	movaps	0x10(%rsp),%xmm7
-	movaps	0x20(%rsp),%xmm8
-	movaps	0x30(%rsp),%xmm9
-	movaps	0x40(%rsp),%xmm10
-	movaps	0x50(%rsp),%xmm11
-	movaps	0x60(%rsp),%xmm12
-	movaps	0x70(%rsp),%xmm13
-	movaps	0x80(%rsp),%xmm14
-	movaps	0x90(%rsp),%xmm15
-	lea	0xa8(%rsp),%rsp
+	movaps	-0xa0(%rbp),%xmm6
+	movaps	-0x90(%rbp),%xmm7
+	movaps	-0x80(%rbp),%xmm8
+	movaps	-0x70(%rbp),%xmm9
+	movaps	-0x60(%rbp),%xmm10
+	movaps	-0x50(%rbp),%xmm11
+	movaps	-0x40(%rbp),%xmm12
+	movaps	-0x30(%rbp),%xmm13
+	movaps	-0x20(%rbp),%xmm14
+	movaps	-0x10(%rbp),%xmm15
 ___
 $code.=<<___;
-.Lctr32_ret:
+	lea	(%rbp),%rsp
+	pop	%rbp
+.Lctr32_epilogue:
 	ret
 .size	aesni_ctr32_encrypt_blocks,.-aesni_ctr32_encrypt_blocks
 ___
@@ -1417,16 +1421,16 @@ aesni_xts_encrypt:
 	and	\$-16,%rsp	# Linux kernel stack can be incorrectly seeded
 ___
 $code.=<<___ if ($win64);
-	movaps	%xmm6,0x60(%rsp)
-	movaps	%xmm7,0x70(%rsp)
-	movaps	%xmm8,0x80(%rsp)
-	movaps	%xmm9,0x90(%rsp)
-	movaps	%xmm10,0xa0(%rsp)
-	movaps	%xmm11,0xb0(%rsp)
-	movaps	%xmm12,0xc0(%rsp)
-	movaps	%xmm13,0xd0(%rsp)
-	movaps	%xmm14,0xe0(%rsp)
-	movaps	%xmm15,0xf0(%rsp)
+	movaps	%xmm6,-0xa8(%rax)
+	movaps	%xmm7,-0x98(%rax)
+	movaps	%xmm8,-0x88(%rax)
+	movaps	%xmm9,-0x78(%rax)
+	movaps	%xmm10,-0x68(%rax)
+	movaps	%xmm11,-0x58(%rax)
+	movaps	%xmm12,-0x48(%rax)
+	movaps	%xmm13,-0x38(%rax)
+	movaps	%xmm14,-0x28(%rax)
+	movaps	%xmm15,-0x18(%rax)
 .Lxts_enc_body:
 ___
 $code.=<<___;
@@ -1782,16 +1786,16 @@ $code.=<<___;
 .Lxts_enc_ret:
 ___
 $code.=<<___ if ($win64);
-	movaps	0x60(%rsp),%xmm6
-	movaps	0x70(%rsp),%xmm7
-	movaps	0x80(%rsp),%xmm8
-	movaps	0x90(%rsp),%xmm9
-	movaps	0xa0(%rsp),%xmm10
-	movaps	0xb0(%rsp),%xmm11
-	movaps	0xc0(%rsp),%xmm12
-	movaps	0xd0(%rsp),%xmm13
-	movaps	0xe0(%rsp),%xmm14
-	movaps	0xf0(%rsp),%xmm15
+	movaps	-0xa0(%rbp),%xmm6
+	movaps	-0x90(%rbp),%xmm7
+	movaps	-0x80(%rbp),%xmm8
+	movaps	-0x70(%rbp),%xmm9
+	movaps	-0x60(%rbp),%xmm10
+	movaps	-0x50(%rbp),%xmm11
+	movaps	-0x40(%rbp),%xmm12
+	movaps	-0x30(%rbp),%xmm13
+	movaps	-0x20(%rbp),%xmm14
+	movaps	-0x10(%rbp),%xmm15
 ___
 $code.=<<___;
 	lea	(%rbp),%rsp
@@ -1812,16 +1816,16 @@ aesni_xts_decrypt:
 	and	\$-16,%rsp	# Linux kernel stack can be incorrectly seeded
 ___
 $code.=<<___ if ($win64);
-	movaps	%xmm6,0x60(%rsp)
-	movaps	%xmm7,0x70(%rsp)
-	movaps	%xmm8,0x80(%rsp)
-	movaps	%xmm9,0x90(%rsp)
-	movaps	%xmm10,0xa0(%rsp)
-	movaps	%xmm11,0xb0(%rsp)
-	movaps	%xmm12,0xc0(%rsp)
-	movaps	%xmm13,0xd0(%rsp)
-	movaps	%xmm14,0xe0(%rsp)
-	movaps	%xmm15,0xf0(%rsp)
+	movaps	%xmm6,-0xa8(%rax)
+	movaps	%xmm7,-0x98(%rax)
+	movaps	%xmm8,-0x88(%rax)
+	movaps	%xmm9,-0x78(%rax)
+	movaps	%xmm10,-0x68(%rax)
+	movaps	%xmm11,-0x58(%rax)
+	movaps	%xmm12,-0x48(%rax)
+	movaps	%xmm13,-0x38(%rax)
+	movaps	%xmm14,-0x28(%rax)
+	movaps	%xmm15,-0x18(%rax)
 .Lxts_dec_body:
 ___
 $code.=<<___;
@@ -2213,16 +2217,16 @@ $code.=<<___;
 .Lxts_dec_ret:
 ___
 $code.=<<___ if ($win64);
-	movaps	0x60(%rsp),%xmm6
-	movaps	0x70(%rsp),%xmm7
-	movaps	0x80(%rsp),%xmm8
-	movaps	0x90(%rsp),%xmm9
-	movaps	0xa0(%rsp),%xmm10
-	movaps	0xb0(%rsp),%xmm11
-	movaps	0xc0(%rsp),%xmm12
-	movaps	0xd0(%rsp),%xmm13
-	movaps	0xe0(%rsp),%xmm14
-	movaps	0xf0(%rsp),%xmm15
+	movaps	-0xa0(%rbp),%xmm6
+	movaps	-0x90(%rbp),%xmm7
+	movaps	-0x80(%rbp),%xmm8
+	movaps	-0x70(%rbp),%xmm9
+	movaps	-0x60(%rbp),%xmm10
+	movaps	-0x50(%rbp),%xmm11
+	movaps	-0x40(%rbp),%xmm12
+	movaps	-0x30(%rbp),%xmm13
+	movaps	-0x20(%rbp),%xmm14
+	movaps	-0x10(%rbp),%xmm15
 ___
 $code.=<<___;
 	lea	(%rbp),%rsp
@@ -2914,45 +2918,9 @@ ccm64_se_handler:
 	jmp	.Lcommon_seh_tail
 .size	ccm64_se_handler,.-ccm64_se_handler
 
-.type	ctr32_se_handler,\@abi-omnipotent
+.type	ctr_xts_se_handler,\@abi-omnipotent
 .align	16
-ctr32_se_handler:
-	push	%rsi
-	push	%rdi
-	push	%rbx
-	push	%rbp
-	push	%r12
-	push	%r13
-	push	%r14
-	push	%r15
-	pushfq
-	sub	\$64,%rsp
-
-	mov	120($context),%rax	# pull context->Rax
-	mov	248($context),%rbx	# pull context->Rip
-
-	lea	.Lctr32_body(%rip),%r10
-	cmp	%r10,%rbx		# context->Rip<"prologue" label
-	jb	.Lcommon_seh_tail
-
-	mov	152($context),%rax	# pull context->Rsp
-
-	lea	.Lctr32_ret(%rip),%r10
-	cmp	%r10,%rbx
-	jae	.Lcommon_seh_tail
-
-	lea	(%rax),%rsi		# %xmm save area
-	lea	512($context),%rdi	# &context.Xmm6
-	mov	\$20,%ecx		# 10*sizeof(%xmm0)/sizeof(%rax)
-	.long	0xa548f3fc		# cld; rep movsq
-	lea	0xa8(%rax),%rax		# adjust stack pointer
-
-	jmp	.Lcommon_seh_tail
-.size	ctr32_se_handler,.-ctr32_se_handler
-
-.type	xts_se_handler,\@abi-omnipotent
-.align	16
-xts_se_handler:
+ctr_xts_se_handler:
 	push	%rsi
 	push	%rdi
 	push	%rbx
@@ -2982,13 +2950,14 @@ xts_se_handler:
 	cmp	%r10,%rbx		# context->Rip>=epilogue label
 	jae	.Lcommon_seh_tail
 
-	lea	0x60(%rax),%rsi		# %xmm save area
+	mov	160($context),%rax	# pull context->Rbp
+	lea	-0xa0(%rax),%rsi	# %xmm save area
 	lea	512($context),%rdi	# & context.Xmm6
 	mov	\$20,%ecx		# 10*sizeof(%xmm0)/sizeof(%rax)
 	.long	0xa548f3fc		# cld; rep movsq
 
 	jmp	.Lcommon_rbp_tail
-.size	xts_se_handler,.-xts_se_handler
+.size	ctr_xts_se_handler,.-ctr_xts_se_handler
 ___
 $code.=<<___;
 .type	cbc_se_handler,\@abi-omnipotent
@@ -3132,14 +3101,15 @@ $code.=<<___ if ($PREFIX eq "aesni");
 	.rva	.Lccm64_dec_body,.Lccm64_dec_ret	# HandlerData[]
 .LSEH_info_ctr32:
 	.byte	9,0,0,0
-	.rva	ctr32_se_handler
+	.rva	ctr_xts_se_handler
+	.rva	.Lctr32_body,.Lctr32_epilogue		# HandlerData[]
 .LSEH_info_xts_enc:
 	.byte	9,0,0,0
-	.rva	xts_se_handler
+	.rva	ctr_xts_se_handler
 	.rva	.Lxts_enc_body,.Lxts_enc_epilogue	# HandlerData[]
 .LSEH_info_xts_dec:
 	.byte	9,0,0,0
-	.rva	xts_se_handler
+	.rva	ctr_xts_se_handler
 	.rva	.Lxts_dec_body,.Lxts_dec_epilogue	# HandlerData[]
 ___
 $code.=<<___;
-- 
2.40.0