.globl AES_encrypt
.type AES_encrypt,\@function,3
.align 16
+.globl asm_AES_encrypt
+.hidden asm_AES_encrypt
+asm_AES_encrypt:
AES_encrypt:
push %rbx
push %rbp
.globl AES_decrypt
.type AES_decrypt,\@function,3
.align 16
+.globl asm_AES_decrypt
+.hidden asm_AES_decrypt
+asm_AES_decrypt:
AES_decrypt:
push %rbx
push %rbp
.type AES_cbc_encrypt,\@function,6
.align 16
.extern OPENSSL_ia32cap_P
+.globl asm_AES_cbc_encrypt
+.hidden asm_AES_cbc_encrypt
+asm_AES_cbc_encrypt:
AES_cbc_encrypt:
cmp \$0,%rdx # check length
je .Lcbc_epilogue
--- /dev/null
+#!/usr/bin/env perl
+
+###################################################################
+### AES-128 [originally in CTR mode] ###
+### bitsliced implementation for Intel Core 2 processors ###
+### requires support of SSE extensions up to SSSE3 ###
+### Author: Emilia Käsper and Peter Schwabe ###
+### Date: 2009-03-19 ###
+### Public domain ###
+### ###
+### See http://homes.esat.kuleuven.be/~ekasper/#software for ###
+### further information. ###
+###################################################################
+#
+# September 2011.
+#
+# Started as transliteration to "perlasm" the original code has
+# undergone following changes:
+#
+# - code was made position-independent;
+# - rounds were folded into a loop resulting in >5x size reduction
+# from 12.5KB to 2.2KB;
+# - above was possibile thanks to mixcolumns() modification that
+# allowed to feed its output back to aesenc[last], this was
+# achieved at cost of two additional inter-registers moves;
+# - some instruction reordering and interleaving;
+# - this module doesn't implement key setup subroutine, instead it
+# relies on conversion of "conventional" key schedule as returned
+# by AES_set_encrypt_key (see discussion below);
+# - first and last round keys are treated differently, which allowed
+# to skip one shiftrows(), reduce bit-sliced key schedule and
+# speed-up conversion by 22%;
+# - support for 192- and 256-bit keys was added;
+#
+# Resulting performance in CPU cycles spent to encrypt one byte out
+# of 4096-byte buffer with 128-bit key is:
+#
+# Emilia's this(*) difference
+#
+# Core 2 9.30 8.69 +7%
+# Nehalem(**) 7.63 6.98 +9%
+# Atom 17.1 17.4 -2%(***)
+#
+# (*) Comparison is not completely fair, because "this" is ECB,
+# i.e. no extra processing such as counter values calculation
+# and xor-ing input as in Emilia's CTR implementation is
+# performed. However, the CTR calculations stand for not more
+# than 1% of total time, so comparison is *rather* fair.
+#
+# (**) Results were collected on Westmere, which is considered to
+# be equivalent to Nehalem for this code.
+#
+# (***) Slowdown on Atom is rather strange per se, because original
+# implementation has a number of 9+-bytes instructions, which
+# are bad for Atom front-end, and which I eliminated completely.
+# In attempt to address deterioration sbox() was tested in FP
+# SIMD "domain" (movaps instead of movdqa, xorps instead of
+# pxor, etc.). While it resulted in nominal 4% improvement on
+# Atom, it hurted Westmere by more than 2x factor.
+#
+# As for key schedule conversion subroutine. Interface to OpenSSL
+# relies on per-invocation on-the-fly conversion. This naturally
+# has impact on performance, especially for short inputs. Conversion
+# time in CPU cycles and its ratio to CPU cycles spent in 8x block
+# function is:
+#
+# conversion conversion/8x block
+# Core 2 410 0.37
+# Nehalem 310 0.35
+# Atom 570 0.26
+#
+# The ratio values mean that 128-byte blocks will be processed
+# 21-27% slower, 256-byte blocks - 12-16%, 384-byte blocks - 8-11%,
+# etc. Then keep in mind that input sizes not divisible by 128 are
+# *effectively* slower, especially shortest ones, e.g. consecutive
+# 144-byte blocks are processed 44% slower than one would expect,
+# 272 - 29%, 400 - 22%, etc. Yet, despite all these "shortcomings"
+# it's still faster than ["hyper-threading-safe" code path in]
+# aes-x86_64.pl on all lengths above 64 bytes...
+#
+# October 2011.
+#
+# Add decryption procedure. Performance in CPU cycles spent to decrypt
+# one byte out of 4096-byte buffer with 128-bit key is:
+#
+# Core 2 11.0
+# Nehalem 9.16
+#
+# November 2011.
+#
+# Add bsaes_xts_[en|de]crypt. Less-than-80-bytes-block performance is
+# suboptimal, but XTS is meant to be used with larger blocks...
+#
+# <appro@openssl.org>
+
+$flavour = shift;
+$output = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+open STDOUT,"| $^X $xlate $flavour $output";
+
+my ($inp,$out,$len,$key,$ivp)=("%rdi","%rsi","%rdx","%rcx");
+my @XMM=map("%xmm$_",(15,0..14)); # best on Atom, +10% over (0..15)
+my $ecb=0; # suppress unreferenced ECB subroutines, spare some space...
+
+{
+my ($key,$rounds,$const)=("%rax","%r10d","%r11");
+
+sub Sbox {
+# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
+# output in lsb > [b0, b1, b4, b6, b3, b7, b2, b5] < msb
+my @b=@_[0..7];
+my @t=@_[8..11];
+my @s=@_[12..15];
+ &InBasisChange (@b);
+ &Inv_GF256 (@b[6,5,0,3,7,1,4,2],@t,@s);
+ &OutBasisChange (@b[7,1,4,2,6,5,0,3]);
+}
+
+sub InBasisChange {
+# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
+# output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb
+my @b=@_[0..7];
+$code.=<<___;
+ pxor @b[6], @b[5]
+ pxor @b[1], @b[2]
+ pxor @b[0], @b[3]
+ pxor @b[2], @b[6]
+ pxor @b[0], @b[5]
+
+ pxor @b[3], @b[6]
+ pxor @b[7], @b[3]
+ pxor @b[5], @b[7]
+ pxor @b[4], @b[3]
+ pxor @b[5], @b[4]
+ pxor @b[1], @b[3]
+
+ pxor @b[7], @b[2]
+ pxor @b[5], @b[1]
+___
+}
+
+sub OutBasisChange {
+# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
+# output in lsb > [b6, b1, b2, b4, b7, b0, b3, b5] < msb
+my @b=@_[0..7];
+$code.=<<___;
+ pxor @b[6], @b[0]
+ pxor @b[4], @b[1]
+ pxor @b[0], @b[2]
+ pxor @b[6], @b[4]
+ pxor @b[1], @b[6]
+
+ pxor @b[5], @b[1]
+ pxor @b[3], @b[5]
+ pxor @b[7], @b[3]
+ pxor @b[5], @b[7]
+ pxor @b[5], @b[2]
+
+ pxor @b[7], @b[4]
+___
+}
+
+sub InvSbox {
+# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
+# output in lsb > [b0, b1, b6, b4, b2, b7, b3, b5] < msb
+my @b=@_[0..7];
+my @t=@_[8..11];
+my @s=@_[12..15];
+ &InvInBasisChange (@b);
+ &Inv_GF256 (@b[5,1,2,6,3,7,0,4],@t,@s);
+ &InvOutBasisChange (@b[3,7,0,4,5,1,2,6]);
+}
+
+sub InvInBasisChange { # OutBasisChange in reverse
+my @b=@_[5,1,2,6,3,7,0,4];
+$code.=<<___
+ pxor @b[7], @b[4]
+
+ pxor @b[5], @b[7]
+ pxor @b[5], @b[2]
+ pxor @b[7], @b[3]
+ pxor @b[3], @b[5]
+ pxor @b[5], @b[1]
+
+ pxor @b[1], @b[6]
+ pxor @b[0], @b[2]
+ pxor @b[6], @b[4]
+ pxor @b[6], @b[0]
+ pxor @b[4], @b[1]
+___
+}
+
+sub InvOutBasisChange { # InBasisChange in reverse
+my @b=@_[2,5,7,3,6,1,0,4];
+$code.=<<___;
+ pxor @b[5], @b[1]
+ pxor @b[7], @b[2]
+
+ pxor @b[1], @b[3]
+ pxor @b[5], @b[4]
+ pxor @b[5], @b[7]
+ pxor @b[4], @b[3]
+ pxor @b[0], @b[5]
+ pxor @b[7], @b[3]
+ pxor @b[2], @b[6]
+ pxor @b[1], @b[2]
+ pxor @b[3], @b[6]
+
+ pxor @b[0], @b[3]
+ pxor @b[6], @b[5]
+___
+}
+
+sub Mul_GF4 {
+#;*************************************************************
+#;* Mul_GF4: Input x0-x1,y0-y1 Output x0-x1 Temp t0 (8) *
+#;*************************************************************
+my ($x0,$x1,$y0,$y1,$t0)=@_;
+$code.=<<___;
+ movdqa $y0, $t0
+ pxor $y1, $t0
+ pand $x0, $t0
+ pxor $x1, $x0
+ pand $y0, $x1
+ pand $y1, $x0
+ pxor $x1, $x0
+ pxor $t0, $x1
+___
+}
+
+sub Mul_GF4_N { # not used, see next subroutine
+# multiply and scale by N
+my ($x0,$x1,$y0,$y1,$t0)=@_;
+$code.=<<___;
+ movdqa $y0, $t0
+ pxor $y1, $t0
+ pand $x0, $t0
+ pxor $x1, $x0
+ pand $y0, $x1
+ pand $y1, $x0
+ pxor $x0, $x1
+ pxor $t0, $x0
+___
+}
+
+sub Mul_GF4_N_GF4 {
+# interleaved Mul_GF4_N and Mul_GF4
+my ($x0,$x1,$y0,$y1,$t0,
+ $x2,$x3,$y2,$y3,$t1)=@_;
+$code.=<<___;
+ movdqa $y0, $t0
+ movdqa $y2, $t1
+ pxor $y1, $t0
+ pxor $y3, $t1
+ pand $x0, $t0
+ pand $x2, $t1
+ pxor $x1, $x0
+ pxor $x3, $x2
+ pand $y0, $x1
+ pand $y2, $x3
+ pand $y1, $x0
+ pand $y3, $x2
+ pxor $x0, $x1
+ pxor $x3, $x2
+ pxor $t0, $x0
+ pxor $t1, $x3
+___
+}
+sub Mul_GF16_2 {
+my @x=@_[0..7];
+my @y=@_[8..11];
+my @t=@_[12..15];
+$code.=<<___;
+ movdqa @x[0], @t[0]
+ movdqa @x[1], @t[1]
+___
+ &Mul_GF4 (@x[0], @x[1], @y[0], @y[1], @t[2]);
+$code.=<<___;
+ pxor @x[2], @t[0]
+ pxor @x[3], @t[1]
+ pxor @y[2], @y[0]
+ pxor @y[3], @y[1]
+___
+ Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3],
+ @x[2], @x[3], @y[2], @y[3], @t[2]);
+$code.=<<___;
+ pxor @t[0], @x[0]
+ pxor @t[0], @x[2]
+ pxor @t[1], @x[1]
+ pxor @t[1], @x[3]
+
+ movdqa @x[4], @t[0]
+ movdqa @x[5], @t[1]
+ pxor @x[6], @t[0]
+ pxor @x[7], @t[1]
+___
+ &Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3],
+ @x[6], @x[7], @y[2], @y[3], @t[2]);
+$code.=<<___;
+ pxor @y[2], @y[0]
+ pxor @y[3], @y[1]
+___
+ &Mul_GF4 (@x[4], @x[5], @y[0], @y[1], @t[3]);
+$code.=<<___;
+ pxor @t[0], @x[4]
+ pxor @t[0], @x[6]
+ pxor @t[1], @x[5]
+ pxor @t[1], @x[7]
+___
+}
+sub Inv_GF256 {
+#;********************************************************************
+#;* Inv_GF256: Input x0-x7 Output x0-x7 Temp t0-t3,s0-s3 (144) *
+#;********************************************************************
+my @x=@_[0..7];
+my @t=@_[8..11];
+my @s=@_[12..15];
+# direct optimizations from hardware
+$code.=<<___;
+ movdqa @x[4], @t[3]
+ movdqa @x[5], @t[2]
+ movdqa @x[1], @t[1]
+ movdqa @x[7], @s[1]
+ movdqa @x[0], @s[0]
+
+ pxor @x[6], @t[3]
+ pxor @x[7], @t[2]
+ pxor @x[3], @t[1]
+ movdqa @t[3], @s[2]
+ pxor @x[6], @s[1]
+ movdqa @t[2], @t[0]
+ pxor @x[2], @s[0]
+ movdqa @t[3], @s[3]
+
+ por @t[1], @t[2]
+ por @s[0], @t[3]
+ pxor @t[0], @s[3]
+ pand @s[0], @s[2]
+ pxor @t[1], @s[0]
+ pand @t[1], @t[0]
+ pand @s[0], @s[3]
+ movdqa @x[3], @s[0]
+ pxor @x[2], @s[0]
+ pand @s[0], @s[1]
+ pxor @s[1], @t[3]
+ pxor @s[1], @t[2]
+ movdqa @x[4], @s[1]
+ movdqa @x[1], @s[0]
+ pxor @x[5], @s[1]
+ pxor @x[0], @s[0]
+ movdqa @s[1], @t[1]
+ pand @s[0], @s[1]
+ por @s[0], @t[1]
+ pxor @s[1], @t[0]
+ pxor @s[3], @t[3]
+ pxor @s[2], @t[2]
+ pxor @s[3], @t[1]
+ movdqa @x[7], @s[0]
+ pxor @s[2], @t[0]
+ movdqa @x[6], @s[1]
+ pxor @s[2], @t[1]
+ movdqa @x[5], @s[2]
+ pand @x[3], @s[0]
+ movdqa @x[4], @s[3]
+ pand @x[2], @s[1]
+ pand @x[1], @s[2]
+ por @x[0], @s[3]
+ pxor @s[0], @t[3]
+ pxor @s[1], @t[2]
+ pxor @s[2], @t[1]
+ pxor @s[3], @t[0]
+
+ #Inv_GF16 \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3
+
+ # new smaller inversion
+
+ movdqa @t[3], @s[0]
+ pand @t[1], @t[3]
+ pxor @t[2], @s[0]
+
+ movdqa @t[0], @s[2]
+ movdqa @s[0], @s[3]
+ pxor @t[3], @s[2]
+ pand @s[2], @s[3]
+
+ movdqa @t[1], @s[1]
+ pxor @t[2], @s[3]
+ pxor @t[0], @s[1]
+
+ pxor @t[2], @t[3]
+
+ pand @t[3], @s[1]
+
+ movdqa @s[2], @t[2]
+ pxor @t[0], @s[1]
+
+ pxor @s[1], @t[2]
+ pxor @s[1], @t[1]
+
+ pand @t[0], @t[2]
+
+ pxor @t[2], @s[2]
+ pxor @t[2], @t[1]
+
+ pand @s[3], @s[2]
+
+ pxor @s[0], @s[2]
+___
+# output in s3, s2, s1, t1
+
+# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \t2, \t3, \t0, \t1, \s0, \s1, \s2, \s3
+
+# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3
+ &Mul_GF16_2(@x,@s[3,2,1],@t[1],@s[0],@t[0,2,3]);
+
+### output msb > [x3,x2,x1,x0,x7,x6,x5,x4] < lsb
+}
+
+# AES linear components
+
+sub ShiftRows {
+my @x=@_[0..7];
+my $mask=pop;
+$code.=<<___;
+ pxor 0x00($key),@x[0]
+ pxor 0x10($key),@x[1]
+ pshufb $mask,@x[0]
+ pxor 0x20($key),@x[2]
+ pshufb $mask,@x[1]
+ pxor 0x30($key),@x[3]
+ pshufb $mask,@x[2]
+ pxor 0x40($key),@x[4]
+ pshufb $mask,@x[3]
+ pxor 0x50($key),@x[5]
+ pshufb $mask,@x[4]
+ pxor 0x60($key),@x[6]
+ pshufb $mask,@x[5]
+ pxor 0x70($key),@x[7]
+ pshufb $mask,@x[6]
+ lea 0x80($key),$key
+ pshufb $mask,@x[7]
+___
+}
+
+sub MixColumns {
+# modified to emit output in order suitable for feeding back to aesenc[last]
+my @x=@_[0..7];
+my @t=@_[8..15];
+$code.=<<___;
+ pshufd \$0x93, @x[0], @t[0] # x0 <<< 32
+ pshufd \$0x93, @x[1], @t[1]
+ pxor @t[0], @x[0] # x0 ^ (x0 <<< 32)
+ pshufd \$0x93, @x[2], @t[2]
+ pxor @t[1], @x[1]
+ pshufd \$0x93, @x[3], @t[3]
+ pxor @t[2], @x[2]
+ pshufd \$0x93, @x[4], @t[4]
+ pxor @t[3], @x[3]
+ pshufd \$0x93, @x[5], @t[5]
+ pxor @t[4], @x[4]
+ pshufd \$0x93, @x[6], @t[6]
+ pxor @t[5], @x[5]
+ pshufd \$0x93, @x[7], @t[7]
+ pxor @t[6], @x[6]
+ pxor @t[7], @x[7]
+
+ pxor @x[0], @t[1]
+ pxor @x[7], @t[0]
+ pxor @x[7], @t[1]
+ pshufd \$0x4E, @x[0], @x[0] # (x0 ^ (x0 <<< 32)) <<< 64)
+ pxor @x[1], @t[2]
+ pshufd \$0x4E, @x[1], @x[1]
+ pxor @x[4], @t[5]
+ pxor @t[0], @x[0]
+ pxor @x[5], @t[6]
+ pxor @t[1], @x[1]
+ pxor @x[3], @t[4]
+ pshufd \$0x4E, @x[4], @t[0]
+ pxor @x[6], @t[7]
+ pshufd \$0x4E, @x[5], @t[1]
+ pxor @x[2], @t[3]
+ pshufd \$0x4E, @x[3], @x[4]
+ pxor @x[7], @t[3]
+ pshufd \$0x4E, @x[7], @x[5]
+ pxor @x[7], @t[4]
+ pshufd \$0x4E, @x[6], @x[3]
+ pxor @t[4], @t[0]
+ pshufd \$0x4E, @x[2], @x[6]
+ pxor @t[5], @t[1]
+
+ pxor @t[3], @x[4]
+ pxor @t[7], @x[5]
+ pxor @t[6], @x[3]
+ movdqa @t[0], @x[2]
+ pxor @t[2], @x[6]
+ movdqa @t[1], @x[7]
+___
+}
+
+sub InvMixColumns {
+my @x=@_[0..7];
+my @t=@_[8..15];
+
+$code.=<<___;
+ # multiplication by 0x0e
+ pshufd \$0x93, @x[7], @t[7]
+ movdqa @x[2], @t[2]
+ pxor @x[5], @x[7] # 7 5
+ pxor @x[5], @x[2] # 2 5
+ pshufd \$0x93, @x[0], @t[0]
+ movdqa @x[5], @t[5]
+ pxor @x[0], @x[5] # 5 0 [1]
+ pxor @x[1], @x[0] # 0 1
+ pshufd \$0x93, @x[1], @t[1]
+ pxor @x[2], @x[1] # 1 25
+ pxor @x[6], @x[0] # 01 6 [2]
+ pxor @x[3], @x[1] # 125 3 [4]
+ pshufd \$0x93, @x[3], @t[3]
+ pxor @x[0], @x[2] # 25 016 [3]
+ pxor @x[7], @x[3] # 3 75
+ pxor @x[6], @x[7] # 75 6 [0]
+ pshufd \$0x93, @x[6], @t[6]
+ movdqa @x[4], @t[4]
+ pxor @x[4], @x[6] # 6 4
+ pxor @x[3], @x[4] # 4 375 [6]
+ pxor @x[7], @x[3] # 375 756=36
+ pxor @t[5], @x[6] # 64 5 [7]
+ pxor @t[2], @x[3] # 36 2
+ pxor @t[4], @x[3] # 362 4 [5]
+ pshufd \$0x93, @t[5], @t[5]
+___
+ my @y = @x[7,5,0,2,1,3,4,6];
+$code.=<<___;
+ # multiplication by 0x0b
+ pxor @y[0], @y[1]
+ pxor @t[0], @y[0]
+ pxor @t[1], @y[1]
+ pshufd \$0x93, @t[2], @t[2]
+ pxor @t[5], @y[0]
+ pxor @t[6], @y[1]
+ pxor @t[7], @y[0]
+ pshufd \$0x93, @t[4], @t[4]
+ pxor @t[6], @t[7] # clobber t[7]
+ pxor @y[0], @y[1]
+
+ pxor @t[0], @y[3]
+ pshufd \$0x93, @t[0], @t[0]
+ pxor @t[1], @y[2]
+ pxor @t[1], @y[4]
+ pxor @t[2], @y[2]
+ pshufd \$0x93, @t[1], @t[1]
+ pxor @t[2], @y[3]
+ pxor @t[2], @y[5]
+ pxor @t[7], @y[2]
+ pshufd \$0x93, @t[2], @t[2]
+ pxor @t[3], @y[3]
+ pxor @t[3], @y[6]
+ pxor @t[3], @y[4]
+ pshufd \$0x93, @t[3], @t[3]
+ pxor @t[4], @y[7]
+ pxor @t[4], @y[5]
+ pxor @t[7], @y[7]
+ pxor @t[5], @y[3]
+ pxor @t[4], @y[4]
+ pxor @t[5], @t[7] # clobber t[7] even more
+
+ pxor @t[7], @y[5]
+ pshufd \$0x93, @t[4], @t[4]
+ pxor @t[7], @y[6]
+ pxor @t[7], @y[4]
+
+ pxor @t[5], @t[7]
+ pshufd \$0x93, @t[5], @t[5]
+ pxor @t[6], @t[7] # restore t[7]
+
+ # multiplication by 0x0d
+ pxor @y[7], @y[4]
+ pxor @t[4], @y[7]
+ pshufd \$0x93, @t[6], @t[6]
+ pxor @t[0], @y[2]
+ pxor @t[5], @y[7]
+ pxor @t[2], @y[2]
+ pshufd \$0x93, @t[7], @t[7]
+
+ pxor @y[1], @y[3]
+ pxor @t[1], @y[1]
+ pxor @t[0], @y[0]
+ pxor @t[0], @y[3]
+ pxor @t[5], @y[1]
+ pxor @t[5], @y[0]
+ pxor @t[7], @y[1]
+ pshufd \$0x93, @t[0], @t[0]
+ pxor @t[6], @y[0]
+ pxor @y[1], @y[3]
+ pxor @t[1], @y[4]
+ pshufd \$0x93, @t[1], @t[1]
+
+ pxor @t[7], @y[7]
+ pxor @t[2], @y[4]
+ pxor @t[2], @y[5]
+ pshufd \$0x93, @t[2], @t[2]
+ pxor @t[6], @y[2]
+ pxor @t[3], @t[6] # clobber t[6]
+ pxor @y[7], @y[4]
+ pxor @t[6], @y[3]
+
+ pxor @t[6], @y[6]
+ pxor @t[5], @y[5]
+ pxor @t[4], @y[6]
+ pshufd \$0x93, @t[4], @t[4]
+ pxor @t[6], @y[5]
+ pxor @t[7], @y[6]
+ pxor @t[3], @t[6] # restore t[6]
+
+ pshufd \$0x93, @t[5], @t[5]
+ pshufd \$0x93, @t[6], @t[6]
+ pshufd \$0x93, @t[7], @t[7]
+ pshufd \$0x93, @t[3], @t[3]
+
+ # multiplication by 0x09
+ pxor @y[1], @y[4]
+ pxor @y[1], @t[1] # t[1]=y[1]
+ pxor @t[5], @t[0] # clobber t[0]
+ pxor @t[5], @t[1]
+ pxor @t[0], @y[3]
+ pxor @y[0], @t[0] # t[0]=y[0]
+ pxor @t[6], @t[1]
+ pxor @t[7], @t[6] # clobber t[6]
+ pxor @t[1], @y[4]
+ pxor @t[4], @y[7]
+ pxor @y[4], @t[4] # t[4]=y[4]
+ pxor @t[3], @y[6]
+ pxor @y[3], @t[3] # t[3]=y[3]
+ pxor @t[2], @y[5]
+ pxor @y[2], @t[2] # t[2]=y[2]
+ pxor @t[7], @t[3]
+ pxor @y[5], @t[5] # t[5]=y[5]
+ pxor @t[6], @t[2]
+ pxor @t[6], @t[5]
+ pxor @y[6], @t[6] # t[6]=y[6]
+ pxor @y[7], @t[7] # t[7]=y[7]
+
+ movdqa @t[0],@XMM[0]
+ movdqa @t[1],@XMM[1]
+ movdqa @t[2],@XMM[2]
+ movdqa @t[3],@XMM[3]
+ movdqa @t[4],@XMM[4]
+ movdqa @t[5],@XMM[5]
+ movdqa @t[6],@XMM[6]
+ movdqa @t[7],@XMM[7]
+___
+}
+
+sub aesenc { # not used
+my @b=@_[0..7];
+my @t=@_[8..15];
+$code.=<<___;
+ movdqa 0x30($const),@t[0] # .LSR
+___
+ &ShiftRows (@b,@t[0]);
+ &Sbox (@b,@t);
+ &MixColumns (@b[0,1,4,6,3,7,2,5],@t);
+}
+
+sub aesenclast { # not used
+my @b=@_[0..7];
+my @t=@_[8..15];
+$code.=<<___;
+ movdqa 0x40($const),@t[0] # .LSRM0
+___
+ &ShiftRows (@b,@t[0]);
+ &Sbox (@b,@t);
+$code.=<<___
+ pxor 0x00($key),@b[0]
+ pxor 0x10($key),@b[1]
+ pxor 0x20($key),@b[4]
+ pxor 0x30($key),@b[6]
+ pxor 0x40($key),@b[3]
+ pxor 0x50($key),@b[7]
+ pxor 0x60($key),@b[2]
+ pxor 0x70($key),@b[5]
+___
+}
+
+sub swapmove {
+my ($a,$b,$n,$mask,$t)=@_;
+$code.=<<___;
+ movdqa $b,$t
+ psrlq \$$n,$b
+ pxor $a,$b
+ pand $mask,$b
+ pxor $b,$a
+ psllq \$$n,$b
+ pxor $t,$b
+___
+}
+sub swapmove2x {
+my ($a0,$b0,$a1,$b1,$n,$mask,$t0,$t1)=@_;
+$code.=<<___;
+ movdqa $b0,$t0
+ psrlq \$$n,$b0
+ movdqa $b1,$t1
+ psrlq \$$n,$b1
+ pxor $a0,$b0
+ pxor $a1,$b1
+ pand $mask,$b0
+ pand $mask,$b1
+ pxor $b0,$a0
+ psllq \$$n,$b0
+ pxor $b1,$a1
+ psllq \$$n,$b1
+ pxor $t0,$b0
+ pxor $t1,$b1
+___
+}
+
+sub bitslice {
+my @x=reverse(@_[0..7]);
+my ($t0,$t1,$t2,$t3)=@_[8..11];
+$code.=<<___;
+ movdqa 0x00($const),$t0 # .LBS0
+ movdqa 0x10($const),$t1 # .LBS1
+___
+ &swapmove2x(@x[0,1,2,3],1,$t0,$t2,$t3);
+ &swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
+$code.=<<___;
+ movdqa 0x20($const),$t0 # .LBS2
+___
+ &swapmove2x(@x[0,2,1,3],2,$t1,$t2,$t3);
+ &swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
+
+ &swapmove2x(@x[0,4,1,5],4,$t0,$t2,$t3);
+ &swapmove2x(@x[2,6,3,7],4,$t0,$t2,$t3);
+}
+
+$code.=<<___;
+.text
+
+.extern asm_AES_encrypt
+.extern asm_AES_decrypt
+
+.type _bsaes_encrypt8,\@abi-omnipotent
+.align 64
+_bsaes_encrypt8:
+ lea .LBS0(%rip), $const # constants table
+
+ movdqa ($key), @XMM[9] # round 0 key
+ lea 0x10($key), $key
+ movdqa 0x60($const), @XMM[8] # .LM0SR
+ pxor @XMM[9], @XMM[0] # xor with round0 key
+ pxor @XMM[9], @XMM[1]
+ pshufb @XMM[8], @XMM[0]
+ pxor @XMM[9], @XMM[2]
+ pshufb @XMM[8], @XMM[1]
+ pxor @XMM[9], @XMM[3]
+ pshufb @XMM[8], @XMM[2]
+ pxor @XMM[9], @XMM[4]
+ pshufb @XMM[8], @XMM[3]
+ pxor @XMM[9], @XMM[5]
+ pshufb @XMM[8], @XMM[4]
+ pxor @XMM[9], @XMM[6]
+ pshufb @XMM[8], @XMM[5]
+ pxor @XMM[9], @XMM[7]
+ pshufb @XMM[8], @XMM[6]
+ pshufb @XMM[8], @XMM[7]
+_bsaes_encrypt8_bitslice:
+___
+ &bitslice (@XMM[0..7, 8..11]);
+$code.=<<___;
+ dec $rounds
+ jmp .Lenc_sbox
+.align 16
+.Lenc_loop:
+___
+ &ShiftRows (@XMM[0..7, 8]);
+$code.=".Lenc_sbox:\n";
+ &Sbox (@XMM[0..7, 8..15]);
+$code.=<<___;
+ dec $rounds
+ jl .Lenc_done
+___
+ &MixColumns (@XMM[0,1,4,6,3,7,2,5, 8..15]);
+$code.=<<___;
+ movdqa 0x30($const), @XMM[8] # .LSR
+ jnz .Lenc_loop
+ movdqa 0x40($const), @XMM[8] # .LSRM0
+ jmp .Lenc_loop
+.align 16
+.Lenc_done:
+___
+ # output in lsb > [t0, t1, t4, t6, t3, t7, t2, t5] < msb
+ &bitslice (@XMM[0,1,4,6,3,7,2,5, 8..11]);
+$code.=<<___;
+ movdqa ($key), @XMM[8] # last round key
+ pxor @XMM[8], @XMM[4]
+ pxor @XMM[8], @XMM[6]
+ pxor @XMM[8], @XMM[3]
+ pxor @XMM[8], @XMM[7]
+ pxor @XMM[8], @XMM[2]
+ pxor @XMM[8], @XMM[5]
+ pxor @XMM[8], @XMM[0]
+ pxor @XMM[8], @XMM[1]
+ ret
+.size _bsaes_encrypt8,.-_bsaes_encrypt8
+
+.type _bsaes_decrypt8,\@abi-omnipotent
+.align 64
+_bsaes_decrypt8:
+ lea .LBS0(%rip), $const # constants table
+
+ movdqa ($key), @XMM[9] # round 0 key
+ lea 0x10($key), $key
+ movdqa -0x30($const), @XMM[8] # .LM0ISR
+ pxor @XMM[9], @XMM[0] # xor with round0 key
+ pxor @XMM[9], @XMM[1]
+ pshufb @XMM[8], @XMM[0]
+ pxor @XMM[9], @XMM[2]
+ pshufb @XMM[8], @XMM[1]
+ pxor @XMM[9], @XMM[3]
+ pshufb @XMM[8], @XMM[2]
+ pxor @XMM[9], @XMM[4]
+ pshufb @XMM[8], @XMM[3]
+ pxor @XMM[9], @XMM[5]
+ pshufb @XMM[8], @XMM[4]
+ pxor @XMM[9], @XMM[6]
+ pshufb @XMM[8], @XMM[5]
+ pxor @XMM[9], @XMM[7]
+ pshufb @XMM[8], @XMM[6]
+ pshufb @XMM[8], @XMM[7]
+___
+ &bitslice (@XMM[0..7, 8..11]);
+$code.=<<___;
+ dec $rounds
+ jmp .Ldec_sbox
+.align 16
+.Ldec_loop:
+___
+ &ShiftRows (@XMM[0..7, 8]);
+$code.=".Ldec_sbox:\n";
+ &InvSbox (@XMM[0..7, 8..15]);
+$code.=<<___;
+ dec $rounds
+ jl .Ldec_done
+___
+ &InvMixColumns (@XMM[0,1,6,4,2,7,3,5, 8..15]);
+$code.=<<___;
+ movdqa -0x10($const), @XMM[8] # .LISR
+ jnz .Ldec_loop
+ movdqa -0x20($const), @XMM[8] # .LISRM0
+ jmp .Ldec_loop
+.align 16
+.Ldec_done:
+___
+ &bitslice (@XMM[0,1,6,4,2,7,3,5, 8..11]);
+$code.=<<___;
+ movdqa ($key), @XMM[8] # last round key
+ pxor @XMM[8], @XMM[6]
+ pxor @XMM[8], @XMM[4]
+ pxor @XMM[8], @XMM[2]
+ pxor @XMM[8], @XMM[7]
+ pxor @XMM[8], @XMM[3]
+ pxor @XMM[8], @XMM[5]
+ pxor @XMM[8], @XMM[0]
+ pxor @XMM[8], @XMM[1]
+ ret
+.size _bsaes_decrypt8,.-_bsaes_decrypt8
+___
+}
+{
+my ($out,$inp,$rounds,$const)=("%rax","%rcx","%r10d","%r11");
+
+sub bitslice_key {
+my @x=reverse(@_[0..7]);
+my ($bs0,$bs1,$bs2,$t2,$t3)=@_[8..12];
+
+ &swapmove (@x[0,1],1,$bs0,$t2,$t3);
+$code.=<<___;
+ #&swapmove(@x[2,3],1,$t0,$t2,$t3);
+ movdqa @x[0], @x[2]
+ movdqa @x[1], @x[3]
+___
+ #&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
+
+ &swapmove2x (@x[0,2,1,3],2,$bs1,$t2,$t3);
+$code.=<<___;
+ #&swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
+ movdqa @x[0], @x[4]
+ movdqa @x[2], @x[6]
+ movdqa @x[1], @x[5]
+ movdqa @x[3], @x[7]
+___
+ &swapmove2x (@x[0,4,1,5],4,$bs2,$t2,$t3);
+ &swapmove2x (@x[2,6,3,7],4,$bs2,$t2,$t3);
+}
+
+$code.=<<___;
+.type _bsaes_key_convert,\@abi-omnipotent
+.align 16
+_bsaes_key_convert:
+ lea .LBS1(%rip), $const
+ movdqu ($inp), %xmm7 # load round 0 key
+ movdqa -0x10($const), %xmm8 # .LBS0
+ movdqa 0x00($const), %xmm9 # .LBS1
+ movdqa 0x10($const), %xmm10 # .LBS2
+ movdqa 0x40($const), %xmm13 # .LM0
+ movdqa 0x60($const), %xmm14 # .LNOT
+
+ movdqu 0x10($inp), %xmm6 # load round 1 key
+ lea 0x10($inp), $inp
+ movdqa %xmm7, ($out) # save round 0 key
+ lea 0x10($out), $out
+ dec $rounds
+ jmp .Lkey_loop
+.align 16
+.Lkey_loop:
+ pshufb %xmm13, %xmm6 # .LM0
+ movdqa %xmm6, %xmm7
+___
+ &bitslice_key (map("%xmm$_",(0..7, 8..12)));
+$code.=<<___;
+ pxor %xmm14, %xmm5 # "pnot"
+ pxor %xmm14, %xmm6
+ pxor %xmm14, %xmm0
+ pxor %xmm14, %xmm1
+ lea 0x10($inp), $inp
+ movdqa %xmm0, 0x00($out) # write bit-sliced round key
+ movdqa %xmm1, 0x10($out)
+ movdqa %xmm2, 0x20($out)
+ movdqa %xmm3, 0x30($out)
+ movdqa %xmm4, 0x40($out)
+ movdqa %xmm5, 0x50($out)
+ movdqa %xmm6, 0x60($out)
+ movdqa %xmm7, 0x70($out)
+ lea 0x80($out),$out
+ movdqu ($inp), %xmm6 # load next round key
+ dec $rounds
+ jnz .Lkey_loop
+
+ movdqa 0x70($const), %xmm7 # .L63
+ #movdqa %xmm6, ($out) # don't save last round key
+ ret
+.size _bsaes_key_convert,.-_bsaes_key_convert
+___
+}
+
+if (0 && !$win64) { # following four functions are unsupported interface
+ # used for benchmarking...
+$code.=<<___;
+.globl bsaes_enc_key_convert
+.type bsaes_enc_key_convert,\@function,2
+.align 16
+bsaes_enc_key_convert:
+ mov 240($inp),%r10d # pass rounds
+ mov $inp,%rcx # pass key
+ mov $out,%rax # pass key schedule
+ call _bsaes_key_convert
+ pxor %xmm6,%xmm7 # fix up last round key
+ movdqa %xmm7,(%rax) # save last round key
+ ret
+.size bsaes_enc_key_convert,.-bsaes_enc_key_convert
+
+.globl bsaes_encrypt_128
+.type bsaes_encrypt_128,\@function,4
+.align 16
+bsaes_encrypt_128:
+.Lenc128_loop:
+ movdqu 0x00($inp), @XMM[0] # load input
+ movdqu 0x10($inp), @XMM[1]
+ movdqu 0x20($inp), @XMM[2]
+ movdqu 0x30($inp), @XMM[3]
+ movdqu 0x40($inp), @XMM[4]
+ movdqu 0x50($inp), @XMM[5]
+ movdqu 0x60($inp), @XMM[6]
+ movdqu 0x70($inp), @XMM[7]
+ mov $key, %rax # pass the $key
+ lea 0x80($inp), $inp
+ mov \$10,%r10d
+
+ call _bsaes_encrypt8
+
+ movdqu @XMM[0], 0x00($out) # write output
+ movdqu @XMM[1], 0x10($out)
+ movdqu @XMM[4], 0x20($out)
+ movdqu @XMM[6], 0x30($out)
+ movdqu @XMM[3], 0x40($out)
+ movdqu @XMM[7], 0x50($out)
+ movdqu @XMM[2], 0x60($out)
+ movdqu @XMM[5], 0x70($out)
+ lea 0x80($out), $out
+ sub \$0x80,$len
+ ja .Lenc128_loop
+ ret
+.size bsaes_encrypt_128,.-bsaes_encrypt_128
+
+.globl bsaes_dec_key_convert
+.type bsaes_dec_key_convert,\@function,2
+.align 16
+bsaes_dec_key_convert:
+ mov 240($inp),%r10d # pass rounds
+ mov $inp,%rcx # pass key
+ mov $out,%rax # pass key schedule
+ call _bsaes_key_convert
+ pxor ($out),%xmm7 # fix up round 0 key
+ movdqa %xmm6,(%rax) # save last round key
+ movdqa %xmm7,($out)
+ ret
+.size bsaes_dec_key_convert,.-bsaes_dec_key_convert
+
+.globl bsaes_decrypt_128
+.type bsaes_decrypt_128,\@function,4
+.align 16
+bsaes_decrypt_128:
+.Ldec128_loop:
+ movdqu 0x00($inp), @XMM[0] # load input
+ movdqu 0x10($inp), @XMM[1]
+ movdqu 0x20($inp), @XMM[2]
+ movdqu 0x30($inp), @XMM[3]
+ movdqu 0x40($inp), @XMM[4]
+ movdqu 0x50($inp), @XMM[5]
+ movdqu 0x60($inp), @XMM[6]
+ movdqu 0x70($inp), @XMM[7]
+ mov $key, %rax # pass the $key
+ lea 0x80($inp), $inp
+ mov \$10,%r10d
+
+ call _bsaes_decrypt8
+
+ movdqu @XMM[0], 0x00($out) # write output
+ movdqu @XMM[1], 0x10($out)
+ movdqu @XMM[6], 0x20($out)
+ movdqu @XMM[4], 0x30($out)
+ movdqu @XMM[2], 0x40($out)
+ movdqu @XMM[7], 0x50($out)
+ movdqu @XMM[3], 0x60($out)
+ movdqu @XMM[5], 0x70($out)
+ lea 0x80($out), $out
+ sub \$0x80,$len
+ ja .Ldec128_loop
+ ret
+.size bsaes_decrypt_128,.-bsaes_decrypt_128
+___
+}
+{
+######################################################################
+#
+# OpenSSL interface
+#
+my ($arg1,$arg2,$arg3,$arg4,$arg5,$arg6)=$win64 ? ("%rcx","%rdx","%r8","%r9","%r10","%r11d")
+ : ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d");
+my ($inp,$out,$len,$key)=("%r12","%r13","%r14","%r15");
+
+if ($ecb) {
+$code.=<<___;
+.globl bsaes_ecb_encrypt_blocks
+.type bsaes_ecb_encrypt_blocks,\@abi-omnipotent
+.align 16
+bsaes_ecb_encrypt_blocks:
+ mov %rsp, %rax
+.Lecb_enc_prologue:
+ push %rbp
+ push %rbx
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ lea -0x48(%rsp),%rsp
+___
+$code.=<<___ if ($win64);
+ lea -0xa0(%rsp), %rsp
+ movaps %xmm6, 0x40(%rsp)
+ movaps %xmm7, 0x50(%rsp)
+ movaps %xmm8, 0x60(%rsp)
+ movaps %xmm9, 0x70(%rsp)
+ movaps %xmm10, 0x80(%rsp)
+ movaps %xmm11, 0x90(%rsp)
+ movaps %xmm12, 0xa0(%rsp)
+ movaps %xmm13, 0xb0(%rsp)
+ movaps %xmm14, 0xc0(%rsp)
+ movaps %xmm15, 0xd0(%rsp)
+.Lecb_enc_body:
+___
+$code.=<<___;
+ mov %rsp,%rbp # backup %rsp
+ mov 240($arg4),%eax # rounds
+ mov $arg1,$inp # backup arguments
+ mov $arg2,$out
+ mov $arg3,$len
+ mov $arg4,$key
+ cmp \$8,$arg3
+ jb .Lecb_enc_short
+
+ mov %eax,%ebx # backup rounds
+ shl \$7,%rax # 128 bytes per inner round key
+ sub \$`128-32`,%rax # size of bit-sliced key schedule
+ sub %rax,%rsp
+ mov %rsp,%rax # pass key schedule
+ mov $key,%rcx # pass key
+ mov %ebx,%r10d # pass rounds
+ call _bsaes_key_convert
+ pxor %xmm6,%xmm7 # fix up last round key
+ movdqa %xmm7,(%rax) # save last round key
+
+ sub \$8,$len
+.Lecb_enc_loop:
+ movdqu 0x00($inp), @XMM[0] # load input
+ movdqu 0x10($inp), @XMM[1]
+ movdqu 0x20($inp), @XMM[2]
+ movdqu 0x30($inp), @XMM[3]
+ movdqu 0x40($inp), @XMM[4]
+ movdqu 0x50($inp), @XMM[5]
+ mov %rsp, %rax # pass key schedule
+ movdqu 0x60($inp), @XMM[6]
+ mov %ebx,%r10d # pass rounds
+ movdqu 0x70($inp), @XMM[7]
+ lea 0x80($inp), $inp
+
+ call _bsaes_encrypt8
+
+ movdqu @XMM[0], 0x00($out) # write output
+ movdqu @XMM[1], 0x10($out)
+ movdqu @XMM[4], 0x20($out)
+ movdqu @XMM[6], 0x30($out)
+ movdqu @XMM[3], 0x40($out)
+ movdqu @XMM[7], 0x50($out)
+ movdqu @XMM[2], 0x60($out)
+ movdqu @XMM[5], 0x70($out)
+ lea 0x80($out), $out
+ sub \$8,$len
+ jnc .Lecb_enc_loop
+
+ add \$8,$len
+ jz .Lecb_enc_done
+
+ movdqu 0x00($inp), @XMM[0] # load input
+ mov %rsp, %rax # pass key schedule
+ mov %ebx,%r10d # pass rounds
+ cmp \$2,$len
+ jb .Lecb_enc_one
+ movdqu 0x10($inp), @XMM[1]
+ je .Lecb_enc_two
+ movdqu 0x20($inp), @XMM[2]
+ cmp \$4,$len
+ jb .Lecb_enc_three
+ movdqu 0x30($inp), @XMM[3]
+ je .Lecb_enc_four
+ movdqu 0x40($inp), @XMM[4]
+ cmp \$6,$len
+ jb .Lecb_enc_five
+ movdqu 0x50($inp), @XMM[5]
+ je .Lecb_enc_six
+ movdqu 0x60($inp), @XMM[6]
+ call _bsaes_encrypt8
+ movdqu @XMM[0], 0x00($out) # write output
+ movdqu @XMM[1], 0x10($out)
+ movdqu @XMM[4], 0x20($out)
+ movdqu @XMM[6], 0x30($out)
+ movdqu @XMM[3], 0x40($out)
+ movdqu @XMM[7], 0x50($out)
+ movdqu @XMM[2], 0x60($out)
+ jmp .Lecb_enc_done
+.align 16
+.Lecb_enc_six:
+ call _bsaes_encrypt8
+ movdqu @XMM[0], 0x00($out) # write output
+ movdqu @XMM[1], 0x10($out)
+ movdqu @XMM[4], 0x20($out)
+ movdqu @XMM[6], 0x30($out)
+ movdqu @XMM[3], 0x40($out)
+ movdqu @XMM[7], 0x50($out)
+ jmp .Lecb_enc_done
+.align 16
+.Lecb_enc_five:
+ call _bsaes_encrypt8
+ movdqu @XMM[0], 0x00($out) # write output
+ movdqu @XMM[1], 0x10($out)
+ movdqu @XMM[4], 0x20($out)
+ movdqu @XMM[6], 0x30($out)
+ movdqu @XMM[3], 0x40($out)
+ jmp .Lecb_enc_done
+.align 16
+.Lecb_enc_four:
+ call _bsaes_encrypt8
+ movdqu @XMM[0], 0x00($out) # write output
+ movdqu @XMM[1], 0x10($out)
+ movdqu @XMM[4], 0x20($out)
+ movdqu @XMM[6], 0x30($out)
+ jmp .Lecb_enc_done
+.align 16
+.Lecb_enc_three:
+ call _bsaes_encrypt8
+ movdqu @XMM[0], 0x00($out) # write output
+ movdqu @XMM[1], 0x10($out)
+ movdqu @XMM[4], 0x20($out)
+ jmp .Lecb_enc_done
+.align 16
+.Lecb_enc_two:
+ call _bsaes_encrypt8
+ movdqu @XMM[0], 0x00($out) # write output
+ movdqu @XMM[1], 0x10($out)
+ jmp .Lecb_enc_done
+.align 16
+.Lecb_enc_one:
+ call _bsaes_encrypt8
+ movdqu @XMM[0], 0x00($out) # write output
+ jmp .Lecb_enc_done
+.align 16
+.Lecb_enc_short:
+ lea ($inp), $arg1
+ lea ($out), $arg2
+ lea ($key), $arg3
+ call asm_AES_encrypt
+ lea 16($inp), $inp
+ lea 16($out), $out
+ dec $len
+ jnz .Lecb_enc_short
+
+.Lecb_enc_done:
+ lea (%rsp),%rax
+ pxor %xmm0, %xmm0
+.Lecb_enc_bzero: # wipe key schedule [if any]
+ movdqa %xmm0, 0x00(%rax)
+ movdqa %xmm0, 0x10(%rax)
+ lea 0x20(%rax), %rax
+ cmp %rax, %rbp
+ jb .Lecb_enc_bzero
+
+ lea (%rbp),%rsp # restore %rsp
+___
+$code.=<<___ if ($win64);
+ movaps 0x40(%rbp), %xmm6
+ movaps 0x50(%rbp), %xmm7
+ movaps 0x60(%rbp), %xmm8
+ movaps 0x70(%rbp), %xmm9
+ movaps 0x80(%rbp), %xmm10
+ movaps 0x90(%rbp), %xmm11
+ movaps 0xa0(%rbp), %xmm12
+ movaps 0xb0(%rbp), %xmm13
+ movaps 0xc0(%rbp), %xmm14
+ movaps 0xd0(%rbp), %xmm15
+ lea 0xa0(%rbp), %rsp
+___
+$code.=<<___;
+ mov 0x48(%rsp), %r15
+ mov 0x50(%rsp), %r14
+ mov 0x58(%rsp), %r13
+ mov 0x60(%rsp), %r12
+ mov 0x68(%rsp), %rbx
+ mov 0x70(%rsp), %rax
+ lea 0x78(%rsp), %rsp
+ mov %rax, %rbp
+.Lecb_enc_epilogue:
+ ret
+.size bsaes_ecb_encrypt_blocks,.-bsaes_ecb_encrypt_blocks
+
+.globl bsaes_ecb_decrypt_blocks
+.type bsaes_ecb_decrypt_blocks,\@abi-omnipotent
+.align 16
+bsaes_ecb_decrypt_blocks:
+ mov %rsp, %rax
+.Lecb_dec_prologue:
+ push %rbp
+ push %rbx
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ lea -0x48(%rsp),%rsp
+___
+$code.=<<___ if ($win64);
+ lea -0xa0(%rsp), %rsp
+ movaps %xmm6, 0x40(%rsp)
+ movaps %xmm7, 0x50(%rsp)
+ movaps %xmm8, 0x60(%rsp)
+ movaps %xmm9, 0x70(%rsp)
+ movaps %xmm10, 0x80(%rsp)
+ movaps %xmm11, 0x90(%rsp)
+ movaps %xmm12, 0xa0(%rsp)
+ movaps %xmm13, 0xb0(%rsp)
+ movaps %xmm14, 0xc0(%rsp)
+ movaps %xmm15, 0xd0(%rsp)
+.Lecb_dec_body:
+___
+$code.=<<___;
+ mov %rsp,%rbp # backup %rsp
+ mov 240($arg4),%eax # rounds
+ mov $arg1,$inp # backup arguments
+ mov $arg2,$out
+ mov $arg3,$len
+ mov $arg4,$key
+ cmp \$8,$arg3
+ jb .Lecb_dec_short
+
+ mov %eax,%ebx # backup rounds
+ shl \$7,%rax # 128 bytes per inner round key
+ sub \$`128-32`,%rax # size of bit-sliced key schedule
+ sub %rax,%rsp
+ mov %rsp,%rax # pass key schedule
+ mov $key,%rcx # pass key
+ mov %ebx,%r10d # pass rounds
+ call _bsaes_key_convert
+ pxor (%rsp),%xmm7 # fix up 0 round key
+ movdqa %xmm6,(%rax) # save last round key
+ movdqa %xmm7,(%rsp)
+
+ sub \$8,$len
+.Lecb_dec_loop:
+ movdqu 0x00($inp), @XMM[0] # load input
+ movdqu 0x10($inp), @XMM[1]
+ movdqu 0x20($inp), @XMM[2]
+ movdqu 0x30($inp), @XMM[3]
+ movdqu 0x40($inp), @XMM[4]
+ movdqu 0x50($inp), @XMM[5]
+ mov %rsp, %rax # pass key schedule
+ movdqu 0x60($inp), @XMM[6]
+ mov %ebx,%r10d # pass rounds
+ movdqu 0x70($inp), @XMM[7]
+ lea 0x80($inp), $inp
+
+ call _bsaes_decrypt8
+
+ movdqu @XMM[0], 0x00($out) # write output
+ movdqu @XMM[1], 0x10($out)
+ movdqu @XMM[6], 0x20($out)
+ movdqu @XMM[4], 0x30($out)
+ movdqu @XMM[2], 0x40($out)
+ movdqu @XMM[7], 0x50($out)
+ movdqu @XMM[3], 0x60($out)
+ movdqu @XMM[5], 0x70($out)
+ lea 0x80($out), $out
+ sub \$8,$len
+ jnc .Lecb_dec_loop
+
+ add \$8,$len
+ jz .Lecb_dec_done
+
+ movdqu 0x00($inp), @XMM[0] # load input
+ mov %rsp, %rax # pass key schedule
+ mov %ebx,%r10d # pass rounds
+ cmp \$2,$len
+ jb .Lecb_dec_one
+ movdqu 0x10($inp), @XMM[1]
+ je .Lecb_dec_two
+ movdqu 0x20($inp), @XMM[2]
+ cmp \$4,$len
+ jb .Lecb_dec_three
+ movdqu 0x30($inp), @XMM[3]
+ je .Lecb_dec_four
+ movdqu 0x40($inp), @XMM[4]
+ cmp \$6,$len
+ jb .Lecb_dec_five
+ movdqu 0x50($inp), @XMM[5]
+ je .Lecb_dec_six
+ movdqu 0x60($inp), @XMM[6]
+ call _bsaes_decrypt8
+ movdqu @XMM[0], 0x00($out) # write output
+ movdqu @XMM[1], 0x10($out)
+ movdqu @XMM[6], 0x20($out)
+ movdqu @XMM[4], 0x30($out)
+ movdqu @XMM[2], 0x40($out)
+ movdqu @XMM[7], 0x50($out)
+ movdqu @XMM[3], 0x60($out)
+ jmp .Lecb_dec_done
+.align 16
+.Lecb_dec_six:
+ call _bsaes_decrypt8
+ movdqu @XMM[0], 0x00($out) # write output
+ movdqu @XMM[1], 0x10($out)
+ movdqu @XMM[6], 0x20($out)
+ movdqu @XMM[4], 0x30($out)
+ movdqu @XMM[2], 0x40($out)
+ movdqu @XMM[7], 0x50($out)
+ jmp .Lecb_dec_done
+.align 16
+.Lecb_dec_five:
+ call _bsaes_decrypt8
+ movdqu @XMM[0], 0x00($out) # write output
+ movdqu @XMM[1], 0x10($out)
+ movdqu @XMM[6], 0x20($out)
+ movdqu @XMM[4], 0x30($out)
+ movdqu @XMM[2], 0x40($out)
+ jmp .Lecb_dec_done
+.align 16
+.Lecb_dec_four:
+ call _bsaes_decrypt8
+ movdqu @XMM[0], 0x00($out) # write output
+ movdqu @XMM[1], 0x10($out)
+ movdqu @XMM[6], 0x20($out)
+ movdqu @XMM[4], 0x30($out)
+ jmp .Lecb_dec_done
+.align 16
+.Lecb_dec_three:
+ call _bsaes_decrypt8
+ movdqu @XMM[0], 0x00($out) # write output
+ movdqu @XMM[1], 0x10($out)
+ movdqu @XMM[6], 0x20($out)
+ jmp .Lecb_dec_done
+.align 16
+.Lecb_dec_two:
+ call _bsaes_decrypt8
+ movdqu @XMM[0], 0x00($out) # write output
+ movdqu @XMM[1], 0x10($out)
+ jmp .Lecb_dec_done
+.align 16
+.Lecb_dec_one:
+ call _bsaes_decrypt8
+ movdqu @XMM[0], 0x00($out) # write output
+ jmp .Lecb_dec_done
+.align 16
+.Lecb_dec_short:
+ lea ($inp), $arg1
+ lea ($out), $arg2
+ lea ($key), $arg3
+ call asm_AES_decrypt
+ lea 16($inp), $inp
+ lea 16($out), $out
+ dec $len
+ jnz .Lecb_dec_short
+
+.Lecb_dec_done:
+ lea (%rsp),%rax
+ pxor %xmm0, %xmm0
+.Lecb_dec_bzero: # wipe key schedule [if any]
+ movdqa %xmm0, 0x00(%rax)
+ movdqa %xmm0, 0x10(%rax)
+ lea 0x20(%rax), %rax
+ cmp %rax, %rbp
+ jb .Lecb_dec_bzero
+
+ lea (%rbp),%rsp # restore %rsp
+___
+$code.=<<___ if ($win64);
+ movaps 0x40(%rbp), %xmm6
+ movaps 0x50(%rbp), %xmm7
+ movaps 0x60(%rbp), %xmm8
+ movaps 0x70(%rbp), %xmm9
+ movaps 0x80(%rbp), %xmm10
+ movaps 0x90(%rbp), %xmm11
+ movaps 0xa0(%rbp), %xmm12
+ movaps 0xb0(%rbp), %xmm13
+ movaps 0xc0(%rbp), %xmm14
+ movaps 0xd0(%rbp), %xmm15
+ lea 0xa0(%rbp), %rsp
+___
+$code.=<<___;
+ mov 0x48(%rsp), %r15
+ mov 0x50(%rsp), %r14
+ mov 0x58(%rsp), %r13
+ mov 0x60(%rsp), %r12
+ mov 0x68(%rsp), %rbx
+ mov 0x70(%rsp), %rax
+ lea 0x78(%rsp), %rsp
+ mov %rax, %rbp
+.Lecb_dec_epilogue:
+ ret
+.size bsaes_ecb_decrypt_blocks,.-bsaes_ecb_decrypt_blocks
+___
+}
+$code.=<<___;
+.extern asm_AES_cbc_encrypt
+.globl bsaes_cbc_encrypt
+.type bsaes_cbc_encrypt,\@abi-omnipotent
+.align 16
+bsaes_cbc_encrypt:
+___
+$code.=<<___ if ($win64);
+ mov 48(%rsp),$arg6 # pull direction flag
+___
+$code.=<<___;
+ cmp \$0,$arg6
+ jne asm_AES_cbc_encrypt
+ cmp \$128,$arg3
+ jb asm_AES_cbc_encrypt
+
+ mov %rsp, %rax
+.Lcbc_dec_prologue:
+ push %rbp
+ push %rbx
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ lea -0x48(%rsp), %rsp
+___
+$code.=<<___ if ($win64);
+ mov 0xa0(%rsp),$arg5 # pull ivp
+ lea -0xa0(%rsp), %rsp
+ movaps %xmm6, 0x40(%rsp)
+ movaps %xmm7, 0x50(%rsp)
+ movaps %xmm8, 0x60(%rsp)
+ movaps %xmm9, 0x70(%rsp)
+ movaps %xmm10, 0x80(%rsp)
+ movaps %xmm11, 0x90(%rsp)
+ movaps %xmm12, 0xa0(%rsp)
+ movaps %xmm13, 0xb0(%rsp)
+ movaps %xmm14, 0xc0(%rsp)
+ movaps %xmm15, 0xd0(%rsp)
+.Lcbc_dec_body:
+___
+$code.=<<___;
+ mov %rsp, %rbp # backup %rsp
+ mov 240($arg4), %eax # rounds
+ mov $arg1, $inp # backup arguments
+ mov $arg2, $out
+ mov $arg3, $len
+ mov $arg4, $key
+ mov $arg5, %rbx
+ shr \$4, $len # bytes to blocks
+
+ mov %eax, %edx # rounds
+ shl \$7, %rax # 128 bytes per inner round key
+ sub \$`128-32`, %rax # size of bit-sliced key schedule
+ sub %rax, %rsp
+
+ mov %rsp, %rax # pass key schedule
+ mov $key, %rcx # pass key
+ mov %edx, %r10d # pass rounds
+ call _bsaes_key_convert
+ pxor (%rsp),%xmm7 # fix up 0 round key
+ movdqa %xmm6,(%rax) # save last round key
+ movdqa %xmm7,(%rsp)
+
+ movdqu (%rbx), @XMM[15] # load IV
+ sub \$8,$len
+.Lcbc_dec_loop:
+ movdqu 0x00($inp), @XMM[0] # load input
+ movdqu 0x10($inp), @XMM[1]
+ movdqu 0x20($inp), @XMM[2]
+ movdqu 0x30($inp), @XMM[3]
+ movdqu 0x40($inp), @XMM[4]
+ movdqu 0x50($inp), @XMM[5]
+ mov %rsp, %rax # pass key schedule
+ movdqu 0x60($inp), @XMM[6]
+ mov %edx,%r10d # pass rounds
+ movdqu 0x70($inp), @XMM[7]
+ movdqa @XMM[15], 0x20(%rbp) # put aside IV
+
+ call _bsaes_decrypt8
+
+ pxor 0x20(%rbp), @XMM[0] # ^= IV
+ movdqu 0x00($inp), @XMM[8] # re-load input
+ movdqu 0x10($inp), @XMM[9]
+ pxor @XMM[8], @XMM[1]
+ movdqu 0x20($inp), @XMM[10]
+ pxor @XMM[9], @XMM[6]
+ movdqu 0x30($inp), @XMM[11]
+ pxor @XMM[10], @XMM[4]
+ movdqu 0x40($inp), @XMM[12]
+ pxor @XMM[11], @XMM[2]
+ movdqu 0x50($inp), @XMM[13]
+ pxor @XMM[12], @XMM[7]
+ movdqu 0x60($inp), @XMM[14]
+ pxor @XMM[13], @XMM[3]
+ movdqu 0x70($inp), @XMM[15] # IV
+ pxor @XMM[14], @XMM[5]
+ movdqu @XMM[0], 0x00($out) # write output
+ lea 0x80($inp), $inp
+ movdqu @XMM[1], 0x10($out)
+ movdqu @XMM[6], 0x20($out)
+ movdqu @XMM[4], 0x30($out)
+ movdqu @XMM[2], 0x40($out)
+ movdqu @XMM[7], 0x50($out)
+ movdqu @XMM[3], 0x60($out)
+ movdqu @XMM[5], 0x70($out)
+ lea 0x80($out), $out
+ sub \$8,$len
+ jnc .Lcbc_dec_loop
+
+ add \$8,$len
+ jz .Lcbc_dec_done
+
+ movdqu 0x00($inp), @XMM[0] # load input
+ mov %rsp, %rax # pass key schedule
+ mov %edx, %r10d # pass rounds
+ cmp \$2,$len
+ jb .Lcbc_dec_one
+ movdqu 0x10($inp), @XMM[1]
+ je .Lcbc_dec_two
+ movdqu 0x20($inp), @XMM[2]
+ cmp \$4,$len
+ jb .Lcbc_dec_three
+ movdqu 0x30($inp), @XMM[3]
+ je .Lcbc_dec_four
+ movdqu 0x40($inp), @XMM[4]
+ cmp \$6,$len
+ jb .Lcbc_dec_five
+ movdqu 0x50($inp), @XMM[5]
+ je .Lcbc_dec_six
+ movdqu 0x60($inp), @XMM[6]
+ movdqa @XMM[15], 0x20(%rbp) # put aside IV
+ call _bsaes_decrypt8
+ pxor 0x20(%rbp), @XMM[0] # ^= IV
+ movdqu 0x00($inp), @XMM[8] # re-load input
+ movdqu 0x10($inp), @XMM[9]
+ pxor @XMM[8], @XMM[1]
+ movdqu 0x20($inp), @XMM[10]
+ pxor @XMM[9], @XMM[6]
+ movdqu 0x30($inp), @XMM[11]
+ pxor @XMM[10], @XMM[4]
+ movdqu 0x40($inp), @XMM[12]
+ pxor @XMM[11], @XMM[2]
+ movdqu 0x50($inp), @XMM[13]
+ pxor @XMM[12], @XMM[7]
+ movdqu 0x60($inp), @XMM[15] # IV
+ pxor @XMM[13], @XMM[3]
+ movdqu @XMM[0], 0x00($out) # write output
+ movdqu @XMM[1], 0x10($out)
+ movdqu @XMM[6], 0x20($out)
+ movdqu @XMM[4], 0x30($out)
+ movdqu @XMM[2], 0x40($out)
+ movdqu @XMM[7], 0x50($out)
+ movdqu @XMM[3], 0x60($out)
+ jmp .Lcbc_dec_done
+.align 16
+.Lcbc_dec_six:
+ movdqa @XMM[15], 0x20(%rbp) # put aside IV
+ call _bsaes_decrypt8
+ pxor 0x20(%rbp), @XMM[0] # ^= IV
+ movdqu 0x00($inp), @XMM[8] # re-load input
+ movdqu 0x10($inp), @XMM[9]
+ pxor @XMM[8], @XMM[1]
+ movdqu 0x20($inp), @XMM[10]
+ pxor @XMM[9], @XMM[6]
+ movdqu 0x30($inp), @XMM[11]
+ pxor @XMM[10], @XMM[4]
+ movdqu 0x40($inp), @XMM[12]
+ pxor @XMM[11], @XMM[2]
+ movdqu 0x50($inp), @XMM[15] # IV
+ pxor @XMM[12], @XMM[7]
+ movdqu @XMM[0], 0x00($out) # write output
+ movdqu @XMM[1], 0x10($out)
+ movdqu @XMM[6], 0x20($out)
+ movdqu @XMM[4], 0x30($out)
+ movdqu @XMM[2], 0x40($out)
+ movdqu @XMM[7], 0x50($out)
+ jmp .Lcbc_dec_done
+.align 16
+.Lcbc_dec_five:
+ movdqa @XMM[15], 0x20(%rbp) # put aside IV
+ call _bsaes_decrypt8
+ pxor 0x20(%rbp), @XMM[0] # ^= IV
+ movdqu 0x00($inp), @XMM[8] # re-load input
+ movdqu 0x10($inp), @XMM[9]
+ pxor @XMM[8], @XMM[1]
+ movdqu 0x20($inp), @XMM[10]
+ pxor @XMM[9], @XMM[6]
+ movdqu 0x30($inp), @XMM[11]
+ pxor @XMM[10], @XMM[4]
+ movdqu 0x40($inp), @XMM[15] # IV
+ pxor @XMM[11], @XMM[2]
+ movdqu @XMM[0], 0x00($out) # write output
+ movdqu @XMM[1], 0x10($out)
+ movdqu @XMM[6], 0x20($out)
+ movdqu @XMM[4], 0x30($out)
+ movdqu @XMM[2], 0x40($out)
+ jmp .Lcbc_dec_done
+.align 16
+.Lcbc_dec_four:
+ movdqa @XMM[15], 0x20(%rbp) # put aside IV
+ call _bsaes_decrypt8
+ pxor 0x20(%rbp), @XMM[0] # ^= IV
+ movdqu 0x00($inp), @XMM[8] # re-load input
+ movdqu 0x10($inp), @XMM[9]
+ pxor @XMM[8], @XMM[1]
+ movdqu 0x20($inp), @XMM[10]
+ pxor @XMM[9], @XMM[6]
+ movdqu 0x30($inp), @XMM[15] # IV
+ pxor @XMM[10], @XMM[4]
+ movdqu @XMM[0], 0x00($out) # write output
+ movdqu @XMM[1], 0x10($out)
+ movdqu @XMM[6], 0x20($out)
+ movdqu @XMM[4], 0x30($out)
+ jmp .Lcbc_dec_done
+.align 16
+.Lcbc_dec_three:
+ movdqa @XMM[15], 0x20(%rbp) # put aside IV
+ call _bsaes_decrypt8
+ pxor 0x20(%rbp), @XMM[0] # ^= IV
+ movdqu 0x00($inp), @XMM[8] # re-load input
+ movdqu 0x10($inp), @XMM[9]
+ pxor @XMM[8], @XMM[1]
+ movdqu 0x20($inp), @XMM[15] # IV
+ pxor @XMM[9], @XMM[6]
+ movdqu @XMM[0], 0x00($out) # write output
+ movdqu @XMM[1], 0x10($out)
+ movdqu @XMM[6], 0x20($out)
+ jmp .Lcbc_dec_done
+.align 16
+.Lcbc_dec_two:
+ movdqa @XMM[15], 0x20(%rbp) # put aside IV
+ call _bsaes_decrypt8
+ pxor 0x20(%rbp), @XMM[0] # ^= IV
+ movdqu 0x00($inp), @XMM[8] # re-load input
+ movdqu 0x10($inp), @XMM[15] # IV
+ pxor @XMM[8], @XMM[1]
+ movdqu @XMM[0], 0x00($out) # write output
+ movdqu @XMM[1], 0x10($out)
+ jmp .Lcbc_dec_done
+.align 16
+.Lcbc_dec_one:
+ lea ($inp), $arg1
+ lea 0x20(%rbp), $arg2 # buffer output
+ lea ($key), $arg3
+ call asm_AES_decrypt # doesn't touch %xmm
+ pxor 0x20(%rbp), @XMM[15] # ^= IV
+ movdqu @XMM[15], ($out) # write output
+ movdqa @XMM[0], @XMM[15] # IV
+
+.Lcbc_dec_done:
+ movdqu @XMM[15], (%rbx) # return IV
+ lea (%rsp), %rax
+ pxor %xmm0, %xmm0
+.Lcbc_dec_bzero: # wipe key schedule [if any]
+ movdqa %xmm0, 0x00(%rax)
+ movdqa %xmm0, 0x10(%rax)
+ lea 0x20(%rax), %rax
+ cmp %rax, %rbp
+ ja .Lcbc_dec_bzero
+
+ lea (%rbp),%rsp # restore %rsp
+___
+$code.=<<___ if ($win64);
+ movaps 0x40(%rbp), %xmm6
+ movaps 0x50(%rbp), %xmm7
+ movaps 0x60(%rbp), %xmm8
+ movaps 0x70(%rbp), %xmm9
+ movaps 0x80(%rbp), %xmm10
+ movaps 0x90(%rbp), %xmm11
+ movaps 0xa0(%rbp), %xmm12
+ movaps 0xb0(%rbp), %xmm13
+ movaps 0xc0(%rbp), %xmm14
+ movaps 0xd0(%rbp), %xmm15
+ lea 0xa0(%rbp), %rsp
+___
+$code.=<<___;
+ mov 0x48(%rsp), %r15
+ mov 0x50(%rsp), %r14
+ mov 0x58(%rsp), %r13
+ mov 0x60(%rsp), %r12
+ mov 0x68(%rsp), %rbx
+ mov 0x70(%rsp), %rax
+ lea 0x78(%rsp), %rsp
+ mov %rax, %rbp
+.Lcbc_dec_epilogue:
+ ret
+.size bsaes_cbc_encrypt,.-bsaes_cbc_encrypt
+
+.globl bsaes_ctr32_encrypt_blocks
+.type bsaes_ctr32_encrypt_blocks,\@abi-omnipotent
+.align 16
+bsaes_ctr32_encrypt_blocks:
+ mov %rsp, %rax
+.Lctr_enc_prologue:
+ push %rbp
+ push %rbx
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ lea -0x48(%rsp), %rsp
+___
+$code.=<<___ if ($win64);
+ mov 0xa0(%rsp),$arg5 # pull ivp
+ lea -0xa0(%rsp), %rsp
+ movaps %xmm6, 0x40(%rsp)
+ movaps %xmm7, 0x50(%rsp)
+ movaps %xmm8, 0x60(%rsp)
+ movaps %xmm9, 0x70(%rsp)
+ movaps %xmm10, 0x80(%rsp)
+ movaps %xmm11, 0x90(%rsp)
+ movaps %xmm12, 0xa0(%rsp)
+ movaps %xmm13, 0xb0(%rsp)
+ movaps %xmm14, 0xc0(%rsp)
+ movaps %xmm15, 0xd0(%rsp)
+.Lctr_enc_body:
+___
+$code.=<<___;
+ mov %rsp, %rbp # backup %rsp
+ movdqu ($arg5), %xmm0 # load counter
+ mov 240($arg4), %eax # rounds
+ mov $arg1, $inp # backup arguments
+ mov $arg2, $out
+ mov $arg3, $len
+ mov $arg4, $key
+ movdqa %xmm0, 0x20(%rbp) # copy counter
+ cmp \$8, $arg3
+ jb .Lctr_enc_short
+
+ mov %eax, %ebx # rounds
+ shl \$7, %rax # 128 bytes per inner round key
+ sub \$`128-32`, %rax # size of bit-sliced key schedule
+ sub %rax, %rsp
+
+ mov %rsp, %rax # pass key schedule
+ mov $key, %rcx # pass key
+ mov %ebx, %r10d # pass rounds
+ call _bsaes_key_convert
+ pxor %xmm6,%xmm7 # fix up last round key
+ movdqa %xmm7,(%rax) # save last round key
+
+ movdqa (%rsp), @XMM[9] # load round0 key
+ lea .LADD1(%rip), %r11
+ movdqa 0x20(%rbp), @XMM[0] # counter copy
+ movdqa -0x20(%r11), @XMM[8] # .LSWPUP
+ pshufb @XMM[8], @XMM[9] # byte swap upper part
+ pshufb @XMM[8], @XMM[0]
+ movdqa @XMM[9], (%rsp) # save adjusted round0 key
+ jmp .Lctr_enc_loop
+.align 16
+.Lctr_enc_loop:
+ movdqa @XMM[0], 0x20(%rbp) # save counter
+ movdqa @XMM[0], @XMM[1] # prepare 8 counter values
+ movdqa @XMM[0], @XMM[2]
+ paddd 0x00(%r11), @XMM[1] # .LADD1
+ movdqa @XMM[0], @XMM[3]
+ paddd 0x10(%r11), @XMM[2] # .LADD2
+ movdqa @XMM[0], @XMM[4]
+ paddd 0x20(%r11), @XMM[3] # .LADD3
+ movdqa @XMM[0], @XMM[5]
+ paddd 0x30(%r11), @XMM[4] # .LADD4
+ movdqa @XMM[0], @XMM[6]
+ paddd 0x40(%r11), @XMM[5] # .LADD5
+ movdqa @XMM[0], @XMM[7]
+ paddd 0x50(%r11), @XMM[6] # .LADD6
+ paddd 0x60(%r11), @XMM[7] # .LADD7
+
+ # Borrow prologue from _bsaes_encrypt8 to use the opportunity
+ # to flip byte order in 32-bit counter
+ movdqa (%rsp), @XMM[9] # round 0 key
+ lea 0x10(%rsp), %rax # pass key schedule
+ movdqa -0x10(%r11), @XMM[8] # .LSWPUPM0SR
+ pxor @XMM[9], @XMM[0] # xor with round0 key
+ pxor @XMM[9], @XMM[1]
+ pshufb @XMM[8], @XMM[0]
+ pxor @XMM[9], @XMM[2]
+ pshufb @XMM[8], @XMM[1]
+ pxor @XMM[9], @XMM[3]
+ pshufb @XMM[8], @XMM[2]
+ pxor @XMM[9], @XMM[4]
+ pshufb @XMM[8], @XMM[3]
+ pxor @XMM[9], @XMM[5]
+ pshufb @XMM[8], @XMM[4]
+ pxor @XMM[9], @XMM[6]
+ pshufb @XMM[8], @XMM[5]
+ pxor @XMM[9], @XMM[7]
+ pshufb @XMM[8], @XMM[6]
+ lea .LBS0(%rip), %r11 # constants table
+ pshufb @XMM[8], @XMM[7]
+ mov %ebx,%r10d # pass rounds
+
+ call _bsaes_encrypt8_bitslice
+
+ sub \$8,$len
+ jc .Lctr_enc_loop_done
+
+ movdqu 0x00($inp), @XMM[8] # load input
+ movdqu 0x10($inp), @XMM[9]
+ movdqu 0x20($inp), @XMM[10]
+ movdqu 0x30($inp), @XMM[11]
+ movdqu 0x40($inp), @XMM[12]
+ movdqu 0x50($inp), @XMM[13]
+ movdqu 0x60($inp), @XMM[14]
+ movdqu 0x70($inp), @XMM[15]
+ lea 0x80($inp),$inp
+ pxor @XMM[0], @XMM[8]
+ movdqa 0x20(%rbp), @XMM[0] # load counter
+ pxor @XMM[9], @XMM[1]
+ movdqu @XMM[8], 0x00($out) # write output
+ pxor @XMM[10], @XMM[4]
+ movdqu @XMM[1], 0x10($out)
+ pxor @XMM[11], @XMM[6]
+ movdqu @XMM[4], 0x20($out)
+ pxor @XMM[12], @XMM[3]
+ movdqu @XMM[6], 0x30($out)
+ pxor @XMM[13], @XMM[7]
+ movdqu @XMM[3], 0x40($out)
+ pxor @XMM[14], @XMM[2]
+ movdqu @XMM[7], 0x50($out)
+ pxor @XMM[15], @XMM[5]
+ movdqu @XMM[2], 0x60($out)
+ lea .LADD1(%rip), %r11
+ movdqu @XMM[5], 0x70($out)
+ lea 0x80($out), $out
+ paddd 0x70(%r11), @XMM[0] # .LADD8
+ jnz .Lctr_enc_loop
+
+ jmp .Lctr_enc_done
+.align 16
+.Lctr_enc_loop_done:
+ movdqu 0x00($inp), @XMM[8] # load input
+ pxor @XMM[8], @XMM[0]
+ movdqu @XMM[0], 0x00($out) # write output
+ cmp \$2,$len
+ jb .Lctr_enc_done
+ movdqu 0x10($inp), @XMM[9]
+ pxor @XMM[9], @XMM[1]
+ movdqu @XMM[1], 0x10($out)
+ je .Lctr_enc_done
+ movdqu 0x20($inp), @XMM[10]
+ pxor @XMM[10], @XMM[4]
+ movdqu @XMM[4], 0x20($out)
+ cmp \$4,$len
+ jb .Lctr_enc_done
+ movdqu 0x30($inp), @XMM[11]
+ pxor @XMM[11], @XMM[6]
+ movdqu @XMM[6], 0x30($out)
+ je .Lctr_enc_done
+ movdqu 0x40($inp), @XMM[12]
+ pxor @XMM[12], @XMM[3]
+ movdqu @XMM[3], 0x40($out)
+ cmp \$6,$len
+ jb .Lctr_enc_done
+ movdqu 0x50($inp), @XMM[13]
+ pxor @XMM[13], @XMM[7]
+ movdqu @XMM[7], 0x50($out)
+ je .Lctr_enc_done
+ movdqu 0x60($inp), @XMM[14]
+ pxor @XMM[14], @XMM[2]
+ movdqu @XMM[2], 0x60($out)
+ jmp .Lctr_enc_done
+
+.align 16
+.Lctr_enc_short:
+ lea 0x20(%rbp), $arg1
+ lea 0x30(%rbp), $arg2
+ lea ($key), $arg3
+ call asm_AES_encrypt
+ movdqu ($inp), @XMM[1]
+ lea 16($inp), $inp
+ mov 0x2c(%rbp), %eax # load 32-bit counter
+ bswap %eax
+ pxor 0x30(%rbp), @XMM[1]
+ inc %eax # increment
+ movdqu @XMM[1], ($out)
+ bswap %eax
+ lea 16($out), $out
+ mov %eax, 0x2c(%rsp) # save 32-bit counter
+ dec $len
+ jnz .Lctr_enc_short
+
+.Lctr_enc_done:
+ lea (%rsp), %rax
+ pxor %xmm0, %xmm0
+.Lctr_enc_bzero: # wipe key schedule [if any]
+ movdqa %xmm0, 0x00(%rax)
+ movdqa %xmm0, 0x10(%rax)
+ lea 0x20(%rax), %rax
+ cmp %rax, %rbp
+ ja .Lctr_enc_bzero
+
+ lea (%rbp),%rsp # restore %rsp
+___
+$code.=<<___ if ($win64);
+ movaps 0x40(%rbp), %xmm6
+ movaps 0x50(%rbp), %xmm7
+ movaps 0x60(%rbp), %xmm8
+ movaps 0x70(%rbp), %xmm9
+ movaps 0x80(%rbp), %xmm10
+ movaps 0x90(%rbp), %xmm11
+ movaps 0xa0(%rbp), %xmm12
+ movaps 0xb0(%rbp), %xmm13
+ movaps 0xc0(%rbp), %xmm14
+ movaps 0xd0(%rbp), %xmm15
+ lea 0xa0(%rbp), %rsp
+___
+$code.=<<___;
+ mov 0x48(%rsp), %r15
+ mov 0x50(%rsp), %r14
+ mov 0x58(%rsp), %r13
+ mov 0x60(%rsp), %r12
+ mov 0x68(%rsp), %rbx
+ mov 0x70(%rsp), %rax
+ lea 0x78(%rsp), %rsp
+ mov %rax, %rbp
+.Lctr_enc_epilogue:
+ ret
+.size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks
+___
+######################################################################
+# void bsaes_xts_[en|de]crypt(const char *inp,char *out,size_t len,
+# const AES_KEY *key1, const AES_KEY *key2,
+# const unsigned char iv[16]);
+#
+my ($twmask,$twres,$twtmp)=@XMM[13..15];
+$code.=<<___;
+.globl bsaes_xts_encrypt
+.type bsaes_xts_encrypt,\@abi-omnipotent
+.align 16
+bsaes_xts_encrypt:
+ mov %rsp, %rax
+.Lxts_enc_prologue:
+ push %rbp
+ push %rbx
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ lea -0x48(%rsp), %rsp
+___
+$code.=<<___ if ($win64);
+ mov 0xa0(%rsp),$arg5 # pull key2
+ mov 0xa8(%rsp),$arg6 # pull ivp
+ lea -0xa0(%rsp), %rsp
+ movaps %xmm6, 0x40(%rsp)
+ movaps %xmm7, 0x50(%rsp)
+ movaps %xmm8, 0x60(%rsp)
+ movaps %xmm9, 0x70(%rsp)
+ movaps %xmm10, 0x80(%rsp)
+ movaps %xmm11, 0x90(%rsp)
+ movaps %xmm12, 0xa0(%rsp)
+ movaps %xmm13, 0xb0(%rsp)
+ movaps %xmm14, 0xc0(%rsp)
+ movaps %xmm15, 0xd0(%rsp)
+.Lxts_enc_body:
+___
+$code.=<<___;
+ mov %rsp, %rbp # backup %rsp
+ mov $arg1, $inp # backup arguments
+ mov $arg2, $out
+ mov $arg3, $len
+ mov $arg4, $key
+
+ lea ($arg6), $arg1
+ lea 0x20(%rbp), $arg2
+ lea ($arg5), $arg3
+ call asm_AES_encrypt # generate initial tweak
+
+ mov 240($key), %eax # rounds
+ mov $len, %rbx # backup $len
+
+ mov %eax, %edx # rounds
+ shl \$7, %rax # 128 bytes per inner round key
+ sub \$`128-32`, %rax # size of bit-sliced key schedule
+ sub %rax, %rsp
+
+ mov %rsp, %rax # pass key schedule
+ mov $key, %rcx # pass key
+ mov %edx, %r10d # pass rounds
+ call _bsaes_key_convert
+ pxor %xmm6, %xmm7 # fix up last round key
+ movdqa %xmm7, (%rax) # save last round key
+
+ and \$-16, $len
+ sub \$0x80, %rsp # place for tweak[8]
+ movdqa 0x20(%rbp), @XMM[7] # initial tweak
+
+ pxor $twtmp, $twtmp
+ movdqa .Lxts_magic(%rip), $twmask
+ pcmpgtd @XMM[7], $twtmp # broadcast upper bits
+
+ sub \$0x80, $len
+ jc .Lxts_enc_short
+ jmp .Lxts_enc_loop
+
+.align 16
+.Lxts_enc_loop:
+___
+ for ($i=0;$i<7;$i++) {
+ $code.=<<___;
+ pshufd \$0x13, $twtmp, $twres
+ pxor $twtmp, $twtmp
+ movdqa @XMM[7], @XMM[$i]
+ movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
+ paddq @XMM[7], @XMM[7] # psllq 1,$tweak
+ pand $twmask, $twres # isolate carry and residue
+ pcmpgtd @XMM[7], $twtmp # broadcast upper bits
+ pxor $twres, @XMM[7]
+___
+ $code.=<<___ if ($i>=1);
+ movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
+___
+ $code.=<<___ if ($i>=2);
+ pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
+___
+ }
+$code.=<<___;
+ movdqu 0x60($inp), @XMM[8+6]
+ pxor @XMM[8+5], @XMM[5]
+ movdqu 0x70($inp), @XMM[8+7]
+ lea 0x80($inp), $inp
+ movdqa @XMM[7], 0x70(%rsp)
+ pxor @XMM[8+6], @XMM[6]
+ lea 0x80(%rsp), %rax # pass key schedule
+ pxor @XMM[8+7], @XMM[7]
+ mov %edx, %r10d # pass rounds
+
+ call _bsaes_encrypt8
+
+ pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
+ pxor 0x10(%rsp), @XMM[1]
+ movdqu @XMM[0], 0x00($out) # write output
+ pxor 0x20(%rsp), @XMM[4]
+ movdqu @XMM[1], 0x10($out)
+ pxor 0x30(%rsp), @XMM[6]
+ movdqu @XMM[4], 0x20($out)
+ pxor 0x40(%rsp), @XMM[3]
+ movdqu @XMM[6], 0x30($out)
+ pxor 0x50(%rsp), @XMM[7]
+ movdqu @XMM[3], 0x40($out)
+ pxor 0x60(%rsp), @XMM[2]
+ movdqu @XMM[7], 0x50($out)
+ pxor 0x70(%rsp), @XMM[5]
+ movdqu @XMM[2], 0x60($out)
+ movdqu @XMM[5], 0x70($out)
+ lea 0x80($out), $out
+
+ movdqa 0x70(%rsp), @XMM[7] # prepare next iteration tweak
+ pxor $twtmp, $twtmp
+ movdqa .Lxts_magic(%rip), $twmask
+ pcmpgtd @XMM[7], $twtmp
+ pshufd \$0x13, $twtmp, $twres
+ pxor $twtmp, $twtmp
+ paddq @XMM[7], @XMM[7] # psllq 1,$tweak
+ pand $twmask, $twres # isolate carry and residue
+ pcmpgtd @XMM[7], $twtmp # broadcast upper bits
+ pxor $twres, @XMM[7]
+
+ sub \$0x80,$len
+ jnc .Lxts_enc_loop
+
+.Lxts_enc_short:
+ add \$0x80, $len
+ jz .Lxts_enc_done
+___
+ for ($i=0;$i<7;$i++) {
+ $code.=<<___;
+ pshufd \$0x13, $twtmp, $twres
+ pxor $twtmp, $twtmp
+ movdqa @XMM[7], @XMM[$i]
+ movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
+ paddq @XMM[7], @XMM[7] # psllq 1,$tweak
+ pand $twmask, $twres # isolate carry and residue
+ pcmpgtd @XMM[7], $twtmp # broadcast upper bits
+ pxor $twres, @XMM[7]
+___
+ $code.=<<___ if ($i>=1);
+ movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
+ cmp \$`0x10*$i`,$len
+ je .Lxts_enc_$i
+___
+ $code.=<<___ if ($i>=2);
+ pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
+___
+ }
+$code.=<<___;
+ movdqu 0x60($inp), @XMM[8+6]
+ pxor @XMM[8+5], @XMM[5]
+ movdqa @XMM[7], 0x70(%rsp)
+ lea 0x70($inp), $inp
+ pxor @XMM[8+6], @XMM[6]
+ lea 0x80(%rsp), %rax # pass key schedule
+ mov %edx, %r10d # pass rounds
+
+ call _bsaes_encrypt8
+
+ pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
+ pxor 0x10(%rsp), @XMM[1]
+ movdqu @XMM[0], 0x00($out) # write output
+ pxor 0x20(%rsp), @XMM[4]
+ movdqu @XMM[1], 0x10($out)
+ pxor 0x30(%rsp), @XMM[6]
+ movdqu @XMM[4], 0x20($out)
+ pxor 0x40(%rsp), @XMM[3]
+ movdqu @XMM[6], 0x30($out)
+ pxor 0x50(%rsp), @XMM[7]
+ movdqu @XMM[3], 0x40($out)
+ pxor 0x60(%rsp), @XMM[2]
+ movdqu @XMM[7], 0x50($out)
+ movdqu @XMM[2], 0x60($out)
+ lea 0x70($out), $out
+
+ movdqa 0x70(%rsp), @XMM[7] # next iteration tweak
+ jmp .Lxts_enc_done
+.align 16
+.Lxts_enc_6:
+ pxor @XMM[8+4], @XMM[4]
+ lea 0x60($inp), $inp
+ pxor @XMM[8+5], @XMM[5]
+ lea 0x80(%rsp), %rax # pass key schedule
+ mov %edx, %r10d # pass rounds
+
+ call _bsaes_encrypt8
+
+ pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
+ pxor 0x10(%rsp), @XMM[1]
+ movdqu @XMM[0], 0x00($out) # write output
+ pxor 0x20(%rsp), @XMM[4]
+ movdqu @XMM[1], 0x10($out)
+ pxor 0x30(%rsp), @XMM[6]
+ movdqu @XMM[4], 0x20($out)
+ pxor 0x40(%rsp), @XMM[3]
+ movdqu @XMM[6], 0x30($out)
+ pxor 0x50(%rsp), @XMM[7]
+ movdqu @XMM[3], 0x40($out)
+ movdqu @XMM[7], 0x50($out)
+ lea 0x60($out), $out
+
+ movdqa 0x60(%rsp), @XMM[7] # next iteration tweak
+ jmp .Lxts_enc_done
+.align 16
+.Lxts_enc_5:
+ pxor @XMM[8+3], @XMM[3]
+ lea 0x50($inp), $inp
+ pxor @XMM[8+4], @XMM[4]
+ lea 0x80(%rsp), %rax # pass key schedule
+ mov %edx, %r10d # pass rounds
+
+ call _bsaes_encrypt8
+
+ pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
+ pxor 0x10(%rsp), @XMM[1]
+ movdqu @XMM[0], 0x00($out) # write output
+ pxor 0x20(%rsp), @XMM[4]
+ movdqu @XMM[1], 0x10($out)
+ pxor 0x30(%rsp), @XMM[6]
+ movdqu @XMM[4], 0x20($out)
+ pxor 0x40(%rsp), @XMM[3]
+ movdqu @XMM[6], 0x30($out)
+ movdqu @XMM[3], 0x40($out)
+ lea 0x50($out), $out
+
+ movdqa 0x50(%rsp), @XMM[7] # next iteration tweak
+ jmp .Lxts_enc_done
+.align 16
+.Lxts_enc_4:
+ pxor @XMM[8+2], @XMM[2]
+ lea 0x40($inp), $inp
+ pxor @XMM[8+3], @XMM[3]
+ lea 0x80(%rsp), %rax # pass key schedule
+ mov %edx, %r10d # pass rounds
+
+ call _bsaes_encrypt8
+
+ pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
+ pxor 0x10(%rsp), @XMM[1]
+ movdqu @XMM[0], 0x00($out) # write output
+ pxor 0x20(%rsp), @XMM[4]
+ movdqu @XMM[1], 0x10($out)
+ pxor 0x30(%rsp), @XMM[6]
+ movdqu @XMM[4], 0x20($out)
+ movdqu @XMM[6], 0x30($out)
+ lea 0x40($out), $out
+
+ movdqa 0x40(%rsp), @XMM[7] # next iteration tweak
+ jmp .Lxts_enc_done
+.align 16
+.Lxts_enc_3:
+ pxor @XMM[8+1], @XMM[1]
+ lea 0x30($inp), $inp
+ pxor @XMM[8+2], @XMM[2]
+ lea 0x80(%rsp), %rax # pass key schedule
+ mov %edx, %r10d # pass rounds
+
+ call _bsaes_encrypt8
+
+ pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
+ pxor 0x10(%rsp), @XMM[1]
+ movdqu @XMM[0], 0x00($out) # write output
+ pxor 0x20(%rsp), @XMM[4]
+ movdqu @XMM[1], 0x10($out)
+ movdqu @XMM[4], 0x20($out)
+ lea 0x30($out), $out
+
+ movdqa 0x30(%rsp), @XMM[7] # next iteration tweak
+ jmp .Lxts_enc_done
+.align 16
+.Lxts_enc_2:
+ pxor @XMM[8+0], @XMM[0]
+ lea 0x20($inp), $inp
+ pxor @XMM[8+1], @XMM[1]
+ lea 0x80(%rsp), %rax # pass key schedule
+ mov %edx, %r10d # pass rounds
+
+ call _bsaes_encrypt8
+
+ pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
+ pxor 0x10(%rsp), @XMM[1]
+ movdqu @XMM[0], 0x00($out) # write output
+ movdqu @XMM[1], 0x10($out)
+ lea 0x20($out), $out
+
+ movdqa 0x20(%rsp), @XMM[7] # next iteration tweak
+ jmp .Lxts_enc_done
+.align 16
+.Lxts_enc_1:
+ pxor @XMM[0], @XMM[8]
+ lea 0x10($inp), $inp
+ movdqa @XMM[8], 0x20(%rbp)
+ lea 0x20(%rbp), $arg1
+ lea 0x20(%rbp), $arg2
+ lea ($key), $arg3
+ call asm_AES_encrypt # doesn't touch %xmm
+ pxor 0x20(%rbp), @XMM[0] # ^= tweak[]
+ #pxor @XMM[8], @XMM[0]
+ #lea 0x80(%rsp), %rax # pass key schedule
+ #mov %edx, %r10d # pass rounds
+ #call _bsaes_encrypt8
+ #pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
+ movdqu @XMM[0], 0x00($out) # write output
+ lea 0x10($out), $out
+
+ movdqa 0x10(%rsp), @XMM[7] # next iteration tweak
+
+.Lxts_enc_done:
+ and \$15, %ebx
+ jz .Lxts_enc_ret
+ mov $out, %rdx
+
+.Lxts_enc_steal:
+ movzb ($inp), %eax
+ movzb -16(%rdx), %ecx
+ lea 1($inp), $inp
+ mov %al, -16(%rdx)
+ mov %cl, 0(%rdx)
+ lea 1(%rdx), %rdx
+ sub \$1,%ebx
+ jnz .Lxts_enc_steal
+
+ movdqu -16($out), @XMM[0]
+ lea 0x20(%rbp), $arg1
+ pxor @XMM[7], @XMM[0]
+ lea 0x20(%rbp), $arg2
+ movdqa @XMM[0], 0x20(%rbp)
+ lea ($key), $arg3
+ call asm_AES_encrypt # doesn't touch %xmm
+ pxor 0x20(%rbp), @XMM[7]
+ movdqu @XMM[7], -16($out)
+
+.Lxts_enc_ret:
+ lea (%rsp), %rax
+ pxor %xmm0, %xmm0
+.Lxts_enc_bzero: # wipe key schedule [if any]
+ movdqa %xmm0, 0x00(%rax)
+ movdqa %xmm0, 0x10(%rax)
+ lea 0x20(%rax), %rax
+ cmp %rax, %rbp
+ ja .Lxts_enc_bzero
+
+ lea (%rbp),%rsp # restore %rsp
+___
+$code.=<<___ if ($win64);
+ movaps 0x40(%rbp), %xmm6
+ movaps 0x50(%rbp), %xmm7
+ movaps 0x60(%rbp), %xmm8
+ movaps 0x70(%rbp), %xmm9
+ movaps 0x80(%rbp), %xmm10
+ movaps 0x90(%rbp), %xmm11
+ movaps 0xa0(%rbp), %xmm12
+ movaps 0xb0(%rbp), %xmm13
+ movaps 0xc0(%rbp), %xmm14
+ movaps 0xd0(%rbp), %xmm15
+ lea 0xa0(%rbp), %rsp
+___
+$code.=<<___;
+ mov 0x48(%rsp), %r15
+ mov 0x50(%rsp), %r14
+ mov 0x58(%rsp), %r13
+ mov 0x60(%rsp), %r12
+ mov 0x68(%rsp), %rbx
+ mov 0x70(%rsp), %rax
+ lea 0x78(%rsp), %rsp
+ mov %rax, %rbp
+.Lxts_enc_epilogue:
+ ret
+.size bsaes_xts_encrypt,.-bsaes_xts_encrypt
+
+.globl bsaes_xts_decrypt
+.type bsaes_xts_decrypt,\@abi-omnipotent
+.align 16
+bsaes_xts_decrypt:
+ mov %rsp, %rax
+.Lxts_dec_prologue:
+ push %rbp
+ push %rbx
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ lea -0x48(%rsp), %rsp
+___
+$code.=<<___ if ($win64);
+ mov 0xa0(%rsp),$arg5 # pull key2
+ mov 0xa8(%rsp),$arg6 # pull ivp
+ lea -0xa0(%rsp), %rsp
+ movaps %xmm6, 0x40(%rsp)
+ movaps %xmm7, 0x50(%rsp)
+ movaps %xmm8, 0x60(%rsp)
+ movaps %xmm9, 0x70(%rsp)
+ movaps %xmm10, 0x80(%rsp)
+ movaps %xmm11, 0x90(%rsp)
+ movaps %xmm12, 0xa0(%rsp)
+ movaps %xmm13, 0xb0(%rsp)
+ movaps %xmm14, 0xc0(%rsp)
+ movaps %xmm15, 0xd0(%rsp)
+.Lxts_dec_body:
+___
+$code.=<<___;
+ mov %rsp, %rbp # backup %rsp
+ mov $arg1, $inp # backup arguments
+ mov $arg2, $out
+ mov $arg3, $len
+ mov $arg4, $key
+
+ lea ($arg6), $arg1
+ lea 0x20(%rbp), $arg2
+ lea ($arg5), $arg3
+ call asm_AES_encrypt # generate initial tweak
+
+ mov 240($key), %eax # rounds
+ mov $len, %rbx # backup $len
+
+ mov %eax, %edx # rounds
+ shl \$7, %rax # 128 bytes per inner round key
+ sub \$`128-32`, %rax # size of bit-sliced key schedule
+ sub %rax, %rsp
+
+ mov %rsp, %rax # pass key schedule
+ mov $key, %rcx # pass key
+ mov %edx, %r10d # pass rounds
+ call _bsaes_key_convert
+ pxor (%rsp), %xmm7 # fix up round 0 key
+ movdqa %xmm6, (%rax) # save last round key
+ movdqa %xmm7, (%rsp)
+
+ xor %eax, %eax # if ($len%16) len-=16;
+ and \$-16, $len
+ test \$15, %ebx
+ setnz %al
+ shl \$4, %rax
+ sub %rax, $len
+
+ sub \$0x80, %rsp # place for tweak[8]
+ movdqa 0x20(%rbp), @XMM[7] # initial tweak
+
+ pxor $twtmp, $twtmp
+ movdqa .Lxts_magic(%rip), $twmask
+ pcmpgtd @XMM[7], $twtmp # broadcast upper bits
+
+ sub \$0x80, $len
+ jc .Lxts_dec_short
+ jmp .Lxts_dec_loop
+
+.align 16
+.Lxts_dec_loop:
+___
+ for ($i=0;$i<7;$i++) {
+ $code.=<<___;
+ pshufd \$0x13, $twtmp, $twres
+ pxor $twtmp, $twtmp
+ movdqa @XMM[7], @XMM[$i]
+ movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
+ paddq @XMM[7], @XMM[7] # psllq 1,$tweak
+ pand $twmask, $twres # isolate carry and residue
+ pcmpgtd @XMM[7], $twtmp # broadcast upper bits
+ pxor $twres, @XMM[7]
+___
+ $code.=<<___ if ($i>=1);
+ movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
+___
+ $code.=<<___ if ($i>=2);
+ pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
+___
+ }
+$code.=<<___;
+ movdqu 0x60($inp), @XMM[8+6]
+ pxor @XMM[8+5], @XMM[5]
+ movdqu 0x70($inp), @XMM[8+7]
+ lea 0x80($inp), $inp
+ movdqa @XMM[7], 0x70(%rsp)
+ pxor @XMM[8+6], @XMM[6]
+ lea 0x80(%rsp), %rax # pass key schedule
+ pxor @XMM[8+7], @XMM[7]
+ mov %edx, %r10d # pass rounds
+
+ call _bsaes_decrypt8
+
+ pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
+ pxor 0x10(%rsp), @XMM[1]
+ movdqu @XMM[0], 0x00($out) # write output
+ pxor 0x20(%rsp), @XMM[6]
+ movdqu @XMM[1], 0x10($out)
+ pxor 0x30(%rsp), @XMM[4]
+ movdqu @XMM[6], 0x20($out)
+ pxor 0x40(%rsp), @XMM[2]
+ movdqu @XMM[4], 0x30($out)
+ pxor 0x50(%rsp), @XMM[7]
+ movdqu @XMM[2], 0x40($out)
+ pxor 0x60(%rsp), @XMM[3]
+ movdqu @XMM[7], 0x50($out)
+ pxor 0x70(%rsp), @XMM[5]
+ movdqu @XMM[3], 0x60($out)
+ movdqu @XMM[5], 0x70($out)
+ lea 0x80($out), $out
+
+ movdqa 0x70(%rsp), @XMM[7] # prepare next iteration tweak
+ pxor $twtmp, $twtmp
+ movdqa .Lxts_magic(%rip), $twmask
+ pcmpgtd @XMM[7], $twtmp
+ pshufd \$0x13, $twtmp, $twres
+ pxor $twtmp, $twtmp
+ paddq @XMM[7], @XMM[7] # psllq 1,$tweak
+ pand $twmask, $twres # isolate carry and residue
+ pcmpgtd @XMM[7], $twtmp # broadcast upper bits
+ pxor $twres, @XMM[7]
+
+ sub \$0x80,$len
+ jnc .Lxts_dec_loop
+
+.Lxts_dec_short:
+ add \$0x80, $len
+ jz .Lxts_dec_done
+___
+ for ($i=0;$i<7;$i++) {
+ $code.=<<___;
+ pshufd \$0x13, $twtmp, $twres
+ pxor $twtmp, $twtmp
+ movdqa @XMM[7], @XMM[$i]
+ movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
+ paddq @XMM[7], @XMM[7] # psllq 1,$tweak
+ pand $twmask, $twres # isolate carry and residue
+ pcmpgtd @XMM[7], $twtmp # broadcast upper bits
+ pxor $twres, @XMM[7]
+___
+ $code.=<<___ if ($i>=1);
+ movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
+ cmp \$`0x10*$i`,$len
+ je .Lxts_dec_$i
+___
+ $code.=<<___ if ($i>=2);
+ pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
+___
+ }
+$code.=<<___;
+ movdqu 0x60($inp), @XMM[8+6]
+ pxor @XMM[8+5], @XMM[5]
+ movdqa @XMM[7], 0x70(%rsp)
+ lea 0x70($inp), $inp
+ pxor @XMM[8+6], @XMM[6]
+ lea 0x80(%rsp), %rax # pass key schedule
+ mov %edx, %r10d # pass rounds
+
+ call _bsaes_decrypt8
+
+ pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
+ pxor 0x10(%rsp), @XMM[1]
+ movdqu @XMM[0], 0x00($out) # write output
+ pxor 0x20(%rsp), @XMM[6]
+ movdqu @XMM[1], 0x10($out)
+ pxor 0x30(%rsp), @XMM[4]
+ movdqu @XMM[6], 0x20($out)
+ pxor 0x40(%rsp), @XMM[2]
+ movdqu @XMM[4], 0x30($out)
+ pxor 0x50(%rsp), @XMM[7]
+ movdqu @XMM[2], 0x40($out)
+ pxor 0x60(%rsp), @XMM[3]
+ movdqu @XMM[7], 0x50($out)
+ movdqu @XMM[3], 0x60($out)
+ lea 0x70($out), $out
+
+ movdqa 0x70(%rsp), @XMM[7] # next iteration tweak
+ jmp .Lxts_dec_done
+.align 16
+.Lxts_dec_6:
+ pxor @XMM[8+4], @XMM[4]
+ lea 0x60($inp), $inp
+ pxor @XMM[8+5], @XMM[5]
+ lea 0x80(%rsp), %rax # pass key schedule
+ mov %edx, %r10d # pass rounds
+
+ call _bsaes_decrypt8
+
+ pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
+ pxor 0x10(%rsp), @XMM[1]
+ movdqu @XMM[0], 0x00($out) # write output
+ pxor 0x20(%rsp), @XMM[6]
+ movdqu @XMM[1], 0x10($out)
+ pxor 0x30(%rsp), @XMM[4]
+ movdqu @XMM[6], 0x20($out)
+ pxor 0x40(%rsp), @XMM[2]
+ movdqu @XMM[4], 0x30($out)
+ pxor 0x50(%rsp), @XMM[7]
+ movdqu @XMM[2], 0x40($out)
+ movdqu @XMM[7], 0x50($out)
+ lea 0x60($out), $out
+
+ movdqa 0x60(%rsp), @XMM[7] # next iteration tweak
+ jmp .Lxts_dec_done
+.align 16
+.Lxts_dec_5:
+ pxor @XMM[8+3], @XMM[3]
+ lea 0x50($inp), $inp
+ pxor @XMM[8+4], @XMM[4]
+ lea 0x80(%rsp), %rax # pass key schedule
+ mov %edx, %r10d # pass rounds
+
+ call _bsaes_decrypt8
+
+ pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
+ pxor 0x10(%rsp), @XMM[1]
+ movdqu @XMM[0], 0x00($out) # write output
+ pxor 0x20(%rsp), @XMM[6]
+ movdqu @XMM[1], 0x10($out)
+ pxor 0x30(%rsp), @XMM[4]
+ movdqu @XMM[6], 0x20($out)
+ pxor 0x40(%rsp), @XMM[2]
+ movdqu @XMM[4], 0x30($out)
+ movdqu @XMM[2], 0x40($out)
+ lea 0x50($out), $out
+
+ movdqa 0x50(%rsp), @XMM[7] # next iteration tweak
+ jmp .Lxts_dec_done
+.align 16
+.Lxts_dec_4:
+ pxor @XMM[8+2], @XMM[2]
+ lea 0x40($inp), $inp
+ pxor @XMM[8+3], @XMM[3]
+ lea 0x80(%rsp), %rax # pass key schedule
+ mov %edx, %r10d # pass rounds
+
+ call _bsaes_decrypt8
+
+ pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
+ pxor 0x10(%rsp), @XMM[1]
+ movdqu @XMM[0], 0x00($out) # write output
+ pxor 0x20(%rsp), @XMM[6]
+ movdqu @XMM[1], 0x10($out)
+ pxor 0x30(%rsp), @XMM[4]
+ movdqu @XMM[6], 0x20($out)
+ movdqu @XMM[4], 0x30($out)
+ lea 0x40($out), $out
+
+ movdqa 0x40(%rsp), @XMM[7] # next iteration tweak
+ jmp .Lxts_dec_done
+.align 16
+.Lxts_dec_3:
+ pxor @XMM[8+1], @XMM[1]
+ lea 0x30($inp), $inp
+ pxor @XMM[8+2], @XMM[2]
+ lea 0x80(%rsp), %rax # pass key schedule
+ mov %edx, %r10d # pass rounds
+
+ call _bsaes_decrypt8
+
+ pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
+ pxor 0x10(%rsp), @XMM[1]
+ movdqu @XMM[0], 0x00($out) # write output
+ pxor 0x20(%rsp), @XMM[6]
+ movdqu @XMM[1], 0x10($out)
+ movdqu @XMM[6], 0x20($out)
+ lea 0x30($out), $out
+
+ movdqa 0x30(%rsp), @XMM[7] # next iteration tweak
+ jmp .Lxts_dec_done
+.align 16
+.Lxts_dec_2:
+ pxor @XMM[8+0], @XMM[0]
+ lea 0x20($inp), $inp
+ pxor @XMM[8+1], @XMM[1]
+ lea 0x80(%rsp), %rax # pass key schedule
+ mov %edx, %r10d # pass rounds
+
+ call _bsaes_decrypt8
+
+ pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
+ pxor 0x10(%rsp), @XMM[1]
+ movdqu @XMM[0], 0x00($out) # write output
+ movdqu @XMM[1], 0x10($out)
+ lea 0x20($out), $out
+
+ movdqa 0x20(%rsp), @XMM[7] # next iteration tweak
+ jmp .Lxts_dec_done
+.align 16
+.Lxts_dec_1:
+ pxor @XMM[0], @XMM[8]
+ lea 0x10($inp), $inp
+ movdqa @XMM[8], 0x20(%rbp)
+ lea 0x20(%rbp), $arg1
+ lea 0x20(%rbp), $arg2
+ lea ($key), $arg3
+ call asm_AES_decrypt # doesn't touch %xmm
+ pxor 0x20(%rbp), @XMM[0] # ^= tweak[]
+ #pxor @XMM[8], @XMM[0]
+ #lea 0x80(%rsp), %rax # pass key schedule
+ #mov %edx, %r10d # pass rounds
+ #call _bsaes_decrypt8
+ #pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
+ movdqu @XMM[0], 0x00($out) # write output
+ lea 0x10($out), $out
+
+ movdqa 0x10(%rsp), @XMM[7] # next iteration tweak
+
+.Lxts_dec_done:
+ and \$15, %ebx
+ jz .Lxts_dec_ret
+
+ pxor $twtmp, $twtmp
+ movdqa .Lxts_magic(%rip), $twmask
+ pcmpgtd @XMM[7], $twtmp
+ pshufd \$0x13, $twtmp, $twres
+ movdqa @XMM[7], @XMM[6]
+ paddq @XMM[7], @XMM[7] # psllq 1,$tweak
+ pand $twmask, $twres # isolate carry and residue
+ movdqu ($inp), @XMM[0]
+ pxor $twres, @XMM[7]
+
+ lea 0x20(%rbp), $arg1
+ pxor @XMM[7], @XMM[0]
+ lea 0x20(%rbp), $arg2
+ movdqa @XMM[0], 0x20(%rbp)
+ lea ($key), $arg3
+ call asm_AES_decrypt # doesn't touch %xmm
+ pxor 0x20(%rbp), @XMM[7]
+ mov $out, %rdx
+ movdqu @XMM[7], ($out)
+
+.Lxts_dec_steal:
+ movzb 16($inp), %eax
+ movzb (%rdx), %ecx
+ lea 1($inp), $inp
+ mov %al, (%rdx)
+ mov %cl, 16(%rdx)
+ lea 1(%rdx), %rdx
+ sub \$1,%ebx
+ jnz .Lxts_dec_steal
+
+ movdqu ($out), @XMM[0]
+ lea 0x20(%rbp), $arg1
+ pxor @XMM[6], @XMM[0]
+ lea 0x20(%rbp), $arg2
+ movdqa @XMM[0], 0x20(%rbp)
+ lea ($key), $arg3
+ call asm_AES_decrypt # doesn't touch %xmm
+ pxor 0x20(%rbp), @XMM[6]
+ movdqu @XMM[6], ($out)
+
+.Lxts_dec_ret:
+ lea (%rsp), %rax
+ pxor %xmm0, %xmm0
+.Lxts_dec_bzero: # wipe key schedule [if any]
+ movdqa %xmm0, 0x00(%rax)
+ movdqa %xmm0, 0x10(%rax)
+ lea 0x20(%rax), %rax
+ cmp %rax, %rbp
+ ja .Lxts_dec_bzero
+
+ lea (%rbp),%rsp # restore %rsp
+___
+$code.=<<___ if ($win64);
+ movaps 0x40(%rbp), %xmm6
+ movaps 0x50(%rbp), %xmm7
+ movaps 0x60(%rbp), %xmm8
+ movaps 0x70(%rbp), %xmm9
+ movaps 0x80(%rbp), %xmm10
+ movaps 0x90(%rbp), %xmm11
+ movaps 0xa0(%rbp), %xmm12
+ movaps 0xb0(%rbp), %xmm13
+ movaps 0xc0(%rbp), %xmm14
+ movaps 0xd0(%rbp), %xmm15
+ lea 0xa0(%rbp), %rsp
+___
+$code.=<<___;
+ mov 0x48(%rsp), %r15
+ mov 0x50(%rsp), %r14
+ mov 0x58(%rsp), %r13
+ mov 0x60(%rsp), %r12
+ mov 0x68(%rsp), %rbx
+ mov 0x70(%rsp), %rax
+ lea 0x78(%rsp), %rsp
+ mov %rax, %rbp
+.Lxts_dec_epilogue:
+ ret
+.size bsaes_xts_decrypt,.-bsaes_xts_decrypt
+___
+}
+$code.=<<___;
+.type _bsaes_const,\@object
+.align 64
+_bsaes_const:
+.LM0ISR: # InvShiftRows constants
+ .quad 0x0a0e0206070b0f03, 0x0004080c0d010509
+.LISRM0:
+ .quad 0x01040b0e0205080f, 0x0306090c00070a0d
+.LISR:
+ .quad 0x0504070602010003, 0x0f0e0d0c080b0a09
+.LBS0: # bit-slice constants
+ .quad 0x5555555555555555, 0x5555555555555555
+.LBS1:
+ .quad 0x3333333333333333, 0x3333333333333333
+.LBS2:
+ .quad 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f
+.LSR: # shiftrows constants
+ .quad 0x0504070600030201, 0x0f0e0d0c0a09080b
+.LSRM0:
+ .quad 0x0304090e00050a0f, 0x01060b0c0207080d
+.LM0:
+ .quad 0x02060a0e03070b0f, 0x0004080c0105090d
+.LM0SR:
+ .quad 0x0a0e02060f03070b, 0x0004080c05090d01
+.LNOT: # magic constants
+ .quad 0xffffffffffffffff, 0xffffffffffffffff
+.L63:
+ .quad 0x6363636363636363, 0x6363636363636363
+.LSWPUP: # byte-swap upper dword
+ .quad 0x0706050403020100, 0x0c0d0e0f0b0a0908
+.LSWPUPM0SR:
+ .quad 0x0a0d02060c03070b, 0x0004080f05090e01
+.LADD1: # counter increment constants
+ .quad 0x0000000000000000, 0x0000000100000000
+.LADD2:
+ .quad 0x0000000000000000, 0x0000000200000000
+.LADD3:
+ .quad 0x0000000000000000, 0x0000000300000000
+.LADD4:
+ .quad 0x0000000000000000, 0x0000000400000000
+.LADD5:
+ .quad 0x0000000000000000, 0x0000000500000000
+.LADD6:
+ .quad 0x0000000000000000, 0x0000000600000000
+.LADD7:
+ .quad 0x0000000000000000, 0x0000000700000000
+.LADD8:
+ .quad 0x0000000000000000, 0x0000000800000000
+.Lxts_magic:
+ .long 0x87,0,1,0
+.asciz "Bit-sliced AES for x86_64/SSSE3, Emilia Käsper, Peter Schwabe, Andy Polyakov"
+.align 64
+.size _bsaes_const,.-_bsaes_const
+___
+
+# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
+# CONTEXT *context,DISPATCHER_CONTEXT *disp)
+if ($win64) {
+$rec="%rcx";
+$frame="%rdx";
+$context="%r8";
+$disp="%r9";
+
+$code.=<<___;
+.extern __imp_RtlVirtualUnwind
+.type se_handler,\@abi-omnipotent
+.align 16
+se_handler:
+ push %rsi
+ push %rdi
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ pushfq
+ sub \$64,%rsp
+
+ mov 120($context),%rax # pull context->Rax
+ mov 248($context),%rbx # pull context->Rip
+
+ mov 8($disp),%rsi # disp->ImageBase
+ mov 56($disp),%r11 # disp->HandlerData
+
+ mov 0(%r11),%r10d # HandlerData[0]
+ lea (%rsi,%r10),%r10 # prologue label
+ cmp %r10,%rbx # context->Rip<prologue label
+ jb .Lin_prologue
+
+ mov 152($context),%rax # pull context->Rsp
+
+ mov 4(%r11),%r10d # HandlerData[1]
+ lea (%rsi,%r10),%r10 # epilogue label
+ cmp %r10,%rbx # context->Rip>=epilogue label
+ jae .Lin_prologue
+
+ mov 160($context),%rax # pull context->Rbp
+
+ lea 0x40(%rax),%rsi # %xmm save area
+ lea 512($context),%rdi # &context.Xmm6
+ mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
+ .long 0xa548f3fc # cld; rep movsq
+ lea 0xa0(%rax),%rax # adjust stack pointer
+
+ mov 0x70(%rax),%rbp
+ mov 0x68(%rax),%rbx
+ mov 0x60(%rax),%r12
+ mov 0x58(%rax),%r13
+ mov 0x50(%rax),%r14
+ mov 0x48(%rax),%r15
+ lea 0x78(%rax),%rax # adjust stack pointer
+ mov %rbx,144($context) # restore context->Rbx
+ mov %rbp,160($context) # restore context->Rbp
+ mov %r12,216($context) # restore context->R12
+ mov %r13,224($context) # restore context->R13
+ mov %r14,232($context) # restore context->R14
+ mov %r15,240($context) # restore context->R15
+
+.Lin_prologue:
+ mov %rax,152($context) # restore context->Rsp
+
+ mov 40($disp),%rdi # disp->ContextRecord
+ mov $context,%rsi # context
+ mov \$`1232/8`,%ecx # sizeof(CONTEXT)
+ .long 0xa548f3fc # cld; rep movsq
+
+ mov $disp,%rsi
+ xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
+ mov 8(%rsi),%rdx # arg2, disp->ImageBase
+ mov 0(%rsi),%r8 # arg3, disp->ControlPc
+ mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
+ mov 40(%rsi),%r10 # disp->ContextRecord
+ lea 56(%rsi),%r11 # &disp->HandlerData
+ lea 24(%rsi),%r12 # &disp->EstablisherFrame
+ mov %r10,32(%rsp) # arg5
+ mov %r11,40(%rsp) # arg6
+ mov %r12,48(%rsp) # arg7
+ mov %rcx,56(%rsp) # arg8, (NULL)
+ call *__imp_RtlVirtualUnwind(%rip)
+
+ mov \$1,%eax # ExceptionContinueSearch
+ add \$64,%rsp
+ popfq
+ pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbp
+ pop %rbx
+ pop %rdi
+ pop %rsi
+ ret
+.size se_handler,.-se_handler
+
+.section .pdata
+.align 4
+___
+$code.=<<___ if ($ecb);
+ .rva .Lecb_enc_prologue
+ .rva .Lecb_enc_epilogue
+ .rva .Lecb_enc_info
+
+ .rva .Lecb_dec_prologue
+ .rva .Lecb_dec_epilogue
+ .rva .Lecb_dec_info
+___
+$code.=<<___;
+ .rva .Lcbc_dec_prologue
+ .rva .Lcbc_dec_epilogue
+ .rva .Lcbc_dec_info
+
+ .rva .Lctr_enc_prologue
+ .rva .Lctr_enc_epilogue
+ .rva .Lctr_enc_info
+
+ .rva .Lxts_enc_prologue
+ .rva .Lxts_enc_epilogue
+ .rva .Lxts_enc_info
+
+ .rva .Lxts_dec_prologue
+ .rva .Lxts_dec_epilogue
+ .rva .Lxts_dec_info
+
+.section .xdata
+.align 8
+___
+$code.=<<___ if ($ecb);
+.Lecb_enc_info:
+ .byte 9,0,0,0
+ .rva se_handler
+ .rva .Lecb_enc_body,.Lecb_enc_epilogue # HandlerData[]
+.Lecb_dec_info:
+ .byte 9,0,0,0
+ .rva se_handler
+ .rva .Lecb_dec_body,.Lecb_dec_epilogue # HandlerData[]
+___
+$code.=<<___;
+.Lcbc_dec_info:
+ .byte 9,0,0,0
+ .rva se_handler
+ .rva .Lcbc_dec_body,.Lcbc_dec_epilogue # HandlerData[]
+.Lctr_enc_info:
+ .byte 9,0,0,0
+ .rva se_handler
+ .rva .Lctr_enc_body,.Lctr_enc_epilogue # HandlerData[]
+.Lxts_enc_info:
+ .byte 9,0,0,0
+ .rva se_handler
+ .rva .Lxts_enc_body,.Lxts_enc_epilogue # HandlerData[]
+.Lxts_dec_info:
+ .byte 9,0,0,0
+ .rva se_handler
+ .rva .Lxts_dec_body,.Lxts_dec_epilogue # HandlerData[]
+___
+}
+
+$code =~ s/\`([^\`]*)\`/eval($1)/gem;
+
+print $code;
+
+close STDOUT;
--- /dev/null
+#!/usr/bin/env perl
+
+######################################################################
+## Constant-time SSSE3 AES core implementation.
+## version 0.1
+##
+## By Mike Hamburg (Stanford University), 2009
+## Public domain.
+##
+## For details see http://shiftleft.org/papers/vector_aes/ and
+## http://crypto.stanford.edu/vpaes/.
+
+######################################################################
+# September 2011.
+#
+# Interface to OpenSSL as "almost" drop-in replacement for
+# aes-x86_64.pl. "Almost" refers to the fact that AES_cbc_encrypt
+# doesn't handle partial vectors (doesn't have to if called from
+# EVP only). "Drop-in" implies that this module doesn't share key
+# schedule structure with the original nor does it make assumption
+# about its alignment...
+#
+# Performance summary. aes-x86_64.pl column lists large-block CBC
+# encrypt/decrypt/with-hyper-threading-off(*) results in cycles per
+# byte processed with 128-bit key, and vpaes-x86_64.pl column -
+# [also large-block CBC] encrypt/decrypt.
+#
+# aes-x86_64.pl vpaes-x86_64.pl
+#
+# Core 2(**) 30.5/43.7/14.3 21.8/25.7(***)
+# Nehalem 30.5/42.2/14.6 9.8/11.8
+# Atom 63.9/79.0/32.1 64.0/84.8(***)
+#
+# (*) "Hyper-threading" in the context refers rather to cache shared
+# among multiple cores, than to specifically Intel HTT. As vast
+# majority of contemporary cores share cache, slower code path
+# is common place. In other words "with-hyper-threading-off"
+# results are presented mostly for reference purposes.
+#
+# (**) "Core 2" refers to initial 65nm design, a.k.a. Conroe.
+#
+# (***) Less impressive improvement on Core 2 and Atom is due to slow
+# pshufb, yet it's respectable +40%/78% improvement on Core 2
+# (as implied, over "hyper-threading-safe" code path).
+#
+# <appro@openssl.org>
+
+$flavour = shift;
+$output = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+open STDOUT,"| $^X $xlate $flavour $output";
+
+$PREFIX="vpaes";
+
+$code.=<<___;
+.text
+
+##
+## _aes_encrypt_core
+##
+## AES-encrypt %xmm0.
+##
+## Inputs:
+## %xmm0 = input
+## %xmm9-%xmm15 as in _vpaes_preheat
+## (%rdx) = scheduled keys
+##
+## Output in %xmm0
+## Clobbers %xmm1-%xmm5, %r9, %r10, %r11, %rax
+## Preserves %xmm6 - %xmm8 so you get some local vectors
+##
+##
+.type _vpaes_encrypt_core,\@abi-omnipotent
+.align 16
+_vpaes_encrypt_core:
+ mov %rdx, %r9
+ mov \$16, %r11
+ mov 240(%rdx),%eax
+ movdqa %xmm9, %xmm1
+ movdqa .Lk_ipt(%rip), %xmm2 # iptlo
+ pandn %xmm0, %xmm1
+ movdqu (%r9), %xmm5 # round0 key
+ psrld \$4, %xmm1
+ pand %xmm9, %xmm0
+ pshufb %xmm0, %xmm2
+ movdqa .Lk_ipt+16(%rip), %xmm0 # ipthi
+ pshufb %xmm1, %xmm0
+ pxor %xmm5, %xmm2
+ pxor %xmm2, %xmm0
+ add \$16, %r9
+ lea .Lk_mc_backward(%rip),%r10
+ jmp .Lenc_entry
+
+.align 16
+.Lenc_loop:
+ # middle of middle round
+ movdqa %xmm13, %xmm4 # 4 : sb1u
+ pshufb %xmm2, %xmm4 # 4 = sb1u
+ pxor %xmm5, %xmm4 # 4 = sb1u + k
+ movdqa %xmm12, %xmm0 # 0 : sb1t
+ pshufb %xmm3, %xmm0 # 0 = sb1t
+ pxor %xmm4, %xmm0 # 0 = A
+ movdqa %xmm15, %xmm5 # 4 : sb2u
+ pshufb %xmm2, %xmm5 # 4 = sb2u
+ movdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[]
+ movdqa %xmm14, %xmm2 # 2 : sb2t
+ pshufb %xmm3, %xmm2 # 2 = sb2t
+ pxor %xmm5, %xmm2 # 2 = 2A
+ movdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[]
+ movdqa %xmm0, %xmm3 # 3 = A
+ pshufb %xmm1, %xmm0 # 0 = B
+ add \$16, %r9 # next key
+ pxor %xmm2, %xmm0 # 0 = 2A+B
+ pshufb %xmm4, %xmm3 # 3 = D
+ add \$16, %r11 # next mc
+ pxor %xmm0, %xmm3 # 3 = 2A+B+D
+ pshufb %xmm1, %xmm0 # 0 = 2B+C
+ and \$0x30, %r11 # ... mod 4
+ pxor %xmm3, %xmm0 # 0 = 2A+3B+C+D
+ sub \$1,%rax # nr--
+
+.Lenc_entry:
+ # top of round
+ movdqa %xmm9, %xmm1 # 1 : i
+ pandn %xmm0, %xmm1 # 1 = i<<4
+ psrld \$4, %xmm1 # 1 = i
+ pand %xmm9, %xmm0 # 0 = k
+ movdqa %xmm11, %xmm5 # 2 : a/k
+ pshufb %xmm0, %xmm5 # 2 = a/k
+ pxor %xmm1, %xmm0 # 0 = j
+ movdqa %xmm10, %xmm3 # 3 : 1/i
+ pshufb %xmm1, %xmm3 # 3 = 1/i
+ pxor %xmm5, %xmm3 # 3 = iak = 1/i + a/k
+ movdqa %xmm10, %xmm4 # 4 : 1/j
+ pshufb %xmm0, %xmm4 # 4 = 1/j
+ pxor %xmm5, %xmm4 # 4 = jak = 1/j + a/k
+ movdqa %xmm10, %xmm2 # 2 : 1/iak
+ pshufb %xmm3, %xmm2 # 2 = 1/iak
+ pxor %xmm0, %xmm2 # 2 = io
+ movdqa %xmm10, %xmm3 # 3 : 1/jak
+ movdqu (%r9), %xmm5
+ pshufb %xmm4, %xmm3 # 3 = 1/jak
+ pxor %xmm1, %xmm3 # 3 = jo
+ jnz .Lenc_loop
+
+ # middle of last round
+ movdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo
+ movdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16
+ pshufb %xmm2, %xmm4 # 4 = sbou
+ pxor %xmm5, %xmm4 # 4 = sb1u + k
+ pshufb %xmm3, %xmm0 # 0 = sb1t
+ movdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[]
+ pxor %xmm4, %xmm0 # 0 = A
+ pshufb %xmm1, %xmm0
+ ret
+.size _vpaes_encrypt_core,.-_vpaes_encrypt_core
+
+##
+## Decryption core
+##
+## Same API as encryption core.
+##
+.type _vpaes_decrypt_core,\@abi-omnipotent
+.align 16
+_vpaes_decrypt_core:
+ mov %rdx, %r9 # load key
+ mov 240(%rdx),%eax
+ movdqa %xmm9, %xmm1
+ movdqa .Lk_dipt(%rip), %xmm2 # iptlo
+ pandn %xmm0, %xmm1
+ mov %rax, %r11
+ psrld \$4, %xmm1
+ movdqu (%r9), %xmm5 # round0 key
+ shl \$4, %r11
+ pand %xmm9, %xmm0
+ pshufb %xmm0, %xmm2
+ movdqa .Lk_dipt+16(%rip), %xmm0 # ipthi
+ xor \$0x30, %r11
+ lea .Lk_dsbd(%rip),%r10
+ pshufb %xmm1, %xmm0
+ and \$0x30, %r11
+ pxor %xmm5, %xmm2
+ movdqa .Lk_mc_forward+48(%rip), %xmm5
+ pxor %xmm2, %xmm0
+ add \$16, %r9
+ add %r10, %r11
+ jmp .Ldec_entry
+
+.align 16
+.Ldec_loop:
+##
+## Inverse mix columns
+##
+ movdqa -0x20(%r10),%xmm4 # 4 : sb9u
+ pshufb %xmm2, %xmm4 # 4 = sb9u
+ pxor %xmm0, %xmm4
+ movdqa -0x10(%r10),%xmm0 # 0 : sb9t
+ pshufb %xmm3, %xmm0 # 0 = sb9t
+ pxor %xmm4, %xmm0 # 0 = ch
+ add \$16, %r9 # next round key
+
+ pshufb %xmm5, %xmm0 # MC ch
+ movdqa 0x00(%r10),%xmm4 # 4 : sbdu
+ pshufb %xmm2, %xmm4 # 4 = sbdu
+ pxor %xmm0, %xmm4 # 4 = ch
+ movdqa 0x10(%r10),%xmm0 # 0 : sbdt
+ pshufb %xmm3, %xmm0 # 0 = sbdt
+ pxor %xmm4, %xmm0 # 0 = ch
+ sub \$1,%rax # nr--
+
+ pshufb %xmm5, %xmm0 # MC ch
+ movdqa 0x20(%r10),%xmm4 # 4 : sbbu
+ pshufb %xmm2, %xmm4 # 4 = sbbu
+ pxor %xmm0, %xmm4 # 4 = ch
+ movdqa 0x30(%r10),%xmm0 # 0 : sbbt
+ pshufb %xmm3, %xmm0 # 0 = sbbt
+ pxor %xmm4, %xmm0 # 0 = ch
+
+ pshufb %xmm5, %xmm0 # MC ch
+ movdqa 0x40(%r10),%xmm4 # 4 : sbeu
+ pshufb %xmm2, %xmm4 # 4 = sbeu
+ pxor %xmm0, %xmm4 # 4 = ch
+ movdqa 0x50(%r10),%xmm0 # 0 : sbet
+ pshufb %xmm3, %xmm0 # 0 = sbet
+ pxor %xmm4, %xmm0 # 0 = ch
+
+ palignr \$12, %xmm5, %xmm5
+
+.Ldec_entry:
+ # top of round
+ movdqa %xmm9, %xmm1 # 1 : i
+ pandn %xmm0, %xmm1 # 1 = i<<4
+ psrld \$4, %xmm1 # 1 = i
+ pand %xmm9, %xmm0 # 0 = k
+ movdqa %xmm11, %xmm2 # 2 : a/k
+ pshufb %xmm0, %xmm2 # 2 = a/k
+ pxor %xmm1, %xmm0 # 0 = j
+ movdqa %xmm10, %xmm3 # 3 : 1/i
+ pshufb %xmm1, %xmm3 # 3 = 1/i
+ pxor %xmm2, %xmm3 # 3 = iak = 1/i + a/k
+ movdqa %xmm10, %xmm4 # 4 : 1/j
+ pshufb %xmm0, %xmm4 # 4 = 1/j
+ pxor %xmm2, %xmm4 # 4 = jak = 1/j + a/k
+ movdqa %xmm10, %xmm2 # 2 : 1/iak
+ pshufb %xmm3, %xmm2 # 2 = 1/iak
+ pxor %xmm0, %xmm2 # 2 = io
+ movdqa %xmm10, %xmm3 # 3 : 1/jak
+ pshufb %xmm4, %xmm3 # 3 = 1/jak
+ pxor %xmm1, %xmm3 # 3 = jo
+ movdqu (%r9), %xmm0
+ jnz .Ldec_loop
+
+ # middle of last round
+ movdqa 0x60(%r10), %xmm4 # 3 : sbou
+ pshufb %xmm2, %xmm4 # 4 = sbou
+ pxor %xmm0, %xmm4 # 4 = sb1u + k
+ movdqa 0x70(%r10), %xmm0 # 0 : sbot
+ movdqa .Lk_sr-.Lk_dsbd(%r11), %xmm2
+ pshufb %xmm3, %xmm0 # 0 = sb1t
+ pxor %xmm4, %xmm0 # 0 = A
+ pshufb %xmm2, %xmm0
+ ret
+.size _vpaes_decrypt_core,.-_vpaes_decrypt_core
+
+########################################################
+## ##
+## AES key schedule ##
+## ##
+########################################################
+.type _vpaes_schedule_core,\@abi-omnipotent
+.align 16
+_vpaes_schedule_core:
+ # rdi = key
+ # rsi = size in bits
+ # rdx = buffer
+ # rcx = direction. 0=encrypt, 1=decrypt
+
+ call _vpaes_preheat # load the tables
+ movdqa .Lk_rcon(%rip), %xmm8 # load rcon
+ movdqu (%rdi), %xmm0 # load key (unaligned)
+
+ # input transform
+ movdqa %xmm0, %xmm3
+ lea .Lk_ipt(%rip), %r11
+ call _vpaes_schedule_transform
+ movdqa %xmm0, %xmm7
+
+ lea .Lk_sr(%rip),%r10
+ test %rcx, %rcx
+ jnz .Lschedule_am_decrypting
+
+ # encrypting, output zeroth round key after transform
+ movdqu %xmm0, (%rdx)
+ jmp .Lschedule_go
+
+.Lschedule_am_decrypting:
+ # decrypting, output zeroth round key after shiftrows
+ movdqa (%r8,%r10),%xmm1
+ pshufb %xmm1, %xmm3
+ movdqu %xmm3, (%rdx)
+ xor \$0x30, %r8
+
+.Lschedule_go:
+ cmp \$192, %esi
+ ja .Lschedule_256
+ je .Lschedule_192
+ # 128: fall though
+
+##
+## .schedule_128
+##
+## 128-bit specific part of key schedule.
+##
+## This schedule is really simple, because all its parts
+## are accomplished by the subroutines.
+##
+.Lschedule_128:
+ mov \$10, %esi
+
+.Loop_schedule_128:
+ call _vpaes_schedule_round
+ dec %rsi
+ jz .Lschedule_mangle_last
+ call _vpaes_schedule_mangle # write output
+ jmp .Loop_schedule_128
+
+##
+## .aes_schedule_192
+##
+## 192-bit specific part of key schedule.
+##
+## The main body of this schedule is the same as the 128-bit
+## schedule, but with more smearing. The long, high side is
+## stored in %xmm7 as before, and the short, low side is in
+## the high bits of %xmm6.
+##
+## This schedule is somewhat nastier, however, because each
+## round produces 192 bits of key material, or 1.5 round keys.
+## Therefore, on each cycle we do 2 rounds and produce 3 round
+## keys.
+##
+.align 16
+.Lschedule_192:
+ movdqu 8(%rdi),%xmm0 # load key part 2 (very unaligned)
+ call _vpaes_schedule_transform # input transform
+ movdqa %xmm0, %xmm6 # save short part
+ pxor %xmm4, %xmm4 # clear 4
+ movhlps %xmm4, %xmm6 # clobber low side with zeros
+ mov \$4, %esi
+
+.Loop_schedule_192:
+ call _vpaes_schedule_round
+ palignr \$8,%xmm6,%xmm0
+ call _vpaes_schedule_mangle # save key n
+ call _vpaes_schedule_192_smear
+ call _vpaes_schedule_mangle # save key n+1
+ call _vpaes_schedule_round
+ dec %rsi
+ jz .Lschedule_mangle_last
+ call _vpaes_schedule_mangle # save key n+2
+ call _vpaes_schedule_192_smear
+ jmp .Loop_schedule_192
+
+##
+## .aes_schedule_256
+##
+## 256-bit specific part of key schedule.
+##
+## The structure here is very similar to the 128-bit
+## schedule, but with an additional "low side" in
+## %xmm6. The low side's rounds are the same as the
+## high side's, except no rcon and no rotation.
+##
+.align 16
+.Lschedule_256:
+ movdqu 16(%rdi),%xmm0 # load key part 2 (unaligned)
+ call _vpaes_schedule_transform # input transform
+ mov \$7, %esi
+
+.Loop_schedule_256:
+ call _vpaes_schedule_mangle # output low result
+ movdqa %xmm0, %xmm6 # save cur_lo in xmm6
+
+ # high round
+ call _vpaes_schedule_round
+ dec %rsi
+ jz .Lschedule_mangle_last
+ call _vpaes_schedule_mangle
+
+ # low round. swap xmm7 and xmm6
+ pshufd \$0xFF, %xmm0, %xmm0
+ movdqa %xmm7, %xmm5
+ movdqa %xmm6, %xmm7
+ call _vpaes_schedule_low_round
+ movdqa %xmm5, %xmm7
+
+ jmp .Loop_schedule_256
+
+
+##
+## .aes_schedule_mangle_last
+##
+## Mangler for last round of key schedule
+## Mangles %xmm0
+## when encrypting, outputs out(%xmm0) ^ 63
+## when decrypting, outputs unskew(%xmm0)
+##
+## Always called right before return... jumps to cleanup and exits
+##
+.align 16
+.Lschedule_mangle_last:
+ # schedule last round key from xmm0
+ lea .Lk_deskew(%rip),%r11 # prepare to deskew
+ test %rcx, %rcx
+ jnz .Lschedule_mangle_last_dec
+
+ # encrypting
+ movdqa (%r8,%r10),%xmm1
+ pshufb %xmm1, %xmm0 # output permute
+ lea .Lk_opt(%rip), %r11 # prepare to output transform
+ add \$32, %rdx
+
+.Lschedule_mangle_last_dec:
+ add \$-16, %rdx
+ pxor .Lk_s63(%rip), %xmm0
+ call _vpaes_schedule_transform # output transform
+ movdqu %xmm0, (%rdx) # save last key
+
+ # cleanup
+ pxor %xmm0, %xmm0
+ pxor %xmm1, %xmm1
+ pxor %xmm2, %xmm2
+ pxor %xmm3, %xmm3
+ pxor %xmm4, %xmm4
+ pxor %xmm5, %xmm5
+ pxor %xmm6, %xmm6
+ pxor %xmm7, %xmm7
+ ret
+.size _vpaes_schedule_core,.-_vpaes_schedule_core
+
+##
+## .aes_schedule_192_smear
+##
+## Smear the short, low side in the 192-bit key schedule.
+##
+## Inputs:
+## %xmm7: high side, b a x y
+## %xmm6: low side, d c 0 0
+## %xmm13: 0
+##
+## Outputs:
+## %xmm6: b+c+d b+c 0 0
+## %xmm0: b+c+d b+c b a
+##
+.type _vpaes_schedule_192_smear,\@abi-omnipotent
+.align 16
+_vpaes_schedule_192_smear:
+ pshufd \$0x80, %xmm6, %xmm0 # d c 0 0 -> c 0 0 0
+ pxor %xmm0, %xmm6 # -> c+d c 0 0
+ pshufd \$0xFE, %xmm7, %xmm0 # b a _ _ -> b b b a
+ pxor %xmm0, %xmm6 # -> b+c+d b+c b a
+ movdqa %xmm6, %xmm0
+ pxor %xmm1, %xmm1
+ movhlps %xmm1, %xmm6 # clobber low side with zeros
+ ret
+.size _vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear
+
+##
+## .aes_schedule_round
+##
+## Runs one main round of the key schedule on %xmm0, %xmm7
+##
+## Specifically, runs subbytes on the high dword of %xmm0
+## then rotates it by one byte and xors into the low dword of
+## %xmm7.
+##
+## Adds rcon from low byte of %xmm8, then rotates %xmm8 for
+## next rcon.
+##
+## Smears the dwords of %xmm7 by xoring the low into the
+## second low, result into third, result into highest.
+##
+## Returns results in %xmm7 = %xmm0.
+## Clobbers %xmm1-%xmm4, %r11.
+##
+.type _vpaes_schedule_round,\@abi-omnipotent
+.align 16
+_vpaes_schedule_round:
+ # extract rcon from xmm8
+ pxor %xmm1, %xmm1
+ palignr \$15, %xmm8, %xmm1
+ palignr \$15, %xmm8, %xmm8
+ pxor %xmm1, %xmm7
+
+ # rotate
+ pshufd \$0xFF, %xmm0, %xmm0
+ palignr \$1, %xmm0, %xmm0
+
+ # fall through...
+
+ # low round: same as high round, but no rotation and no rcon.
+_vpaes_schedule_low_round:
+ # smear xmm7
+ movdqa %xmm7, %xmm1
+ pslldq \$4, %xmm7
+ pxor %xmm1, %xmm7
+ movdqa %xmm7, %xmm1
+ pslldq \$8, %xmm7
+ pxor %xmm1, %xmm7
+ pxor .Lk_s63(%rip), %xmm7
+
+ # subbytes
+ movdqa %xmm9, %xmm1
+ pandn %xmm0, %xmm1
+ psrld \$4, %xmm1 # 1 = i
+ pand %xmm9, %xmm0 # 0 = k
+ movdqa %xmm11, %xmm2 # 2 : a/k
+ pshufb %xmm0, %xmm2 # 2 = a/k
+ pxor %xmm1, %xmm0 # 0 = j
+ movdqa %xmm10, %xmm3 # 3 : 1/i
+ pshufb %xmm1, %xmm3 # 3 = 1/i
+ pxor %xmm2, %xmm3 # 3 = iak = 1/i + a/k
+ movdqa %xmm10, %xmm4 # 4 : 1/j
+ pshufb %xmm0, %xmm4 # 4 = 1/j
+ pxor %xmm2, %xmm4 # 4 = jak = 1/j + a/k
+ movdqa %xmm10, %xmm2 # 2 : 1/iak
+ pshufb %xmm3, %xmm2 # 2 = 1/iak
+ pxor %xmm0, %xmm2 # 2 = io
+ movdqa %xmm10, %xmm3 # 3 : 1/jak
+ pshufb %xmm4, %xmm3 # 3 = 1/jak
+ pxor %xmm1, %xmm3 # 3 = jo
+ movdqa %xmm13, %xmm4 # 4 : sbou
+ pshufb %xmm2, %xmm4 # 4 = sbou
+ movdqa %xmm12, %xmm0 # 0 : sbot
+ pshufb %xmm3, %xmm0 # 0 = sb1t
+ pxor %xmm4, %xmm0 # 0 = sbox output
+
+ # add in smeared stuff
+ pxor %xmm7, %xmm0
+ movdqa %xmm0, %xmm7
+ ret
+.size _vpaes_schedule_round,.-_vpaes_schedule_round
+
+##
+## .aes_schedule_transform
+##
+## Linear-transform %xmm0 according to tables at (%r11)
+##
+## Requires that %xmm9 = 0x0F0F... as in preheat
+## Output in %xmm0
+## Clobbers %xmm1, %xmm2
+##
+.type _vpaes_schedule_transform,\@abi-omnipotent
+.align 16
+_vpaes_schedule_transform:
+ movdqa %xmm9, %xmm1
+ pandn %xmm0, %xmm1
+ psrld \$4, %xmm1
+ pand %xmm9, %xmm0
+ movdqa (%r11), %xmm2 # lo
+ pshufb %xmm0, %xmm2
+ movdqa 16(%r11), %xmm0 # hi
+ pshufb %xmm1, %xmm0
+ pxor %xmm2, %xmm0
+ ret
+.size _vpaes_schedule_transform,.-_vpaes_schedule_transform
+
+##
+## .aes_schedule_mangle
+##
+## Mangle xmm0 from (basis-transformed) standard version
+## to our version.
+##
+## On encrypt,
+## xor with 0x63
+## multiply by circulant 0,1,1,1
+## apply shiftrows transform
+##
+## On decrypt,
+## xor with 0x63
+## multiply by "inverse mixcolumns" circulant E,B,D,9
+## deskew
+## apply shiftrows transform
+##
+##
+## Writes out to (%rdx), and increments or decrements it
+## Keeps track of round number mod 4 in %r8
+## Preserves xmm0
+## Clobbers xmm1-xmm5
+##
+.type _vpaes_schedule_mangle,\@abi-omnipotent
+.align 16
+_vpaes_schedule_mangle:
+ movdqa %xmm0, %xmm4 # save xmm0 for later
+ movdqa .Lk_mc_forward(%rip),%xmm5
+ test %rcx, %rcx
+ jnz .Lschedule_mangle_dec
+
+ # encrypting
+ add \$16, %rdx
+ pxor .Lk_s63(%rip),%xmm4
+ pshufb %xmm5, %xmm4
+ movdqa %xmm4, %xmm3
+ pshufb %xmm5, %xmm4
+ pxor %xmm4, %xmm3
+ pshufb %xmm5, %xmm4
+ pxor %xmm4, %xmm3
+
+ jmp .Lschedule_mangle_both
+.align 16
+.Lschedule_mangle_dec:
+ # inverse mix columns
+ lea .Lk_dksd(%rip),%r11
+ movdqa %xmm9, %xmm1
+ pandn %xmm4, %xmm1
+ psrld \$4, %xmm1 # 1 = hi
+ pand %xmm9, %xmm4 # 4 = lo
+
+ movdqa 0x00(%r11), %xmm2
+ pshufb %xmm4, %xmm2
+ movdqa 0x10(%r11), %xmm3
+ pshufb %xmm1, %xmm3
+ pxor %xmm2, %xmm3
+ pshufb %xmm5, %xmm3
+
+ movdqa 0x20(%r11), %xmm2
+ pshufb %xmm4, %xmm2
+ pxor %xmm3, %xmm2
+ movdqa 0x30(%r11), %xmm3
+ pshufb %xmm1, %xmm3
+ pxor %xmm2, %xmm3
+ pshufb %xmm5, %xmm3
+
+ movdqa 0x40(%r11), %xmm2
+ pshufb %xmm4, %xmm2
+ pxor %xmm3, %xmm2
+ movdqa 0x50(%r11), %xmm3
+ pshufb %xmm1, %xmm3
+ pxor %xmm2, %xmm3
+ pshufb %xmm5, %xmm3
+
+ movdqa 0x60(%r11), %xmm2
+ pshufb %xmm4, %xmm2
+ pxor %xmm3, %xmm2
+ movdqa 0x70(%r11), %xmm3
+ pshufb %xmm1, %xmm3
+ pxor %xmm2, %xmm3
+
+ add \$-16, %rdx
+
+.Lschedule_mangle_both:
+ movdqa (%r8,%r10),%xmm1
+ pshufb %xmm1,%xmm3
+ add \$-16, %r8
+ and \$0x30, %r8
+ movdqu %xmm3, (%rdx)
+ ret
+.size _vpaes_schedule_mangle,.-_vpaes_schedule_mangle
+
+#
+# Interface to OpenSSL
+#
+.globl ${PREFIX}_set_encrypt_key
+.type ${PREFIX}_set_encrypt_key,\@function,3
+.align 16
+${PREFIX}_set_encrypt_key:
+___
+$code.=<<___ if ($win64);
+ lea -0xb8(%rsp),%rsp
+ movaps %xmm6,0x10(%rsp)
+ movaps %xmm7,0x20(%rsp)
+ movaps %xmm8,0x30(%rsp)
+ movaps %xmm9,0x40(%rsp)
+ movaps %xmm10,0x50(%rsp)
+ movaps %xmm11,0x60(%rsp)
+ movaps %xmm12,0x70(%rsp)
+ movaps %xmm13,0x80(%rsp)
+ movaps %xmm14,0x90(%rsp)
+ movaps %xmm15,0xa0(%rsp)
+.Lenc_key_body:
+___
+$code.=<<___;
+ mov %esi,%eax
+ shr \$5,%eax
+ add \$5,%eax
+ mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5;
+
+ mov \$0,%ecx
+ mov \$0x30,%r8d
+ call _vpaes_schedule_core
+___
+$code.=<<___ if ($win64);
+ movaps 0x10(%rsp),%xmm6
+ movaps 0x20(%rsp),%xmm7
+ movaps 0x30(%rsp),%xmm8
+ movaps 0x40(%rsp),%xmm9
+ movaps 0x50(%rsp),%xmm10
+ movaps 0x60(%rsp),%xmm11
+ movaps 0x70(%rsp),%xmm12
+ movaps 0x80(%rsp),%xmm13
+ movaps 0x90(%rsp),%xmm14
+ movaps 0xa0(%rsp),%xmm15
+ lea 0xb8(%rsp),%rsp
+.Lenc_key_epilogue:
+___
+$code.=<<___;
+ xor %eax,%eax
+ ret
+.size ${PREFIX}_set_encrypt_key,.-${PREFIX}_set_encrypt_key
+
+.globl ${PREFIX}_set_decrypt_key
+.type ${PREFIX}_set_decrypt_key,\@function,3
+.align 16
+${PREFIX}_set_decrypt_key:
+___
+$code.=<<___ if ($win64);
+ lea -0xb8(%rsp),%rsp
+ movaps %xmm6,0x10(%rsp)
+ movaps %xmm7,0x20(%rsp)
+ movaps %xmm8,0x30(%rsp)
+ movaps %xmm9,0x40(%rsp)
+ movaps %xmm10,0x50(%rsp)
+ movaps %xmm11,0x60(%rsp)
+ movaps %xmm12,0x70(%rsp)
+ movaps %xmm13,0x80(%rsp)
+ movaps %xmm14,0x90(%rsp)
+ movaps %xmm15,0xa0(%rsp)
+.Ldec_key_body:
+___
+$code.=<<___;
+ mov %esi,%eax
+ shr \$5,%eax
+ add \$5,%eax
+ mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5;
+ shl \$4,%eax
+ lea 16(%rdx,%rax),%rdx
+
+ mov \$1,%ecx
+ mov %esi,%r8d
+ shr \$1,%r8d
+ and \$32,%r8d
+ xor \$32,%r8d # nbits==192?0:32
+ call _vpaes_schedule_core
+___
+$code.=<<___ if ($win64);
+ movaps 0x10(%rsp),%xmm6
+ movaps 0x20(%rsp),%xmm7
+ movaps 0x30(%rsp),%xmm8
+ movaps 0x40(%rsp),%xmm9
+ movaps 0x50(%rsp),%xmm10
+ movaps 0x60(%rsp),%xmm11
+ movaps 0x70(%rsp),%xmm12
+ movaps 0x80(%rsp),%xmm13
+ movaps 0x90(%rsp),%xmm14
+ movaps 0xa0(%rsp),%xmm15
+ lea 0xb8(%rsp),%rsp
+.Ldec_key_epilogue:
+___
+$code.=<<___;
+ xor %eax,%eax
+ ret
+.size ${PREFIX}_set_decrypt_key,.-${PREFIX}_set_decrypt_key
+
+.globl ${PREFIX}_encrypt
+.type ${PREFIX}_encrypt,\@function,3
+.align 16
+${PREFIX}_encrypt:
+___
+$code.=<<___ if ($win64);
+ lea -0xb8(%rsp),%rsp
+ movaps %xmm6,0x10(%rsp)
+ movaps %xmm7,0x20(%rsp)
+ movaps %xmm8,0x30(%rsp)
+ movaps %xmm9,0x40(%rsp)
+ movaps %xmm10,0x50(%rsp)
+ movaps %xmm11,0x60(%rsp)
+ movaps %xmm12,0x70(%rsp)
+ movaps %xmm13,0x80(%rsp)
+ movaps %xmm14,0x90(%rsp)
+ movaps %xmm15,0xa0(%rsp)
+.Lenc_body:
+___
+$code.=<<___;
+ movdqu (%rdi),%xmm0
+ call _vpaes_preheat
+ call _vpaes_encrypt_core
+ movdqu %xmm0,(%rsi)
+___
+$code.=<<___ if ($win64);
+ movaps 0x10(%rsp),%xmm6
+ movaps 0x20(%rsp),%xmm7
+ movaps 0x30(%rsp),%xmm8
+ movaps 0x40(%rsp),%xmm9
+ movaps 0x50(%rsp),%xmm10
+ movaps 0x60(%rsp),%xmm11
+ movaps 0x70(%rsp),%xmm12
+ movaps 0x80(%rsp),%xmm13
+ movaps 0x90(%rsp),%xmm14
+ movaps 0xa0(%rsp),%xmm15
+ lea 0xb8(%rsp),%rsp
+.Lenc_epilogue:
+___
+$code.=<<___;
+ ret
+.size ${PREFIX}_encrypt,.-${PREFIX}_encrypt
+
+.globl ${PREFIX}_decrypt
+.type ${PREFIX}_decrypt,\@function,3
+.align 16
+${PREFIX}_decrypt:
+___
+$code.=<<___ if ($win64);
+ lea -0xb8(%rsp),%rsp
+ movaps %xmm6,0x10(%rsp)
+ movaps %xmm7,0x20(%rsp)
+ movaps %xmm8,0x30(%rsp)
+ movaps %xmm9,0x40(%rsp)
+ movaps %xmm10,0x50(%rsp)
+ movaps %xmm11,0x60(%rsp)
+ movaps %xmm12,0x70(%rsp)
+ movaps %xmm13,0x80(%rsp)
+ movaps %xmm14,0x90(%rsp)
+ movaps %xmm15,0xa0(%rsp)
+.Ldec_body:
+___
+$code.=<<___;
+ movdqu (%rdi),%xmm0
+ call _vpaes_preheat
+ call _vpaes_decrypt_core
+ movdqu %xmm0,(%rsi)
+___
+$code.=<<___ if ($win64);
+ movaps 0x10(%rsp),%xmm6
+ movaps 0x20(%rsp),%xmm7
+ movaps 0x30(%rsp),%xmm8
+ movaps 0x40(%rsp),%xmm9
+ movaps 0x50(%rsp),%xmm10
+ movaps 0x60(%rsp),%xmm11
+ movaps 0x70(%rsp),%xmm12
+ movaps 0x80(%rsp),%xmm13
+ movaps 0x90(%rsp),%xmm14
+ movaps 0xa0(%rsp),%xmm15
+ lea 0xb8(%rsp),%rsp
+.Ldec_epilogue:
+___
+$code.=<<___;
+ ret
+.size ${PREFIX}_decrypt,.-${PREFIX}_decrypt
+___
+{
+my ($inp,$out,$len,$key,$ivp,$enc)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9");
+# void AES_cbc_encrypt (const void char *inp, unsigned char *out,
+# size_t length, const AES_KEY *key,
+# unsigned char *ivp,const int enc);
+$code.=<<___;
+.globl ${PREFIX}_cbc_encrypt
+.type ${PREFIX}_cbc_encrypt,\@function,6
+.align 16
+${PREFIX}_cbc_encrypt:
+ xchg $key,$len
+___
+($len,$key)=($key,$len);
+$code.=<<___;
+___
+$code.=<<___ if ($win64);
+ lea -0xb8(%rsp),%rsp
+ movaps %xmm6,0x10(%rsp)
+ movaps %xmm7,0x20(%rsp)
+ movaps %xmm8,0x30(%rsp)
+ movaps %xmm9,0x40(%rsp)
+ movaps %xmm10,0x50(%rsp)
+ movaps %xmm11,0x60(%rsp)
+ movaps %xmm12,0x70(%rsp)
+ movaps %xmm13,0x80(%rsp)
+ movaps %xmm14,0x90(%rsp)
+ movaps %xmm15,0xa0(%rsp)
+.Lcbc_body:
+___
+$code.=<<___;
+ movdqu ($ivp),%xmm6 # load IV
+ sub $inp,$out
+ sub \$16,$len
+ call _vpaes_preheat
+ cmp \$0,${enc}d
+ je .Lcbc_dec_loop
+ jmp .Lcbc_enc_loop
+.align 16
+.Lcbc_enc_loop:
+ movdqu ($inp),%xmm0
+ pxor %xmm6,%xmm0
+ call _vpaes_encrypt_core
+ movdqa %xmm0,%xmm6
+ movdqu %xmm0,($out,$inp)
+ lea 16($inp),$inp
+ sub \$16,$len
+ jnc .Lcbc_enc_loop
+ jmp .Lcbc_done
+.align 16
+.Lcbc_dec_loop:
+ movdqu ($inp),%xmm0
+ movdqa %xmm0,%xmm7
+ call _vpaes_decrypt_core
+ pxor %xmm6,%xmm0
+ movdqa %xmm7,%xmm6
+ movdqu %xmm0,($out,$inp)
+ lea 16($inp),$inp
+ sub \$16,$len
+ jnc .Lcbc_dec_loop
+.Lcbc_done:
+ movdqu %xmm6,($ivp) # save IV
+___
+$code.=<<___ if ($win64);
+ movaps 0x10(%rsp),%xmm6
+ movaps 0x20(%rsp),%xmm7
+ movaps 0x30(%rsp),%xmm8
+ movaps 0x40(%rsp),%xmm9
+ movaps 0x50(%rsp),%xmm10
+ movaps 0x60(%rsp),%xmm11
+ movaps 0x70(%rsp),%xmm12
+ movaps 0x80(%rsp),%xmm13
+ movaps 0x90(%rsp),%xmm14
+ movaps 0xa0(%rsp),%xmm15
+ lea 0xb8(%rsp),%rsp
+.Lcbc_epilogue:
+___
+$code.=<<___;
+ ret
+.size ${PREFIX}_cbc_encrypt,.-${PREFIX}_cbc_encrypt
+___
+}
+$code.=<<___;
+##
+## _aes_preheat
+##
+## Fills register %r10 -> .aes_consts (so you can -fPIC)
+## and %xmm9-%xmm15 as specified below.
+##
+.type _vpaes_preheat,\@abi-omnipotent
+.align 16
+_vpaes_preheat:
+ lea .Lk_s0F(%rip), %r10
+ movdqa -0x20(%r10), %xmm10 # .Lk_inv
+ movdqa -0x10(%r10), %xmm11 # .Lk_inv+16
+ movdqa 0x00(%r10), %xmm9 # .Lk_s0F
+ movdqa 0x30(%r10), %xmm13 # .Lk_sb1
+ movdqa 0x40(%r10), %xmm12 # .Lk_sb1+16
+ movdqa 0x50(%r10), %xmm15 # .Lk_sb2
+ movdqa 0x60(%r10), %xmm14 # .Lk_sb2+16
+ ret
+.size _vpaes_preheat,.-_vpaes_preheat
+########################################################
+## ##
+## Constants ##
+## ##
+########################################################
+.type _vpaes_consts,\@object
+.align 64
+_vpaes_consts:
+.Lk_inv: # inv, inva
+ .quad 0x0E05060F0D080180, 0x040703090A0B0C02
+ .quad 0x01040A060F0B0780, 0x030D0E0C02050809
+
+.Lk_s0F: # s0F
+ .quad 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F
+
+.Lk_ipt: # input transform (lo, hi)
+ .quad 0xC2B2E8985A2A7000, 0xCABAE09052227808
+ .quad 0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81
+
+.Lk_sb1: # sb1u, sb1t
+ .quad 0xB19BE18FCB503E00, 0xA5DF7A6E142AF544
+ .quad 0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF
+.Lk_sb2: # sb2u, sb2t
+ .quad 0xE27A93C60B712400, 0x5EB7E955BC982FCD
+ .quad 0x69EB88400AE12900, 0xC2A163C8AB82234A
+.Lk_sbo: # sbou, sbot
+ .quad 0xD0D26D176FBDC700, 0x15AABF7AC502A878
+ .quad 0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA
+
+.Lk_mc_forward: # mc_forward
+ .quad 0x0407060500030201, 0x0C0F0E0D080B0A09
+ .quad 0x080B0A0904070605, 0x000302010C0F0E0D
+ .quad 0x0C0F0E0D080B0A09, 0x0407060500030201
+ .quad 0x000302010C0F0E0D, 0x080B0A0904070605
+
+.Lk_mc_backward:# mc_backward
+ .quad 0x0605040702010003, 0x0E0D0C0F0A09080B
+ .quad 0x020100030E0D0C0F, 0x0A09080B06050407
+ .quad 0x0E0D0C0F0A09080B, 0x0605040702010003
+ .quad 0x0A09080B06050407, 0x020100030E0D0C0F
+
+.Lk_sr: # sr
+ .quad 0x0706050403020100, 0x0F0E0D0C0B0A0908
+ .quad 0x030E09040F0A0500, 0x0B06010C07020D08
+ .quad 0x0F060D040B020900, 0x070E050C030A0108
+ .quad 0x0B0E0104070A0D00, 0x0306090C0F020508
+
+.Lk_rcon: # rcon
+ .quad 0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81
+
+.Lk_s63: # s63: all equal to 0x63 transformed
+ .quad 0x5B5B5B5B5B5B5B5B, 0x5B5B5B5B5B5B5B5B
+
+.Lk_opt: # output transform
+ .quad 0xFF9F4929D6B66000, 0xF7974121DEBE6808
+ .quad 0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0
+
+.Lk_deskew: # deskew tables: inverts the sbox's "skew"
+ .quad 0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A
+ .quad 0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77
+
+##
+## Decryption stuff
+## Key schedule constants
+##
+.Lk_dksd: # decryption key schedule: invskew x*D
+ .quad 0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9
+ .quad 0x41C277F4B5368300, 0x5FDC69EAAB289D1E
+.Lk_dksb: # decryption key schedule: invskew x*B
+ .quad 0x9A4FCA1F8550D500, 0x03D653861CC94C99
+ .quad 0x115BEDA7B6FC4A00, 0xD993256F7E3482C8
+.Lk_dkse: # decryption key schedule: invskew x*E + 0x63
+ .quad 0xD5031CCA1FC9D600, 0x53859A4C994F5086
+ .quad 0xA23196054FDC7BE8, 0xCD5EF96A20B31487
+.Lk_dks9: # decryption key schedule: invskew x*9
+ .quad 0xB6116FC87ED9A700, 0x4AED933482255BFC
+ .quad 0x4576516227143300, 0x8BB89FACE9DAFDCE
+
+##
+## Decryption stuff
+## Round function constants
+##
+.Lk_dipt: # decryption input transform
+ .quad 0x0F505B040B545F00, 0x154A411E114E451A
+ .quad 0x86E383E660056500, 0x12771772F491F194
+
+.Lk_dsb9: # decryption sbox output *9*u, *9*t
+ .quad 0x851C03539A86D600, 0xCAD51F504F994CC9
+ .quad 0xC03B1789ECD74900, 0x725E2C9EB2FBA565
+.Lk_dsbd: # decryption sbox output *D*u, *D*t
+ .quad 0x7D57CCDFE6B1A200, 0xF56E9B13882A4439
+ .quad 0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3
+.Lk_dsbb: # decryption sbox output *B*u, *B*t
+ .quad 0xD022649296B44200, 0x602646F6B0F2D404
+ .quad 0xC19498A6CD596700, 0xF3FF0C3E3255AA6B
+.Lk_dsbe: # decryption sbox output *E*u, *E*t
+ .quad 0x46F2929626D4D000, 0x2242600464B4F6B0
+ .quad 0x0C55A6CDFFAAC100, 0x9467F36B98593E32
+.Lk_dsbo: # decryption sbox final output
+ .quad 0x1387EA537EF94000, 0xC7AA6DB9D4943E2D
+ .quad 0x12D7560F93441D00, 0xCA4B8159D8C58E9C
+.asciz "Vector Permutaion AES for x86_64/SSSE3, Mike Hamburg (Stanford University)"
+.align 64
+.size _vpaes_consts,.-_vpaes_consts
+___
+
+if ($win64) {
+# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
+# CONTEXT *context,DISPATCHER_CONTEXT *disp)
+$rec="%rcx";
+$frame="%rdx";
+$context="%r8";
+$disp="%r9";
+
+$code.=<<___;
+.extern __imp_RtlVirtualUnwind
+.type se_handler,\@abi-omnipotent
+.align 16
+se_handler:
+ push %rsi
+ push %rdi
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ pushfq
+ sub \$64,%rsp
+
+ mov 120($context),%rax # pull context->Rax
+ mov 248($context),%rbx # pull context->Rip
+
+ mov 8($disp),%rsi # disp->ImageBase
+ mov 56($disp),%r11 # disp->HandlerData
+
+ mov 0(%r11),%r10d # HandlerData[0]
+ lea (%rsi,%r10),%r10 # prologue label
+ cmp %r10,%rbx # context->Rip<prologue label
+ jb .Lin_prologue
+
+ mov 152($context),%rax # pull context->Rsp
+
+ mov 4(%r11),%r10d # HandlerData[1]
+ lea (%rsi,%r10),%r10 # epilogue label
+ cmp %r10,%rbx # context->Rip>=epilogue label
+ jae .Lin_prologue
+
+ lea 16(%rax),%rsi # %xmm save area
+ lea 512($context),%rdi # &context.Xmm6
+ mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
+ .long 0xa548f3fc # cld; rep movsq
+ lea 0xb8(%rax),%rax # adjust stack pointer
+
+.Lin_prologue:
+ mov 8(%rax),%rdi
+ mov 16(%rax),%rsi
+ mov %rax,152($context) # restore context->Rsp
+ mov %rsi,168($context) # restore context->Rsi
+ mov %rdi,176($context) # restore context->Rdi
+
+ mov 40($disp),%rdi # disp->ContextRecord
+ mov $context,%rsi # context
+ mov \$`1232/8`,%ecx # sizeof(CONTEXT)
+ .long 0xa548f3fc # cld; rep movsq
+
+ mov $disp,%rsi
+ xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
+ mov 8(%rsi),%rdx # arg2, disp->ImageBase
+ mov 0(%rsi),%r8 # arg3, disp->ControlPc
+ mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
+ mov 40(%rsi),%r10 # disp->ContextRecord
+ lea 56(%rsi),%r11 # &disp->HandlerData
+ lea 24(%rsi),%r12 # &disp->EstablisherFrame
+ mov %r10,32(%rsp) # arg5
+ mov %r11,40(%rsp) # arg6
+ mov %r12,48(%rsp) # arg7
+ mov %rcx,56(%rsp) # arg8, (NULL)
+ call *__imp_RtlVirtualUnwind(%rip)
+
+ mov \$1,%eax # ExceptionContinueSearch
+ add \$64,%rsp
+ popfq
+ pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbp
+ pop %rbx
+ pop %rdi
+ pop %rsi
+ ret
+.size se_handler,.-se_handler
+
+.section .pdata
+.align 4
+ .rva .LSEH_begin_${PREFIX}_set_encrypt_key
+ .rva .LSEH_end_${PREFIX}_set_encrypt_key
+ .rva .LSEH_info_${PREFIX}_set_encrypt_key
+
+ .rva .LSEH_begin_${PREFIX}_set_decrypt_key
+ .rva .LSEH_end_${PREFIX}_set_decrypt_key
+ .rva .LSEH_info_${PREFIX}_set_decrypt_key
+
+ .rva .LSEH_begin_${PREFIX}_encrypt
+ .rva .LSEH_end_${PREFIX}_encrypt
+ .rva .LSEH_info_${PREFIX}_encrypt
+
+ .rva .LSEH_begin_${PREFIX}_decrypt
+ .rva .LSEH_end_${PREFIX}_decrypt
+ .rva .LSEH_info_${PREFIX}_decrypt
+
+ .rva .LSEH_begin_${PREFIX}_cbc_encrypt
+ .rva .LSEH_end_${PREFIX}_cbc_encrypt
+ .rva .LSEH_info_${PREFIX}_cbc_encrypt
+
+.section .xdata
+.align 8
+.LSEH_info_${PREFIX}_set_encrypt_key:
+ .byte 9,0,0,0
+ .rva se_handler
+ .rva .Lenc_key_body,.Lenc_key_epilogue # HandlerData[]
+.LSEH_info_${PREFIX}_set_decrypt_key:
+ .byte 9,0,0,0
+ .rva se_handler
+ .rva .Ldec_key_body,.Ldec_key_epilogue # HandlerData[]
+.LSEH_info_${PREFIX}_encrypt:
+ .byte 9,0,0,0
+ .rva se_handler
+ .rva .Lenc_body,.Lenc_epilogue # HandlerData[]
+.LSEH_info_${PREFIX}_decrypt:
+ .byte 9,0,0,0
+ .rva se_handler
+ .rva .Ldec_body,.Ldec_epilogue # HandlerData[]
+.LSEH_info_${PREFIX}_cbc_encrypt:
+ .byte 9,0,0,0
+ .rva se_handler
+ .rva .Lcbc_body,.Lcbc_epilogue # HandlerData[]
+___
+}
+
+$code =~ s/\`([^\`]*)\`/eval($1)/gem;
+
+print $code;
+
+close STDOUT;
--- /dev/null
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# May 2011
+#
+# The module implements bn_GF2m_mul_2x2 polynomial multiplication used
+# in bn_gf2m.c. It's kind of low-hanging mechanical port from C for
+# the time being... Except that it has two code paths: code suitable
+# for any x86_64 CPU and PCLMULQDQ one suitable for Westmere and
+# later. Improvement varies from one benchmark and µ-arch to another.
+# Vanilla code path is at most 20% faster than compiler-generated code
+# [not very impressive], while PCLMULQDQ - whole 85%-160% better on
+# 163- and 571-bit ECDH benchmarks on Intel CPUs. Keep in mind that
+# these coefficients are not ones for bn_GF2m_mul_2x2 itself, as not
+# all CPU time is burnt in it...
+
+$flavour = shift;
+$output = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+open STDOUT,"| $^X $xlate $flavour $output";
+
+($lo,$hi)=("%rax","%rdx"); $a=$lo;
+($i0,$i1)=("%rsi","%rdi");
+($t0,$t1)=("%rbx","%rcx");
+($b,$mask)=("%rbp","%r8");
+($a1,$a2,$a4,$a8,$a12,$a48)=map("%r$_",(9..15));
+($R,$Tx)=("%xmm0","%xmm1");
+
+$code.=<<___;
+.text
+
+.type _mul_1x1,\@abi-omnipotent
+.align 16
+_mul_1x1:
+ sub \$128+8,%rsp
+ mov \$-1,$a1
+ lea ($a,$a),$i0
+ shr \$3,$a1
+ lea (,$a,4),$i1
+ and $a,$a1 # a1=a&0x1fffffffffffffff
+ lea (,$a,8),$a8
+ sar \$63,$a # broadcast 63rd bit
+ lea ($a1,$a1),$a2
+ sar \$63,$i0 # broadcast 62nd bit
+ lea (,$a1,4),$a4
+ and $b,$a
+ sar \$63,$i1 # boardcast 61st bit
+ mov $a,$hi # $a is $lo
+ shl \$63,$lo
+ and $b,$i0
+ shr \$1,$hi
+ mov $i0,$t1
+ shl \$62,$i0
+ and $b,$i1
+ shr \$2,$t1
+ xor $i0,$lo
+ mov $i1,$t0
+ shl \$61,$i1
+ xor $t1,$hi
+ shr \$3,$t0
+ xor $i1,$lo
+ xor $t0,$hi
+
+ mov $a1,$a12
+ movq \$0,0(%rsp) # tab[0]=0
+ xor $a2,$a12 # a1^a2
+ mov $a1,8(%rsp) # tab[1]=a1
+ mov $a4,$a48
+ mov $a2,16(%rsp) # tab[2]=a2
+ xor $a8,$a48 # a4^a8
+ mov $a12,24(%rsp) # tab[3]=a1^a2
+
+ xor $a4,$a1
+ mov $a4,32(%rsp) # tab[4]=a4
+ xor $a4,$a2
+ mov $a1,40(%rsp) # tab[5]=a1^a4
+ xor $a4,$a12
+ mov $a2,48(%rsp) # tab[6]=a2^a4
+ xor $a48,$a1 # a1^a4^a4^a8=a1^a8
+ mov $a12,56(%rsp) # tab[7]=a1^a2^a4
+ xor $a48,$a2 # a2^a4^a4^a8=a1^a8
+
+ mov $a8,64(%rsp) # tab[8]=a8
+ xor $a48,$a12 # a1^a2^a4^a4^a8=a1^a2^a8
+ mov $a1,72(%rsp) # tab[9]=a1^a8
+ xor $a4,$a1 # a1^a8^a4
+ mov $a2,80(%rsp) # tab[10]=a2^a8
+ xor $a4,$a2 # a2^a8^a4
+ mov $a12,88(%rsp) # tab[11]=a1^a2^a8
+
+ xor $a4,$a12 # a1^a2^a8^a4
+ mov $a48,96(%rsp) # tab[12]=a4^a8
+ mov $mask,$i0
+ mov $a1,104(%rsp) # tab[13]=a1^a4^a8
+ and $b,$i0
+ mov $a2,112(%rsp) # tab[14]=a2^a4^a8
+ shr \$4,$b
+ mov $a12,120(%rsp) # tab[15]=a1^a2^a4^a8
+ mov $mask,$i1
+ and $b,$i1
+ shr \$4,$b
+
+ movq (%rsp,$i0,8),$R # half of calculations is done in SSE2
+ mov $mask,$i0
+ and $b,$i0
+ shr \$4,$b
+___
+ for ($n=1;$n<8;$n++) {
+ $code.=<<___;
+ mov (%rsp,$i1,8),$t1
+ mov $mask,$i1
+ mov $t1,$t0
+ shl \$`8*$n-4`,$t1
+ and $b,$i1
+ movq (%rsp,$i0,8),$Tx
+ shr \$`64-(8*$n-4)`,$t0
+ xor $t1,$lo
+ pslldq \$$n,$Tx
+ mov $mask,$i0
+ shr \$4,$b
+ xor $t0,$hi
+ and $b,$i0
+ shr \$4,$b
+ pxor $Tx,$R
+___
+ }
+$code.=<<___;
+ mov (%rsp,$i1,8),$t1
+ mov $t1,$t0
+ shl \$`8*$n-4`,$t1
+ movq $R,$i0
+ shr \$`64-(8*$n-4)`,$t0
+ xor $t1,$lo
+ psrldq \$8,$R
+ xor $t0,$hi
+ movq $R,$i1
+ xor $i0,$lo
+ xor $i1,$hi
+
+ add \$128+8,%rsp
+ ret
+.Lend_mul_1x1:
+.size _mul_1x1,.-_mul_1x1
+___
+
+($rp,$a1,$a0,$b1,$b0) = $win64? ("%rcx","%rdx","%r8", "%r9","%r10") : # Win64 order
+ ("%rdi","%rsi","%rdx","%rcx","%r8"); # Unix order
+
+$code.=<<___;
+.extern OPENSSL_ia32cap_P
+.globl bn_GF2m_mul_2x2
+.type bn_GF2m_mul_2x2,\@abi-omnipotent
+.align 16
+bn_GF2m_mul_2x2:
+ mov OPENSSL_ia32cap_P(%rip),%rax
+ bt \$33,%rax
+ jnc .Lvanilla_mul_2x2
+
+ movq $a1,%xmm0
+ movq $b1,%xmm1
+ movq $a0,%xmm2
+___
+$code.=<<___ if ($win64);
+ movq 40(%rsp),%xmm3
+___
+$code.=<<___ if (!$win64);
+ movq $b0,%xmm3
+___
+$code.=<<___;
+ movdqa %xmm0,%xmm4
+ movdqa %xmm1,%xmm5
+ pclmulqdq \$0,%xmm1,%xmm0 # a1·b1
+ pxor %xmm2,%xmm4
+ pxor %xmm3,%xmm5
+ pclmulqdq \$0,%xmm3,%xmm2 # a0·b0
+ pclmulqdq \$0,%xmm5,%xmm4 # (a0+a1)·(b0+b1)
+ xorps %xmm0,%xmm4
+ xorps %xmm2,%xmm4 # (a0+a1)·(b0+b1)-a0·b0-a1·b1
+ movdqa %xmm4,%xmm5
+ pslldq \$8,%xmm4
+ psrldq \$8,%xmm5
+ pxor %xmm4,%xmm2
+ pxor %xmm5,%xmm0
+ movdqu %xmm2,0($rp)
+ movdqu %xmm0,16($rp)
+ ret
+
+.align 16
+.Lvanilla_mul_2x2:
+ lea -8*17(%rsp),%rsp
+___
+$code.=<<___ if ($win64);
+ mov `8*17+40`(%rsp),$b0
+ mov %rdi,8*15(%rsp)
+ mov %rsi,8*16(%rsp)
+___
+$code.=<<___;
+ mov %r14,8*10(%rsp)
+ mov %r13,8*11(%rsp)
+ mov %r12,8*12(%rsp)
+ mov %rbp,8*13(%rsp)
+ mov %rbx,8*14(%rsp)
+.Lbody_mul_2x2:
+ mov $rp,32(%rsp) # save the arguments
+ mov $a1,40(%rsp)
+ mov $a0,48(%rsp)
+ mov $b1,56(%rsp)
+ mov $b0,64(%rsp)
+
+ mov \$0xf,$mask
+ mov $a1,$a
+ mov $b1,$b
+ call _mul_1x1 # a1·b1
+ mov $lo,16(%rsp)
+ mov $hi,24(%rsp)
+
+ mov 48(%rsp),$a
+ mov 64(%rsp),$b
+ call _mul_1x1 # a0·b0
+ mov $lo,0(%rsp)
+ mov $hi,8(%rsp)
+
+ mov 40(%rsp),$a
+ mov 56(%rsp),$b
+ xor 48(%rsp),$a
+ xor 64(%rsp),$b
+ call _mul_1x1 # (a0+a1)·(b0+b1)
+___
+ @r=("%rbx","%rcx","%rdi","%rsi");
+$code.=<<___;
+ mov 0(%rsp),@r[0]
+ mov 8(%rsp),@r[1]
+ mov 16(%rsp),@r[2]
+ mov 24(%rsp),@r[3]
+ mov 32(%rsp),%rbp
+
+ xor $hi,$lo
+ xor @r[1],$hi
+ xor @r[0],$lo
+ mov @r[0],0(%rbp)
+ xor @r[2],$hi
+ mov @r[3],24(%rbp)
+ xor @r[3],$lo
+ xor @r[3],$hi
+ xor $hi,$lo
+ mov $hi,16(%rbp)
+ mov $lo,8(%rbp)
+
+ mov 8*10(%rsp),%r14
+ mov 8*11(%rsp),%r13
+ mov 8*12(%rsp),%r12
+ mov 8*13(%rsp),%rbp
+ mov 8*14(%rsp),%rbx
+___
+$code.=<<___ if ($win64);
+ mov 8*15(%rsp),%rdi
+ mov 8*16(%rsp),%rsi
+___
+$code.=<<___;
+ lea 8*17(%rsp),%rsp
+ ret
+.Lend_mul_2x2:
+.size bn_GF2m_mul_2x2,.-bn_GF2m_mul_2x2
+.asciz "GF(2^m) Multiplication for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
+.align 16
+___
+
+# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
+# CONTEXT *context,DISPATCHER_CONTEXT *disp)
+if ($win64) {
+$rec="%rcx";
+$frame="%rdx";
+$context="%r8";
+$disp="%r9";
+
+$code.=<<___;
+.extern __imp_RtlVirtualUnwind
+
+.type se_handler,\@abi-omnipotent
+.align 16
+se_handler:
+ push %rsi
+ push %rdi
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ pushfq
+ sub \$64,%rsp
+
+ mov 152($context),%rax # pull context->Rsp
+ mov 248($context),%rbx # pull context->Rip
+
+ lea .Lbody_mul_2x2(%rip),%r10
+ cmp %r10,%rbx # context->Rip<"prologue" label
+ jb .Lin_prologue
+
+ mov 8*10(%rax),%r14 # mimic epilogue
+ mov 8*11(%rax),%r13
+ mov 8*12(%rax),%r12
+ mov 8*13(%rax),%rbp
+ mov 8*14(%rax),%rbx
+ mov 8*15(%rax),%rdi
+ mov 8*16(%rax),%rsi
+
+ mov %rbx,144($context) # restore context->Rbx
+ mov %rbp,160($context) # restore context->Rbp
+ mov %rsi,168($context) # restore context->Rsi
+ mov %rdi,176($context) # restore context->Rdi
+ mov %r12,216($context) # restore context->R12
+ mov %r13,224($context) # restore context->R13
+ mov %r14,232($context) # restore context->R14
+
+.Lin_prologue:
+ lea 8*17(%rax),%rax
+ mov %rax,152($context) # restore context->Rsp
+
+ mov 40($disp),%rdi # disp->ContextRecord
+ mov $context,%rsi # context
+ mov \$154,%ecx # sizeof(CONTEXT)
+ .long 0xa548f3fc # cld; rep movsq
+
+ mov $disp,%rsi
+ xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
+ mov 8(%rsi),%rdx # arg2, disp->ImageBase
+ mov 0(%rsi),%r8 # arg3, disp->ControlPc
+ mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
+ mov 40(%rsi),%r10 # disp->ContextRecord
+ lea 56(%rsi),%r11 # &disp->HandlerData
+ lea 24(%rsi),%r12 # &disp->EstablisherFrame
+ mov %r10,32(%rsp) # arg5
+ mov %r11,40(%rsp) # arg6
+ mov %r12,48(%rsp) # arg7
+ mov %rcx,56(%rsp) # arg8, (NULL)
+ call *__imp_RtlVirtualUnwind(%rip)
+
+ mov \$1,%eax # ExceptionContinueSearch
+ add \$64,%rsp
+ popfq
+ pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbp
+ pop %rbx
+ pop %rdi
+ pop %rsi
+ ret
+.size se_handler,.-se_handler
+
+.section .pdata
+.align 4
+ .rva _mul_1x1
+ .rva .Lend_mul_1x1
+ .rva .LSEH_info_1x1
+
+ .rva .Lvanilla_mul_2x2
+ .rva .Lend_mul_2x2
+ .rva .LSEH_info_2x2
+.section .xdata
+.align 8
+.LSEH_info_1x1:
+ .byte 0x01,0x07,0x02,0x00
+ .byte 0x07,0x01,0x11,0x00 # sub rsp,128+8
+.LSEH_info_2x2:
+ .byte 9,0,0,0
+ .rva se_handler
+___
+}
+
+$code =~ s/\`([^\`]*)\`/eval($1)/gem;
+print $code;
+close STDOUT;
#!/usr/bin/env perl
# ====================================================================
-# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# respectful 50%. It remains to be seen if loop unrolling and
# dedicated squaring routine can provide further improvement...
+# July 2011.
+#
+# Add dedicated squaring procedure. Performance improvement varies
+# from platform to platform, but in average it's ~5%/15%/25%/33%
+# for 512-/1024-/2048-/4096-bit RSA *sign* benchmarks respectively.
+
+# August 2011.
+#
+# Unroll and modulo-schedule inner loops in such manner that they
+# are "fallen through" for input lengths of 8, which is critical for
+# 1024-bit RSA *sign*. Average performance improvement in comparison
+# to *initial* version of this module from 2005 is ~0%/30%/40%/45%
+# for 512-/1024-/2048-/4096-bit RSA *sign* benchmarks respectively.
+
$flavour = shift;
$output = shift;
if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
$num="%r9"; # int num);
$lo0="%r10";
$hi0="%r11";
-$bp="%r12"; # reassign $bp
$hi1="%r13";
$i="%r14";
$j="%r15";
.type bn_mul_mont,\@function,6
.align 16
bn_mul_mont:
+ test \$3,${num}d
+ jnz .Lmul_enter
+ cmp \$8,${num}d
+ jb .Lmul_enter
+ cmp $ap,$bp
+ jne .Lmul4x_enter
+ jmp .Lsqr4x_enter
+
+.align 16
+.Lmul_enter:
push %rbx
push %rbp
push %r12
and \$-1024,%rsp # minimize TLB usage
mov %r11,8(%rsp,$num,8) # tp[num+1]=%rsp
-.Lprologue:
- mov %rdx,$bp # $bp reassigned, remember?
-
+.Lmul_body:
+ mov $bp,%r12 # reassign $bp
+___
+ $bp="%r12";
+$code.=<<___;
mov ($n0),$n0 # pull n0[0] value
+ mov ($bp),$m0 # m0=bp[0]
+ mov ($ap),%rax
xor $i,$i # i=0
xor $j,$j # j=0
- mov ($bp),$m0 # m0=bp[0]
- mov ($ap),%rax
+ mov $n0,$m1
mulq $m0 # ap[0]*bp[0]
mov %rax,$lo0
- mov %rdx,$hi0
+ mov ($np),%rax
- imulq $n0,%rax # "tp[0]"*n0
- mov %rax,$m1
+ imulq $lo0,$m1 # "tp[0]"*n0
+ mov %rdx,$hi0
- mulq ($np) # np[0]*m1
- add $lo0,%rax # discarded
+ mulq $m1 # np[0]*m1
+ add %rax,$lo0 # discarded
+ mov 8($ap),%rax
adc \$0,%rdx
mov %rdx,$hi1
lea 1($j),$j # j++
+ jmp .L1st_enter
+
+.align 16
.L1st:
+ add %rax,$hi1
mov ($ap,$j,8),%rax
- mulq $m0 # ap[j]*bp[0]
- add $hi0,%rax
adc \$0,%rdx
- mov %rax,$lo0
+ add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0]
+ mov $lo0,$hi0
+ adc \$0,%rdx
+ mov $hi1,-16(%rsp,$j,8) # tp[j-1]
+ mov %rdx,$hi1
+
+.L1st_enter:
+ mulq $m0 # ap[j]*bp[0]
+ add %rax,$hi0
mov ($np,$j,8),%rax
- mov %rdx,$hi0
+ adc \$0,%rdx
+ lea 1($j),$j # j++
+ mov %rdx,$lo0
mulq $m1 # np[j]*m1
- add $hi1,%rax
- lea 1($j),$j # j++
+ cmp $num,$j
+ jne .L1st
+
+ add %rax,$hi1
+ mov ($ap),%rax # ap[0]
adc \$0,%rdx
- add $lo0,%rax # np[j]*m1+ap[j]*bp[0]
+ add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0]
adc \$0,%rdx
- mov %rax,-16(%rsp,$j,8) # tp[j-1]
- cmp $num,$j
+ mov $hi1,-16(%rsp,$j,8) # tp[j-1]
mov %rdx,$hi1
- jl .L1st
+ mov $lo0,$hi0
xor %rdx,%rdx
add $hi0,$hi1
mov %rdx,(%rsp,$num,8) # store upmost overflow bit
lea 1($i),$i # i++
-.align 4
+ jmp .Louter
+.align 16
.Louter:
- xor $j,$j # j=0
-
mov ($bp,$i,8),$m0 # m0=bp[i]
- mov ($ap),%rax # ap[0]
+ xor $j,$j # j=0
+ mov $n0,$m1
+ mov (%rsp),$lo0
mulq $m0 # ap[0]*bp[i]
- add (%rsp),%rax # ap[0]*bp[i]+tp[0]
+ add %rax,$lo0 # ap[0]*bp[i]+tp[0]
+ mov ($np),%rax
adc \$0,%rdx
- mov %rax,$lo0
- mov %rdx,$hi0
- imulq $n0,%rax # tp[0]*n0
- mov %rax,$m1
+ imulq $lo0,$m1 # tp[0]*n0
+ mov %rdx,$hi0
- mulq ($np,$j,8) # np[0]*m1
- add $lo0,%rax # discarded
- mov 8(%rsp),$lo0 # tp[1]
+ mulq $m1 # np[0]*m1
+ add %rax,$lo0 # discarded
+ mov 8($ap),%rax
adc \$0,%rdx
+ mov 8(%rsp),$lo0 # tp[1]
mov %rdx,$hi1
lea 1($j),$j # j++
-.align 4
+ jmp .Linner_enter
+
+.align 16
.Linner:
+ add %rax,$hi1
mov ($ap,$j,8),%rax
- mulq $m0 # ap[j]*bp[i]
- add $hi0,%rax
adc \$0,%rdx
- add %rax,$lo0 # ap[j]*bp[i]+tp[j]
+ add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j]
+ mov (%rsp,$j,8),$lo0
+ adc \$0,%rdx
+ mov $hi1,-16(%rsp,$j,8) # tp[j-1]
+ mov %rdx,$hi1
+
+.Linner_enter:
+ mulq $m0 # ap[j]*bp[i]
+ add %rax,$hi0
mov ($np,$j,8),%rax
adc \$0,%rdx
+ add $hi0,$lo0 # ap[j]*bp[i]+tp[j]
mov %rdx,$hi0
+ adc \$0,$hi0
+ lea 1($j),$j # j++
mulq $m1 # np[j]*m1
- add $hi1,%rax
- lea 1($j),$j # j++
- adc \$0,%rdx
- add $lo0,%rax # np[j]*m1+ap[j]*bp[i]+tp[j]
+ cmp $num,$j
+ jne .Linner
+
+ add %rax,$hi1
+ mov ($ap),%rax # ap[0]
adc \$0,%rdx
+ add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j]
mov (%rsp,$j,8),$lo0
- cmp $num,$j
- mov %rax,-16(%rsp,$j,8) # tp[j-1]
+ adc \$0,%rdx
+ mov $hi1,-16(%rsp,$j,8) # tp[j-1]
mov %rdx,$hi1
- jl .Linner
xor %rdx,%rdx
add $hi0,$hi1
cmp $num,$i
jl .Louter
- lea (%rsp),$ap # borrow ap for tp
- lea -1($num),$j # j=num-1
-
- mov ($ap),%rax # tp[0]
xor $i,$i # i=0 and clear CF!
+ mov (%rsp),%rax # tp[0]
+ lea (%rsp),$ap # borrow ap for tp
+ mov $num,$j # j=num
jmp .Lsub
.align 16
.Lsub: sbb ($np,$i,8),%rax
mov %rax,($rp,$i,8) # rp[i]=tp[i]-np[i]
- dec $j # doesn't affect CF!
mov 8($ap,$i,8),%rax # tp[i+1]
lea 1($i),$i # i++
- jge .Lsub
+ dec $j # doesnn't affect CF!
+ jnz .Lsub
sbb \$0,%rax # handle upmost overflow bit
+ xor $i,$i
and %rax,$ap
not %rax
mov $rp,$np
and %rax,$np
- lea -1($num),$j
+ mov $num,$j # j=num
or $np,$ap # ap=borrow?tp:rp
.align 16
.Lcopy: # copy or in-place refresh
+ mov ($ap,$i,8),%rax
+ mov $i,(%rsp,$i,8) # zap temporary vector
+ mov %rax,($rp,$i,8) # rp[i]=tp[i]
+ lea 1($i),$i
+ sub \$1,$j
+ jnz .Lcopy
+
+ mov 8(%rsp,$num,8),%rsi # restore %rsp
+ mov \$1,%rax
+ mov (%rsi),%r15
+ mov 8(%rsi),%r14
+ mov 16(%rsi),%r13
+ mov 24(%rsi),%r12
+ mov 32(%rsi),%rbp
+ mov 40(%rsi),%rbx
+ lea 48(%rsi),%rsp
+.Lmul_epilogue:
+ ret
+.size bn_mul_mont,.-bn_mul_mont
+___
+{{{
+my @A=("%r10","%r11");
+my @N=("%r13","%rdi");
+$code.=<<___;
+.type bn_mul4x_mont,\@function,6
+.align 16
+bn_mul4x_mont:
+.Lmul4x_enter:
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+
+ mov ${num}d,${num}d
+ lea 4($num),%r10
+ mov %rsp,%r11
+ neg %r10
+ lea (%rsp,%r10,8),%rsp # tp=alloca(8*(num+4))
+ and \$-1024,%rsp # minimize TLB usage
+
+ mov %r11,8(%rsp,$num,8) # tp[num+1]=%rsp
+.Lmul4x_body:
+ mov $rp,16(%rsp,$num,8) # tp[num+2]=$rp
+ mov %rdx,%r12 # reassign $bp
+___
+ $bp="%r12";
+$code.=<<___;
+ mov ($n0),$n0 # pull n0[0] value
+ mov ($bp),$m0 # m0=bp[0]
+ mov ($ap),%rax
+
+ xor $i,$i # i=0
+ xor $j,$j # j=0
+
+ mov $n0,$m1
+ mulq $m0 # ap[0]*bp[0]
+ mov %rax,$A[0]
+ mov ($np),%rax
+
+ imulq $A[0],$m1 # "tp[0]"*n0
+ mov %rdx,$A[1]
+
+ mulq $m1 # np[0]*m1
+ add %rax,$A[0] # discarded
+ mov 8($ap),%rax
+ adc \$0,%rdx
+ mov %rdx,$N[1]
+
+ mulq $m0
+ add %rax,$A[1]
+ mov 8($np),%rax
+ adc \$0,%rdx
+ mov %rdx,$A[0]
+
+ mulq $m1
+ add %rax,$N[1]
+ mov 16($ap),%rax
+ adc \$0,%rdx
+ add $A[1],$N[1]
+ lea 4($j),$j # j++
+ adc \$0,%rdx
+ mov $N[1],(%rsp)
+ mov %rdx,$N[0]
+ jmp .L1st4x
+.align 16
+.L1st4x:
+ mulq $m0 # ap[j]*bp[0]
+ add %rax,$A[0]
+ mov -16($np,$j,8),%rax
+ adc \$0,%rdx
+ mov %rdx,$A[1]
+
+ mulq $m1 # np[j]*m1
+ add %rax,$N[0]
+ mov -8($ap,$j,8),%rax
+ adc \$0,%rdx
+ add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0]
+ adc \$0,%rdx
+ mov $N[0],-24(%rsp,$j,8) # tp[j-1]
+ mov %rdx,$N[1]
+
+ mulq $m0 # ap[j]*bp[0]
+ add %rax,$A[1]
+ mov -8($np,$j,8),%rax
+ adc \$0,%rdx
+ mov %rdx,$A[0]
+
+ mulq $m1 # np[j]*m1
+ add %rax,$N[1]
mov ($ap,$j,8),%rax
- mov %rax,($rp,$j,8) # rp[i]=tp[i]
- mov $i,(%rsp,$j,8) # zap temporary vector
+ adc \$0,%rdx
+ add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0]
+ adc \$0,%rdx
+ mov $N[1],-16(%rsp,$j,8) # tp[j-1]
+ mov %rdx,$N[0]
+
+ mulq $m0 # ap[j]*bp[0]
+ add %rax,$A[0]
+ mov ($np,$j,8),%rax
+ adc \$0,%rdx
+ mov %rdx,$A[1]
+
+ mulq $m1 # np[j]*m1
+ add %rax,$N[0]
+ mov 8($ap,$j,8),%rax
+ adc \$0,%rdx
+ add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0]
+ adc \$0,%rdx
+ mov $N[0],-8(%rsp,$j,8) # tp[j-1]
+ mov %rdx,$N[1]
+
+ mulq $m0 # ap[j]*bp[0]
+ add %rax,$A[1]
+ mov 8($np,$j,8),%rax
+ adc \$0,%rdx
+ lea 4($j),$j # j++
+ mov %rdx,$A[0]
+
+ mulq $m1 # np[j]*m1
+ add %rax,$N[1]
+ mov -16($ap,$j,8),%rax
+ adc \$0,%rdx
+ add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0]
+ adc \$0,%rdx
+ mov $N[1],-32(%rsp,$j,8) # tp[j-1]
+ mov %rdx,$N[0]
+ cmp $num,$j
+ jl .L1st4x
+
+ mulq $m0 # ap[j]*bp[0]
+ add %rax,$A[0]
+ mov -16($np,$j,8),%rax
+ adc \$0,%rdx
+ mov %rdx,$A[1]
+
+ mulq $m1 # np[j]*m1
+ add %rax,$N[0]
+ mov -8($ap,$j,8),%rax
+ adc \$0,%rdx
+ add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0]
+ adc \$0,%rdx
+ mov $N[0],-24(%rsp,$j,8) # tp[j-1]
+ mov %rdx,$N[1]
+
+ mulq $m0 # ap[j]*bp[0]
+ add %rax,$A[1]
+ mov -8($np,$j,8),%rax
+ adc \$0,%rdx
+ mov %rdx,$A[0]
+
+ mulq $m1 # np[j]*m1
+ add %rax,$N[1]
+ mov ($ap),%rax # ap[0]
+ adc \$0,%rdx
+ add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0]
+ adc \$0,%rdx
+ mov $N[1],-16(%rsp,$j,8) # tp[j-1]
+ mov %rdx,$N[0]
+
+ xor $N[1],$N[1]
+ add $A[0],$N[0]
+ adc \$0,$N[1]
+ mov $N[0],-8(%rsp,$j,8)
+ mov $N[1],(%rsp,$j,8) # store upmost overflow bit
+
+ lea 1($i),$i # i++
+.align 4
+.Louter4x:
+ mov ($bp,$i,8),$m0 # m0=bp[i]
+ xor $j,$j # j=0
+ mov (%rsp),$A[0]
+ mov $n0,$m1
+ mulq $m0 # ap[0]*bp[i]
+ add %rax,$A[0] # ap[0]*bp[i]+tp[0]
+ mov ($np),%rax
+ adc \$0,%rdx
+
+ imulq $A[0],$m1 # tp[0]*n0
+ mov %rdx,$A[1]
+
+ mulq $m1 # np[0]*m1
+ add %rax,$A[0] # "$N[0]", discarded
+ mov 8($ap),%rax
+ adc \$0,%rdx
+ mov %rdx,$N[1]
+
+ mulq $m0 # ap[j]*bp[i]
+ add %rax,$A[1]
+ mov 8($np),%rax
+ adc \$0,%rdx
+ add 8(%rsp),$A[1] # +tp[1]
+ adc \$0,%rdx
+ mov %rdx,$A[0]
+
+ mulq $m1 # np[j]*m1
+ add %rax,$N[1]
+ mov 16($ap),%rax
+ adc \$0,%rdx
+ add $A[1],$N[1] # np[j]*m1+ap[j]*bp[i]+tp[j]
+ lea 4($j),$j # j+=2
+ adc \$0,%rdx
+ mov $N[1],(%rsp) # tp[j-1]
+ mov %rdx,$N[0]
+ jmp .Linner4x
+.align 16
+.Linner4x:
+ mulq $m0 # ap[j]*bp[i]
+ add %rax,$A[0]
+ mov -16($np,$j,8),%rax
+ adc \$0,%rdx
+ add -16(%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j]
+ adc \$0,%rdx
+ mov %rdx,$A[1]
+
+ mulq $m1 # np[j]*m1
+ add %rax,$N[0]
+ mov -8($ap,$j,8),%rax
+ adc \$0,%rdx
+ add $A[0],$N[0]
+ adc \$0,%rdx
+ mov $N[0],-24(%rsp,$j,8) # tp[j-1]
+ mov %rdx,$N[1]
+
+ mulq $m0 # ap[j]*bp[i]
+ add %rax,$A[1]
+ mov -8($np,$j,8),%rax
+ adc \$0,%rdx
+ add -8(%rsp,$j,8),$A[1]
+ adc \$0,%rdx
+ mov %rdx,$A[0]
+
+ mulq $m1 # np[j]*m1
+ add %rax,$N[1]
+ mov ($ap,$j,8),%rax
+ adc \$0,%rdx
+ add $A[1],$N[1]
+ adc \$0,%rdx
+ mov $N[1],-16(%rsp,$j,8) # tp[j-1]
+ mov %rdx,$N[0]
+
+ mulq $m0 # ap[j]*bp[i]
+ add %rax,$A[0]
+ mov ($np,$j,8),%rax
+ adc \$0,%rdx
+ add (%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j]
+ adc \$0,%rdx
+ mov %rdx,$A[1]
+
+ mulq $m1 # np[j]*m1
+ add %rax,$N[0]
+ mov 8($ap,$j,8),%rax
+ adc \$0,%rdx
+ add $A[0],$N[0]
+ adc \$0,%rdx
+ mov $N[0],-8(%rsp,$j,8) # tp[j-1]
+ mov %rdx,$N[1]
+
+ mulq $m0 # ap[j]*bp[i]
+ add %rax,$A[1]
+ mov 8($np,$j,8),%rax
+ adc \$0,%rdx
+ add 8(%rsp,$j,8),$A[1]
+ adc \$0,%rdx
+ lea 4($j),$j # j++
+ mov %rdx,$A[0]
+
+ mulq $m1 # np[j]*m1
+ add %rax,$N[1]
+ mov -16($ap,$j,8),%rax
+ adc \$0,%rdx
+ add $A[1],$N[1]
+ adc \$0,%rdx
+ mov $N[1],-32(%rsp,$j,8) # tp[j-1]
+ mov %rdx,$N[0]
+ cmp $num,$j
+ jl .Linner4x
+
+ mulq $m0 # ap[j]*bp[i]
+ add %rax,$A[0]
+ mov -16($np,$j,8),%rax
+ adc \$0,%rdx
+ add -16(%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j]
+ adc \$0,%rdx
+ mov %rdx,$A[1]
+
+ mulq $m1 # np[j]*m1
+ add %rax,$N[0]
+ mov -8($ap,$j,8),%rax
+ adc \$0,%rdx
+ add $A[0],$N[0]
+ adc \$0,%rdx
+ mov $N[0],-24(%rsp,$j,8) # tp[j-1]
+ mov %rdx,$N[1]
+
+ mulq $m0 # ap[j]*bp[i]
+ add %rax,$A[1]
+ mov -8($np,$j,8),%rax
+ adc \$0,%rdx
+ add -8(%rsp,$j,8),$A[1]
+ adc \$0,%rdx
+ lea 1($i),$i # i++
+ mov %rdx,$A[0]
+
+ mulq $m1 # np[j]*m1
+ add %rax,$N[1]
+ mov ($ap),%rax # ap[0]
+ adc \$0,%rdx
+ add $A[1],$N[1]
+ adc \$0,%rdx
+ mov $N[1],-16(%rsp,$j,8) # tp[j-1]
+ mov %rdx,$N[0]
+
+ xor $N[1],$N[1]
+ add $A[0],$N[0]
+ adc \$0,$N[1]
+ add (%rsp,$num,8),$N[0] # pull upmost overflow bit
+ adc \$0,$N[1]
+ mov $N[0],-8(%rsp,$j,8)
+ mov $N[1],(%rsp,$j,8) # store upmost overflow bit
+
+ cmp $num,$i
+ jl .Louter4x
+___
+{
+my @ri=("%rax","%rdx",$m0,$m1);
+$code.=<<___;
+ mov 16(%rsp,$num,8),$rp # restore $rp
+ mov 0(%rsp),@ri[0] # tp[0]
+ pxor %xmm0,%xmm0
+ mov 8(%rsp),@ri[1] # tp[1]
+ shr \$2,$num # num/=4
+ lea (%rsp),$ap # borrow ap for tp
+ xor $i,$i # i=0 and clear CF!
+
+ sub 0($np),@ri[0]
+ mov 16($ap),@ri[2] # tp[2]
+ mov 24($ap),@ri[3] # tp[3]
+ sbb 8($np),@ri[1]
+ lea -1($num),$j # j=num/4-1
+ jmp .Lsub4x
+.align 16
+.Lsub4x:
+ mov @ri[0],0($rp,$i,8) # rp[i]=tp[i]-np[i]
+ mov @ri[1],8($rp,$i,8) # rp[i]=tp[i]-np[i]
+ sbb 16($np,$i,8),@ri[2]
+ mov 32($ap,$i,8),@ri[0] # tp[i+1]
+ mov 40($ap,$i,8),@ri[1]
+ sbb 24($np,$i,8),@ri[3]
+ mov @ri[2],16($rp,$i,8) # rp[i]=tp[i]-np[i]
+ mov @ri[3],24($rp,$i,8) # rp[i]=tp[i]-np[i]
+ sbb 32($np,$i,8),@ri[0]
+ mov 48($ap,$i,8),@ri[2]
+ mov 56($ap,$i,8),@ri[3]
+ sbb 40($np,$i,8),@ri[1]
+ lea 4($i),$i # i++
+ dec $j # doesnn't affect CF!
+ jnz .Lsub4x
+
+ mov @ri[0],0($rp,$i,8) # rp[i]=tp[i]-np[i]
+ mov 32($ap,$i,8),@ri[0] # load overflow bit
+ sbb 16($np,$i,8),@ri[2]
+ mov @ri[1],8($rp,$i,8) # rp[i]=tp[i]-np[i]
+ sbb 24($np,$i,8),@ri[3]
+ mov @ri[2],16($rp,$i,8) # rp[i]=tp[i]-np[i]
+
+ sbb \$0,@ri[0] # handle upmost overflow bit
+ mov @ri[3],24($rp,$i,8) # rp[i]=tp[i]-np[i]
+ xor $i,$i # i=0
+ and @ri[0],$ap
+ not @ri[0]
+ mov $rp,$np
+ and @ri[0],$np
+ lea -1($num),$j
+ or $np,$ap # ap=borrow?tp:rp
+
+ movdqu ($ap),%xmm1
+ movdqa %xmm0,(%rsp)
+ movdqu %xmm1,($rp)
+ jmp .Lcopy4x
+.align 16
+.Lcopy4x: # copy or in-place refresh
+ movdqu 16($ap,$i),%xmm2
+ movdqu 32($ap,$i),%xmm1
+ movdqa %xmm0,16(%rsp,$i)
+ movdqu %xmm2,16($rp,$i)
+ movdqa %xmm0,32(%rsp,$i)
+ movdqu %xmm1,32($rp,$i)
+ lea 32($i),$i
dec $j
- jge .Lcopy
+ jnz .Lcopy4x
+ shl \$2,$num
+ movdqu 16($ap,$i),%xmm2
+ movdqa %xmm0,16(%rsp,$i)
+ movdqu %xmm2,16($rp,$i)
+___
+}
+$code.=<<___;
mov 8(%rsp,$num,8),%rsi # restore %rsp
mov \$1,%rax
mov (%rsi),%r15
mov 32(%rsi),%rbp
mov 40(%rsi),%rbx
lea 48(%rsi),%rsp
-.Lepilogue:
+.Lmul4x_epilogue:
ret
-.size bn_mul_mont,.-bn_mul_mont
+.size bn_mul4x_mont,.-bn_mul4x_mont
+___
+}}}
+\f{{{
+######################################################################
+# void bn_sqr4x_mont(
+my $rptr="%rdi"; # const BN_ULONG *rptr,
+my $aptr="%rsi"; # const BN_ULONG *aptr,
+my $bptr="%rdx"; # not used
+my $nptr="%rcx"; # const BN_ULONG *nptr,
+my $n0 ="%r8"; # const BN_ULONG *n0);
+my $num ="%r9"; # int num, has to be divisible by 4 and
+ # not less than 8
+
+my ($i,$j,$tptr)=("%rbp","%rcx",$rptr);
+my @A0=("%r10","%r11");
+my @A1=("%r12","%r13");
+my ($a0,$a1,$ai)=("%r14","%r15","%rbx");
+
+$code.=<<___;
+.type bn_sqr4x_mont,\@function,6
+.align 16
+bn_sqr4x_mont:
+.Lsqr4x_enter:
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+
+ shl \$3,${num}d # convert $num to bytes
+ xor %r10,%r10
+ mov %rsp,%r11 # put aside %rsp
+ sub $num,%r10 # -$num
+ mov ($n0),$n0 # *n0
+ lea -72(%rsp,%r10,2),%rsp # alloca(frame+2*$num)
+ and \$-1024,%rsp # minimize TLB usage
+ ##############################################################
+ # Stack layout
+ #
+ # +0 saved $num, used in reduction section
+ # +8 &t[2*$num], used in reduction section
+ # +32 saved $rptr
+ # +40 saved $nptr
+ # +48 saved *n0
+ # +56 saved %rsp
+ # +64 t[2*$num]
+ #
+ mov $rptr,32(%rsp) # save $rptr
+ mov $nptr,40(%rsp)
+ mov $n0, 48(%rsp)
+ mov %r11, 56(%rsp) # save original %rsp
+.Lsqr4x_body:
+ ##############################################################
+ # Squaring part:
+ #
+ # a) multiply-n-add everything but a[i]*a[i];
+ # b) shift result of a) by 1 to the left and accumulate
+ # a[i]*a[i] products;
+ #
+ lea 32(%r10),$i # $i=-($num-32)
+ lea ($aptr,$num),$aptr # end of a[] buffer, ($aptr,$i)=&ap[2]
+
+ mov $num,$j # $j=$num
+
+ # comments apply to $num==8 case
+ mov -32($aptr,$i),$a0 # a[0]
+ lea 64(%rsp,$num,2),$tptr # end of tp[] buffer, &tp[2*$num]
+ mov -24($aptr,$i),%rax # a[1]
+ lea -32($tptr,$i),$tptr # end of tp[] window, &tp[2*$num-"$i"]
+ mov -16($aptr,$i),$ai # a[2]
+ mov %rax,$a1
+
+ mul $a0 # a[1]*a[0]
+ mov %rax,$A0[0] # a[1]*a[0]
+ mov $ai,%rax # a[2]
+ mov %rdx,$A0[1]
+ mov $A0[0],-24($tptr,$i) # t[1]
+
+ xor $A0[0],$A0[0]
+ mul $a0 # a[2]*a[0]
+ add %rax,$A0[1]
+ mov $ai,%rax
+ adc %rdx,$A0[0]
+ mov $A0[1],-16($tptr,$i) # t[2]
+
+ lea -16($i),$j # j=-16
+
+
+ mov 8($aptr,$j),$ai # a[3]
+ mul $a1 # a[2]*a[1]
+ mov %rax,$A1[0] # a[2]*a[1]+t[3]
+ mov $ai,%rax
+ mov %rdx,$A1[1]
+
+ xor $A0[1],$A0[1]
+ add $A1[0],$A0[0]
+ lea 16($j),$j
+ adc \$0,$A0[1]
+ mul $a0 # a[3]*a[0]
+ add %rax,$A0[0] # a[3]*a[0]+a[2]*a[1]+t[3]
+ mov $ai,%rax
+ adc %rdx,$A0[1]
+ mov $A0[0],-8($tptr,$j) # t[3]
+ jmp .Lsqr4x_1st
+
+.align 16
+.Lsqr4x_1st:
+ mov ($aptr,$j),$ai # a[4]
+ xor $A1[0],$A1[0]
+ mul $a1 # a[3]*a[1]
+ add %rax,$A1[1] # a[3]*a[1]+t[4]
+ mov $ai,%rax
+ adc %rdx,$A1[0]
+
+ xor $A0[0],$A0[0]
+ add $A1[1],$A0[1]
+ adc \$0,$A0[0]
+ mul $a0 # a[4]*a[0]
+ add %rax,$A0[1] # a[4]*a[0]+a[3]*a[1]+t[4]
+ mov $ai,%rax # a[3]
+ adc %rdx,$A0[0]
+ mov $A0[1],($tptr,$j) # t[4]
+
+
+ mov 8($aptr,$j),$ai # a[5]
+ xor $A1[1],$A1[1]
+ mul $a1 # a[4]*a[3]
+ add %rax,$A1[0] # a[4]*a[3]+t[5]
+ mov $ai,%rax
+ adc %rdx,$A1[1]
+
+ xor $A0[1],$A0[1]
+ add $A1[0],$A0[0]
+ adc \$0,$A0[1]
+ mul $a0 # a[5]*a[2]
+ add %rax,$A0[0] # a[5]*a[2]+a[4]*a[3]+t[5]
+ mov $ai,%rax
+ adc %rdx,$A0[1]
+ mov $A0[0],8($tptr,$j) # t[5]
+
+ mov 16($aptr,$j),$ai # a[6]
+ xor $A1[0],$A1[0]
+ mul $a1 # a[5]*a[3]
+ add %rax,$A1[1] # a[5]*a[3]+t[6]
+ mov $ai,%rax
+ adc %rdx,$A1[0]
+
+ xor $A0[0],$A0[0]
+ add $A1[1],$A0[1]
+ adc \$0,$A0[0]
+ mul $a0 # a[6]*a[2]
+ add %rax,$A0[1] # a[6]*a[2]+a[5]*a[3]+t[6]
+ mov $ai,%rax # a[3]
+ adc %rdx,$A0[0]
+ mov $A0[1],16($tptr,$j) # t[6]
+
+
+ mov 24($aptr,$j),$ai # a[7]
+ xor $A1[1],$A1[1]
+ mul $a1 # a[6]*a[5]
+ add %rax,$A1[0] # a[6]*a[5]+t[7]
+ mov $ai,%rax
+ adc %rdx,$A1[1]
+
+ xor $A0[1],$A0[1]
+ add $A1[0],$A0[0]
+ lea 32($j),$j
+ adc \$0,$A0[1]
+ mul $a0 # a[7]*a[4]
+ add %rax,$A0[0] # a[7]*a[4]+a[6]*a[5]+t[6]
+ mov $ai,%rax
+ adc %rdx,$A0[1]
+ mov $A0[0],-8($tptr,$j) # t[7]
+
+ cmp \$0,$j
+ jne .Lsqr4x_1st
+
+ xor $A1[0],$A1[0]
+ add $A0[1],$A1[1]
+ adc \$0,$A1[0]
+ mul $a1 # a[7]*a[5]
+ add %rax,$A1[1]
+ adc %rdx,$A1[0]
+
+ mov $A1[1],($tptr) # t[8]
+ lea 16($i),$i
+ mov $A1[0],8($tptr) # t[9]
+ jmp .Lsqr4x_outer
+
+.align 16
+.Lsqr4x_outer: # comments apply to $num==6 case
+ mov -32($aptr,$i),$a0 # a[0]
+ lea 64(%rsp,$num,2),$tptr # end of tp[] buffer, &tp[2*$num]
+ mov -24($aptr,$i),%rax # a[1]
+ lea -32($tptr,$i),$tptr # end of tp[] window, &tp[2*$num-"$i"]
+ mov -16($aptr,$i),$ai # a[2]
+ mov %rax,$a1
+
+ mov -24($tptr,$i),$A0[0] # t[1]
+ xor $A0[1],$A0[1]
+ mul $a0 # a[1]*a[0]
+ add %rax,$A0[0] # a[1]*a[0]+t[1]
+ mov $ai,%rax # a[2]
+ adc %rdx,$A0[1]
+ mov $A0[0],-24($tptr,$i) # t[1]
+
+ xor $A0[0],$A0[0]
+ add -16($tptr,$i),$A0[1] # a[2]*a[0]+t[2]
+ adc \$0,$A0[0]
+ mul $a0 # a[2]*a[0]
+ add %rax,$A0[1]
+ mov $ai,%rax
+ adc %rdx,$A0[0]
+ mov $A0[1],-16($tptr,$i) # t[2]
+
+ lea -16($i),$j # j=-16
+ xor $A1[0],$A1[0]
+
+
+ mov 8($aptr,$j),$ai # a[3]
+ xor $A1[1],$A1[1]
+ add 8($tptr,$j),$A1[0]
+ adc \$0,$A1[1]
+ mul $a1 # a[2]*a[1]
+ add %rax,$A1[0] # a[2]*a[1]+t[3]
+ mov $ai,%rax
+ adc %rdx,$A1[1]
+
+ xor $A0[1],$A0[1]
+ add $A1[0],$A0[0]
+ adc \$0,$A0[1]
+ mul $a0 # a[3]*a[0]
+ add %rax,$A0[0] # a[3]*a[0]+a[2]*a[1]+t[3]
+ mov $ai,%rax
+ adc %rdx,$A0[1]
+ mov $A0[0],8($tptr,$j) # t[3]
+
+ lea 16($j),$j
+ jmp .Lsqr4x_inner
+
+.align 16
+.Lsqr4x_inner:
+ mov ($aptr,$j),$ai # a[4]
+ xor $A1[0],$A1[0]
+ add ($tptr,$j),$A1[1]
+ adc \$0,$A1[0]
+ mul $a1 # a[3]*a[1]
+ add %rax,$A1[1] # a[3]*a[1]+t[4]
+ mov $ai,%rax
+ adc %rdx,$A1[0]
+
+ xor $A0[0],$A0[0]
+ add $A1[1],$A0[1]
+ adc \$0,$A0[0]
+ mul $a0 # a[4]*a[0]
+ add %rax,$A0[1] # a[4]*a[0]+a[3]*a[1]+t[4]
+ mov $ai,%rax # a[3]
+ adc %rdx,$A0[0]
+ mov $A0[1],($tptr,$j) # t[4]
+
+ mov 8($aptr,$j),$ai # a[5]
+ xor $A1[1],$A1[1]
+ add 8($tptr,$j),$A1[0]
+ adc \$0,$A1[1]
+ mul $a1 # a[4]*a[3]
+ add %rax,$A1[0] # a[4]*a[3]+t[5]
+ mov $ai,%rax
+ adc %rdx,$A1[1]
+
+ xor $A0[1],$A0[1]
+ add $A1[0],$A0[0]
+ lea 16($j),$j # j++
+ adc \$0,$A0[1]
+ mul $a0 # a[5]*a[2]
+ add %rax,$A0[0] # a[5]*a[2]+a[4]*a[3]+t[5]
+ mov $ai,%rax
+ adc %rdx,$A0[1]
+ mov $A0[0],-8($tptr,$j) # t[5], "preloaded t[1]" below
+
+ cmp \$0,$j
+ jne .Lsqr4x_inner
+
+ xor $A1[0],$A1[0]
+ add $A0[1],$A1[1]
+ adc \$0,$A1[0]
+ mul $a1 # a[5]*a[3]
+ add %rax,$A1[1]
+ adc %rdx,$A1[0]
+
+ mov $A1[1],($tptr) # t[6], "preloaded t[2]" below
+ mov $A1[0],8($tptr) # t[7], "preloaded t[3]" below
+
+ add \$16,$i
+ jnz .Lsqr4x_outer
+
+ # comments apply to $num==4 case
+ mov -32($aptr),$a0 # a[0]
+ lea 64(%rsp,$num,2),$tptr # end of tp[] buffer, &tp[2*$num]
+ mov -24($aptr),%rax # a[1]
+ lea -32($tptr,$i),$tptr # end of tp[] window, &tp[2*$num-"$i"]
+ mov -16($aptr),$ai # a[2]
+ mov %rax,$a1
+
+ xor $A0[1],$A0[1]
+ mul $a0 # a[1]*a[0]
+ add %rax,$A0[0] # a[1]*a[0]+t[1], preloaded t[1]
+ mov $ai,%rax # a[2]
+ adc %rdx,$A0[1]
+ mov $A0[0],-24($tptr) # t[1]
+
+ xor $A0[0],$A0[0]
+ add $A1[1],$A0[1] # a[2]*a[0]+t[2], preloaded t[2]
+ adc \$0,$A0[0]
+ mul $a0 # a[2]*a[0]
+ add %rax,$A0[1]
+ mov $ai,%rax
+ adc %rdx,$A0[0]
+ mov $A0[1],-16($tptr) # t[2]
+
+ mov -8($aptr),$ai # a[3]
+ mul $a1 # a[2]*a[1]
+ add %rax,$A1[0] # a[2]*a[1]+t[3], preloaded t[3]
+ mov $ai,%rax
+ adc \$0,%rdx
+
+ xor $A0[1],$A0[1]
+ add $A1[0],$A0[0]
+ mov %rdx,$A1[1]
+ adc \$0,$A0[1]
+ mul $a0 # a[3]*a[0]
+ add %rax,$A0[0] # a[3]*a[0]+a[2]*a[1]+t[3]
+ mov $ai,%rax
+ adc %rdx,$A0[1]
+ mov $A0[0],-8($tptr) # t[3]
+
+ xor $A1[0],$A1[0]
+ add $A0[1],$A1[1]
+ adc \$0,$A1[0]
+ mul $a1 # a[3]*a[1]
+ add %rax,$A1[1]
+ mov -16($aptr),%rax # a[2]
+ adc %rdx,$A1[0]
+
+ mov $A1[1],($tptr) # t[4]
+ mov $A1[0],8($tptr) # t[5]
+
+ mul $ai # a[2]*a[3]
+___
+{
+my ($shift,$carry)=($a0,$a1);
+my @S=(@A1,$ai,$n0);
+$code.=<<___;
+ add \$16,$i
+ xor $shift,$shift
+ sub $num,$i # $i=16-$num
+ xor $carry,$carry
+
+ add $A1[0],%rax # t[5]
+ adc \$0,%rdx
+ mov %rax,8($tptr) # t[5]
+ mov %rdx,16($tptr) # t[6]
+ mov $carry,24($tptr) # t[7]
+
+ mov -16($aptr,$i),%rax # a[0]
+ lea 64(%rsp,$num,2),$tptr
+ xor $A0[0],$A0[0] # t[0]
+ mov -24($tptr,$i,2),$A0[1] # t[1]
+
+ lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift
+ shr \$63,$A0[0]
+ lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 |
+ shr \$63,$A0[1]
+ or $A0[0],$S[1] # | t[2*i]>>63
+ mov -16($tptr,$i,2),$A0[0] # t[2*i+2] # prefetch
+ mov $A0[1],$shift # shift=t[2*i+1]>>63
+ mul %rax # a[i]*a[i]
+ neg $carry # mov $carry,cf
+ mov -8($tptr,$i,2),$A0[1] # t[2*i+2+1] # prefetch
+ adc %rax,$S[0]
+ mov -8($aptr,$i),%rax # a[i+1] # prefetch
+ mov $S[0],-32($tptr,$i,2)
+ adc %rdx,$S[1]
+
+ lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1 | shift
+ mov $S[1],-24($tptr,$i,2)
+ sbb $carry,$carry # mov cf,$carry
+ shr \$63,$A0[0]
+ lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 |
+ shr \$63,$A0[1]
+ or $A0[0],$S[3] # | t[2*i]>>63
+ mov 0($tptr,$i,2),$A0[0] # t[2*i+2] # prefetch
+ mov $A0[1],$shift # shift=t[2*i+1]>>63
+ mul %rax # a[i]*a[i]
+ neg $carry # mov $carry,cf
+ mov 8($tptr,$i,2),$A0[1] # t[2*i+2+1] # prefetch
+ adc %rax,$S[2]
+ mov 0($aptr,$i),%rax # a[i+1] # prefetch
+ mov $S[2],-16($tptr,$i,2)
+ adc %rdx,$S[3]
+ lea 16($i),$i
+ mov $S[3],-40($tptr,$i,2)
+ sbb $carry,$carry # mov cf,$carry
+ jmp .Lsqr4x_shift_n_add
+
+.align 16
+.Lsqr4x_shift_n_add:
+ lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift
+ shr \$63,$A0[0]
+ lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 |
+ shr \$63,$A0[1]
+ or $A0[0],$S[1] # | t[2*i]>>63
+ mov -16($tptr,$i,2),$A0[0] # t[2*i+2] # prefetch
+ mov $A0[1],$shift # shift=t[2*i+1]>>63
+ mul %rax # a[i]*a[i]
+ neg $carry # mov $carry,cf
+ mov -8($tptr,$i,2),$A0[1] # t[2*i+2+1] # prefetch
+ adc %rax,$S[0]
+ mov -8($aptr,$i),%rax # a[i+1] # prefetch
+ mov $S[0],-32($tptr,$i,2)
+ adc %rdx,$S[1]
+
+ lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1 | shift
+ mov $S[1],-24($tptr,$i,2)
+ sbb $carry,$carry # mov cf,$carry
+ shr \$63,$A0[0]
+ lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 |
+ shr \$63,$A0[1]
+ or $A0[0],$S[3] # | t[2*i]>>63
+ mov 0($tptr,$i,2),$A0[0] # t[2*i+2] # prefetch
+ mov $A0[1],$shift # shift=t[2*i+1]>>63
+ mul %rax # a[i]*a[i]
+ neg $carry # mov $carry,cf
+ mov 8($tptr,$i,2),$A0[1] # t[2*i+2+1] # prefetch
+ adc %rax,$S[2]
+ mov 0($aptr,$i),%rax # a[i+1] # prefetch
+ mov $S[2],-16($tptr,$i,2)
+ adc %rdx,$S[3]
+
+ lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift
+ mov $S[3],-8($tptr,$i,2)
+ sbb $carry,$carry # mov cf,$carry
+ shr \$63,$A0[0]
+ lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 |
+ shr \$63,$A0[1]
+ or $A0[0],$S[1] # | t[2*i]>>63
+ mov 16($tptr,$i,2),$A0[0] # t[2*i+2] # prefetch
+ mov $A0[1],$shift # shift=t[2*i+1]>>63
+ mul %rax # a[i]*a[i]
+ neg $carry # mov $carry,cf
+ mov 24($tptr,$i,2),$A0[1] # t[2*i+2+1] # prefetch
+ adc %rax,$S[0]
+ mov 8($aptr,$i),%rax # a[i+1] # prefetch
+ mov $S[0],0($tptr,$i,2)
+ adc %rdx,$S[1]
+
+ lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1 | shift
+ mov $S[1],8($tptr,$i,2)
+ sbb $carry,$carry # mov cf,$carry
+ shr \$63,$A0[0]
+ lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 |
+ shr \$63,$A0[1]
+ or $A0[0],$S[3] # | t[2*i]>>63
+ mov 32($tptr,$i,2),$A0[0] # t[2*i+2] # prefetch
+ mov $A0[1],$shift # shift=t[2*i+1]>>63
+ mul %rax # a[i]*a[i]
+ neg $carry # mov $carry,cf
+ mov 40($tptr,$i,2),$A0[1] # t[2*i+2+1] # prefetch
+ adc %rax,$S[2]
+ mov 16($aptr,$i),%rax # a[i+1] # prefetch
+ mov $S[2],16($tptr,$i,2)
+ adc %rdx,$S[3]
+ mov $S[3],24($tptr,$i,2)
+ sbb $carry,$carry # mov cf,$carry
+ add \$32,$i
+ jnz .Lsqr4x_shift_n_add
+
+ lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift
+ shr \$63,$A0[0]
+ lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 |
+ shr \$63,$A0[1]
+ or $A0[0],$S[1] # | t[2*i]>>63
+ mov -16($tptr),$A0[0] # t[2*i+2] # prefetch
+ mov $A0[1],$shift # shift=t[2*i+1]>>63
+ mul %rax # a[i]*a[i]
+ neg $carry # mov $carry,cf
+ mov -8($tptr),$A0[1] # t[2*i+2+1] # prefetch
+ adc %rax,$S[0]
+ mov -8($aptr),%rax # a[i+1] # prefetch
+ mov $S[0],-32($tptr)
+ adc %rdx,$S[1]
+
+ lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1|shift
+ mov $S[1],-24($tptr)
+ sbb $carry,$carry # mov cf,$carry
+ shr \$63,$A0[0]
+ lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 |
+ shr \$63,$A0[1]
+ or $A0[0],$S[3] # | t[2*i]>>63
+ mul %rax # a[i]*a[i]
+ neg $carry # mov $carry,cf
+ adc %rax,$S[2]
+ adc %rdx,$S[3]
+ mov $S[2],-16($tptr)
+ mov $S[3],-8($tptr)
+___
+}\f
+##############################################################
+# Montgomery reduction part, "word-by-word" algorithm.
+#
+{
+my ($topbit,$nptr)=("%rbp",$aptr);
+my ($m0,$m1)=($a0,$a1);
+my @Ni=("%rbx","%r9");
+$code.=<<___;
+ mov 40(%rsp),$nptr # restore $nptr
+ mov 48(%rsp),$n0 # restore *n0
+ xor $j,$j
+ mov $num,0(%rsp) # save $num
+ sub $num,$j # $j=-$num
+ mov 64(%rsp),$A0[0] # t[0] # modsched #
+ mov $n0,$m0 # # modsched #
+ lea 64(%rsp,$num,2),%rax # end of t[] buffer
+ lea 64(%rsp,$num),$tptr # end of t[] window
+ mov %rax,8(%rsp) # save end of t[] buffer
+ lea ($nptr,$num),$nptr # end of n[] buffer
+ xor $topbit,$topbit # $topbit=0
+
+ mov 0($nptr,$j),%rax # n[0] # modsched #
+ mov 8($nptr,$j),$Ni[1] # n[1] # modsched #
+ imulq $A0[0],$m0 # m0=t[0]*n0 # modsched #
+ mov %rax,$Ni[0] # # modsched #
+ jmp .Lsqr4x_mont_outer
+
+.align 16
+.Lsqr4x_mont_outer:
+ xor $A0[1],$A0[1]
+ mul $m0 # n[0]*m0
+ add %rax,$A0[0] # n[0]*m0+t[0]
+ mov $Ni[1],%rax
+ adc %rdx,$A0[1]
+ mov $n0,$m1
+
+ xor $A0[0],$A0[0]
+ add 8($tptr,$j),$A0[1]
+ adc \$0,$A0[0]
+ mul $m0 # n[1]*m0
+ add %rax,$A0[1] # n[1]*m0+t[1]
+ mov $Ni[0],%rax
+ adc %rdx,$A0[0]
+
+ imulq $A0[1],$m1
+
+ mov 16($nptr,$j),$Ni[0] # n[2]
+ xor $A1[1],$A1[1]
+ add $A0[1],$A1[0]
+ adc \$0,$A1[1]
+ mul $m1 # n[0]*m1
+ add %rax,$A1[0] # n[0]*m1+"t[1]"
+ mov $Ni[0],%rax
+ adc %rdx,$A1[1]
+ mov $A1[0],8($tptr,$j) # "t[1]"
+
+ xor $A0[1],$A0[1]
+ add 16($tptr,$j),$A0[0]
+ adc \$0,$A0[1]
+ mul $m0 # n[2]*m0
+ add %rax,$A0[0] # n[2]*m0+t[2]
+ mov $Ni[1],%rax
+ adc %rdx,$A0[1]
+
+ mov 24($nptr,$j),$Ni[1] # n[3]
+ xor $A1[0],$A1[0]
+ add $A0[0],$A1[1]
+ adc \$0,$A1[0]
+ mul $m1 # n[1]*m1
+ add %rax,$A1[1] # n[1]*m1+"t[2]"
+ mov $Ni[1],%rax
+ adc %rdx,$A1[0]
+ mov $A1[1],16($tptr,$j) # "t[2]"
+
+ xor $A0[0],$A0[0]
+ add 24($tptr,$j),$A0[1]
+ lea 32($j),$j
+ adc \$0,$A0[0]
+ mul $m0 # n[3]*m0
+ add %rax,$A0[1] # n[3]*m0+t[3]
+ mov $Ni[0],%rax
+ adc %rdx,$A0[0]
+ jmp .Lsqr4x_mont_inner
+
+.align 16
+.Lsqr4x_mont_inner:
+ mov ($nptr,$j),$Ni[0] # n[4]
+ xor $A1[1],$A1[1]
+ add $A0[1],$A1[0]
+ adc \$0,$A1[1]
+ mul $m1 # n[2]*m1
+ add %rax,$A1[0] # n[2]*m1+"t[3]"
+ mov $Ni[0],%rax
+ adc %rdx,$A1[1]
+ mov $A1[0],-8($tptr,$j) # "t[3]"
+
+ xor $A0[1],$A0[1]
+ add ($tptr,$j),$A0[0]
+ adc \$0,$A0[1]
+ mul $m0 # n[4]*m0
+ add %rax,$A0[0] # n[4]*m0+t[4]
+ mov $Ni[1],%rax
+ adc %rdx,$A0[1]
+
+ mov 8($nptr,$j),$Ni[1] # n[5]
+ xor $A1[0],$A1[0]
+ add $A0[0],$A1[1]
+ adc \$0,$A1[0]
+ mul $m1 # n[3]*m1
+ add %rax,$A1[1] # n[3]*m1+"t[4]"
+ mov $Ni[1],%rax
+ adc %rdx,$A1[0]
+ mov $A1[1],($tptr,$j) # "t[4]"
+
+ xor $A0[0],$A0[0]
+ add 8($tptr,$j),$A0[1]
+ adc \$0,$A0[0]
+ mul $m0 # n[5]*m0
+ add %rax,$A0[1] # n[5]*m0+t[5]
+ mov $Ni[0],%rax
+ adc %rdx,$A0[0]
+
+
+ mov 16($nptr,$j),$Ni[0] # n[6]
+ xor $A1[1],$A1[1]
+ add $A0[1],$A1[0]
+ adc \$0,$A1[1]
+ mul $m1 # n[4]*m1
+ add %rax,$A1[0] # n[4]*m1+"t[5]"
+ mov $Ni[0],%rax
+ adc %rdx,$A1[1]
+ mov $A1[0],8($tptr,$j) # "t[5]"
+
+ xor $A0[1],$A0[1]
+ add 16($tptr,$j),$A0[0]
+ adc \$0,$A0[1]
+ mul $m0 # n[6]*m0
+ add %rax,$A0[0] # n[6]*m0+t[6]
+ mov $Ni[1],%rax
+ adc %rdx,$A0[1]
+
+ mov 24($nptr,$j),$Ni[1] # n[7]
+ xor $A1[0],$A1[0]
+ add $A0[0],$A1[1]
+ adc \$0,$A1[0]
+ mul $m1 # n[5]*m1
+ add %rax,$A1[1] # n[5]*m1+"t[6]"
+ mov $Ni[1],%rax
+ adc %rdx,$A1[0]
+ mov $A1[1],16($tptr,$j) # "t[6]"
+
+ xor $A0[0],$A0[0]
+ add 24($tptr,$j),$A0[1]
+ lea 32($j),$j
+ adc \$0,$A0[0]
+ mul $m0 # n[7]*m0
+ add %rax,$A0[1] # n[7]*m0+t[7]
+ mov $Ni[0],%rax
+ adc %rdx,$A0[0]
+ cmp \$0,$j
+ jne .Lsqr4x_mont_inner
+
+ sub 0(%rsp),$j # $j=-$num # modsched #
+ mov $n0,$m0 # # modsched #
+
+ xor $A1[1],$A1[1]
+ add $A0[1],$A1[0]
+ adc \$0,$A1[1]
+ mul $m1 # n[6]*m1
+ add %rax,$A1[0] # n[6]*m1+"t[7]"
+ mov $Ni[1],%rax
+ adc %rdx,$A1[1]
+ mov $A1[0],-8($tptr) # "t[7]"
+
+ xor $A0[1],$A0[1]
+ add ($tptr),$A0[0] # +t[8]
+ adc \$0,$A0[1]
+ mov 0($nptr,$j),$Ni[0] # n[0] # modsched #
+ add $topbit,$A0[0]
+ adc \$0,$A0[1]
+
+ imulq 16($tptr,$j),$m0 # m0=t[0]*n0 # modsched #
+ xor $A1[0],$A1[0]
+ mov 8($nptr,$j),$Ni[1] # n[1] # modsched #
+ add $A0[0],$A1[1]
+ mov 16($tptr,$j),$A0[0] # t[0] # modsched #
+ adc \$0,$A1[0]
+ mul $m1 # n[7]*m1
+ add %rax,$A1[1] # n[7]*m1+"t[8]"
+ mov $Ni[0],%rax # # modsched #
+ adc %rdx,$A1[0]
+ mov $A1[1],($tptr) # "t[8]"
+
+ xor $topbit,$topbit
+ add 8($tptr),$A1[0] # +t[9]
+ adc $topbit,$topbit
+ add $A0[1],$A1[0]
+ lea 16($tptr),$tptr # "t[$num]>>128"
+ adc \$0,$topbit
+ mov $A1[0],-8($tptr) # "t[9]"
+ cmp 8(%rsp),$tptr # are we done?
+ jb .Lsqr4x_mont_outer
+
+ mov 0(%rsp),$num # restore $num
+ mov $topbit,($tptr) # save $topbit
+___
+}\f
+##############################################################
+# Post-condition, 4x unrolled copy from bn_mul_mont
+#
+{
+my ($tptr,$nptr)=("%rbx",$aptr);
+my @ri=("%rax","%rdx","%r10","%r11");
+$code.=<<___;
+ mov 64(%rsp,$num),@ri[0] # tp[0]
+ lea 64(%rsp,$num),$tptr # upper half of t[2*$num] holds result
+ mov 40(%rsp),$nptr # restore $nptr
+ shr \$5,$num # num/4
+ mov 8($tptr),@ri[1] # t[1]
+ xor $i,$i # i=0 and clear CF!
+
+ mov 32(%rsp),$rptr # restore $rptr
+ sub 0($nptr),@ri[0]
+ mov 16($tptr),@ri[2] # t[2]
+ mov 24($tptr),@ri[3] # t[3]
+ sbb 8($nptr),@ri[1]
+ lea -1($num),$j # j=num/4-1
+ jmp .Lsqr4x_sub
+.align 16
+.Lsqr4x_sub:
+ mov @ri[0],0($rptr,$i,8) # rp[i]=tp[i]-np[i]
+ mov @ri[1],8($rptr,$i,8) # rp[i]=tp[i]-np[i]
+ sbb 16($nptr,$i,8),@ri[2]
+ mov 32($tptr,$i,8),@ri[0] # tp[i+1]
+ mov 40($tptr,$i,8),@ri[1]
+ sbb 24($nptr,$i,8),@ri[3]
+ mov @ri[2],16($rptr,$i,8) # rp[i]=tp[i]-np[i]
+ mov @ri[3],24($rptr,$i,8) # rp[i]=tp[i]-np[i]
+ sbb 32($nptr,$i,8),@ri[0]
+ mov 48($tptr,$i,8),@ri[2]
+ mov 56($tptr,$i,8),@ri[3]
+ sbb 40($nptr,$i,8),@ri[1]
+ lea 4($i),$i # i++
+ dec $j # doesn't affect CF!
+ jnz .Lsqr4x_sub
+
+ mov @ri[0],0($rptr,$i,8) # rp[i]=tp[i]-np[i]
+ mov 32($tptr,$i,8),@ri[0] # load overflow bit
+ sbb 16($nptr,$i,8),@ri[2]
+ mov @ri[1],8($rptr,$i,8) # rp[i]=tp[i]-np[i]
+ sbb 24($nptr,$i,8),@ri[3]
+ mov @ri[2],16($rptr,$i,8) # rp[i]=tp[i]-np[i]
+
+ sbb \$0,@ri[0] # handle upmost overflow bit
+ mov @ri[3],24($rptr,$i,8) # rp[i]=tp[i]-np[i]
+ xor $i,$i # i=0
+ and @ri[0],$tptr
+ not @ri[0]
+ mov $rptr,$nptr
+ and @ri[0],$nptr
+ lea -1($num),$j
+ or $nptr,$tptr # tp=borrow?tp:rp
+
+ pxor %xmm0,%xmm0
+ lea 64(%rsp,$num,8),$nptr
+ movdqu ($tptr),%xmm1
+ lea ($nptr,$num,8),$nptr
+ movdqa %xmm0,64(%rsp) # zap lower half of temporary vector
+ movdqa %xmm0,($nptr) # zap upper half of temporary vector
+ movdqu %xmm1,($rptr)
+ jmp .Lsqr4x_copy
+.align 16
+.Lsqr4x_copy: # copy or in-place refresh
+ movdqu 16($tptr,$i),%xmm2
+ movdqu 32($tptr,$i),%xmm1
+ movdqa %xmm0,80(%rsp,$i) # zap lower half of temporary vector
+ movdqa %xmm0,96(%rsp,$i) # zap lower half of temporary vector
+ movdqa %xmm0,16($nptr,$i) # zap upper half of temporary vector
+ movdqa %xmm0,32($nptr,$i) # zap upper half of temporary vector
+ movdqu %xmm2,16($rptr,$i)
+ movdqu %xmm1,32($rptr,$i)
+ lea 32($i),$i
+ dec $j
+ jnz .Lsqr4x_copy
+
+ movdqu 16($tptr,$i),%xmm2
+ movdqa %xmm0,80(%rsp,$i) # zap lower half of temporary vector
+ movdqa %xmm0,16($nptr,$i) # zap upper half of temporary vector
+ movdqu %xmm2,16($rptr,$i)
+___
+}
+$code.=<<___;
+ mov 56(%rsp),%rsi # restore %rsp
+ mov \$1,%rax
+ mov 0(%rsi),%r15
+ mov 8(%rsi),%r14
+ mov 16(%rsi),%r13
+ mov 24(%rsi),%r12
+ mov 32(%rsi),%rbp
+ mov 40(%rsi),%rbx
+ lea 48(%rsi),%rsp
+.Lsqr4x_epilogue:
+ ret
+.size bn_sqr4x_mont,.-bn_sqr4x_mont
+___
+}}}
+$code.=<<___;
.asciz "Montgomery Multiplication for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
.align 16
___
$code.=<<___;
.extern __imp_RtlVirtualUnwind
-.type se_handler,\@abi-omnipotent
+.type mul_handler,\@abi-omnipotent
.align 16
-se_handler:
+mul_handler:
push %rsi
push %rdi
push %rbx
mov 120($context),%rax # pull context->Rax
mov 248($context),%rbx # pull context->Rip
- lea .Lprologue(%rip),%r10
- cmp %r10,%rbx # context->Rip<.Lprologue
- jb .Lin_prologue
+ mov 8($disp),%rsi # disp->ImageBase
+ mov 56($disp),%r11 # disp->HandlerData
+
+ mov 0(%r11),%r10d # HandlerData[0]
+ lea (%rsi,%r10),%r10 # end of prologue label
+ cmp %r10,%rbx # context->Rip<end of prologue label
+ jb .Lcommon_seh_tail
mov 152($context),%rax # pull context->Rsp
- lea .Lepilogue(%rip),%r10
- cmp %r10,%rbx # context->Rip>=.Lepilogue
- jae .Lin_prologue
+ mov 4(%r11),%r10d # HandlerData[1]
+ lea (%rsi,%r10),%r10 # epilogue label
+ cmp %r10,%rbx # context->Rip>=epilogue label
+ jae .Lcommon_seh_tail
mov 192($context),%r10 # pull $num
mov 8(%rax,%r10,8),%rax # pull saved stack pointer
mov %r14,232($context) # restore context->R14
mov %r15,240($context) # restore context->R15
-.Lin_prologue:
+ jmp .Lcommon_seh_tail
+.size mul_handler,.-mul_handler
+
+.type sqr_handler,\@abi-omnipotent
+.align 16
+sqr_handler:
+ push %rsi
+ push %rdi
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ pushfq
+ sub \$64,%rsp
+
+ mov 120($context),%rax # pull context->Rax
+ mov 248($context),%rbx # pull context->Rip
+
+ lea .Lsqr4x_body(%rip),%r10
+ cmp %r10,%rbx # context->Rip<.Lsqr_body
+ jb .Lcommon_seh_tail
+
+ mov 152($context),%rax # pull context->Rsp
+
+ lea .Lsqr4x_epilogue(%rip),%r10
+ cmp %r10,%rbx # context->Rip>=.Lsqr_epilogue
+ jae .Lcommon_seh_tail
+
+ mov 56(%rax),%rax # pull saved stack pointer
+ lea 48(%rax),%rax
+
+ mov -8(%rax),%rbx
+ mov -16(%rax),%rbp
+ mov -24(%rax),%r12
+ mov -32(%rax),%r13
+ mov -40(%rax),%r14
+ mov -48(%rax),%r15
+ mov %rbx,144($context) # restore context->Rbx
+ mov %rbp,160($context) # restore context->Rbp
+ mov %r12,216($context) # restore context->R12
+ mov %r13,224($context) # restore context->R13
+ mov %r14,232($context) # restore context->R14
+ mov %r15,240($context) # restore context->R15
+
+.Lcommon_seh_tail:
mov 8(%rax),%rdi
mov 16(%rax),%rsi
mov %rax,152($context) # restore context->Rsp
pop %rdi
pop %rsi
ret
-.size se_handler,.-se_handler
+.size sqr_handler,.-sqr_handler
.section .pdata
.align 4
.rva .LSEH_end_bn_mul_mont
.rva .LSEH_info_bn_mul_mont
+ .rva .LSEH_begin_bn_mul4x_mont
+ .rva .LSEH_end_bn_mul4x_mont
+ .rva .LSEH_info_bn_mul4x_mont
+
+ .rva .LSEH_begin_bn_sqr4x_mont
+ .rva .LSEH_end_bn_sqr4x_mont
+ .rva .LSEH_info_bn_sqr4x_mont
+
.section .xdata
.align 8
.LSEH_info_bn_mul_mont:
.byte 9,0,0,0
- .rva se_handler
+ .rva mul_handler
+ .rva .Lmul_body,.Lmul_epilogue # HandlerData[]
+.LSEH_info_bn_mul4x_mont:
+ .byte 9,0,0,0
+ .rva mul_handler
+ .rva .Lmul4x_body,.Lmul4x_epilogue # HandlerData[]
+.LSEH_info_bn_sqr4x_mont:
+ .byte 9,0,0,0
+ .rva sqr_handler
___
}
--- /dev/null
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# August 2011.
+#
+# Companion to x86_64-mont.pl that optimizes cache-timing attack
+# countermeasures. The subroutines are produced by replacing bp[i]
+# references in their x86_64-mont.pl counterparts with cache-neutral
+# references to powers table computed in BN_mod_exp_mont_consttime.
+# In addition subroutine that scatters elements of the powers table
+# is implemented, so that scatter-/gathering can be tuned without
+# bn_exp.c modifications.
+
+$flavour = shift;
+$output = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+open STDOUT,"| $^X $xlate $flavour $output";
+
+# int bn_mul_mont_gather5(
+$rp="%rdi"; # BN_ULONG *rp,
+$ap="%rsi"; # const BN_ULONG *ap,
+$bp="%rdx"; # const BN_ULONG *bp,
+$np="%rcx"; # const BN_ULONG *np,
+$n0="%r8"; # const BN_ULONG *n0,
+$num="%r9"; # int num,
+ # int idx); # 0 to 2^5-1, "index" in $bp holding
+ # pre-computed powers of a', interlaced
+ # in such manner that b[0] is $bp[idx],
+ # b[1] is [2^5+idx], etc.
+$lo0="%r10";
+$hi0="%r11";
+$hi1="%r13";
+$i="%r14";
+$j="%r15";
+$m0="%rbx";
+$m1="%rbp";
+
+$code=<<___;
+.text
+
+.globl bn_mul_mont_gather5
+.type bn_mul_mont_gather5,\@function,6
+.align 64
+bn_mul_mont_gather5:
+ test \$3,${num}d
+ jnz .Lmul_enter
+ cmp \$8,${num}d
+ jb .Lmul_enter
+ jmp .Lmul4x_enter
+
+.align 16
+.Lmul_enter:
+ mov ${num}d,${num}d
+ mov `($win64?56:8)`(%rsp),%r10d # load 7th argument
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+___
+$code.=<<___ if ($win64);
+ lea -0x28(%rsp),%rsp
+ movaps %xmm6,(%rsp)
+ movaps %xmm7,0x10(%rsp)
+.Lmul_alloca:
+___
+$code.=<<___;
+ mov %rsp,%rax
+ lea 2($num),%r11
+ neg %r11
+ lea (%rsp,%r11,8),%rsp # tp=alloca(8*(num+2))
+ and \$-1024,%rsp # minimize TLB usage
+
+ mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp
+.Lmul_body:
+ mov $bp,%r12 # reassign $bp
+___
+ $bp="%r12";
+ $STRIDE=2**5*8; # 5 is "window size"
+ $N=$STRIDE/4; # should match cache line size
+$code.=<<___;
+ mov %r10,%r11
+ shr \$`log($N/8)/log(2)`,%r10
+ and \$`$N/8-1`,%r11
+ not %r10
+ lea .Lmagic_masks(%rip),%rax
+ and \$`2**5/($N/8)-1`,%r10 # 5 is "window size"
+ lea 96($bp,%r11,8),$bp # pointer within 1st cache line
+ movq 0(%rax,%r10,8),%xmm4 # set of masks denoting which
+ movq 8(%rax,%r10,8),%xmm5 # cache line contains element
+ movq 16(%rax,%r10,8),%xmm6 # denoted by 7th argument
+ movq 24(%rax,%r10,8),%xmm7
+
+ movq `0*$STRIDE/4-96`($bp),%xmm0
+ movq `1*$STRIDE/4-96`($bp),%xmm1
+ pand %xmm4,%xmm0
+ movq `2*$STRIDE/4-96`($bp),%xmm2
+ pand %xmm5,%xmm1
+ movq `3*$STRIDE/4-96`($bp),%xmm3
+ pand %xmm6,%xmm2
+ por %xmm1,%xmm0
+ pand %xmm7,%xmm3
+ por %xmm2,%xmm0
+ lea $STRIDE($bp),$bp
+ por %xmm3,%xmm0
+
+ movq %xmm0,$m0 # m0=bp[0]
+
+ mov ($n0),$n0 # pull n0[0] value
+ mov ($ap),%rax
+
+ xor $i,$i # i=0
+ xor $j,$j # j=0
+
+ movq `0*$STRIDE/4-96`($bp),%xmm0
+ movq `1*$STRIDE/4-96`($bp),%xmm1
+ pand %xmm4,%xmm0
+ movq `2*$STRIDE/4-96`($bp),%xmm2
+ pand %xmm5,%xmm1
+
+ mov $n0,$m1
+ mulq $m0 # ap[0]*bp[0]
+ mov %rax,$lo0
+ mov ($np),%rax
+
+ movq `3*$STRIDE/4-96`($bp),%xmm3
+ pand %xmm6,%xmm2
+ por %xmm1,%xmm0
+ pand %xmm7,%xmm3
+
+ imulq $lo0,$m1 # "tp[0]"*n0
+ mov %rdx,$hi0
+
+ por %xmm2,%xmm0
+ lea $STRIDE($bp),$bp
+ por %xmm3,%xmm0
+
+ mulq $m1 # np[0]*m1
+ add %rax,$lo0 # discarded
+ mov 8($ap),%rax
+ adc \$0,%rdx
+ mov %rdx,$hi1
+
+ lea 1($j),$j # j++
+ jmp .L1st_enter
+
+.align 16
+.L1st:
+ add %rax,$hi1
+ mov ($ap,$j,8),%rax
+ adc \$0,%rdx
+ add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0]
+ mov $lo0,$hi0
+ adc \$0,%rdx
+ mov $hi1,-16(%rsp,$j,8) # tp[j-1]
+ mov %rdx,$hi1
+
+.L1st_enter:
+ mulq $m0 # ap[j]*bp[0]
+ add %rax,$hi0
+ mov ($np,$j,8),%rax
+ adc \$0,%rdx
+ lea 1($j),$j # j++
+ mov %rdx,$lo0
+
+ mulq $m1 # np[j]*m1
+ cmp $num,$j
+ jne .L1st
+
+ movq %xmm0,$m0 # bp[1]
+
+ add %rax,$hi1
+ mov ($ap),%rax # ap[0]
+ adc \$0,%rdx
+ add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0]
+ adc \$0,%rdx
+ mov $hi1,-16(%rsp,$j,8) # tp[j-1]
+ mov %rdx,$hi1
+ mov $lo0,$hi0
+
+ xor %rdx,%rdx
+ add $hi0,$hi1
+ adc \$0,%rdx
+ mov $hi1,-8(%rsp,$num,8)
+ mov %rdx,(%rsp,$num,8) # store upmost overflow bit
+
+ lea 1($i),$i # i++
+ jmp .Louter
+.align 16
+.Louter:
+ xor $j,$j # j=0
+ mov $n0,$m1
+ mov (%rsp),$lo0
+
+ movq `0*$STRIDE/4-96`($bp),%xmm0
+ movq `1*$STRIDE/4-96`($bp),%xmm1
+ pand %xmm4,%xmm0
+ movq `2*$STRIDE/4-96`($bp),%xmm2
+ pand %xmm5,%xmm1
+
+ mulq $m0 # ap[0]*bp[i]
+ add %rax,$lo0 # ap[0]*bp[i]+tp[0]
+ mov ($np),%rax
+ adc \$0,%rdx
+
+ movq `3*$STRIDE/4-96`($bp),%xmm3
+ pand %xmm6,%xmm2
+ por %xmm1,%xmm0
+ pand %xmm7,%xmm3
+
+ imulq $lo0,$m1 # tp[0]*n0
+ mov %rdx,$hi0
+
+ por %xmm2,%xmm0
+ lea $STRIDE($bp),$bp
+ por %xmm3,%xmm0
+
+ mulq $m1 # np[0]*m1
+ add %rax,$lo0 # discarded
+ mov 8($ap),%rax
+ adc \$0,%rdx
+ mov 8(%rsp),$lo0 # tp[1]
+ mov %rdx,$hi1
+
+ lea 1($j),$j # j++
+ jmp .Linner_enter
+
+.align 16
+.Linner:
+ add %rax,$hi1
+ mov ($ap,$j,8),%rax
+ adc \$0,%rdx
+ add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j]
+ mov (%rsp,$j,8),$lo0
+ adc \$0,%rdx
+ mov $hi1,-16(%rsp,$j,8) # tp[j-1]
+ mov %rdx,$hi1
+
+.Linner_enter:
+ mulq $m0 # ap[j]*bp[i]
+ add %rax,$hi0
+ mov ($np,$j,8),%rax
+ adc \$0,%rdx
+ add $hi0,$lo0 # ap[j]*bp[i]+tp[j]
+ mov %rdx,$hi0
+ adc \$0,$hi0
+ lea 1($j),$j # j++
+
+ mulq $m1 # np[j]*m1
+ cmp $num,$j
+ jne .Linner
+
+ movq %xmm0,$m0 # bp[i+1]
+
+ add %rax,$hi1
+ mov ($ap),%rax # ap[0]
+ adc \$0,%rdx
+ add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j]
+ mov (%rsp,$j,8),$lo0
+ adc \$0,%rdx
+ mov $hi1,-16(%rsp,$j,8) # tp[j-1]
+ mov %rdx,$hi1
+
+ xor %rdx,%rdx
+ add $hi0,$hi1
+ adc \$0,%rdx
+ add $lo0,$hi1 # pull upmost overflow bit
+ adc \$0,%rdx
+ mov $hi1,-8(%rsp,$num,8)
+ mov %rdx,(%rsp,$num,8) # store upmost overflow bit
+
+ lea 1($i),$i # i++
+ cmp $num,$i
+ jl .Louter
+
+ xor $i,$i # i=0 and clear CF!
+ mov (%rsp),%rax # tp[0]
+ lea (%rsp),$ap # borrow ap for tp
+ mov $num,$j # j=num
+ jmp .Lsub
+.align 16
+.Lsub: sbb ($np,$i,8),%rax
+ mov %rax,($rp,$i,8) # rp[i]=tp[i]-np[i]
+ mov 8($ap,$i,8),%rax # tp[i+1]
+ lea 1($i),$i # i++
+ dec $j # doesnn't affect CF!
+ jnz .Lsub
+
+ sbb \$0,%rax # handle upmost overflow bit
+ xor $i,$i
+ and %rax,$ap
+ not %rax
+ mov $rp,$np
+ and %rax,$np
+ mov $num,$j # j=num
+ or $np,$ap # ap=borrow?tp:rp
+.align 16
+.Lcopy: # copy or in-place refresh
+ mov ($ap,$i,8),%rax
+ mov $i,(%rsp,$i,8) # zap temporary vector
+ mov %rax,($rp,$i,8) # rp[i]=tp[i]
+ lea 1($i),$i
+ sub \$1,$j
+ jnz .Lcopy
+
+ mov 8(%rsp,$num,8),%rsi # restore %rsp
+ mov \$1,%rax
+___
+$code.=<<___ if ($win64);
+ movaps (%rsi),%xmm6
+ movaps 0x10(%rsi),%xmm7
+ lea 0x28(%rsi),%rsi
+___
+$code.=<<___;
+ mov (%rsi),%r15
+ mov 8(%rsi),%r14
+ mov 16(%rsi),%r13
+ mov 24(%rsi),%r12
+ mov 32(%rsi),%rbp
+ mov 40(%rsi),%rbx
+ lea 48(%rsi),%rsp
+.Lmul_epilogue:
+ ret
+.size bn_mul_mont_gather5,.-bn_mul_mont_gather5
+___
+{{{
+my @A=("%r10","%r11");
+my @N=("%r13","%rdi");
+$code.=<<___;
+.type bn_mul4x_mont_gather5,\@function,6
+.align 16
+bn_mul4x_mont_gather5:
+.Lmul4x_enter:
+ mov ${num}d,${num}d
+ mov `($win64?56:8)`(%rsp),%r10d # load 7th argument
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+___
+$code.=<<___ if ($win64);
+ lea -0x28(%rsp),%rsp
+ movaps %xmm6,(%rsp)
+ movaps %xmm7,0x10(%rsp)
+.Lmul4x_alloca:
+___
+$code.=<<___;
+ mov %rsp,%rax
+ lea 4($num),%r11
+ neg %r11
+ lea (%rsp,%r11,8),%rsp # tp=alloca(8*(num+4))
+ and \$-1024,%rsp # minimize TLB usage
+
+ mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp
+.Lmul4x_body:
+ mov $rp,16(%rsp,$num,8) # tp[num+2]=$rp
+ mov %rdx,%r12 # reassign $bp
+___
+ $bp="%r12";
+ $STRIDE=2**5*8; # 5 is "window size"
+ $N=$STRIDE/4; # should match cache line size
+$code.=<<___;
+ mov %r10,%r11
+ shr \$`log($N/8)/log(2)`,%r10
+ and \$`$N/8-1`,%r11
+ not %r10
+ lea .Lmagic_masks(%rip),%rax
+ and \$`2**5/($N/8)-1`,%r10 # 5 is "window size"
+ lea 96($bp,%r11,8),$bp # pointer within 1st cache line
+ movq 0(%rax,%r10,8),%xmm4 # set of masks denoting which
+ movq 8(%rax,%r10,8),%xmm5 # cache line contains element
+ movq 16(%rax,%r10,8),%xmm6 # denoted by 7th argument
+ movq 24(%rax,%r10,8),%xmm7
+
+ movq `0*$STRIDE/4-96`($bp),%xmm0
+ movq `1*$STRIDE/4-96`($bp),%xmm1
+ pand %xmm4,%xmm0
+ movq `2*$STRIDE/4-96`($bp),%xmm2
+ pand %xmm5,%xmm1
+ movq `3*$STRIDE/4-96`($bp),%xmm3
+ pand %xmm6,%xmm2
+ por %xmm1,%xmm0
+ pand %xmm7,%xmm3
+ por %xmm2,%xmm0
+ lea $STRIDE($bp),$bp
+ por %xmm3,%xmm0
+
+ movq %xmm0,$m0 # m0=bp[0]
+ mov ($n0),$n0 # pull n0[0] value
+ mov ($ap),%rax
+
+ xor $i,$i # i=0
+ xor $j,$j # j=0
+
+ movq `0*$STRIDE/4-96`($bp),%xmm0
+ movq `1*$STRIDE/4-96`($bp),%xmm1
+ pand %xmm4,%xmm0
+ movq `2*$STRIDE/4-96`($bp),%xmm2
+ pand %xmm5,%xmm1
+
+ mov $n0,$m1
+ mulq $m0 # ap[0]*bp[0]
+ mov %rax,$A[0]
+ mov ($np),%rax
+
+ movq `3*$STRIDE/4-96`($bp),%xmm3
+ pand %xmm6,%xmm2
+ por %xmm1,%xmm0
+ pand %xmm7,%xmm3
+
+ imulq $A[0],$m1 # "tp[0]"*n0
+ mov %rdx,$A[1]
+
+ por %xmm2,%xmm0
+ lea $STRIDE($bp),$bp
+ por %xmm3,%xmm0
+
+ mulq $m1 # np[0]*m1
+ add %rax,$A[0] # discarded
+ mov 8($ap),%rax
+ adc \$0,%rdx
+ mov %rdx,$N[1]
+
+ mulq $m0
+ add %rax,$A[1]
+ mov 8($np),%rax
+ adc \$0,%rdx
+ mov %rdx,$A[0]
+
+ mulq $m1
+ add %rax,$N[1]
+ mov 16($ap),%rax
+ adc \$0,%rdx
+ add $A[1],$N[1]
+ lea 4($j),$j # j++
+ adc \$0,%rdx
+ mov $N[1],(%rsp)
+ mov %rdx,$N[0]
+ jmp .L1st4x
+.align 16
+.L1st4x:
+ mulq $m0 # ap[j]*bp[0]
+ add %rax,$A[0]
+ mov -16($np,$j,8),%rax
+ adc \$0,%rdx
+ mov %rdx,$A[1]
+
+ mulq $m1 # np[j]*m1
+ add %rax,$N[0]
+ mov -8($ap,$j,8),%rax
+ adc \$0,%rdx
+ add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0]
+ adc \$0,%rdx
+ mov $N[0],-24(%rsp,$j,8) # tp[j-1]
+ mov %rdx,$N[1]
+
+ mulq $m0 # ap[j]*bp[0]
+ add %rax,$A[1]
+ mov -8($np,$j,8),%rax
+ adc \$0,%rdx
+ mov %rdx,$A[0]
+
+ mulq $m1 # np[j]*m1
+ add %rax,$N[1]
+ mov ($ap,$j,8),%rax
+ adc \$0,%rdx
+ add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0]
+ adc \$0,%rdx
+ mov $N[1],-16(%rsp,$j,8) # tp[j-1]
+ mov %rdx,$N[0]
+
+ mulq $m0 # ap[j]*bp[0]
+ add %rax,$A[0]
+ mov ($np,$j,8),%rax
+ adc \$0,%rdx
+ mov %rdx,$A[1]
+
+ mulq $m1 # np[j]*m1
+ add %rax,$N[0]
+ mov 8($ap,$j,8),%rax
+ adc \$0,%rdx
+ add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0]
+ adc \$0,%rdx
+ mov $N[0],-8(%rsp,$j,8) # tp[j-1]
+ mov %rdx,$N[1]
+
+ mulq $m0 # ap[j]*bp[0]
+ add %rax,$A[1]
+ mov 8($np,$j,8),%rax
+ adc \$0,%rdx
+ lea 4($j),$j # j++
+ mov %rdx,$A[0]
+
+ mulq $m1 # np[j]*m1
+ add %rax,$N[1]
+ mov -16($ap,$j,8),%rax
+ adc \$0,%rdx
+ add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0]
+ adc \$0,%rdx
+ mov $N[1],-32(%rsp,$j,8) # tp[j-1]
+ mov %rdx,$N[0]
+ cmp $num,$j
+ jl .L1st4x
+
+ mulq $m0 # ap[j]*bp[0]
+ add %rax,$A[0]
+ mov -16($np,$j,8),%rax
+ adc \$0,%rdx
+ mov %rdx,$A[1]
+
+ mulq $m1 # np[j]*m1
+ add %rax,$N[0]
+ mov -8($ap,$j,8),%rax
+ adc \$0,%rdx
+ add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0]
+ adc \$0,%rdx
+ mov $N[0],-24(%rsp,$j,8) # tp[j-1]
+ mov %rdx,$N[1]
+
+ mulq $m0 # ap[j]*bp[0]
+ add %rax,$A[1]
+ mov -8($np,$j,8),%rax
+ adc \$0,%rdx
+ mov %rdx,$A[0]
+
+ mulq $m1 # np[j]*m1
+ add %rax,$N[1]
+ mov ($ap),%rax # ap[0]
+ adc \$0,%rdx
+ add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0]
+ adc \$0,%rdx
+ mov $N[1],-16(%rsp,$j,8) # tp[j-1]
+ mov %rdx,$N[0]
+
+ movq %xmm0,$m0 # bp[1]
+
+ xor $N[1],$N[1]
+ add $A[0],$N[0]
+ adc \$0,$N[1]
+ mov $N[0],-8(%rsp,$j,8)
+ mov $N[1],(%rsp,$j,8) # store upmost overflow bit
+
+ lea 1($i),$i # i++
+.align 4
+.Louter4x:
+ xor $j,$j # j=0
+ movq `0*$STRIDE/4-96`($bp),%xmm0
+ movq `1*$STRIDE/4-96`($bp),%xmm1
+ pand %xmm4,%xmm0
+ movq `2*$STRIDE/4-96`($bp),%xmm2
+ pand %xmm5,%xmm1
+
+ mov (%rsp),$A[0]
+ mov $n0,$m1
+ mulq $m0 # ap[0]*bp[i]
+ add %rax,$A[0] # ap[0]*bp[i]+tp[0]
+ mov ($np),%rax
+ adc \$0,%rdx
+
+ movq `3*$STRIDE/4-96`($bp),%xmm3
+ pand %xmm6,%xmm2
+ por %xmm1,%xmm0
+ pand %xmm7,%xmm3
+
+ imulq $A[0],$m1 # tp[0]*n0
+ mov %rdx,$A[1]
+
+ por %xmm2,%xmm0
+ lea $STRIDE($bp),$bp
+ por %xmm3,%xmm0
+
+ mulq $m1 # np[0]*m1
+ add %rax,$A[0] # "$N[0]", discarded
+ mov 8($ap),%rax
+ adc \$0,%rdx
+ mov %rdx,$N[1]
+
+ mulq $m0 # ap[j]*bp[i]
+ add %rax,$A[1]
+ mov 8($np),%rax
+ adc \$0,%rdx
+ add 8(%rsp),$A[1] # +tp[1]
+ adc \$0,%rdx
+ mov %rdx,$A[0]
+
+ mulq $m1 # np[j]*m1
+ add %rax,$N[1]
+ mov 16($ap),%rax
+ adc \$0,%rdx
+ add $A[1],$N[1] # np[j]*m1+ap[j]*bp[i]+tp[j]
+ lea 4($j),$j # j+=2
+ adc \$0,%rdx
+ mov %rdx,$N[0]
+ jmp .Linner4x
+.align 16
+.Linner4x:
+ mulq $m0 # ap[j]*bp[i]
+ add %rax,$A[0]
+ mov -16($np,$j,8),%rax
+ adc \$0,%rdx
+ add -16(%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j]
+ adc \$0,%rdx
+ mov %rdx,$A[1]
+
+ mulq $m1 # np[j]*m1
+ add %rax,$N[0]
+ mov -8($ap,$j,8),%rax
+ adc \$0,%rdx
+ add $A[0],$N[0]
+ adc \$0,%rdx
+ mov $N[1],-32(%rsp,$j,8) # tp[j-1]
+ mov %rdx,$N[1]
+
+ mulq $m0 # ap[j]*bp[i]
+ add %rax,$A[1]
+ mov -8($np,$j,8),%rax
+ adc \$0,%rdx
+ add -8(%rsp,$j,8),$A[1]
+ adc \$0,%rdx
+ mov %rdx,$A[0]
+
+ mulq $m1 # np[j]*m1
+ add %rax,$N[1]
+ mov ($ap,$j,8),%rax
+ adc \$0,%rdx
+ add $A[1],$N[1]
+ adc \$0,%rdx
+ mov $N[0],-24(%rsp,$j,8) # tp[j-1]
+ mov %rdx,$N[0]
+
+ mulq $m0 # ap[j]*bp[i]
+ add %rax,$A[0]
+ mov ($np,$j,8),%rax
+ adc \$0,%rdx
+ add (%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j]
+ adc \$0,%rdx
+ mov %rdx,$A[1]
+
+ mulq $m1 # np[j]*m1
+ add %rax,$N[0]
+ mov 8($ap,$j,8),%rax
+ adc \$0,%rdx
+ add $A[0],$N[0]
+ adc \$0,%rdx
+ mov $N[1],-16(%rsp,$j,8) # tp[j-1]
+ mov %rdx,$N[1]
+
+ mulq $m0 # ap[j]*bp[i]
+ add %rax,$A[1]
+ mov 8($np,$j,8),%rax
+ adc \$0,%rdx
+ add 8(%rsp,$j,8),$A[1]
+ adc \$0,%rdx
+ lea 4($j),$j # j++
+ mov %rdx,$A[0]
+
+ mulq $m1 # np[j]*m1
+ add %rax,$N[1]
+ mov -16($ap,$j,8),%rax
+ adc \$0,%rdx
+ add $A[1],$N[1]
+ adc \$0,%rdx
+ mov $N[0],-40(%rsp,$j,8) # tp[j-1]
+ mov %rdx,$N[0]
+ cmp $num,$j
+ jl .Linner4x
+
+ mulq $m0 # ap[j]*bp[i]
+ add %rax,$A[0]
+ mov -16($np,$j,8),%rax
+ adc \$0,%rdx
+ add -16(%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j]
+ adc \$0,%rdx
+ mov %rdx,$A[1]
+
+ mulq $m1 # np[j]*m1
+ add %rax,$N[0]
+ mov -8($ap,$j,8),%rax
+ adc \$0,%rdx
+ add $A[0],$N[0]
+ adc \$0,%rdx
+ mov $N[1],-32(%rsp,$j,8) # tp[j-1]
+ mov %rdx,$N[1]
+
+ mulq $m0 # ap[j]*bp[i]
+ add %rax,$A[1]
+ mov -8($np,$j,8),%rax
+ adc \$0,%rdx
+ add -8(%rsp,$j,8),$A[1]
+ adc \$0,%rdx
+ lea 1($i),$i # i++
+ mov %rdx,$A[0]
+
+ mulq $m1 # np[j]*m1
+ add %rax,$N[1]
+ mov ($ap),%rax # ap[0]
+ adc \$0,%rdx
+ add $A[1],$N[1]
+ adc \$0,%rdx
+ mov $N[0],-24(%rsp,$j,8) # tp[j-1]
+ mov %rdx,$N[0]
+
+ movq %xmm0,$m0 # bp[i+1]
+ mov $N[1],-16(%rsp,$j,8) # tp[j-1]
+
+ xor $N[1],$N[1]
+ add $A[0],$N[0]
+ adc \$0,$N[1]
+ add (%rsp,$num,8),$N[0] # pull upmost overflow bit
+ adc \$0,$N[1]
+ mov $N[0],-8(%rsp,$j,8)
+ mov $N[1],(%rsp,$j,8) # store upmost overflow bit
+
+ cmp $num,$i
+ jl .Louter4x
+___
+{
+my @ri=("%rax","%rdx",$m0,$m1);
+$code.=<<___;
+ mov 16(%rsp,$num,8),$rp # restore $rp
+ mov 0(%rsp),@ri[0] # tp[0]
+ pxor %xmm0,%xmm0
+ mov 8(%rsp),@ri[1] # tp[1]
+ shr \$2,$num # num/=4
+ lea (%rsp),$ap # borrow ap for tp
+ xor $i,$i # i=0 and clear CF!
+
+ sub 0($np),@ri[0]
+ mov 16($ap),@ri[2] # tp[2]
+ mov 24($ap),@ri[3] # tp[3]
+ sbb 8($np),@ri[1]
+ lea -1($num),$j # j=num/4-1
+ jmp .Lsub4x
+.align 16
+.Lsub4x:
+ mov @ri[0],0($rp,$i,8) # rp[i]=tp[i]-np[i]
+ mov @ri[1],8($rp,$i,8) # rp[i]=tp[i]-np[i]
+ sbb 16($np,$i,8),@ri[2]
+ mov 32($ap,$i,8),@ri[0] # tp[i+1]
+ mov 40($ap,$i,8),@ri[1]
+ sbb 24($np,$i,8),@ri[3]
+ mov @ri[2],16($rp,$i,8) # rp[i]=tp[i]-np[i]
+ mov @ri[3],24($rp,$i,8) # rp[i]=tp[i]-np[i]
+ sbb 32($np,$i,8),@ri[0]
+ mov 48($ap,$i,8),@ri[2]
+ mov 56($ap,$i,8),@ri[3]
+ sbb 40($np,$i,8),@ri[1]
+ lea 4($i),$i # i++
+ dec $j # doesnn't affect CF!
+ jnz .Lsub4x
+
+ mov @ri[0],0($rp,$i,8) # rp[i]=tp[i]-np[i]
+ mov 32($ap,$i,8),@ri[0] # load overflow bit
+ sbb 16($np,$i,8),@ri[2]
+ mov @ri[1],8($rp,$i,8) # rp[i]=tp[i]-np[i]
+ sbb 24($np,$i,8),@ri[3]
+ mov @ri[2],16($rp,$i,8) # rp[i]=tp[i]-np[i]
+
+ sbb \$0,@ri[0] # handle upmost overflow bit
+ mov @ri[3],24($rp,$i,8) # rp[i]=tp[i]-np[i]
+ xor $i,$i # i=0
+ and @ri[0],$ap
+ not @ri[0]
+ mov $rp,$np
+ and @ri[0],$np
+ lea -1($num),$j
+ or $np,$ap # ap=borrow?tp:rp
+
+ movdqu ($ap),%xmm1
+ movdqa %xmm0,(%rsp)
+ movdqu %xmm1,($rp)
+ jmp .Lcopy4x
+.align 16
+.Lcopy4x: # copy or in-place refresh
+ movdqu 16($ap,$i),%xmm2
+ movdqu 32($ap,$i),%xmm1
+ movdqa %xmm0,16(%rsp,$i)
+ movdqu %xmm2,16($rp,$i)
+ movdqa %xmm0,32(%rsp,$i)
+ movdqu %xmm1,32($rp,$i)
+ lea 32($i),$i
+ dec $j
+ jnz .Lcopy4x
+
+ shl \$2,$num
+ movdqu 16($ap,$i),%xmm2
+ movdqa %xmm0,16(%rsp,$i)
+ movdqu %xmm2,16($rp,$i)
+___
+}
+$code.=<<___;
+ mov 8(%rsp,$num,8),%rsi # restore %rsp
+ mov \$1,%rax
+___
+$code.=<<___ if ($win64);
+ movaps (%rsi),%xmm6
+ movaps 0x10(%rsi),%xmm7
+ lea 0x28(%rsi),%rsi
+___
+$code.=<<___;
+ mov (%rsi),%r15
+ mov 8(%rsi),%r14
+ mov 16(%rsi),%r13
+ mov 24(%rsi),%r12
+ mov 32(%rsi),%rbp
+ mov 40(%rsi),%rbx
+ lea 48(%rsi),%rsp
+.Lmul4x_epilogue:
+ ret
+.size bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5
+___
+}}}
+
+{
+my ($inp,$num,$tbl,$idx)=$win64?("%rcx","%rdx","%r8", "%r9") : # Win64 order
+ ("%rdi","%rsi","%rdx","%rcx"); # Unix order
+my $out=$inp;
+my $STRIDE=2**5*8;
+my $N=$STRIDE/4;
+
+$code.=<<___;
+.globl bn_scatter5
+.type bn_scatter5,\@abi-omnipotent
+.align 16
+bn_scatter5:
+ cmp \$0, $num
+ jz .Lscatter_epilogue
+ lea ($tbl,$idx,8),$tbl
+.Lscatter:
+ mov ($inp),%rax
+ lea 8($inp),$inp
+ mov %rax,($tbl)
+ lea 32*8($tbl),$tbl
+ sub \$1,$num
+ jnz .Lscatter
+.Lscatter_epilogue:
+ ret
+.size bn_scatter5,.-bn_scatter5
+
+.globl bn_gather5
+.type bn_gather5,\@abi-omnipotent
+.align 16
+bn_gather5:
+___
+$code.=<<___ if ($win64);
+.LSEH_begin_bn_gather5:
+ # I can't trust assembler to use specific encoding:-(
+ .byte 0x48,0x83,0xec,0x28 #sub \$0x28,%rsp
+ .byte 0x0f,0x29,0x34,0x24 #movaps %xmm6,(%rsp)
+ .byte 0x0f,0x29,0x7c,0x24,0x10 #movdqa %xmm7,0x10(%rsp)
+___
+$code.=<<___;
+ mov $idx,%r11
+ shr \$`log($N/8)/log(2)`,$idx
+ and \$`$N/8-1`,%r11
+ not $idx
+ lea .Lmagic_masks(%rip),%rax
+ and \$`2**5/($N/8)-1`,$idx # 5 is "window size"
+ lea 96($tbl,%r11,8),$tbl # pointer within 1st cache line
+ movq 0(%rax,$idx,8),%xmm4 # set of masks denoting which
+ movq 8(%rax,$idx,8),%xmm5 # cache line contains element
+ movq 16(%rax,$idx,8),%xmm6 # denoted by 7th argument
+ movq 24(%rax,$idx,8),%xmm7
+ jmp .Lgather
+.align 16
+.Lgather:
+ movq `0*$STRIDE/4-96`($tbl),%xmm0
+ movq `1*$STRIDE/4-96`($tbl),%xmm1
+ pand %xmm4,%xmm0
+ movq `2*$STRIDE/4-96`($tbl),%xmm2
+ pand %xmm5,%xmm1
+ movq `3*$STRIDE/4-96`($tbl),%xmm3
+ pand %xmm6,%xmm2
+ por %xmm1,%xmm0
+ pand %xmm7,%xmm3
+ por %xmm2,%xmm0
+ lea $STRIDE($tbl),$tbl
+ por %xmm3,%xmm0
+
+ movq %xmm0,($out) # m0=bp[0]
+ lea 8($out),$out
+ sub \$1,$num
+ jnz .Lgather
+___
+$code.=<<___ if ($win64);
+ movaps %xmm6,(%rsp)
+ movaps %xmm7,0x10(%rsp)
+ lea 0x28(%rsp),%rsp
+___
+$code.=<<___;
+ ret
+.LSEH_end_bn_gather5:
+.size bn_gather5,.-bn_gather5
+___
+}
+$code.=<<___;
+.align 64
+.Lmagic_masks:
+ .long 0,0, 0,0, 0,0, -1,-1
+ .long 0,0, 0,0, 0,0, 0,0
+.asciz "Montgomery Multiplication with scatter/gather for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
+___
+
+# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
+# CONTEXT *context,DISPATCHER_CONTEXT *disp)
+if ($win64) {
+$rec="%rcx";
+$frame="%rdx";
+$context="%r8";
+$disp="%r9";
+
+$code.=<<___;
+.extern __imp_RtlVirtualUnwind
+.type mul_handler,\@abi-omnipotent
+.align 16
+mul_handler:
+ push %rsi
+ push %rdi
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ pushfq
+ sub \$64,%rsp
+
+ mov 120($context),%rax # pull context->Rax
+ mov 248($context),%rbx # pull context->Rip
+
+ mov 8($disp),%rsi # disp->ImageBase
+ mov 56($disp),%r11 # disp->HandlerData
+
+ mov 0(%r11),%r10d # HandlerData[0]
+ lea (%rsi,%r10),%r10 # end of prologue label
+ cmp %r10,%rbx # context->Rip<end of prologue label
+ jb .Lcommon_seh_tail
+
+ lea `40+48`(%rax),%rax
+
+ mov 4(%r11),%r10d # HandlerData[1]
+ lea (%rsi,%r10),%r10 # end of alloca label
+ cmp %r10,%rbx # context->Rip<end of alloca label
+ jb .Lcommon_seh_tail
+
+ mov 152($context),%rax # pull context->Rsp
+
+ mov 8(%r11),%r10d # HandlerData[2]
+ lea (%rsi,%r10),%r10 # epilogue label
+ cmp %r10,%rbx # context->Rip>=epilogue label
+ jae .Lcommon_seh_tail
+
+ mov 192($context),%r10 # pull $num
+ mov 8(%rax,%r10,8),%rax # pull saved stack pointer
+
+ movaps (%rax),%xmm0
+ movaps 16(%rax),%xmm1
+ lea `40+48`(%rax),%rax
+
+ mov -8(%rax),%rbx
+ mov -16(%rax),%rbp
+ mov -24(%rax),%r12
+ mov -32(%rax),%r13
+ mov -40(%rax),%r14
+ mov -48(%rax),%r15
+ mov %rbx,144($context) # restore context->Rbx
+ mov %rbp,160($context) # restore context->Rbp
+ mov %r12,216($context) # restore context->R12
+ mov %r13,224($context) # restore context->R13
+ mov %r14,232($context) # restore context->R14
+ mov %r15,240($context) # restore context->R15
+ movups %xmm0,512($context) # restore context->Xmm6
+ movups %xmm1,528($context) # restore context->Xmm7
+
+.Lcommon_seh_tail:
+ mov 8(%rax),%rdi
+ mov 16(%rax),%rsi
+ mov %rax,152($context) # restore context->Rsp
+ mov %rsi,168($context) # restore context->Rsi
+ mov %rdi,176($context) # restore context->Rdi
+
+ mov 40($disp),%rdi # disp->ContextRecord
+ mov $context,%rsi # context
+ mov \$154,%ecx # sizeof(CONTEXT)
+ .long 0xa548f3fc # cld; rep movsq
+
+ mov $disp,%rsi
+ xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
+ mov 8(%rsi),%rdx # arg2, disp->ImageBase
+ mov 0(%rsi),%r8 # arg3, disp->ControlPc
+ mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
+ mov 40(%rsi),%r10 # disp->ContextRecord
+ lea 56(%rsi),%r11 # &disp->HandlerData
+ lea 24(%rsi),%r12 # &disp->EstablisherFrame
+ mov %r10,32(%rsp) # arg5
+ mov %r11,40(%rsp) # arg6
+ mov %r12,48(%rsp) # arg7
+ mov %rcx,56(%rsp) # arg8, (NULL)
+ call *__imp_RtlVirtualUnwind(%rip)
+
+ mov \$1,%eax # ExceptionContinueSearch
+ add \$64,%rsp
+ popfq
+ pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbp
+ pop %rbx
+ pop %rdi
+ pop %rsi
+ ret
+.size mul_handler,.-mul_handler
+
+.section .pdata
+.align 4
+ .rva .LSEH_begin_bn_mul_mont_gather5
+ .rva .LSEH_end_bn_mul_mont_gather5
+ .rva .LSEH_info_bn_mul_mont_gather5
+
+ .rva .LSEH_begin_bn_mul4x_mont_gather5
+ .rva .LSEH_end_bn_mul4x_mont_gather5
+ .rva .LSEH_info_bn_mul4x_mont_gather5
+
+ .rva .LSEH_begin_bn_gather5
+ .rva .LSEH_end_bn_gather5
+ .rva .LSEH_info_bn_gather5
+
+.section .xdata
+.align 8
+.LSEH_info_bn_mul_mont_gather5:
+ .byte 9,0,0,0
+ .rva mul_handler
+ .rva .Lmul_alloca,.Lmul_body,.Lmul_epilogue # HandlerData[]
+.align 8
+.LSEH_info_bn_mul4x_mont_gather5:
+ .byte 9,0,0,0
+ .rva mul_handler
+ .rva .Lmul4x_alloca,.Lmul4x_body,.Lmul4x_epilogue # HandlerData[]
+.align 8
+.LSEH_info_bn_gather5:
+ .byte 0x01,0x0d,0x05,0x00
+ .byte 0x0d,0x78,0x01,0x00 #movaps 0x10(rsp),xmm7
+ .byte 0x08,0x68,0x00,0x00 #movaps (rsp),xmm6
+ .byte 0x04,0x42,0x00,0x00 #sub rsp,0x28
+.align 8
+___
+}
+
+$code =~ s/\`([^\`]*)\`/eval($1)/gem;
+
+print $code;
+close STDOUT;
{ my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
$code.=<<___;
- mov $e,$a0
- mov $e,$a1
+ ror \$`$Sigma1[2]-$Sigma1[1]`,$a0
mov $f,$a2
+ mov $T1,`$SZ*($i&0xf)`(%rsp)
- ror \$$Sigma1[0],$a0
- ror \$$Sigma1[1],$a1
+ ror \$`$Sigma0[2]-$Sigma0[1]`,$a1
+ xor $e,$a0
xor $g,$a2 # f^g
- xor $a1,$a0
- ror \$`$Sigma1[2]-$Sigma1[1]`,$a1
+ ror \$`$Sigma1[1]-$Sigma1[0]`,$a0
+ add $h,$T1 # T1+=h
+ xor $a,$a1
+
+ add ($Tbl,$round,$SZ),$T1 # T1+=K[round]
and $e,$a2 # (f^g)&e
- mov $T1,`$SZ*($i&0xf)`(%rsp)
+ mov $b,$h
- xor $a1,$a0 # Sigma1(e)
+ ror \$`$Sigma0[1]-$Sigma0[0]`,$a1
+ xor $e,$a0
xor $g,$a2 # Ch(e,f,g)=((f^g)&e)^g
- add $h,$T1 # T1+=h
-
- mov $a,$h
- add $a0,$T1 # T1+=Sigma1(e)
+ xor $c,$h # b^c
+ xor $a,$a1
add $a2,$T1 # T1+=Ch(e,f,g)
- mov $a,$a0
- mov $a,$a1
+ mov $b,$a2
- ror \$$Sigma0[0],$h
- ror \$$Sigma0[1],$a0
- mov $a,$a2
- add ($Tbl,$round,$SZ),$T1 # T1+=K[round]
+ ror \$$Sigma1[0],$a0 # Sigma1(e)
+ and $a,$h # h=(b^c)&a
+ and $c,$a2 # b&c
- xor $a0,$h
- ror \$`$Sigma0[2]-$Sigma0[1]`,$a0
- or $c,$a1 # a|c
+ ror \$$Sigma0[0],$a1 # Sigma0(a)
+ add $a0,$T1 # T1+=Sigma1(e)
+ add $a2,$h # h+=b&c (completes +=Maj(a,b,c)
- xor $a0,$h # h=Sigma0(a)
- and $c,$a2 # a&c
add $T1,$d # d+=T1
-
- and $b,$a1 # (a|c)&b
add $T1,$h # h+=T1
-
- or $a2,$a1 # Maj(a,b,c)=((a|c)&b)|(a&c)
lea 1($round),$round # round++
+ add $a1,$h # h+=Sigma0(a)
- add $a1,$h # h+=Maj(a,b,c)
___
}
$code.=<<___;
mov `$SZ*(($i+1)&0xf)`(%rsp),$a0
- mov `$SZ*(($i+14)&0xf)`(%rsp),$T1
-
- mov $a0,$a2
+ mov `$SZ*(($i+14)&0xf)`(%rsp),$a1
+ mov $a0,$T1
+ mov $a1,$a2
+ ror \$`$sigma0[1]-$sigma0[0]`,$T1
+ xor $a0,$T1
shr \$$sigma0[2],$a0
- ror \$$sigma0[0],$a2
-
- xor $a2,$a0
- ror \$`$sigma0[1]-$sigma0[0]`,$a2
- xor $a2,$a0 # sigma0(X[(i+1)&0xf])
- mov $T1,$a1
+ ror \$$sigma0[0],$T1
+ xor $T1,$a0 # sigma0(X[(i+1)&0xf])
+ mov `$SZ*(($i+9)&0xf)`(%rsp),$T1
- shr \$$sigma1[2],$T1
- ror \$$sigma1[0],$a1
-
- xor $a1,$T1
- ror \$`$sigma1[1]-$sigma1[0]`,$a1
-
- xor $a1,$T1 # sigma1(X[(i+14)&0xf])
+ ror \$`$sigma1[1]-$sigma1[0]`,$a2
+ xor $a1,$a2
+ shr \$$sigma1[2],$a1
+ ror \$$sigma1[0],$a2
add $a0,$T1
-
- add `$SZ*(($i+9)&0xf)`(%rsp),$T1
+ xor $a2,$a1 # sigma1(X[(i+14)&0xf])
add `$SZ*($i&0xf)`(%rsp),$T1
+ mov $e,$a0
+ add $a1,$T1
+ mov $a,$a1
___
&ROUND_00_15(@_);
}
___
for($i=0;$i<16;$i++) {
$code.=" mov $SZ*$i($inp),$T1\n";
+ $code.=" mov @ROT[4],$a0\n";
+ $code.=" mov @ROT[0],$a1\n";
$code.=" bswap $T1\n";
&ROUND_00_15($i,@ROT);
unshift(@ROT,pop(@ROT));
call OPENSSL_cpuid_setup
.hidden OPENSSL_ia32cap_P
-.comm OPENSSL_ia32cap_P,8
+.comm OPENSSL_ia32cap_P,8,4
.text