From e1612ea59da067c76fa360cf976a6b38216cddf2 Mon Sep 17 00:00:00 2001 From: Andy Polyakov Date: Fri, 13 Jul 2007 17:39:40 +0000 Subject: [PATCH] Add _x86_64_AES_[en|de]crypt_compact. --- crypto/aes/asm/aes-x86_64.pl | 1071 ++++++++++++++++++++++++++++++---- 1 file changed, 952 insertions(+), 119 deletions(-) diff --git a/crypto/aes/asm/aes-x86_64.pl b/crypto/aes/asm/aes-x86_64.pl index d02cd5bd2b..75e1749b64 100755 --- a/crypto/aes/asm/aes-x86_64.pl +++ b/crypto/aes/asm/aes-x86_64.pl @@ -2,11 +2,12 @@ # # ==================================================================== # Written by Andy Polyakov for the OpenSSL -# project. Rights for redistribution and usage in source and binary -# forms are granted according to the OpenSSL license. +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. # ==================================================================== # -# Version 1.2. +# Version 2.0. # # aes-*-cbc benchmarks are improved by >70% [compared to gcc 3.3.2 on # Opteron 240 CPU] plus all the bells-n-whistles from 32-bit version @@ -17,17 +18,21 @@ # # Performance in number of cycles per processed byte for 128-bit key: # -# ECB CBC encrypt -# AMD64 13.7 13.0(*) -# EM64T 20.2 18.6(*) -# -# (*) CBC benchmarks are better than ECB thanks to custom ABI used -# by the private block encryption function. +# ECB encrypt ECB decrypt CBC large chunk +# AMD64 33 41 13.0 +# EM64T 38 59 18.6 +# Core 2 30 43 14.5 $verticalspin=1; # unlike 32-bit version $verticalspin performs # ~15% better on both AMD and Intel cores $output=shift; -open STDOUT,"| $^X ../perlasm/x86_64-xlate.pl $output"; + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or +die "can't locate x86_64-xlate.pl"; + +open STDOUT,"| $^X $xlate $output"; $code=".text\n"; @@ -35,9 +40,9 @@ $s0="%eax"; $s1="%ebx"; $s2="%ecx"; $s3="%edx"; -$acc0="%esi"; -$acc1="%edi"; -$acc2="%ebp"; +$acc0="%esi"; $mask80="%rsi"; +$acc1="%edi"; $maskfe="%rdi"; +$acc2="%ebp"; $mask1b="%rbp"; $inp="%r8"; $out="%r9"; $t0="%r10d"; @@ -51,6 +56,8 @@ sub hi() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1h/; $r; } sub lo() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1l/; $r =~ s/%[er]([sd]i)/%\1l/; $r =~ s/%(r[0-9]+)[d]?/%\1b/; $r; } +sub LO() { my $r=shift; $r =~ s/%r([a-z]+)/%e\1/; + $r =~ s/%r([0-9]+)/%r\1d/; $r; } sub _data_word() { my $i; while(defined($i=shift)) { $code.=sprintf".long\t0x%08x,0x%08x\n",$i,$i; } @@ -345,6 +352,235 @@ $code.=<<___; .size _x86_64_AES_encrypt,.-_x86_64_AES_encrypt ___ +# it's possible to implement this by shifting tN by 8, filling least +# significant byte with byte load and finally bswap-ing at the end, +# but partial byte load kills Core 2... +sub enccompactvert() +{ my ($t3,$t4,$t5)=("%r8d","%r9d","%r13d"); + +$code.=<<___; + movzb `&lo("$s0")`,$t0 + movzb `&lo("$s1")`,$t1 + movzb `&lo("$s2")`,$t2 + movzb ($sbox,$t0,1),$t0 + movzb ($sbox,$t1,1),$t1 + movzb ($sbox,$t2,1),$t2 + + movzb `&lo("$s3")`,$t3 + movzb `&hi("$s1")`,$acc0 + movzb `&hi("$s2")`,$acc1 + movzb ($sbox,$t3,1),$t3 + movzb ($sbox,$acc0,1),$t4 #$t0 + movzb ($sbox,$acc1,1),$t5 #$t1 + + movzb `&hi("$s3")`,$acc2 + movzb `&hi("$s0")`,$acc0 + shr \$16,$s2 + movzb ($sbox,$acc2,1),$acc2 #$t2 + movzb ($sbox,$acc0,1),$acc0 #$t3 + shr \$16,$s3 + + movzb `&lo("$s2")`,$acc1 + shl \$8,$t4 + shl \$8,$t5 + movzb ($sbox,$acc1,1),$acc1 #$t0 + xor $t4,$t0 + xor $t5,$t1 + + movzb `&lo("$s3")`,$t4 + shr \$16,$s0 + shr \$16,$s1 + movzb `&lo("$s0")`,$t5 + shl \$8,$acc2 + shl \$8,$acc0 + movzb ($sbox,$t4,1),$t4 #$t1 + movzb ($sbox,$t5,1),$t5 #$t2 + xor $acc2,$t2 + xor $acc0,$t3 + + movzb `&lo("$s1")`,$acc2 + movzb `&hi("$s3")`,$acc0 + shl \$16,$acc1 + movzb ($sbox,$acc2,1),$acc2 #$t3 + movzb ($sbox,$acc0,1),$acc0 #$t0 + xor $acc1,$t0 + + movzb `&hi("$s0")`,$acc1 + shr \$8,$s2 + shr \$8,$s1 + movzb ($sbox,$acc1,1),$acc1 #$t1 + movzb ($sbox,$s2,1),$s3 #$t3 + movzb ($sbox,$s1,1),$s2 #$t2 + shl \$16,$t4 + shl \$16,$t5 + shl \$16,$acc2 + xor $t4,$t1 + xor $t5,$t2 + xor $acc2,$t3 + + shl \$24,$acc0 + shl \$24,$acc1 + shl \$24,$s3 + xor $acc0,$t0 + shl \$24,$s2 + xor $acc1,$t1 + mov $t0,$s0 + mov $t1,$s1 + xor $t2,$s2 + xor $t3,$s3 +___ +} + +sub enctransform_ref() +{ my $sn = shift; + my ($acc,$r2,$tmp)=("%r8d","%r9d","%r13d"); + +$code.=<<___; + mov $sn,$acc + and \$0x80808080,$acc + mov $acc,$tmp + shr \$7,$tmp + lea ($sn,$sn),$r2 + sub $tmp,$acc + and \$0xfefefefe,$r2 + and \$0x1b1b1b1b,$acc + mov $sn,$tmp + xor $acc,$r2 + + xor $r2,$sn + rol \$24,$sn + xor $r2,$sn + ror \$16,$tmp + xor $tmp,$sn + ror \$8,$tmp + xor $tmp,$sn +___ +} + +# unlike decrypt case it does not pay off to parallelize enctransform +sub enctransform() +{ my ($t3,$r20,$r21)=($acc2,"%r8d","%r9d"); + +$code.=<<___; + mov $s0,$acc0 + mov $s1,$acc1 + and \$0x80808080,$acc0 + and \$0x80808080,$acc1 + mov $acc0,$t0 + mov $acc1,$t1 + shr \$7,$t0 + lea ($s0,$s0),$r20 + shr \$7,$t1 + lea ($s1,$s1),$r21 + sub $t0,$acc0 + sub $t1,$acc1 + and \$0xfefefefe,$r20 + and \$0xfefefefe,$r21 + and \$0x1b1b1b1b,$acc0 + and \$0x1b1b1b1b,$acc1 + mov $s0,$t0 + mov $s1,$t1 + xor $acc0,$r20 + xor $acc1,$r21 + + xor $r20,$s0 + xor $r21,$s1 + mov $s2,$acc0 + mov $s3,$acc1 + rol \$24,$s0 + rol \$24,$s1 + and \$0x80808080,$acc0 + and \$0x80808080,$acc1 + xor $r20,$s0 + xor $r21,$s1 + mov $acc0,$t2 + mov $acc1,$t3 + ror \$16,$t0 + ror \$16,$t1 + shr \$7,$t2 + lea ($s2,$s2),$r20 + xor $t0,$s0 + xor $t1,$s1 + shr \$7,$t3 + lea ($s3,$s3),$r21 + ror \$8,$t0 + ror \$8,$t1 + sub $t2,$acc0 + sub $t3,$acc1 + xor $t0,$s0 + xor $t1,$s1 + + and \$0xfefefefe,$r20 + and \$0xfefefefe,$r21 + and \$0x1b1b1b1b,$acc0 + and \$0x1b1b1b1b,$acc1 + mov $s2,$t2 + mov $s3,$t3 + xor $acc0,$r20 + xor $acc1,$r21 + + xor $r20,$s2 + xor $r21,$s3 + rol \$24,$s2 + rol \$24,$s3 + xor $r20,$s2 + xor $r21,$s3 + ror \$16,$t2 + ror \$16,$t3 + xor $t2,$s2 + xor $t3,$s3 + ror \$8,$t2 + ror \$8,$t3 + xor $t2,$s2 + xor $t3,$s3 + +___ +} + +$code.=<<___; +.type _x86_64_AES_encrypt_compact,\@abi-omnipotent +.align 16 +_x86_64_AES_encrypt_compact: + lea 128($sbox),$acc0 # size optimization + mov 0-128($acc0),$acc1 # prefetch Te4 + mov 32-128($acc0),$acc2 + mov 64-128($acc0),$t0 + mov 96-128($acc0),$t1 + mov 128-128($acc0),$acc1 + mov 160-128($acc0),$acc2 + mov 192-128($acc0),$t0 + mov 224-128($acc0),$t1 + + xor 0($key),$s0 # xor with key + xor 4($key),$s1 + xor 8($key),$s2 + xor 12($key),$s3 + lea 16($key),$key + jmp .Lenc_loop_compact +.align 16 +.Lenc_loop_compact: +___ + &enccompactvert(); + &enctransform(); +$code.=<<___; + xor 0($key),$s0 + xor 4($key),$s1 + xor 8($key),$s2 + xor 12($key),$s3 + lea 16($key),$key + cmp 16(%rsp),$key + jne .Lenc_loop_compact +___ + &enccompactvert(); +$code.=<<___; + xor 0($key),$s0 + xor 4($key),$s1 + xor 8($key),$s2 + xor 12($key),$s3 + .byte 0xf3,0xc3 # rep ret +.size _x86_64_AES_encrypt_compact,.-_x86_64_AES_encrypt_compact +___ + # void AES_encrypt (const void *inp,void *out,const AES_KEY *key); $code.=<<___; .globl AES_encrypt @@ -358,25 +594,49 @@ AES_encrypt: push %r14 push %r15 + # allocate frame "above" key schedule + mov %rsp,%rax mov %rdx,$key - mov %rdi,$inp - mov %rsi,$out + lea -63(%rdx),%rcx + and \$-64,%rsp + sub %rsp,%rcx + neg %rcx + and \$0x3c0,%rcx + sub %rcx,%rsp - .picmeup $sbox - lea AES_Te-.($sbox),$sbox + push %rax # save real stack pointer + push %rsi # save out - mov 0($inp),$s0 - mov 4($inp),$s1 - mov 8($inp),$s2 - mov 12($inp),$s3 + mov 240($key),$rnds # load rounds - call _x86_64_AES_encrypt + mov 0(%rdi),$s0 # load input vector + mov 4(%rdi),$s1 + mov 8(%rdi),$s2 + mov 12(%rdi),$s3 - mov $s0,0($out) + shl \$4,$rnds + lea ($key,$rnds),%rbp + push %rbp + push $key + + # pick Te4 copy which can't "overlap" with stack frame or key schedule + .picmeup $sbox + lea AES_Te+2048-.($sbox),$sbox + lea 768-32(%rsp),%rbp + sub $sbox,%rbp + and \$0x300,%rbp + lea ($sbox,%rbp),$sbox + + call _x86_64_AES_encrypt_compact + + lea 16(%rsp),%rsp + pop $out # restore out + mov $s0,0($out) # write output vector mov $s1,4($out) mov $s2,8($out) mov $s3,12($out) + mov (%rsp),%rsp pop %r15 pop %r14 pop %r13 @@ -453,19 +713,20 @@ sub declastvert() { my $t3="%r8d"; # zaps $inp! $code.=<<___; + lea 2048($sbox),$sbox # size optimization movzb `&lo("$s0")`,$acc0 movzb `&lo("$s1")`,$acc1 movzb `&lo("$s2")`,$acc2 - movzb 2048($sbox,$acc0,1),$t0 - movzb 2048($sbox,$acc1,1),$t1 - movzb 2048($sbox,$acc2,1),$t2 + movzb ($sbox,$acc0,1),$t0 + movzb ($sbox,$acc1,1),$t1 + movzb ($sbox,$acc2,1),$t2 movzb `&lo("$s3")`,$acc0 movzb `&hi("$s3")`,$acc1 movzb `&hi("$s0")`,$acc2 - movzb 2048($sbox,$acc0,1),$t3 - movzb 2048($sbox,$acc1,1),$acc1 #$t0 - movzb 2048($sbox,$acc2,1),$acc2 #$t1 + movzb ($sbox,$acc0,1),$t3 + movzb ($sbox,$acc1,1),$acc1 #$t0 + movzb ($sbox,$acc2,1),$acc2 #$t1 shl \$8,$acc1 shl \$8,$acc2 @@ -477,8 +738,8 @@ $code.=<<___; movzb `&hi("$s1")`,$acc0 movzb `&hi("$s2")`,$acc1 shr \$16,$s0 - movzb 2048($sbox,$acc0,1),$acc0 #$t2 - movzb 2048($sbox,$acc1,1),$acc1 #$t3 + movzb ($sbox,$acc0,1),$acc0 #$t2 + movzb ($sbox,$acc1,1),$acc1 #$t3 shl \$8,$acc0 shl \$8,$acc1 @@ -490,9 +751,9 @@ $code.=<<___; movzb `&lo("$s2")`,$acc0 movzb `&lo("$s3")`,$acc1 movzb `&lo("$s0")`,$acc2 - movzb 2048($sbox,$acc0,1),$acc0 #$t0 - movzb 2048($sbox,$acc1,1),$acc1 #$t1 - movzb 2048($sbox,$acc2,1),$acc2 #$t2 + movzb ($sbox,$acc0,1),$acc0 #$t0 + movzb ($sbox,$acc1,1),$acc1 #$t1 + movzb ($sbox,$acc2,1),$acc2 #$t2 shl \$16,$acc0 shl \$16,$acc1 @@ -505,9 +766,9 @@ $code.=<<___; movzb `&lo("$s1")`,$acc0 movzb `&hi("$s1")`,$acc1 movzb `&hi("$s2")`,$acc2 - movzb 2048($sbox,$acc0,1),$acc0 #$t3 - movzb 2048($sbox,$acc1,1),$acc1 #$t0 - movzb 2048($sbox,$acc2,1),$acc2 #$t1 + movzb ($sbox,$acc0,1),$acc0 #$t3 + movzb ($sbox,$acc1,1),$acc1 #$t0 + movzb ($sbox,$acc2,1),$acc2 #$t1 shl \$16,$acc0 shl \$24,$acc1 @@ -520,8 +781,8 @@ $code.=<<___; movzb `&hi("$s3")`,$acc0 movzb `&hi("$s0")`,$acc1 mov 16+12($key),$s3 - movzb 2048($sbox,$acc0,1),$acc0 #$t2 - movzb 2048($sbox,$acc1,1),$acc1 #$t3 + movzb ($sbox,$acc0,1),$acc0 #$t2 + movzb ($sbox,$acc1,1),$acc1 #$t3 mov 16+0($key),$s0 shl \$24,$acc0 @@ -532,6 +793,7 @@ $code.=<<___; mov 16+4($key),$s1 mov 16+8($key),$s2 + lea -2048($sbox),$sbox xor $t0,$s0 xor $t1,$s1 xor $t2,$s2 @@ -659,6 +921,260 @@ $code.=<<___; .size _x86_64_AES_decrypt,.-_x86_64_AES_decrypt ___ +sub deccompactvert() +{ my ($t3,$t4,$t5)=("%r8d","%r9d","%r13d"); + +$code.=<<___; + movzb `&lo("$s0")`,$t0 + movzb `&lo("$s1")`,$t1 + movzb `&lo("$s2")`,$t2 + movzb ($sbox,$t0,1),$t0 + movzb ($sbox,$t1,1),$t1 + movzb ($sbox,$t2,1),$t2 + + movzb `&lo("$s3")`,$t3 + movzb `&hi("$s3")`,$acc0 + movzb `&hi("$s0")`,$acc1 + movzb ($sbox,$t3,1),$t3 + movzb ($sbox,$acc0,1),$t4 #$t0 + movzb ($sbox,$acc1,1),$t5 #$t1 + + movzb `&hi("$s1")`,$acc2 + movzb `&hi("$s2")`,$acc0 + shr \$16,$s2 + movzb ($sbox,$acc2,1),$acc2 #$t2 + movzb ($sbox,$acc0,1),$acc0 #$t3 + shr \$16,$s3 + + movzb `&lo("$s2")`,$acc1 + shl \$8,$t4 + shl \$8,$t5 + movzb ($sbox,$acc1,1),$acc1 #$t0 + xor $t4,$t0 + xor $t5,$t1 + + movzb `&lo("$s3")`,$t4 + shr \$16,$s0 + shr \$16,$s1 + movzb `&lo("$s0")`,$t5 + shl \$8,$acc2 + shl \$8,$acc0 + movzb ($sbox,$t4,1),$t4 #$t1 + movzb ($sbox,$t5,1),$t5 #$t2 + xor $acc2,$t2 + xor $acc0,$t3 + + movzb `&lo("$s1")`,$acc2 + movzb `&hi("$s1")`,$acc0 + shl \$16,$acc1 + movzb ($sbox,$acc2,1),$acc2 #$t3 + movzb ($sbox,$acc0,1),$acc0 #$t0 + xor $acc1,$t0 + + movzb `&hi("$s2")`,$acc1 + shl \$16,$t4 + shl \$16,$t5 + movzb ($sbox,$acc1,1),$s1 #$t1 + xor $t4,$t1 + xor $t5,$t2 + + movzb `&hi("$s3")`,$acc1 + shr \$8,$s0 + shl \$16,$acc2 + movzb ($sbox,$acc1,1),$s2 #$t2 + movzb ($sbox,$s0,1),$s3 #$t3 + xor $acc2,$t3 + + shl \$24,$acc0 + shl \$24,$s1 + shl \$24,$s2 + xor $acc0,$t0 + shl \$24,$s3 + xor $t1,$s1 + mov $t0,$s0 + xor $t2,$s2 + xor $t3,$s3 +___ +} + +# parallelized version! input is pair of 64-bit values: %rax=s1.s0 +# and %rcx=s3.s2, output is four 32-bit values in %eax=s0, %ebx=s1, +# %ecx=s2 and %edx=s3. +sub dectransform() +{ my ($tp10,$tp20,$tp40,$tp80,$acc0)=("%rax","%r8", "%r9", "%r10","%rbx"); + my ($tp18,$tp28,$tp48,$tp88,$acc8)=("%rcx","%r11","%r12","%r13","%rdx"); + +$code.=<<___; + mov $tp10,$acc0 + mov $tp18,$acc8 + and $mask80,$acc0 + and $mask80,$acc8 + mov $acc0,$tp40 + mov $acc8,$tp48 + shr \$7,$tp40 + lea ($tp10,$tp10),$tp20 + shr \$7,$tp48 + lea ($tp18,$tp18),$tp28 + sub $tp40,$acc0 + sub $tp48,$acc8 + and $maskfe,$tp20 + and $maskfe,$tp28 + and $mask1b,$acc0 + and $mask1b,$acc8 + xor $tp20,$acc0 + xor $tp28,$acc8 + mov $acc0,$tp20 + mov $acc8,$tp28 + + and $mask80,$acc0 + and $mask80,$acc8 + mov $acc0,$tp80 + mov $acc8,$tp88 + shr \$7,$tp80 + lea ($tp20,$tp20),$tp40 + shr \$7,$tp88 + lea ($tp28,$tp28),$tp48 + sub $tp80,$acc0 + sub $tp88,$acc8 + and $maskfe,$tp40 + and $maskfe,$tp48 + and $mask1b,$acc0 + and $mask1b,$acc8 + xor $tp40,$acc0 + xor $tp48,$acc8 + mov $acc0,$tp40 + mov $acc8,$tp48 + + and $mask80,$acc0 + and $mask80,$acc8 + mov $acc0,$tp80 + mov $acc8,$tp88 + shr \$7,$tp80 + xor $tp10,$tp20 # tp2^=tp1 + shr \$7,$tp88 + xor $tp18,$tp28 # tp2^=tp1 + sub $tp80,$acc0 + sub $tp88,$acc8 + lea ($tp40,$tp40),$tp80 + lea ($tp48,$tp48),$tp88 + xor $tp10,$tp40 # tp4^=tp1 + xor $tp18,$tp48 # tp4^=tp1 + and $maskfe,$tp80 + and $maskfe,$tp88 + and $mask1b,$acc0 + and $mask1b,$acc8 + xor $acc0,$tp80 + xor $acc8,$tp88 + + xor $tp80,$tp10 # tp1^=tp8 + xor $tp88,$tp18 # tp1^=tp8 + xor $tp80,$tp20 # tp2^tp1^=tp8 + xor $tp88,$tp28 # tp2^tp1^=tp8 + mov $tp10,$acc0 + mov $tp18,$acc8 + xor $tp80,$tp40 # tp4^tp1^=tp8 + xor $tp88,$tp48 # tp4^tp1^=tp8 + shr \$32,$acc0 + shr \$32,$acc8 + xor $tp20,$tp80 # tp8^=tp8^tp2^tp1=tp2^tp1 + xor $tp28,$tp88 # tp8^=tp8^tp2^tp1=tp2^tp1 + rol \$8,`&LO("$tp10")` # ROTATE(tp1^tp8,8) + rol \$8,`&LO("$tp18")` # ROTATE(tp1^tp8,8) + xor $tp40,$tp80 # tp2^tp1^=tp8^tp4^tp1=tp8^tp4^tp2 + xor $tp48,$tp88 # tp2^tp1^=tp8^tp4^tp1=tp8^tp4^tp2 + + rol \$8,`&LO("$acc0")` # ROTATE(tp1^tp8,8) + rol \$8,`&LO("$acc8")` # ROTATE(tp1^tp8,8) + xor `&LO("$tp80")`,`&LO("$tp10")` + xor `&LO("$tp88")`,`&LO("$tp18")` + shr \$32,$tp80 + shr \$32,$tp88 + xor `&LO("$tp80")`,`&LO("$acc0")` + xor `&LO("$tp88")`,`&LO("$acc8")` + + mov $tp20,$tp80 + mov $tp28,$tp88 + shr \$32,$tp80 + shr \$32,$tp88 + rol \$24,`&LO("$tp20")` # ROTATE(tp2^tp1^tp8,24) + rol \$24,`&LO("$tp28")` # ROTATE(tp2^tp1^tp8,24) + rol \$24,`&LO("$tp80")` # ROTATE(tp2^tp1^tp8,24) + rol \$24,`&LO("$tp88")` # ROTATE(tp2^tp1^tp8,24) + xor `&LO("$tp20")`,`&LO("$tp10")` + xor `&LO("$tp28")`,`&LO("$tp18")` + mov $tp40,$tp20 + mov $tp48,$tp28 + xor `&LO("$tp80")`,`&LO("$acc0")` + xor `&LO("$tp88")`,`&LO("$acc8")` + + shr \$32,$tp20 + shr \$32,$tp28 + rol \$16,`&LO("$tp40")` # ROTATE(tp4^tp1^tp8,16) + rol \$16,`&LO("$tp48")` # ROTATE(tp4^tp1^tp8,16) + rol \$16,`&LO("$tp20")` # ROTATE(tp4^tp1^tp8,16) + rol \$16,`&LO("$tp28")` # ROTATE(tp4^tp1^tp8,16) + xor `&LO("$tp40")`,`&LO("$tp10")` + xor `&LO("$tp48")`,`&LO("$tp18")` + xor `&LO("$tp20")`,`&LO("$acc0")` + xor `&LO("$tp28")`,`&LO("$acc8")` + +___ +} + +$code.=<<___; +.type _x86_64_AES_decrypt_compact,\@abi-omnipotent +.align 16 +_x86_64_AES_decrypt_compact: + lea 128($sbox),$acc0 # size optimization + mov 0-128($acc0),$acc1 # prefetch Td4 + mov 32-128($acc0),$acc2 + mov 64-128($acc0),$t0 + mov 96-128($acc0),$t1 + mov 128-128($acc0),$acc1 + mov 160-128($acc0),$acc2 + mov 192-128($acc0),$t0 + mov 224-128($acc0),$t1 + + xor 0($key),$s0 # xor with key + xor 4($key),$s1 + xor 8($key),$s2 + xor 12($key),$s3 + lea 16($key),$key + + jmp .Ldec_compact_loop +.align 16 +.Ldec_compact_loop: +___ + &deccompactvert(); +$code.=<<___; + mov 256+0($sbox),$mask80 + shl \$32,%rbx + shl \$32,%rdx + mov 256+8($sbox),$maskfe + or %rbx,%rax + or %rdx,%rcx + mov 256+16($sbox),$mask1b +___ + &dectransform(); +$code.=<<___; + xor 0($key),$s0 + xor 4($key),$s1 + xor 8($key),$s2 + xor 12($key),$s3 + lea 16($key),$key + cmp 16(%rsp),$key + jne .Ldec_compact_loop +___ + &deccompactvert(); +$code.=<<___; + xor 0($key),$s0 + xor 4($key),$s1 + xor 8($key),$s2 + xor 12($key),$s3 + .byte 0xf3,0xc3 # rep ret +.size _x86_64_AES_decrypt_compact,.-_x86_64_AES_decrypt_compact +___ + # void AES_decrypt (const void *inp,void *out,const AES_KEY *key); $code.=<<___; .globl AES_decrypt @@ -672,37 +1188,51 @@ AES_decrypt: push %r14 push %r15 + # allocate frame "above" key schedule + mov %rsp,%rax mov %rdx,$key - mov %rdi,$inp - mov %rsi,$out + lea -63(%rdx),%rcx + and \$-64,%rsp + sub %rsp,%rcx + neg %rcx + and \$0x3c0,%rcx + sub %rcx,%rsp - .picmeup $sbox - lea AES_Td-.($sbox),$sbox - - # prefetch Td4 - lea 2048+128($sbox),$sbox; - mov 0-128($sbox),$s0 - mov 32-128($sbox),$s1 - mov 64-128($sbox),$s2 - mov 96-128($sbox),$s3 - mov 128-128($sbox),$s0 - mov 160-128($sbox),$s1 - mov 192-128($sbox),$s2 - mov 224-128($sbox),$s3 - lea -2048-128($sbox),$sbox; - - mov 0($inp),$s0 - mov 4($inp),$s1 - mov 8($inp),$s2 - mov 12($inp),$s3 - - call _x86_64_AES_decrypt + push %rax # save real stack pointer + push %rsi # save out + + mov 240($key),$rnds # load rounds + + mov 0(%rdi),$s0 # load input vector + mov 4(%rdi),$s1 + mov 8(%rdi),$s2 + mov 12(%rdi),$s3 + + shl \$4,$rnds + lea ($key,$rnds),%rbp + push %rbp + push $key + # pick Td4 copy which can't "overlap" with stack frame or key schedule + .picmeup $sbox + lea AES_Td+2048-.($sbox),$sbox + lea 768-32(%rsp),%rbp + sub $sbox,%rbp + and \$0x300,%rbp + lea ($sbox,%rbp),$sbox + shr \$3,%rbp # recall "magic" constants! + add %rbp,$sbox + + call _x86_64_AES_decrypt_compact + + lea 16(%rsp),%rsp + pop $out # restore out mov $s0,0($out) mov $s1,4($out) mov $s2,8($out) mov $s3,12($out) + mov (%rsp),%rsp pop %r15 pop %r14 pop %r13 @@ -718,27 +1248,26 @@ sub enckey() { $code.=<<___; movz %dl,%esi # rk[i]>>0 - mov 2(%rbp,%rsi,8),%ebx + movzb -128(%rbp,%rsi),%ebx movz %dh,%esi # rk[i]>>8 - and \$0xFF000000,%ebx + shl \$24,%ebx xor %ebx,%eax - mov 2(%rbp,%rsi,8),%ebx + movzb -128(%rbp,%rsi),%ebx shr \$16,%edx - and \$0x000000FF,%ebx movz %dl,%esi # rk[i]>>16 xor %ebx,%eax - mov 0(%rbp,%rsi,8),%ebx + movzb -128(%rbp,%rsi),%ebx movz %dh,%esi # rk[i]>>24 - and \$0x0000FF00,%ebx + shl \$8,%ebx xor %ebx,%eax - mov 0(%rbp,%rsi,8),%ebx - and \$0x00FF0000,%ebx + movzb -128(%rbp,%rsi),%ebx + shl \$16,%ebx xor %ebx,%eax - xor 2048(%rbp,%rcx,4),%eax # rcon + xor 1024-128(%rbp,%rcx,4),%eax # rcon ___ } @@ -763,6 +1292,17 @@ AES_set_encrypt_key: .picmeup %rbp lea AES_Te-.(%rbp),%rbp + lea 2048+128(%rbp),%rbp + + # prefetch Te4 + mov 0-128(%rbp),%eax + mov 32-128(%rbp),%ebx + mov 64-128(%rbp),%r8d + mov 96-128(%rbp),%edx + mov 128-128(%rbp),%eax + mov 160-128(%rbp),%ebx + mov 192-128(%rbp),%r8d + mov 224-128(%rbp),%edx cmp \$128,%ecx je .L10rounds @@ -900,24 +1440,23 @@ $code.=<<___; mov %eax,%edx mov 16(%rdi),%eax # rk[4] movz %dl,%esi # rk[11]>>0 - mov 2(%rbp,%rsi,8),%ebx + movzb -128(%rbp,%rsi),%ebx movz %dh,%esi # rk[11]>>8 - and \$0x000000FF,%ebx xor %ebx,%eax - mov 0(%rbp,%rsi,8),%ebx + movzb -128(%rbp,%rsi),%ebx shr \$16,%edx - and \$0x0000FF00,%ebx + shl \$8,%ebx movz %dl,%esi # rk[11]>>16 xor %ebx,%eax - mov 0(%rbp,%rsi,8),%ebx + movzb -128(%rbp,%rsi),%ebx movz %dh,%esi # rk[11]>>24 - and \$0x00FF0000,%ebx + shl \$16,%ebx xor %ebx,%eax - mov 2(%rbp,%rsi,8),%ebx - and \$0xFF000000,%ebx + movzb -128(%rbp,%rsi),%ebx + shl \$24,%ebx xor %ebx,%eax mov %eax,48(%rdi) # rk[12] @@ -944,25 +1483,57 @@ $code.=<<___; .size AES_set_encrypt_key,.-AES_set_encrypt_key ___ -sub deckey() +sub deckey_ref() { my ($i,$ptr,$te,$td) = @_; + my ($tp1,$tp2,$tp4,$tp8,$acc)=("%eax","%ebx","%edi","%edx","%r8d"); $code.=<<___; - mov $i($ptr),%eax - mov %eax,%edx - movz %ah,%ebx - shr \$16,%edx - and \$0xFF,%eax - movzb 2($te,%rax,8),%rax - movzb 2($te,%rbx,8),%rbx - mov 0($td,%rax,8),%eax - xor 3($td,%rbx,8),%eax - movzb %dh,%ebx - and \$0xFF,%edx - movzb 2($te,%rdx,8),%rdx - movzb 2($te,%rbx,8),%rbx - xor 2($td,%rdx,8),%eax - xor 1($td,%rbx,8),%eax - mov %eax,$i($ptr) + mov $i($ptr),$tp1 + mov $tp1,$acc + and \$0x80808080,$acc + mov $acc,$tp4 + shr \$7,$tp4 + lea 0($tp1,$tp1),$tp2 + sub $tp4,$acc + and \$0xfefefefe,$tp2 + and \$0x1b1b1b1b,$acc + xor $tp2,$acc + mov $acc,$tp2 + + and \$0x80808080,$acc + mov $acc,$tp8 + shr \$7,$tp8 + lea 0($tp2,$tp2),$tp4 + sub $tp8,$acc + and \$0xfefefefe,$tp4 + and \$0x1b1b1b1b,$acc + xor $tp1,$tp2 # tp2^tp1 + xor $tp4,$acc + mov $acc,$tp4 + + and \$0x80808080,$acc + mov $acc,$tp8 + shr \$7,$tp8 + sub $tp8,$acc + lea 0($tp4,$tp4),$tp8 + xor $tp1,$tp4 # tp4^tp1 + and \$0xfefefefe,$tp8 + and \$0x1b1b1b1b,$acc + xor $acc,$tp8 + + xor $tp8,$tp1 # tp1^tp8 + rol \$8,$tp1 # ROTATE(tp1^tp8,8) + xor $tp8,$tp2 # tp2^tp1^tp8 + xor $tp8,$tp4 # tp4^tp1^tp8 + xor $tp2,$tp8 + xor $tp4,$tp8 # tp8^(tp8^tp4^tp1)^(tp8^tp2^tp1)=tp8^tp4^tp2 + + xor $tp8,$tp1 + rol \$24,$tp2 # ROTATE(tp2^tp1^tp8,24) + xor $tp2,$tp1 + rol \$16,$tp4 # ROTATE(tp4^tp1^tp8,16) + xor $tp4,$tp1 + + mov $tp1,$i($ptr) ___ } @@ -973,19 +1544,22 @@ $code.=<<___; .type AES_set_decrypt_key,\@function,3 .align 16 AES_set_decrypt_key: - push %rdx + push %rdx # save key schedule call AES_set_encrypt_key cmp \$0,%eax - je .Lproceed - lea 24(%rsp),%rsp - ret -.Lproceed: - mov (%rsp),%r8 # restore key schedule - mov %rbx,(%rsp) + pop %r8 # restore key schedule + jne .Labort + + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 - mov 240(%r8),%ecx # pull number of rounds + mov 240(%r8),%r14d # pull number of rounds xor %rdi,%rdi - lea (%rdi,%rcx,4),%rcx + lea (%rdi,%r14d,4),%rcx mov %r8,%rsi lea (%r8,%rcx,4),%rdi # pointer to last chunk .align 4 @@ -1003,27 +1577,38 @@ AES_set_decrypt_key: cmp %rsi,%rdi jne .Linvert - .picmeup %r9 - lea AES_Td-.(%r9),%rdi - lea AES_Te-AES_Td(%rdi),%r9 + .picmeup %rax + lea AES_Te+2048+1024-.(%rax),%rax # rcon - mov %r8,%rsi - mov 240(%r8),%ecx # pull number of rounds - sub \$1,%ecx + mov 40(%rax),$mask80 + mov 48(%rax),$maskfe + mov 56(%rax),$mask1b + + mov %r8,$key + sub \$1,%r14d .align 4 .Lpermute: - lea 16(%rsi),%rsi + lea 16($key),$key + mov 0($key),%rax + mov 8($key),%rcx ___ - &deckey (0,"%rsi","%r9","%rdi"); - &deckey (4,"%rsi","%r9","%rdi"); - &deckey (8,"%rsi","%r9","%rdi"); - &deckey (12,"%rsi","%r9","%rdi"); + &dectransform (); $code.=<<___; - sub \$1,%ecx + mov %eax,0($key) + mov %ebx,4($key) + mov %ecx,8($key) + mov %edx,12($key) + sub \$1,%r14d jnz .Lpermute xor %rax,%rax + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbp pop %rbx +.Labort: ret .size AES_set_decrypt_key,.-AES_set_decrypt_key ___ @@ -1461,11 +2046,145 @@ ___ &_data_word(0xdabfbf65, 0x31e6e6d7, 0xc6424284, 0xb86868d0); &_data_word(0xc3414182, 0xb0999929, 0x772d2d5a, 0x110f0f1e); &_data_word(0xcbb0b07b, 0xfc5454a8, 0xd6bbbb6d, 0x3a16162c); + +#Te4 # four copies of Te4 to choose from to avoid L1 aliasing + &data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5); + &data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76); + &data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0); + &data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0); + &data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc); + &data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15); + &data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a); + &data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75); + &data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0); + &data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84); + &data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b); + &data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf); + &data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85); + &data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8); + &data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5); + &data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2); + &data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17); + &data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73); + &data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88); + &data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb); + &data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c); + &data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79); + &data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9); + &data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08); + &data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6); + &data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a); + &data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e); + &data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e); + &data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94); + &data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf); + &data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68); + &data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16); + + &data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5); + &data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76); + &data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0); + &data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0); + &data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc); + &data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15); + &data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a); + &data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75); + &data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0); + &data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84); + &data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b); + &data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf); + &data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85); + &data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8); + &data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5); + &data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2); + &data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17); + &data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73); + &data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88); + &data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb); + &data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c); + &data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79); + &data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9); + &data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08); + &data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6); + &data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a); + &data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e); + &data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e); + &data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94); + &data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf); + &data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68); + &data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16); + + &data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5); + &data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76); + &data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0); + &data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0); + &data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc); + &data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15); + &data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a); + &data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75); + &data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0); + &data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84); + &data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b); + &data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf); + &data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85); + &data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8); + &data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5); + &data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2); + &data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17); + &data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73); + &data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88); + &data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb); + &data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c); + &data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79); + &data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9); + &data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08); + &data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6); + &data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a); + &data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e); + &data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e); + &data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94); + &data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf); + &data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68); + &data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16); + + &data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5); + &data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76); + &data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0); + &data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0); + &data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc); + &data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15); + &data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a); + &data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75); + &data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0); + &data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84); + &data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b); + &data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf); + &data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85); + &data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8); + &data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5); + &data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2); + &data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17); + &data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73); + &data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88); + &data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb); + &data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c); + &data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79); + &data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9); + &data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08); + &data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6); + &data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a); + &data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e); + &data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e); + &data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94); + &data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf); + &data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68); + &data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16); #rcon: $code.=<<___; .long 0x00000001, 0x00000002, 0x00000004, 0x00000008 .long 0x00000010, 0x00000020, 0x00000040, 0x00000080 - .long 0x0000001b, 0x00000036, 0, 0, 0, 0, 0, 0 + .long 0x0000001b, 0x00000036, 0x80808080, 0x80808080 + .long 0xfefefefe, 0xfefefefe, 0x1b1b1b1b, 0x1b1b1b1b ___ $code.=<<___; .globl AES_Td @@ -1536,7 +2255,80 @@ ___ &_data_word(0x72c31d16, 0x0c25e2bc, 0x8b493c28, 0x41950dff); &_data_word(0x7101a839, 0xdeb30c08, 0x9ce4b4d8, 0x90c15664); &_data_word(0x6184cb7b, 0x70b632d5, 0x745c6c48, 0x4257b8d0); -#Td4: + +#Td4: # four copies of Td4 to choose from to avoid L1 aliasing + &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38); + &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb); + &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87); + &data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb); + &data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d); + &data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e); + &data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2); + &data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25); + &data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16); + &data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92); + &data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda); + &data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84); + &data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a); + &data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06); + &data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02); + &data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b); + &data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea); + &data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73); + &data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85); + &data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e); + &data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89); + &data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b); + &data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20); + &data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4); + &data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31); + &data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f); + &data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d); + &data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef); + &data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0); + &data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61); + &data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26); + &data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d); +$code.=<<___; + .long 0x80808080, 0x80808080, 0xfefefefe, 0xfefefefe + .long 0x1b1b1b1b, 0x1b1b1b1b, 0, 0 +___ + &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38); + &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb); + &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87); + &data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb); + &data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d); + &data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e); + &data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2); + &data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25); + &data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16); + &data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92); + &data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda); + &data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84); + &data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a); + &data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06); + &data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02); + &data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b); + &data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea); + &data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73); + &data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85); + &data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e); + &data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89); + &data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b); + &data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20); + &data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4); + &data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31); + &data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f); + &data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d); + &data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef); + &data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0); + &data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61); + &data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26); + &data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d); +$code.=<<___; + .long 0x80808080, 0x80808080, 0xfefefefe, 0xfefefefe + .long 0x1b1b1b1b, 0x1b1b1b1b, 0, 0 +___ &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38); &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb); &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87); @@ -1569,6 +2361,47 @@ ___ &data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61); &data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26); &data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d); +$code.=<<___; + .long 0x80808080, 0x80808080, 0xfefefefe, 0xfefefefe + .long 0x1b1b1b1b, 0x1b1b1b1b, 0, 0 +___ + &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38); + &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb); + &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87); + &data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb); + &data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d); + &data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e); + &data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2); + &data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25); + &data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16); + &data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92); + &data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda); + &data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84); + &data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a); + &data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06); + &data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02); + &data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b); + &data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea); + &data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73); + &data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85); + &data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e); + &data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89); + &data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b); + &data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20); + &data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4); + &data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31); + &data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f); + &data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d); + &data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef); + &data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0); + &data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61); + &data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26); + &data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d); +$code.=<<___; + .long 0x80808080, 0x80808080, 0xfefefefe, 0xfefefefe + .long 0x1b1b1b1b, 0x1b1b1b1b, 0, 0 +.asciz "AES for x86_64, CRYPTOGAMS by " +___ $code =~ s/\`([^\`]*)\`/eval($1)/gem; -- 2.40.0