From 9282c335963e7de87ff20460d0e661e603e8ff40 Mon Sep 17 00:00:00 2001 From: Andy Polyakov Date: Sat, 1 Dec 2012 18:20:39 +0000 Subject: [PATCH] aesni-x86_64.pl: CTR face lift, +25% on Bulldozer. --- crypto/aes/asm/aesni-x86_64.pl | 456 ++++++++++++++++++--------------- 1 file changed, 247 insertions(+), 209 deletions(-) diff --git a/crypto/aes/asm/aesni-x86_64.pl b/crypto/aes/asm/aesni-x86_64.pl index 56199eb240..318e52fe9d 100644 --- a/crypto/aes/asm/aesni-x86_64.pl +++ b/crypto/aes/asm/aesni-x86_64.pl @@ -130,7 +130,7 @@ # Further data for other parallelizable modes: # # CBC decrypt 1.16 0.93 0.93 -# CTR 1.14 0.91 n/a +# CTR 1.14 0.91 0.90 # # Well, given 3x column it's probably inappropriate to call the limit # asymptotic, if it can be surpassed, isn't it? What happens there? @@ -160,7 +160,7 @@ ###################################################################### # For reference, AMD Bulldozer spends 5.77 cycles per byte processed # with 128-bit key in CBC encrypt and 0.76 cycles in CBC decrypt, 0.70 -# in ECB, 0.94 in CTR, 0.95 in XTS... This means that aes[enc|dec] +# in ECB, 0.76 in CTR, 0.95 in XTS... This means that aes[enc|dec] # instruction latency is 9 cycles and that they can be issued every # cycle. @@ -1013,286 +1013,321 @@ ___ # does not update *ivec! (see engine/eng_aesni.c for details) # { -my $frame_size = 0x20+($win64?160:0); -my ($in0,$in1,$in2,$in3)=map("%xmm$_",(8..11)); -my ($iv0,$iv1,$ivec)=("%xmm12","%xmm13","%xmm14"); -my $bswap_mask="%xmm15"; +my ($in0,$in1,$in2,$in3,$one,$ivec)=map("%xmm$_",(10..15)); +my $len_="%r9"; $code.=<<___; .globl aesni_ctr32_encrypt_blocks .type aesni_ctr32_encrypt_blocks,\@function,5 .align 16 aesni_ctr32_encrypt_blocks: - lea (%rsp),%rax - push %rbp - sub \$$frame_size,%rsp - and \$-16,%rsp # Linux kernel stack can be incorrectly seeded ___ $code.=<<___ if ($win64); - movaps %xmm6,0x20(%rsp) - movaps %xmm7,0x30(%rsp) - movaps %xmm8,0x40(%rsp) - movaps %xmm9,0x50(%rsp) - movaps %xmm10,0x60(%rsp) - movaps %xmm11,0x70(%rsp) - movaps %xmm12,0x80(%rsp) - movaps %xmm13,0x90(%rsp) - movaps %xmm14,0xa0(%rsp) - movaps %xmm15,0xb0(%rsp) + lea -0xa8(%rsp),%rsp + movaps %xmm6,0x00(%rsp) + movaps %xmm7,0x10(%rsp) + movaps %xmm8,0x20(%rsp) + movaps %xmm9,0x30(%rsp) + movaps %xmm10,0x40(%rsp) + movaps %xmm11,0x50(%rsp) + movaps %xmm12,0x60(%rsp) + movaps %xmm13,0x70(%rsp) + movaps %xmm14,0x80(%rsp) + movaps %xmm15,0x90(%rsp) .Lctr32_body: ___ $code.=<<___; - lea -8(%rax),%rbp cmp \$1,$len je .Lctr32_one_shortcut + movzb 15($ivp),%rax # counter LSB + mov $len,$len_ # backup $len + mov 240($key),$rnds_ # key->rounds + mov $key,$key_ # backup $key movdqu ($ivp),$ivec - movdqa .Lbswap_mask(%rip),$bswap_mask - xor $rounds,$rounds - pextrd \$3,$ivec,$rnds_ # pull 32-bit counter - pinsrd \$3,$rounds,$ivec # wipe 32-bit counter + neg %rax + movdqa .Lincrement1(%rip),$one + add \$256,%rax # steps to closest overflow - mov 240($key),$rounds # key->rounds - bswap $rnds_ - pxor $iv0,$iv0 # vector of 3 32-bit counters - pxor $iv1,$iv1 # vector of 3 32-bit counters - pinsrd \$0,$rnds_,$iv0 - lea 3($rnds_),$key_ - pinsrd \$0,$key_,$iv1 - inc $rnds_ - pinsrd \$1,$rnds_,$iv0 - inc $key_ - pinsrd \$1,$key_,$iv1 - inc $rnds_ - pinsrd \$2,$rnds_,$iv0 - inc $key_ - pinsrd \$2,$key_,$iv1 - movdqa $iv0,0x00(%rsp) - pshufb $bswap_mask,$iv0 - movdqa $iv1,0x10(%rsp) - pshufb $bswap_mask,$iv1 - - pshufd \$`3<<6`,$iv0,$inout0 # place counter to upper dword - pshufd \$`2<<6`,$iv0,$inout1 - pshufd \$`1<<6`,$iv0,$inout2 - cmp \$6,$len +.Lctr32_grandloop: + cmp %rax,$len + cmova %rax,$len + mov $rnds_,$rounds # restore $rounds + sub $len,$len_ + + cmp \$8,$len jb .Lctr32_tail + + $movkey ($key_),$rndkey0 shr \$1,$rounds - mov $key,$key_ # backup $key - mov $rounds,$rnds_ # backup $rounds - sub \$6,$len - jmp .Lctr32_loop6 + shr \$1,$rnds_ + sub \$8,$len + jmp .Lctr32_loop8 .align 16 -.Lctr32_loop6: - pshufd \$`3<<6`,$iv1,$inout3 - por $ivec,$inout0 # merge counter-less ivec - $movkey ($key_),$rndkey0 - pshufd \$`2<<6`,$iv1,$inout4 - por $ivec,$inout1 +.Lctr32_loop8: $movkey 16($key_),$rndkey1 - pshufd \$`1<<6`,$iv1,$inout5 - por $ivec,$inout2 - por $ivec,$inout3 - xorps $rndkey0,$inout0 - por $ivec,$inout4 - por $ivec,$inout5 - - # inline _aesni_encrypt6 and interleave last rounds - # with own code... - - pxor $rndkey0,$inout1 - aesenc $rndkey1,$inout0 - lea 32($key_),$key - pxor $rndkey0,$inout2 - aesenc $rndkey1,$inout1 - movdqa .Lincrement32(%rip),$iv1 - pxor $rndkey0,$inout3 - aesenc $rndkey1,$inout2 - movdqa (%rsp),$iv0 - pxor $rndkey0,$inout4 - aesenc $rndkey1,$inout3 - pxor $rndkey0,$inout5 - $movkey ($key),$rndkey0 - dec $rounds - aesenc $rndkey1,$inout4 - aesenc $rndkey1,$inout5 - jmp .Lctr32_enc_loop6_enter -.align 16 -.Lctr32_enc_loop6: - aesenc $rndkey1,$inout0 - aesenc $rndkey1,$inout1 - dec $rounds - aesenc $rndkey1,$inout2 - aesenc $rndkey1,$inout3 - aesenc $rndkey1,$inout4 - aesenc $rndkey1,$inout5 -.Lctr32_enc_loop6_enter: - $movkey 16($key),$rndkey1 - aesenc $rndkey0,$inout0 - aesenc $rndkey0,$inout1 - lea 32($key),$key - aesenc $rndkey0,$inout2 - aesenc $rndkey0,$inout3 - aesenc $rndkey0,$inout4 - aesenc $rndkey0,$inout5 - $movkey ($key),$rndkey0 - jnz .Lctr32_enc_loop6 - - aesenc $rndkey1,$inout0 - paddd $iv1,$iv0 # increment counter vector - aesenc $rndkey1,$inout1 - paddd 0x10(%rsp),$iv1 - aesenc $rndkey1,$inout2 - movdqa $iv0,0x00(%rsp) # save counter vector - aesenc $rndkey1,$inout3 - movdqa $iv1,0x10(%rsp) - aesenc $rndkey1,$inout4 - pshufb $bswap_mask,$iv0 # byte swap - aesenc $rndkey1,$inout5 - pshufb $bswap_mask,$iv1 - - aesenclast $rndkey0,$inout0 - movups ($inp),$in0 # load input - aesenclast $rndkey0,$inout1 - movups 0x10($inp),$in1 - aesenclast $rndkey0,$inout2 - movups 0x20($inp),$in2 - aesenclast $rndkey0,$inout3 - movups 0x30($inp),$in3 - aesenclast $rndkey0,$inout4 - movups 0x40($inp),$rndkey1 - aesenclast $rndkey0,$inout5 - movups 0x50($inp),$rndkey0 - lea 0x60($inp),$inp - - xorps $inout0,$in0 # xor - pshufd \$`3<<6`,$iv0,$inout0 - xorps $inout1,$in1 - pshufd \$`2<<6`,$iv0,$inout1 - movups $in0,($out) # store output - xorps $inout2,$in2 - pshufd \$`1<<6`,$iv0,$inout2 - movups $in1,0x10($out) - xorps $inout3,$in3 - movups $in2,0x20($out) - xorps $inout4,$rndkey1 - movups $in3,0x30($out) - xorps $inout5,$rndkey0 - movups $rndkey1,0x40($out) - movups $rndkey0,0x50($out) - lea 0x60($out),$out + movdqa $rndkey0,$inout0 + movdqa $rndkey0,$inout1 + pxor $ivec,$inout0 + paddb $one,$ivec + movdqa $rndkey0,$inout2 + aesenc $rndkey1,$inout0 + pxor $ivec,$inout1 + paddb $one,$ivec + lea 32($key_),$key + movdqa $rndkey0,$inout3 + aesenc $rndkey1,$inout1 + pxor $ivec,$inout2 + paddb $one,$ivec + movdqa $rndkey0,$inout4 + aesenc $rndkey1,$inout2 + pxor $ivec,$inout3 + paddb $one,$ivec + movdqa $rndkey0,$inout5 + aesenc $rndkey1,$inout3 + pxor $ivec,$inout4 + paddb $one,$ivec + movdqa $rndkey0,$inout6 + aesenc $rndkey1,$inout4 + pxor $ivec,$inout5 + paddb $one,$ivec + movdqa $rndkey0,$inout7 + aesenc $rndkey1,$inout5 + pxor $ivec,$inout6 + paddb $one,$ivec + $movkey ($key),$rndkey0 + aesenc $rndkey1,$inout6 + pxor $ivec,$inout7 + paddb $one,$ivec + dec $rounds + aesenc $rndkey1,$inout7 + $movkey 16($key),$rndkey1 + movups ($inp),$in0 # load input + movups 0x10($inp),$in1 + movups 0x20($inp),$in2 + movups 0x30($inp),$in3 + + call .Lenc_loop8_enter + + xorps $in0,$inout0 # xor + movups 0x40($inp),$in0 + xorps $in1,$inout1 + movups 0x50($inp),$in1 + xorps $in2,$inout2 + movups 0x60($inp),$in2 + xorps $in3,$inout3 + movups 0x70($inp),$in3 + lea 0x80($inp),$inp + xorps $in0,$inout4 + movups $inout0,($out) # store output + xorps $in1,$inout5 + movups $inout1,0x10($out) + xorps $in2,$inout6 + movups $inout2,0x20($out) + xorps $in3,$inout7 + movups $inout3,0x30($out) + movups $inout4,0x40($out) + movups $inout5,0x50($out) + movups $inout6,0x60($out) + movups $inout7,0x70($out) + lea 0x80($out),$out + + $movkey ($key_),$rndkey0 mov $rnds_,$rounds - sub \$6,$len - jnc .Lctr32_loop6 + sub \$8,$len + jnc .Lctr32_loop8 - add \$6,$len - jz .Lctr32_done - mov $key_,$key # restore $key lea 1($rounds,$rounds),$rounds # restore original value + lea 1($rnds_,$rnds_),$rnds_ # restore original value + add \$8,$len + jz .Lctr32_done .Lctr32_tail: - por $ivec,$inout0 + mov $key_,$key # restore $key + movdqa $ivec,$inout0 + paddb $one,$ivec movups ($inp),$in0 cmp \$2,$len jb .Lctr32_one - por $ivec,$inout1 + movdqa $ivec,$inout1 + paddb $one,$ivec movups 0x10($inp),$in1 je .Lctr32_two - pshufd \$`3<<6`,$iv1,$inout3 - por $ivec,$inout2 + movdqa $ivec,$inout2 + paddb $one,$ivec movups 0x20($inp),$in2 cmp \$4,$len jb .Lctr32_three - pshufd \$`2<<6`,$iv1,$inout4 - por $ivec,$inout3 + movdqa $ivec,$inout3 + paddb $one,$ivec movups 0x30($inp),$in3 je .Lctr32_four - por $ivec,$inout4 - xorps $inout5,$inout5 + movdqa $ivec,$inout4 + paddb $one,$ivec + cmp \$6,$len + jb .Lctr32_five - call _aesni_encrypt6 + movdqa $ivec,$inout5 + paddb $one,$ivec + je .Lctr32_six - movups 0x40($inp),$rndkey1 - xorps $inout0,$in0 - xorps $inout1,$in1 - movups $in0,($out) - xorps $inout2,$in2 - movups $in1,0x10($out) - xorps $inout3,$in3 - movups $in2,0x20($out) - xorps $inout4,$rndkey1 - movups $in3,0x30($out) - movups $rndkey1,0x40($out) + movdqa $ivec,$inout6 + paddb $one,$ivec + xorps $inout7,$inout7 + + call _aesni_encrypt8 + + xorps $in0,$inout0 # xor + movups 0x40($inp),$in0 + xorps $in1,$inout1 + movups 0x50($inp),$in1 + xorps $in2,$inout2 + movups 0x60($inp),$in2 + lea 0x70($inp),$inp + xorps $in3,$inout3 + movups $inout0,($out) # store output + xorps $in0,$inout4 + movups $inout1,0x10($out) + xorps $in1,$inout5 + movups $inout2,0x20($out) + xorps $in2,$inout6 + movups $inout3,0x30($out) + movups $inout4,0x40($out) + movups $inout5,0x50($out) + movups $inout6,0x60($out) + lea 0x70($out),$out jmp .Lctr32_done .align 16 .Lctr32_one_shortcut: movups ($ivp),$inout0 + xor $len_,$len_ movups ($inp),$in0 mov 240($key),$rounds # key->rounds .Lctr32_one: ___ &aesni_generate1("enc",$key,$rounds); $code.=<<___; - xorps $inout0,$in0 - movups $in0,($out) + xorps $in0,$inout0 + lea 0x10($inp),$inp + movups $inout0,($out) + lea 0x10($out),$out jmp .Lctr32_done .align 16 .Lctr32_two: xorps $inout2,$inout2 call _aesni_encrypt3 - xorps $inout0,$in0 - xorps $inout1,$in1 - movups $in0,($out) - movups $in1,0x10($out) + xorps $in0,$inout0 # xor + lea 0x20($inp),$inp + xorps $in1,$inout1 + movups $inout0,($out) # store output + movups $inout1,0x10($out) + lea 0x20($out),$out jmp .Lctr32_done .align 16 .Lctr32_three: call _aesni_encrypt3 - xorps $inout0,$in0 - xorps $inout1,$in1 - movups $in0,($out) - xorps $inout2,$in2 - movups $in1,0x10($out) - movups $in2,0x20($out) + xorps $in0,$inout0 # xor + lea 0x30($inp),$inp + xorps $in1,$inout1 + movups $inout0,($out) # store output + xorps $in2,$inout2 + movups $inout1,0x10($out) + movups $inout2,0x20($out) + lea 0x30($out),$out jmp .Lctr32_done .align 16 .Lctr32_four: call _aesni_encrypt4 - xorps $inout0,$in0 - xorps $inout1,$in1 - movups $in0,($out) - xorps $inout2,$in2 - movups $in1,0x10($out) - xorps $inout3,$in3 - movups $in2,0x20($out) - movups $in3,0x30($out) + xorps $in0,$inout0 # xor + lea 0x40($inp),$inp + xorps $in1,$inout1 + movups $inout0,($out) # store output + xorps $in2,$inout2 + movups $inout1,0x10($out) + xorps $in3,$inout3 + movups $inout2,0x20($out) + movups $inout3,0x30($out) + lea 0x40($out),$out + jmp .Lctr32_done + +.align 16 +.Lctr32_five: + xorps $inout5,$inout5 + call _aesni_encrypt6 + xorps $in0,$inout0 # xor + movups 0x40($inp),$in0 + lea 0x50($inp),$inp + xorps $in1,$inout1 + movups $inout0,($out) # store output + xorps $in2,$inout2 + movups $inout1,0x10($out) + xorps $in3,$inout3 + movups $inout2,0x20($out) + xorps $in0,$inout4 + movups $inout3,0x30($out) + movups $inout4,0x40($out) + lea 0x50($out),$out + jmp .Lctr32_done + +.align 16 +.Lctr32_six: + call _aesni_encrypt6 + xorps $in0,$inout0 # xor + movups 0x40($inp),$in0 + xorps $in1,$inout1 + movups 0x50($inp),$in1 + lea 0x60($inp),$inp + xorps $in2,$inout2 + movups $inout0,($out) # store output + xorps $in3,$inout3 + movups $inout1,0x10($out) + xorps $in0,$inout4 + movups $inout2,0x20($out) + xorps $in1,$inout5 + movups $inout3,0x30($out) + movups $inout4,0x40($out) + movups $inout5,0x50($out) + lea 0x60($out),$out .Lctr32_done: + test $len_,$len_ + jz .Lctr32_really_done + + movdqa .Lbswap_mask(%rip),$rndkey1 + pshufb $rndkey1,$ivec + psrldq \$14,$one # 256 + paddd $one,$ivec + pslldq \$14,$one + pshufb $rndkey1,$ivec + mov $len_,$len + mov \$256,%rax + jmp .Lctr32_grandloop + +.Lctr32_really_done: ___ $code.=<<___ if ($win64); - movaps 0x20(%rsp),%xmm6 - movaps 0x30(%rsp),%xmm7 - movaps 0x40(%rsp),%xmm8 - movaps 0x50(%rsp),%xmm9 - movaps 0x60(%rsp),%xmm10 - movaps 0x70(%rsp),%xmm11 - movaps 0x80(%rsp),%xmm12 - movaps 0x90(%rsp),%xmm13 - movaps 0xa0(%rsp),%xmm14 - movaps 0xb0(%rsp),%xmm15 + movaps 0x00(%rsp),%xmm6 + movaps 0x10(%rsp),%xmm7 + movaps 0x20(%rsp),%xmm8 + movaps 0x30(%rsp),%xmm9 + movaps 0x40(%rsp),%xmm10 + movaps 0x50(%rsp),%xmm11 + movaps 0x60(%rsp),%xmm12 + movaps 0x70(%rsp),%xmm13 + movaps 0x80(%rsp),%xmm14 + movaps 0x90(%rsp),%xmm15 + lea 0xa8(%rsp),%rsp ___ $code.=<<___; - lea (%rbp),%rsp - pop %rbp .Lctr32_ret: ret .size aesni_ctr32_encrypt_blocks,.-aesni_ctr32_encrypt_blocks @@ -2739,6 +2774,8 @@ $code.=<<___; .long 1,0,0,0 .Lxts_magic: .long 0x87,0,1,0 +.Lincrement1: + .byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1 .asciz "AES for Intel AES-NI, CRYPTOGAMS by " .align 64 @@ -2843,12 +2880,13 @@ ctr32_se_handler: cmp %r10,%rbx jae .Lcommon_seh_tail - lea 0x20(%rax),%rsi # %xmm save area + lea (%rax),%rsi # %xmm save area lea 512($context),%rdi # &context.Xmm6 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax) .long 0xa548f3fc # cld; rep movsq + lea 0xa8(%rax),%rax # adjust stack pointer - jmp .Lcommon_rbp_tail + jmp .Lcommon_seh_tail .size ctr32_se_handler,.-ctr32_se_handler .type xts_se_handler,\@abi-omnipotent -- 2.40.0