# does not update *ivec! (see crypto/modes/ctr128.c for details)
#
# Overhaul based on suggestions from Shay Gueron and Vlad Krasnov,
-# http://rt.openssl.org/Ticket/Display.html?id=3031&user=guest&pass=guest.
+# http://rt.openssl.org/Ticket/Display.html?id=3021&user=guest&pass=guest.
# Keywords are full unroll and modulo-schedule counter calculations
# with zero-round key xor.
{
mov 12($key),$key0 # 0-round key LSB
movdqa $inout0,0x00(%rsp) # populate counter block
bswap $ctr
- movdqa $inout0,0x10(%rsp)
- movdqa $inout0,0x20(%rsp)
- movdqa $inout0,0x30(%rsp)
+ movdqa $inout0,$inout1
+ movdqa $inout0,$inout2
+ movdqa $inout0,$inout3
movdqa $inout0,0x40(%rsp)
movdqa $inout0,0x50(%rsp)
movdqa $inout0,0x60(%rsp)
bswap %r10d
xor $key0,%r9d
xor $key0,%r10d
- mov %r9d,0x10+12(%rsp)
+ pinsrd \$3,%r9d,$inout1
lea 3($ctr),%r9
- mov %r10d,0x20+12(%rsp)
+ movdqa $inout1,0x10(%rsp)
+ pinsrd \$3,%r10d,$inout2
bswap %r9d
lea 4($ctr),%r10
+ movdqa $inout2,0x20(%rsp)
xor $key0,%r9d
bswap %r10d
- mov %r9d,0x30+12(%rsp)
+ pinsrd \$3,%r9d,$inout3
xor $key0,%r10d
+ movdqa $inout3,0x30(%rsp)
lea 5($ctr),%r9
mov %r10d,0x40+12(%rsp)
bswap %r9d
$movkey 0x10($key),$rndkey1
- movdqa 0x10(%rsp),$inout1
- movdqa 0x20(%rsp),$inout2
- movdqa 0x30(%rsp),$inout3
movdqa 0x40(%rsp),$inout4
movdqa 0x50(%rsp),$inout5
.Lctr32_tail:
lea 16($key),$key
cmp \$4,$len
- jbe .Lctr32_loop4
+ jb .Lctr32_loop3
+ je .Lctr32_loop4
movdqa 0x60(%rsp),$inout6
+ pxor $inout7,$inout7
$movkey 16($key),$rndkey0
aesenc $rndkey1,$inout0
aesenc $rndkey1,$inout2
dec $rounds
aesenc $rndkey1,$inout3
+ movups ($inp),$in0
aesenc $rndkey1,$inout4
+ movups 0x10($inp),$in1
aesenc $rndkey1,$inout5
+ movups 0x20($inp),$in2
aesenc $rndkey1,$inout6
- pxor $inout7,$inout7
$movkey 16($key),$rndkey1
call .Lenc_loop8_enter
- movups ($inp),$in0
- movups 0x10($inp),$in1
- movups 0x20($inp),$in2
xorps $in0,$inout0
movups 0x30($inp),$in3
xorps $in1,$inout1
dec $rounds
jnz .Lctr32_loop4
aesenclast $rndkey1,$inout0
+ movups ($inp),$in0
aesenclast $rndkey1,$inout1
+ movups 0x10($inp),$in1
aesenclast $rndkey1,$inout2
+ movups 0x20($inp),$in2
aesenclast $rndkey1,$inout3
+ movups 0x30($inp),$in3
+
+ xorps $in0,$inout0
+ movups $inout0,($out)
+ xorps $in1,$inout1
+ movups $inout1,0x10($out)
+ xorps $in2,$inout2
+ movups $inout2,0x20($out)
+ xorps $in3,$inout3
+ movups $inout3,0x30($out)
+ jmp .Lctr32_done
+
+.align 32
+.Lctr32_loop3:
+ aesenc $rndkey1,$inout0
+ lea 16($key),$key
+ aesenc $rndkey1,$inout1
+ aesenc $rndkey1,$inout2
+ $movkey ($key),$rndkey1
+ dec $rounds
+ jnz .Lctr32_loop3
+ aesenclast $rndkey1,$inout0
+ aesenclast $rndkey1,$inout1
+ aesenclast $rndkey1,$inout2
movups ($inp),$in0
xorps $in0,$inout0
movups 0x20($inp),$in2
xorps $in2,$inout2
movups $inout2,0x20($out)
- cmp \$4,$len
- jb .Lctr32_done
-
- movups 0x30($inp),$in3
- xorps $in3,$inout3
- movups $inout3,0x30($out)
jmp .Lctr32_done
.align 16