From 143ee099e9cdffb256adc27cba583ec52454a29f Mon Sep 17 00:00:00 2001 From: Andy Polyakov Date: Sun, 21 Feb 2016 21:04:26 +0100 Subject: [PATCH] ec/asm/ecp_nistz256-*.pl: get corner case logic right. RT#4284 Reviewed-by: Rich Salz --- crypto/ec/asm/ecp_nistz256-armv4.pl | 17 ++++++++++++----- crypto/ec/asm/ecp_nistz256-armv8.pl | 16 +++++++++++++--- crypto/ec/asm/ecp_nistz256-x86.pl | 10 +++++++++- 3 files changed, 34 insertions(+), 9 deletions(-) diff --git a/crypto/ec/asm/ecp_nistz256-armv4.pl b/crypto/ec/asm/ecp_nistz256-armv4.pl index 3a636eae6c..ab11a8782e 100755 --- a/crypto/ec/asm/ecp_nistz256-armv4.pl +++ b/crypto/ec/asm/ecp_nistz256-armv4.pl @@ -1252,6 +1252,7 @@ ecp_nistz256_point_double: stmdb sp!,{r0-r12,lr} @ push from r0, unusual, but intentional sub sp,sp,#32*5 +.Lpoint_double_shortcut: add r3,sp,#$in_x ldmia $a_ptr!,{r4-r11} @ copy in_x stmia r3,{r4-r11} @@ -1371,7 +1372,7 @@ $code.=<<___; .align 5 ecp_nistz256_point_add: stmdb sp!,{r0-r12,lr} @ push from r0, unusual, but intentional - sub sp,sp,#32*18 + sub sp,sp,#32*18+16 ldmia $b_ptr!,{r4-r11} @ copy in2 add r3,sp,#$in2_x @@ -1504,9 +1505,9 @@ ecp_nistz256_point_add: tst $t0,$t1 beq .Ladd_proceed @ (in1infty || in2infty)? tst $t2,$t2 - beq .Ladd_proceed @ is_equal(S1,S2)? + beq .Ladd_double @ is_equal(S1,S2)? - ldr $r_ptr,[sp,#32*18] + ldr $r_ptr,[sp,#32*18+16] eor r4,r4,r4 eor r5,r5,r5 eor r6,r6,r6 @@ -1520,6 +1521,12 @@ ecp_nistz256_point_add: stmia $r_ptr!,{r4-r11} b .Ladd_done +.align 4 +.Ladd_double: + ldr $a_ptr,[sp,#32*18+20] + add sp,sp,#32*(18-5)+16 @ difference in frame sizes + b .Lpoint_double_shortcut + .align 4 .Ladd_proceed: add $a_ptr,sp,#$R @@ -1588,7 +1595,7 @@ ecp_nistz256_point_add: add r3,sp,#$in1_x and r11,r11,r12 mvn r12,r12 - ldr $r_ptr,[sp,#32*18] + ldr $r_ptr,[sp,#32*18+16] ___ for($i=0;$i<96;$i+=8) { # conditional moves $code.=<<___; @@ -1610,7 +1617,7 @@ ___ } $code.=<<___; .Ladd_done: - add sp,sp,#32*18+16 @ +16 means "skip even over saved r0-r3" + add sp,sp,#32*18+16+16 @ +16 means "skip even over saved r0-r3" #if __ARM_ARCH__>=5 || defined(__thumb__) ldmia sp!,{r4-r12,pc} #else diff --git a/crypto/ec/asm/ecp_nistz256-armv8.pl b/crypto/ec/asm/ecp_nistz256-armv8.pl index ce6b69e8da..4b2e925434 100644 --- a/crypto/ec/asm/ecp_nistz256-armv8.pl +++ b/crypto/ec/asm/ecp_nistz256-armv8.pl @@ -691,12 +691,13 @@ $code.=<<___; .type ecp_nistz256_point_double,%function .align 5 ecp_nistz256_point_double: - stp x29,x30,[sp,#-48]! + stp x29,x30,[sp,#-80]! add x29,sp,#0 stp x19,x20,[sp,#16] stp x21,x22,[sp,#32] sub sp,sp,#32*4 +.Ldouble_shortcut: ldp $acc0,$acc1,[$ap,#32] mov $rp_real,$rp ldp $acc2,$acc3,[$ap,#48] @@ -823,7 +824,7 @@ ecp_nistz256_point_double: add sp,x29,#0 // destroy frame ldp x19,x20,[x29,#16] ldp x21,x22,[x29,#32] - ldp x29,x30,[sp],#48 + ldp x29,x30,[sp],#80 ret .size ecp_nistz256_point_double,.-ecp_nistz256_point_double ___ @@ -963,7 +964,7 @@ ecp_nistz256_point_add: b.eq .Ladd_proceed // (in1infty || in2infty)? tst $temp,$temp - b.eq .Ladd_proceed // is_equal(S1,S2)? + b.eq .Ladd_double // is_equal(S1,S2)? eor $a0,$a0,$a0 eor $a1,$a1,$a1 @@ -975,6 +976,15 @@ ecp_nistz256_point_add: stp $a0,$a1,[$rp_real,#80] b .Ladd_done +.align 4 +.Ladd_double: + mov $ap,$ap_real + mov $rp,$rp_real + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + add sp,sp,#32*(12-4) // difference in stack frames + b .Ldouble_shortcut + .align 4 .Ladd_proceed: add $rp,sp,#$Rsqr diff --git a/crypto/ec/asm/ecp_nistz256-x86.pl b/crypto/ec/asm/ecp_nistz256-x86.pl index 421ac0b34d..4d55f82ef8 100755 --- a/crypto/ec/asm/ecp_nistz256-x86.pl +++ b/crypto/ec/asm/ecp_nistz256-x86.pl @@ -1197,6 +1197,7 @@ for ($i=0;$i<7;$i++) { ######################################################################## # void ecp_nistz256_point_double(P256_POINT *out,const P256_POINT *inp); # +&static_label("point_double_shortcut"); &function_begin("ecp_nistz256_point_double"); { my ($S,$M,$Zsqr,$in_x,$tmp0)=map(32*$_,(0..4)); @@ -1212,6 +1213,7 @@ for ($i=0;$i<7;$i++) { &picmeup("edx","OPENSSL_ia32cap_P","eax",&label("pic")); &mov ("ebp",&DWP(0,"edx")); } +&set_label("point_double_shortcut"); &mov ("eax",&DWP(0,"esi")); # copy in_x &mov ("ebx",&DWP(4,"esi")); &mov ("ecx",&DWP(8,"esi")); @@ -1491,7 +1493,7 @@ for ($i=0;$i<7;$i++) { &mov ("ebx",&DWP(32*18+8,"esp")); &jz (&label("add_proceed")); # (in1infty || in2infty)? &test ("ebx","ebx"); - &jz (&label("add_proceed")); # is_equal(S1,S2)? + &jz (&label("add_double")); # is_equal(S1,S2)? &mov ("edi",&wparam(0)); &xor ("eax","eax"); @@ -1499,6 +1501,12 @@ for ($i=0;$i<7;$i++) { &data_byte(0xfc,0xf3,0xab); # cld; stosd &jmp (&label("add_done")); +&set_label("add_double",16); + &mov ("esi",&wparam(1)); + &mov ("ebp",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy + &add ("esp",4*((8*18+5)-(8*5+1))); # difference in frame sizes + &jmp (&label("point_double_shortcut")); + &set_label("add_proceed",16); &mov ("eax",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy &lea ("esi",&DWP($R,"esp")); -- 2.40.0