From 33d9c8348a91bcc7d58f481cef0d48f01d6708bf Mon Sep 17 00:00:00 2001 From: Andy Polyakov Date: Tue, 3 Aug 2010 15:34:57 +0000 Subject: [PATCH] sha1-armv4-large.pl: reschedule instructions for dual-issue pipeline. --- crypto/sha/asm/sha1-armv4-large.pl | 78 ++++++++++++------------------ 1 file changed, 30 insertions(+), 48 deletions(-) diff --git a/crypto/sha/asm/sha1-armv4-large.pl b/crypto/sha/asm/sha1-armv4-large.pl index feeec9372d..6e65fe3e01 100644 --- a/crypto/sha/asm/sha1-armv4-large.pl +++ b/crypto/sha/asm/sha1-armv4-large.pl @@ -41,6 +41,13 @@ # issue Cortex A8 core was measured to process input block in # ~990 cycles. +# August 2010. +# +# Rescheduling for dual-issue pipeline resulted in 13% improvement on +# Cortex A8 core and in absolute terms ~870 cycles per input block +# [or 13.6 cycles per byte]. + + while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} open STDOUT,">$output"; @@ -60,43 +67,22 @@ $t3="r12"; $Xi="r14"; @V=($a,$b,$c,$d,$e); -# One can optimize this for aligned access on big-endian architecture, -# but code's endian neutrality makes it too pretty:-) -sub Xload { -my ($a,$b,$c,$d,$e)=@_; -$code.=<<___; - ldrb $t0,[$inp],#4 - ldrb $t1,[$inp,#-3] - ldrb $t2,[$inp,#-2] - ldrb $t3,[$inp,#-1] - add $e,$K,$e,ror#2 @ E+=K_00_19 - orr $t0,$t1,$t0,lsl#8 - add $e,$e,$a,ror#27 @ E+=ROR(A,27) - orr $t0,$t2,$t0,lsl#8 - eor $t1,$c,$d @ F_xx_xx - orr $t0,$t3,$t0,lsl#8 - add $e,$e,$t0 @ E+=X[i] - str $t0,[$Xi,#-4]! -___ -} sub Xupdate { -my ($a,$b,$c,$d,$e,$flag)=@_; +my ($a,$b,$c,$d,$e,$opt1,$opt2)=@_; $code.=<<___; ldr $t0,[$Xi,#15*4] ldr $t1,[$Xi,#13*4] ldr $t2,[$Xi,#7*4] - ldr $t3,[$Xi,#2*4] add $e,$K,$e,ror#2 @ E+=K_xx_xx + ldr $t3,[$Xi,#2*4] eor $t0,$t0,$t1 eor $t2,$t2,$t3 - eor $t0,$t0,$t2 - add $e,$e,$a,ror#27 @ E+=ROR(A,27) -___ -$code.=<<___ if (!defined($flag)); - eor $t1,$c,$d @ F_xx_xx, but not in 40_59 -___ -$code.=<<___; + eor $t1,$c,$d @ F_xx_xx mov $t0,$t0,ror#31 + add $e,$e,$a,ror#27 @ E+=ROR(A,27) + eor $t0,$t0,$t2,ror#31 + $opt1 @ F_xx_xx + $opt2 @ F_xx_xx add $e,$e,$t0 @ E+=X[i] str $t0,[$Xi,#-4]! ___ @@ -104,19 +90,29 @@ ___ sub BODY_00_15 { my ($a,$b,$c,$d,$e)=@_; - &Xload(@_); $code.=<<___; + ldrb $t0,[$inp],#4 + ldrb $t1,[$inp,#-1] + ldrb $t2,[$inp,#-2] + add $e,$K,$e,ror#2 @ E+=K_00_19 + ldrb $t3,[$inp,#-3] + add $e,$e,$a,ror#27 @ E+=ROR(A,27) + orr $t0,$t1,$t0,lsl#24 + eor $t1,$c,$d @ F_xx_xx + orr $t0,$t0,$t2,lsl#8 + orr $t0,$t0,$t3,lsl#16 and $t1,$b,$t1,ror#2 + add $e,$e,$t0 @ E+=X[i] eor $t1,$t1,$d,ror#2 @ F_00_19(B,C,D) + str $t0,[$Xi,#-4]! add $e,$e,$t1 @ E+=F_00_19(B,C,D) ___ } sub BODY_16_19 { my ($a,$b,$c,$d,$e)=@_; - &Xupdate(@_); + &Xupdate(@_,"and $t1,$b,$t1,ror#2"); $code.=<<___; - and $t1,$b,$t1,ror#2 eor $t1,$t1,$d,ror#2 @ F_00_19(B,C,D) add $e,$e,$t1 @ E+=F_00_19(B,C,D) ___ @@ -124,34 +120,20 @@ ___ sub BODY_20_39 { my ($a,$b,$c,$d,$e)=@_; - &Xupdate(@_); + &Xupdate(@_,"eor $t1,$b,$t1,ror#2"); $code.=<<___; - eor $t1,$b,$t1,ror#2 @ F_20_39(B,C,D) add $e,$e,$t1 @ E+=F_20_39(B,C,D) ___ } sub BODY_40_59 { my ($a,$b,$c,$d,$e)=@_; -if (1) { - &Xupdate(@_); + &Xupdate(@_,"and $t1,$b,$t1,ror#2","and $t2,$c,$d"); $code.=<<___; - and $t2,$c,$d - and $t1,$b,$t1,ror#2 - add $e,$e,$t2,ror#2 - add $e,$e,$t1 @ E+=F_40_59(B,C,D) -___ -} else { - &Xupdate(@_,1); -$code.=<<___; - and $t1,$b,$c,ror#2 - orr $t2,$b,$c,ror#2 - and $t2,$t2,$d,ror#2 - orr $t1,$t1,$t2 @ F_40_59(B,C,D) add $e,$e,$t1 @ E+=F_40_59(B,C,D) + add $e,$e,$t2,ror#2 ___ } -} $code=<<___; .text -- 2.40.0