From 947d78275bbd53c43f4edf7523910e1e73fb08f5 Mon Sep 17 00:00:00 2001 From: Andy Polyakov Date: Sat, 20 Oct 2012 09:13:21 +0000 Subject: [PATCH] Add VIS3 Montgomery multiplication. --- Configure | 2 +- TABLE | 18 +- crypto/bn/Makefile | 2 + crypto/bn/asm/vis3-mont.pl | 373 +++++++++++++++++++++++++++++++++++++ crypto/sparcv9cap.c | 23 ++- 5 files changed, 399 insertions(+), 19 deletions(-) create mode 100644 crypto/bn/asm/vis3-mont.pl diff --git a/Configure b/Configure index 1c0df350d1..337ef4d224 100755 --- a/Configure +++ b/Configure @@ -130,7 +130,7 @@ my $x86_elf_asm="$x86_asm:elf"; my $x86_64_asm="x86_64cpuid.o:x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o::aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o::md5-x86_64.o:sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o::rc4-x86_64.o rc4-md5-x86_64.o:::wp-x86_64.o:cmll-x86_64.o cmll_misc.o:ghash-x86_64.o:e_padlock-x86_64.o"; my $ia64_asm="ia64cpuid.o:bn-ia64.o ia64-mont.o::aes_core.o aes_cbc.o aes-ia64.o::md5-ia64.o:sha1-ia64.o sha256-ia64.o sha512-ia64.o::rc4-ia64.o rc4_skey.o:::::ghash-ia64.o::void"; -my $sparcv9_asm="sparcv9cap.o sparccpuid.o:bn-sparcv9.o sparcv9-mont.o sparcv9a-mont.o:des_enc-sparc.o fcrypt_b.o:aes_core.o aes_cbc.o aes-sparcv9.o aest4-sparcv9.o::md5-sparcv9.o:sha1-sparcv9.o sha256-sparcv9.o sha512-sparcv9.o::::::camellia.o cmll_misc.o cmll_cbc.o cmllt4-sparcv9.o:ghash-sparcv9.o::void"; +my $sparcv9_asm="sparcv9cap.o sparccpuid.o:bn-sparcv9.o sparcv9-mont.o sparcv9a-mont.o vis3-mont.o:des_enc-sparc.o fcrypt_b.o:aes_core.o aes_cbc.o aes-sparcv9.o aest4-sparcv9.o::md5-sparcv9.o:sha1-sparcv9.o sha256-sparcv9.o sha512-sparcv9.o::::::camellia.o cmll_misc.o cmll_cbc.o cmllt4-sparcv9.o:ghash-sparcv9.o::void"; my $sparcv8_asm=":sparcv8.o:des_enc-sparc.o fcrypt_b.o:::::::::::::void"; my $alpha_asm="alphacpuid.o:bn_asm.o alpha-mont.o:::::sha1-alpha.o:::::::ghash-alpha.o::void"; my $mips64_asm=":bn-mips.o mips-mont.o::aes_cbc.o aes-mips.o:::sha1-mips.o sha256-mips.o sha512-mips.o::::::::"; diff --git a/TABLE b/TABLE index dd9b553b60..f870801bc3 100644 --- a/TABLE +++ b/TABLE @@ -174,7 +174,7 @@ $sys_id = $lflags = $bn_ops = BN_LLONG RC2_CHAR RC4_CHUNK DES_INT DES_PTR DES_RISC2 BF_PTR $cpuid_obj = sparcv9cap.o sparccpuid.o -$bn_obj = bn-sparcv9.o sparcv9-mont.o sparcv9a-mont.o +$bn_obj = bn-sparcv9.o sparcv9-mont.o sparcv9a-mont.o vis3-mont.o $des_obj = des_enc-sparc.o fcrypt_b.o $aes_obj = aes_core.o aes_cbc.o aes-sparcv9.o aest4-sparcv9.o $bf_obj = @@ -2616,7 +2616,7 @@ $sys_id = ULTRASPARC $lflags = -lsocket -lnsl -ldl $bn_ops = BN_LLONG RC4_CHAR RC4_CHUNK_LL DES_PTR DES_RISC1 DES_UNROLL BF_PTR $cpuid_obj = sparcv9cap.o sparccpuid.o -$bn_obj = bn-sparcv9.o sparcv9-mont.o sparcv9a-mont.o +$bn_obj = bn-sparcv9.o sparcv9-mont.o sparcv9a-mont.o vis3-mont.o $des_obj = des_enc-sparc.o fcrypt_b.o $aes_obj = aes_core.o aes_cbc.o aes-sparcv9.o aest4-sparcv9.o $bf_obj = @@ -2649,7 +2649,7 @@ $sys_id = ULTRASPARC $lflags = -lsocket -lnsl -ldl $bn_ops = BN_LLONG RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR $cpuid_obj = sparcv9cap.o sparccpuid.o -$bn_obj = bn-sparcv9.o sparcv9-mont.o sparcv9a-mont.o +$bn_obj = bn-sparcv9.o sparcv9-mont.o sparcv9a-mont.o vis3-mont.o $des_obj = des_enc-sparc.o fcrypt_b.o $aes_obj = aes_core.o aes_cbc.o aes-sparcv9.o aest4-sparcv9.o $bf_obj = @@ -4398,7 +4398,7 @@ $sys_id = ULTRASPARC $lflags = -ldl $bn_ops = BN_LLONG RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR $cpuid_obj = sparcv9cap.o sparccpuid.o -$bn_obj = bn-sparcv9.o sparcv9-mont.o sparcv9a-mont.o +$bn_obj = bn-sparcv9.o sparcv9-mont.o sparcv9a-mont.o vis3-mont.o $des_obj = des_enc-sparc.o fcrypt_b.o $aes_obj = aes_core.o aes_cbc.o aes-sparcv9.o aest4-sparcv9.o $bf_obj = @@ -4596,7 +4596,7 @@ $sys_id = ULTRASPARC $lflags = -ldl $bn_ops = BN_LLONG RC4_CHAR RC4_CHUNK DES_INT DES_PTR DES_RISC1 DES_UNROLL BF_PTR $cpuid_obj = sparcv9cap.o sparccpuid.o -$bn_obj = bn-sparcv9.o sparcv9-mont.o sparcv9a-mont.o +$bn_obj = bn-sparcv9.o sparcv9-mont.o sparcv9a-mont.o vis3-mont.o $des_obj = des_enc-sparc.o fcrypt_b.o $aes_obj = aes_core.o aes_cbc.o aes-sparcv9.o aest4-sparcv9.o $bf_obj = @@ -5454,7 +5454,7 @@ $sys_id = ULTRASPARC $lflags = -lsocket -lnsl -ldl $bn_ops = BN_LLONG RC4_CHAR RC4_CHUNK_LL DES_PTR DES_RISC1 DES_UNROLL BF_PTR $cpuid_obj = sparcv9cap.o sparccpuid.o -$bn_obj = bn-sparcv9.o sparcv9-mont.o sparcv9a-mont.o +$bn_obj = bn-sparcv9.o sparcv9-mont.o sparcv9a-mont.o vis3-mont.o $des_obj = des_enc-sparc.o fcrypt_b.o $aes_obj = aes_core.o aes_cbc.o aes-sparcv9.o aest4-sparcv9.o $bf_obj = @@ -5487,7 +5487,7 @@ $sys_id = ULTRASPARC $lflags = -lsocket -lnsl -ldl $bn_ops = BN_LLONG RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR $cpuid_obj = sparcv9cap.o sparccpuid.o -$bn_obj = bn-sparcv9.o sparcv9-mont.o sparcv9a-mont.o +$bn_obj = bn-sparcv9.o sparcv9-mont.o sparcv9a-mont.o vis3-mont.o $des_obj = des_enc-sparc.o fcrypt_b.o $aes_obj = aes_core.o aes_cbc.o aes-sparcv9.o aest4-sparcv9.o $bf_obj = @@ -5586,7 +5586,7 @@ $sys_id = ULTRASPARC $lflags = -lsocket -lnsl -ldl $bn_ops = BN_LLONG RC4_CHAR RC4_CHUNK DES_INT DES_PTR DES_RISC1 DES_UNROLL BF_PTR $cpuid_obj = sparcv9cap.o sparccpuid.o -$bn_obj = bn-sparcv9.o sparcv9-mont.o sparcv9a-mont.o +$bn_obj = bn-sparcv9.o sparcv9-mont.o sparcv9a-mont.o vis3-mont.o $des_obj = des_enc-sparc.o fcrypt_b.o $aes_obj = aes_core.o aes_cbc.o aes-sparcv9.o aest4-sparcv9.o $bf_obj = @@ -5619,7 +5619,7 @@ $sys_id = ULTRASPARC $lflags = -lsocket -lnsl -ldl $bn_ops = BN_LLONG RC4_CHAR RC4_CHUNK DES_INT DES_PTR DES_RISC1 DES_UNROLL BF_PTR $cpuid_obj = sparcv9cap.o sparccpuid.o -$bn_obj = bn-sparcv9.o sparcv9-mont.o sparcv9a-mont.o +$bn_obj = bn-sparcv9.o sparcv9-mont.o sparcv9a-mont.o vis3-mont.o $des_obj = des_enc-sparc.o fcrypt_b.o $aes_obj = aes_core.o aes_cbc.o aes-sparcv9.o aest4-sparcv9.o $bf_obj = diff --git a/crypto/bn/Makefile b/crypto/bn/Makefile index c823a6b70a..ff5560dec5 100644 --- a/crypto/bn/Makefile +++ b/crypto/bn/Makefile @@ -77,6 +77,8 @@ sparcv9a-mont.s: asm/sparcv9a-mont.pl $(PERL) asm/sparcv9a-mont.pl $(CFLAGS) > $@ sparcv9-mont.s: asm/sparcv9-mont.pl $(PERL) asm/sparcv9-mont.pl $(CFLAGS) > $@ +vis3-mont.s: asm/vis3-mont.pl + $(PERL) asm/vis3-mont.pl $(CFLAGS) > $@ bn-mips3.o: asm/mips3.s @if [ "$(CC)" = "gcc" ]; then \ diff --git a/crypto/bn/asm/vis3-mont.pl b/crypto/bn/asm/vis3-mont.pl new file mode 100644 index 0000000000..a1357de0e9 --- /dev/null +++ b/crypto/bn/asm/vis3-mont.pl @@ -0,0 +1,373 @@ +#!/usr/bin/env perl + +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== + +# October 2012. +# +# SPARCv9 VIS3 Montgomery multiplicaion procedure suitable for T3 and +# onward. There are three new instructions used here: umulxhi, +# addxc[cc] and initializing store. On T3 RSA private key operations +# are 1.54/1.87/2.11/2.26 times faster for 512/1024/2048/4096-bit key +# lengths. This is without dedicated squaring procedure. On T4 +# corresponding coefficients are 1.47/2.10/2.80/2.90x, which is mostly +# for reference purposes, because T4 has dedicated Montgomery +# multiplication and squaring *instructions* that deliver even more. + +$bits=32; +for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); } +if ($bits==64) { $bias=2047; $frame=192; } +else { $bias=0; $frame=112; } + +$code.=<<___ if ($bits==64); +.register %g2,#scratch +.register %g3,#scratch +___ +$code.=<<___; +.section ".text",#alloc,#execinstr +___ + +($n0,$m0,$m1,$lo0,$hi0, $lo1,$hi1,$aj,$alo,$nj,$nlo,$tj)= + (map("%g$_",(1..5)),map("%o$_",(0..5,7))); + +# int bn_mul_mont( +$rp="%o0"; # BN_ULONG *rp, +$ap="%o1"; # const BN_ULONG *ap, +$bp="%o2"; # const BN_ULONG *bp, +$np="%o3"; # const BN_ULONG *np, +$n0p="%o4"; # const BN_ULONG *n0, +$num="%o5"; # int num); # caller ensures that num is even + # and >=6 +$code.=<<___; +.globl bn_mul_mont_vis3 +.align 32 +bn_mul_mont_vis3: + add %sp, $bias, %g4 ! real top of stack + sll $num, 2, $num ! size in bytes + add $num, 63, %g5 + andn %g5, 63, %g5 ! buffer size rounded up to 64 bytes + add %g5, %g5, %g1 + add %g5, %g1, %g1 ! 3*buffer size + sub %g4, %g1, %g1 + andn %g1, 63, %g1 ! align at 64 byte + sub %g1, $frame, %g1 ! new top of stack + sub %g1, %g4, %g1 + + save %sp, %g1, %sp +___ + +# +-------------------------------+<----- %sp +# . . +# +-------------------------------+<----- aligned at 64 bytes +# | __int64 tmp[0] | +# +-------------------------------+ +# . . +# . . +# +-------------------------------+<----- aligned at 64 bytes +# | __int64 ap[1..0] | converted ap[] +# +-------------------------------+ +# | __int64 np[1..0] | converted np[] +# +-------------------------------+ +# | __int64 ap[3..2] | +# . . +# . . +# +-------------------------------+ +($rp,$ap,$bp,$np,$n0p,$num)=map("%i$_",(0..5)); +($t0,$t1,$t2,$t3,$cnt,$tp,$bufsz,$anp)=map("%l$_",(0..7)); +($ovf,$i)=($t0,$t1); +$code.=<<___; + ld [$n0p+0], $t0 ! pull n0[0..1] value + add %sp, $bias+$frame, $tp + ld [$n0p+4], $t1 + add $tp, %g5, $anp + ld [$bp+0], $t2 ! m0=bp[0] + sllx $t1, 32, $n0 + ld [$bp+4], $t3 + or $t0, $n0, $n0 + add $bp, 8, $bp + + ld [$ap+0], $t0 ! ap[0] + sllx $t3, 32, $m0 + ld [$ap+4], $t1 + or $t2, $m0, $m0 + + ld [$ap+8], $t2 ! ap[1] + sllx $t1, 32, $aj + ld [$ap+12], $t3 + or $t0, $aj, $aj + add $ap, 16, $ap + stxa $aj, [$anp]0xe2 ! converted ap[0] + + mulx $aj, $m0, $lo0 ! ap[0]*bp[0] + umulxhi $aj, $m0, $hi0 + + ld [$np+0], $t0 ! np[0] + sllx $t3, 32, $aj + ld [$np+4], $t1 + or $t2, $aj, $aj + + ld [$np+8], $t2 ! np[1] + sllx $t1, 32, $nj + ld [$np+12], $t3 + or $t0, $nj, $nj + add $np, 16, $np + stx $nj, [$anp+8] ! converted np[0] + + mulx $lo0, $n0, $m1 ! "tp[0]"*n0 + stx $aj, [$anp+16] ! converted ap[1] + + mulx $aj, $m0, $alo ! ap[1]*bp[0] + umulxhi $aj, $m0, $aj ! ahi=aj + + mulx $nj, $m1, $lo1 ! np[0]*m1 + umulxhi $nj, $m1, $hi1 + + sllx $t3, 32, $nj + or $t2, $nj, $nj + stx $nj, [$anp+24] ! converted np[1] + add $anp, 32, $anp + + addcc $lo0, $lo1, $lo1 + addxc %g0, $hi1, $hi1 + + mulx $nj, $m1, $nlo ! np[1]*m1 + umulxhi $nj, $m1, $nj ! nhi=nj + + ba .L1st + sub $num, 24, $cnt ! cnt=num-3 + +.align 16 +.L1st: + ld [$ap+0], $t0 ! ap[j] + addcc $alo, $hi0, $lo0 + ld [$ap+4], $t1 + addxc $aj, %g0, $hi0 + + sllx $t1, 32, $aj + add $ap, 8, $ap + or $t0, $aj, $aj + stxa $aj, [$anp]0xe2 ! converted ap[j] + + ld [$np+0], $t2 ! np[j] + addcc $nlo, $hi1, $lo1 + ld [$np+4], $t3 + addxc $nj, %g0, $hi1 ! nhi=nj + + sllx $t3, 32, $nj + add $np, 8, $np + mulx $aj, $m0, $alo ! ap[j]*bp[0] + or $t2, $nj, $nj + umulxhi $aj, $m0, $aj ! ahi=aj + stx $nj, [$anp+8] ! converted np[j] + add $anp, 16, $anp ! anp++ + + mulx $nj, $m1, $nlo ! np[j]*m1 + addcc $lo0, $lo1, $lo1 ! np[j]*m1+ap[j]*bp[0] + umulxhi $nj, $m1, $nj ! nhi=nj + addxc %g0, $hi1, $hi1 + stxa $lo1, [$tp]0xe2 ! tp[j-1] + add $tp, 8, $tp ! tp++ + + brnz,pt $cnt, .L1st + sub $cnt, 8, $cnt ! j-- +!.L1st + addcc $alo, $hi0, $lo0 + addxc $aj, %g0, $hi0 ! ahi=aj + + addcc $nlo, $hi1, $lo1 + addxc $nj, %g0, $hi1 + addcc $lo0, $lo1, $lo1 ! np[j]*m1+ap[j]*bp[0] + addxc %g0, $hi1, $hi1 + stxa $lo1, [$tp]0xe2 ! tp[j-1] + add $tp, 8, $tp + + addcc $hi0, $hi1, $hi1 + addxc %g0, %g0, $ovf ! upmost overflow bit + stxa $hi1, [$tp]0xe2 + add $tp, 8, $tp + + ba .Louter + sub $num, 16, $i ! i=num-2 + +.align 16 +.Louter: + ld [$bp+0], $t2 ! m0=bp[i] + ld [$bp+4], $t3 + + sub $anp, $num, $anp ! rewind + sub $tp, $num, $tp + sub $anp, $num, $anp + + add $bp, 8, $bp + sllx $t3, 32, $m0 + ldx [$anp+0], $aj ! ap[0] + or $t2, $m0, $m0 + ldx [$anp+8], $nj ! np[0] + + mulx $aj, $m0, $lo0 ! ap[0]*bp[i] + ldx [$tp], $tj ! tp[0] + umulxhi $aj, $m0, $hi0 + ldx [$anp+16], $aj ! ap[1] + addcc $lo0, $tj, $lo0 ! ap[0]*bp[i]+tp[0] + mulx $aj, $m0, $alo ! ap[1]*bp[i] + addxc %g0, $hi0, $hi0 + mulx $lo0, $n0, $m1 ! tp[0]*n0 + umulxhi $aj, $m0, $aj ! ahi=aj + mulx $nj, $m1, $lo1 ! np[0]*m1 + umulxhi $nj, $m1, $hi1 + ldx [$anp+24], $nj ! np[1] + add $anp, 32, $anp + addcc $lo1, $lo0, $lo1 + mulx $nj, $m1, $nlo ! np[1]*m1 + addxc %g0, $hi1, $hi1 + umulxhi $nj, $m1, $nj ! nhi=nj + + ba .Linner + sub $num, 24, $cnt ! cnt=num-3 +.align 16 +.Linner: + addcc $alo, $hi0, $lo0 + ldx [$tp+8], $tj ! tp[j] + addxc $aj, %g0, $hi0 ! ahi=aj + ldx [$anp+0], $aj ! ap[j] + addcc $nlo, $hi1, $lo1 + mulx $aj, $m0, $alo ! ap[j]*bp[i] + addxc $nj, %g0, $hi1 ! nhi=nj + ldx [$anp+8], $nj ! np[j] + add $anp, 16, $anp + umulxhi $aj, $m0, $aj ! ahi=aj + addcc $lo0, $tj, $lo0 ! ap[j]*bp[i]+tp[j] + mulx $nj, $m1, $nlo ! np[j]*m1 + addxc %g0, $hi0, $hi0 + umulxhi $nj, $m1, $nj ! nhi=nj + addcc $lo1, $lo0, $lo1 ! np[j]*m1+ap[j]*bp[i]+tp[j] + addxc %g0, $hi1, $hi1 + stx $lo1, [$tp] ! tp[j-1] + add $tp, 8, $tp + brnz,pt $cnt, .Linner + sub $cnt, 8, $cnt +!.Linner + ldx [$tp+8], $tj ! tp[j] + addcc $alo, $hi0, $lo0 + addxc $aj, %g0, $hi0 ! ahi=aj + addcc $lo0, $tj, $lo0 ! ap[j]*bp[i]+tp[j] + addxc %g0, $hi0, $hi0 + + addcc $nlo, $hi1, $lo1 + addxc $nj, %g0, $hi1 ! nhi=nj + addcc $lo1, $lo0, $lo1 ! np[j]*m1+ap[j]*bp[i]+tp[j] + addxc %g0, $hi1, $hi1 + stx $lo1, [$tp] ! tp[j-1] + + subcc %g0, $ovf, %g0 ! move upmost overflow to CCR.xcc + addxccc $hi1, $hi0, $hi1 + addxc %g0, %g0, $ovf + stx $hi1, [$tp+8] + add $tp, 16, $tp + + brnz,pt $i, .Louter + sub $i, 8, $i + + sub $anp, $num, $anp ! rewind + sub $tp, $num, $tp + sub $anp, $num, $anp + ba .Lsub + subcc $num, 8, $cnt ! cnt=num-1 and clear CCR.xcc + +.align 16 +.Lsub: + ldx [$tp], $tj + add $tp, 8, $tp + ldx [$anp+8], $nj + add $anp, 16, $anp + subccc $tj, $nj, $t2 ! tp[j]-np[j] + srlx $tj, 32, $tj + srlx $nj, 32, $nj + subccc $tj, $nj, $t3 + add $rp, 8, $rp + st $t2, [$rp-4] ! reverse order + st $t3, [$rp-8] + brnz,pt $cnt, .Lsub + sub $cnt, 8, $cnt + + sub $anp, $num, $anp ! rewind + sub $tp, $num, $tp + sub $anp, $num, $anp + sub $rp, $num, $rp + + subc $ovf, %g0, $ovf ! handle upmost overflow bit + and $tp, $ovf, $ap + andn $rp, $ovf, $np + or $np, $ap, $ap ! ap=borrow?tp:rp + ba .Lcopy + sub $num, 8, $cnt + +.align 16 +.Lcopy: ! copy or in-place refresh + ld [$ap+0], $t2 + ld [$ap+4], $t3 + add $ap, 8, $ap + stx %g0, [$tp] ! zap + add $tp, 8, $tp + stx %g0, [$anp] ! zap + stx %g0, [$anp+8] + add $anp, 16, $anp + st $t3, [$rp+0] ! flip order + st $t2, [$rp+4] + add $rp, 8, $rp + brnz $cnt, .Lcopy + sub $cnt, 8, $cnt + + mov 1, %o0 + ret + restore +.type bn_mul_mont_vis3, #function +.size bn_mul_mont_vis3, .-bn_mul_mont_vis3 +.asciz "Montgomery Multiplication for SPARCv9 VIS3, CRYPTOGAMS by " +.align 4 +___ + +# Purpose of these subroutines is to explicitly encode VIS instructions, +# so that one can compile the module without having to specify VIS +# extentions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a. +# Idea is to reserve for option to produce "universal" binary and let +# programmer detect if current CPU is VIS capable at run-time. +sub unvis3 { +my ($mnemonic,$rs1,$rs2,$rd)=@_; +my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 ); +my ($ref,$opf); +my %visopf = ( "addxc" => 0x011, + "addxccc" => 0x013, + "umulxhi" => 0x016 ); + + $ref = "$mnemonic\t$rs1,$rs2,$rd"; + + if ($opf=$visopf{$mnemonic}) { + foreach ($rs1,$rs2,$rd) { + return $ref if (!/%([goli])([0-9])/); + $_=$bias{$1}+$2; + } + + return sprintf ".word\t0x%08x !%s", + 0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2, + $ref; + } else { + return $ref; + } +} + +foreach (split("\n",$code)) { + s/\`([^\`]*)\`/eval $1/ge; + + s/\b(umulxhi|addxc[c]{0,2})\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/ + &unvis3($1,$2,$3,$4) + /ge; + + print $_,"\n"; +} + +close STDOUT; diff --git a/crypto/sparcv9cap.c b/crypto/sparcv9cap.c index 149eb5232d..b6bdb8704b 100644 --- a/crypto/sparcv9cap.c +++ b/crypto/sparcv9cap.c @@ -15,15 +15,20 @@ unsigned int OPENSSL_sparcv9cap_P[2]={SPARCV9_TICK_PRIVILEGED,0}; int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np,const BN_ULONG *n0, int num) { + int bn_mul_mont_vis3(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np,const BN_ULONG *n0, int num); int bn_mul_mont_fpu(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np,const BN_ULONG *n0, int num); int bn_mul_mont_int(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np,const BN_ULONG *n0, int num); - if (num>=8 && !(num&1) && - (OPENSSL_sparcv9cap_P[0]&(SPARCV9_PREFER_FPU|SPARCV9_VIS1)) == - (SPARCV9_PREFER_FPU|SPARCV9_VIS1)) - return bn_mul_mont_fpu(rp,ap,bp,np,n0,num); - else - return bn_mul_mont_int(rp,ap,bp,np,n0,num); + if (!(num&1) && num>=6) + { + if ((OPENSSL_sparcv9cap_P[0]&SPARCV9_VIS3)) + return bn_mul_mont_vis3(rp,ap,bp,np,n0,num); + else if (num>=8 && + (OPENSSL_sparcv9cap_P[0]&(SPARCV9_PREFER_FPU|SPARCV9_VIS1)) == + (SPARCV9_PREFER_FPU|SPARCV9_VIS1)) + return bn_mul_mont_fpu(rp,ap,bp,np,n0,num); + } + return bn_mul_mont_int(rp,ap,bp,np,n0,num); } unsigned long _sparcv9_rdtick(void); @@ -35,7 +40,7 @@ unsigned long _sparcv9_rdcfr(void); void _sparcv9_vis3_probe(void); unsigned long _sparcv9_random(void); size_t _sparcv9_vis1_instrument_bus(unsigned int *,size_t); -size_t _sparcv8_vis1_instrument_bus2(unsigned int *,size_t,size_t); +size_t _sparcv9_vis1_instrument_bus2(unsigned int *,size_t,size_t); unsigned long OPENSSL_rdtsc(void) { @@ -51,7 +56,7 @@ unsigned long OPENSSL_rdtsc(void) size_t OPENSSL_instrument_bus(unsigned int *out,size_t cnt) { - if (OPENSSL_sparcv9cap_P[0]&(SPARCV9_TICK_PRIVILEGED|SPARCV9_BLK) == + if ((OPENSSL_sparcv9cap_P[0]&(SPARCV9_TICK_PRIVILEGED|SPARCV9_BLK)) == SPARCV9_BLK) return _sparcv9_vis1_instrument_bus(out,cnt); else @@ -60,7 +65,7 @@ size_t OPENSSL_instrument_bus(unsigned int *out,size_t cnt) size_t OPENSSL_instrument_bus2(unsigned int *out,size_t cnt,size_t max) { - if (OPENSSL_sparcv9cap_P[0]&(SPARCV9_TICK_PRIVILEGED|SPARCV9_BLK) == + if ((OPENSSL_sparcv9cap_P[0]&(SPARCV9_TICK_PRIVILEGED|SPARCV9_BLK)) == SPARCV9_BLK) return _sparcv9_vis1_instrument_bus2(out,cnt,max); else -- 2.40.0