From e822c756b66024d49ab936bf77b745206660fcd2 Mon Sep 17 00:00:00 2001 From: Andy Polyakov Date: Mon, 29 Nov 2010 20:52:43 +0000 Subject: [PATCH] s390x assembler pack: adapt for -m31 build, see commentary in Configure for more details. --- Configure | 18 +++- config | 13 ++- crypto/aes/asm/aes-s390x.pl | 167 ++++++++++++++++++-------------- crypto/bn/asm/s390x-mont.pl | 99 ++++++++++++++----- crypto/modes/asm/ghash-s390x.pl | 29 +++++- crypto/rc4/asm/rc4-s390x.pl | 35 ++++++- crypto/s390xcpuid.S | 3 + crypto/sha/asm/sha1-s390x.pl | 46 ++++++--- crypto/sha/asm/sha512-s390x.pl | 59 +++++++---- 9 files changed, 329 insertions(+), 140 deletions(-) diff --git a/Configure b/Configure index cb4cec6197..e2efb5b864 100755 --- a/Configure +++ b/Configure @@ -134,7 +134,7 @@ my $sparcv8_asm=":sparcv8.o:des_enc-sparc.o fcrypt_b.o::::::::::::void"; my $alpha_asm="alphacpuid.o:bn_asm.o alpha-mont.o:::::sha1-alpha.o:::::::ghash-alpha.o:void"; my $mips32_asm=":bn-mips.o::aes_cbc.o aes-mips.o:::sha1-mips.o sha256-mips.o:::::::"; my $mips64_asm=":bn-mips.o mips-mont.o::aes_cbc.o aes-mips.o:::sha1-mips.o sha256-mips.o sha512-mips.o:::::::"; -my $s390x_asm="s390xcap.o s390xcpuid.o:bn-s390x.o s390x-mont.o::aes_ctr.o aes-s390x.o:::sha1-s390x.o sha256-s390x.o sha512-s390x.o::rc4-s390x.o:::::ghash-s390x.o:void"; +my $s390x_asm="s390xcap.o s390xcpuid.o:bn-s390x.o s390x-mont.o::aes_ctr.o aes-s390x.o:::sha1-s390x.o sha256-s390x.o sha512-s390x.o::rc4-s390x.o:::::ghash-s390x.o"; my $armv4_asm=":bn_asm.o armv4-mont.o::aes_cbc.o aes-armv4.o:::sha1-armv4-large.o sha256-armv4.o sha512-armv4.o:::::::ghash-armv4.o:void"; my $parisc11_asm="pariscid.o:bn_asm.o parisc-mont.o::aes_core.o aes_cbc.o aes-parisc.o:::sha1-parisc.o sha256-parisc.o sha512-parisc.o::rc4-parisc.o:::::ghash-parisc.o:32"; my $parisc20_asm="pariscid.o:pa-risc2W.o parisc-mont.o::aes_core.o aes_cbc.o aes-parisc.o:::sha1-parisc.o sha256-parisc.o sha512-parisc.o::rc4-parisc.o:::::ghash-parisc.o:64"; @@ -356,7 +356,21 @@ my %table=( "linux-ia64-ecc","ecc:-DL_ENDIAN -DTERMIO -O2 -Wall -no_cpprt::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT:${ia64_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", "linux-ia64-icc","icc:-DL_ENDIAN -DTERMIO -O2 -Wall -no_cpprt::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_RISC1 DES_INT:${ia64_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", "linux-x86_64", "gcc:-m64 -DL_ENDIAN -DTERMIO -O3 -Wall::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_UNROLL:${x86_64_asm}:elf:dlfcn:linux-shared:-fPIC:-m64:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR):::64", -"linux-s390x", "gcc:-m64 -DB_ENDIAN -DTERMIO -O3 -Wall::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_INT DES_UNROLL:${s390x_asm}:dlfcn:linux-shared:-fPIC:-m64:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR):::64", +"linux64-s390x", "gcc:-m64 -DB_ENDIAN -DTERMIO -O3 -Wall::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_INT DES_UNROLL:${s390x_asm}:64:dlfcn:linux-shared:-fPIC:-m64:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR):::64", +#### So called "highgprs" target for z/Architecture CPUs +# "Highgprs" is kernel feature first implemented in Linux 2.6.32, see +# /proc/cpuinfo. The idea is to preserve most significant bits of +# general purpose registers not only upon 32-bit process context +# switch, but even on asynchronous signal delivery to such process. +# This makes it possible to deploy 64-bit instructions even in legacy +# application context and achieve better [or should we say adequate] +# performance. The build is binary compatible with linux-generic32, +# and the idea is to be able to install the resulting libcrypto.so +# alongside generic one, e.g. as /lib/highgprs/libcrypto.so.x.y, for +# ldconfig and run-time linker to autodiscover. Unfortunately it +# doesn't work just yet, because of couple of bugs in glibc +# sysdep/s390/dl-procinfo.c affecting ldconfig and ld.so.1... +"linux32-s390x", "gcc:-m31 -Wa,-mzarch -DB_ENDIAN -DTERMIO -O3 -Wall::-D_REENTRANT::-ldl:BN_LLONG RC4_CHAR RC4_CHUNK DES_INT DES_UNROLL:s390xcap.o s390xcpuid.o:bn_asm.o s390x-mont.o::aes_ctr.o aes-s390x.o:::sha1-s390x.o sha256-s390x.o sha512-s390x.o::rc4-s390x.o:::::ghash-s390x.o:31:dlfcn:linux-shared:-fPIC:-m31:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR):::/highgprs", #### SPARC Linux setups # Ray Miller has patiently # assisted with debugging of following two configs. diff --git a/config b/config index 965884a627..bcc725eb18 100755 --- a/config +++ b/config @@ -629,7 +629,18 @@ case "$GUESSOS" in sh*-*-linux2) OUT="linux-generic32"; options="$options -DL_ENDIAN" ;; m68k*-*-linux2) OUT="linux-generic32"; options="$options -DB_ENDIAN" ;; s390-*-linux2) OUT="linux-generic32"; options="$options -DB_ENDIAN" ;; - s390x-*-linux2) OUT="linux-s390x" ;; + s390x-*-linux2) + # To be uncommented when glibc bug is fixed, see Configure... + #if egrep -e '^features.* highgprs' /proc/cpuinfo >/dev/null ; then + # echo "WARNING! If you wish to build \"highgprs\" 32-bit library, then you" + # echo " have to invoke './Configure linux32-s390x' *manually*." + # if [ "$TEST" = "false" -a -t -1 ]; then + # echo " You have about 5 seconds to press Ctrl-C to abort." + # (trap "stty `stty -g`" 2 0; stty -icanon min 0 time 50; read waste) <&1 + # fi + #fi + OUT="linux64-s390x" + ;; x86_64-*-linux?) OUT="linux-x86_64" ;; *86-*-linux2) OUT="linux-elf" if [ "$GCCVER" -gt 28 ]; then diff --git a/crypto/aes/asm/aes-s390x.pl b/crypto/aes/asm/aes-s390x.pl index 4be64e3e51..db963c9df0 100644 --- a/crypto/aes/asm/aes-s390x.pl +++ b/crypto/aes/asm/aes-s390x.pl @@ -60,6 +60,26 @@ # maximum, but *on average* it would be as much as ~98%. Meaning that # worst case is unlike, it's like hitting ravine on plateau. +# November 2010. +# +# Adapt for -m31 build. If kernel supports what's called "highgprs" +# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit +# instructions and achieve "64-bit" performance even in 31-bit legacy +# application context. The feature is not specific to any particular +# processor, as long as it's "z-CPU". Latter implies that the code +# remains z/Architecture specific. On z990 it was measured to perform +# 2x better than code generated by gcc 4.3. + +$flavour = shift; + +if ($flavour =~ /3[12]/) { + $SIZE_T=4; + $g=""; +} else { + $SIZE_T=8; + $g="g"; +} + while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} open STDOUT,">$output"; @@ -82,6 +102,8 @@ $rounds="%r13"; $ra="%r14"; $sp="%r15"; +$stdframe=16*$SIZE_T+4*8; + sub _data_word() { my $i; while(defined($i=shift)) { $code.=sprintf".long\t0x%08x,0x%08x\n",$i,$i; } @@ -223,7 +245,7 @@ $code.=<<___ if (!$softonly); .Lesoft: ___ $code.=<<___; - stmg %r3,$ra,24($sp) + stm${g} %r3,$ra,3*$SIZE_T($sp) llgf $s0,0($inp) llgf $s1,4($inp) @@ -233,20 +255,20 @@ $code.=<<___; larl $tbl,AES_Te bras $ra,_s390x_AES_encrypt - lg $out,24($sp) + l${g} $out,3*$SIZE_T($sp) st $s0,0($out) st $s1,4($out) st $s2,8($out) st $s3,12($out) - lmg %r6,$ra,48($sp) + lm${g} %r6,$ra,6*$SIZE_T($sp) br $ra .size AES_encrypt,.-AES_encrypt .type _s390x_AES_encrypt,\@function .align 16 _s390x_AES_encrypt: - stg $ra,152($sp) + st${g} $ra,`$stdframe-$SIZE_T`($sp) x $s0,0($key) x $s1,4($key) x $s2,8($key) @@ -410,7 +432,7 @@ _s390x_AES_encrypt: or $s2,$i3 or $s3,$t3 - lg $ra,152($sp) + l${g} $ra,`$stdframe-$SIZE_T`($sp) xr $s0,$t0 xr $s1,$t2 x $s2,24($key) @@ -549,7 +571,7 @@ $code.=<<___ if (!$softonly); .Ldsoft: ___ $code.=<<___; - stmg %r3,$ra,24($sp) + stm${g} %r3,$ra,3*$SIZE_T($sp) llgf $s0,0($inp) llgf $s1,4($inp) @@ -559,20 +581,20 @@ $code.=<<___; larl $tbl,AES_Td bras $ra,_s390x_AES_decrypt - lg $out,24($sp) + l${g} $out,3*$SIZE_T($sp) st $s0,0($out) st $s1,4($out) st $s2,8($out) st $s3,12($out) - lmg %r6,$ra,48($sp) + lm${g} %r6,$ra,6*$SIZE_T($sp) br $ra .size AES_decrypt,.-AES_decrypt .type _s390x_AES_decrypt,\@function .align 16 _s390x_AES_decrypt: - stg $ra,152($sp) + st${g} $ra,`$stdframe-$SIZE_T`($sp) x $s0,0($key) x $s1,4($key) x $s2,8($key) @@ -716,7 +738,7 @@ _s390x_AES_decrypt: nr $i1,$mask nr $i2,$mask - lg $ra,152($sp) + l${g} $ra,`$stdframe-$SIZE_T`($sp) or $s1,$t1 l $t0,16($key) l $t1,20($key) @@ -750,9 +772,9 @@ $code.=<<___; .align 16 AES_set_encrypt_key: lghi $t0,0 - clgr $inp,$t0 + cl${g}r $inp,$t0 je .Lminus1 - clgr $key,$t0 + cl${g}r $key,$t0 je .Lminus1 lghi $t0,128 @@ -810,7 +832,7 @@ ___ $code.=<<___; .align 16 .Lekey_internal: - stmg %r6,%r13,48($sp) # all non-volatile regs + stm${g} %r6,%r13,6*$SIZE_T($sp) # all non-volatile regs larl $tbl,AES_Te+2048 @@ -871,7 +893,7 @@ $code.=<<___; la $t3,4($t3) # i++ brct $rounds,.L128_loop lghi %r2,0 - lmg %r6,%r13,48($sp) + lm${g} %r6,%r13,6*$SIZE_T($sp) br $ra .align 16 @@ -919,7 +941,7 @@ $code.=<<___; st $s3,36($key) brct $rounds,.L192_continue lghi %r2,0 - lmg %r6,%r13,48($sp) + lm${g} %r6,%r13,6*$SIZE_T($sp) br $ra .align 16 @@ -981,7 +1003,7 @@ $code.=<<___; st $s3,44($key) brct $rounds,.L256_continue lghi %r2,0 - lmg %r6,%r13,48($sp) + lm${g} %r6,%r13,6*$SIZE_T($sp) br $ra .align 16 @@ -1032,11 +1054,11 @@ $code.=<<___; .type AES_set_decrypt_key,\@function .align 16 AES_set_decrypt_key: - stg $key,32($sp) # I rely on AES_set_encrypt_key to - stg $ra,112($sp) # save non-volatile registers! + st${g} $key,4*$SIZE_T($sp) # I rely on AES_set_encrypt_key to + st${g} $ra,14*$SIZE_T($sp) # save non-volatile registers! bras $ra,AES_set_encrypt_key - lg $key,32($sp) - lg $ra,112($sp) + l${g} $key,4*$SIZE_T($sp) + l${g} $ra,14*$SIZE_T($sp) ltgr %r2,%r2 bnzr $ra ___ @@ -1051,11 +1073,11 @@ $code.=<<___ if (!$softonly); .align 16 .Ldkey_internal: - stg $key,32($sp) - stg $ra,40($sp) + st${g} $key,4*$SIZE_T($sp) + st${g} $ra,14*$SIZE_T($sp) bras $ra,.Lekey_internal - lg $key,32($sp) - lg $ra,40($sp) + l${g} $key,4*$SIZE_T($sp) + l${g} $ra,14*$SIZE_T($sp) ___ $code.=<<___; @@ -1136,7 +1158,7 @@ $code.=<<___; la $key,4($key) brct $rounds,.Lmix - lmg %r6,%r13,48($sp)# as was saved by AES_set_encrypt_key! + lm${g} %r6,%r13,6*$SIZE_T($sp)# as was saved by AES_set_encrypt_key! lghi %r2,0 br $ra .size AES_set_decrypt_key,.-AES_set_decrypt_key @@ -1176,7 +1198,7 @@ $code.=<<___ if (!$softonly); l %r0,240($key) # load kmc code lghi $key,15 # res=len%16, len-=res; ngr $key,$len - slgr $len,$key + sl${g}r $len,$key la %r1,16($sp) # parameter block - ivec || key jz .Lkmc_truncated .long 0xb92f0042 # kmc %r4,%r2 @@ -1194,34 +1216,34 @@ $code.=<<___ if (!$softonly); tmll %r0,0x80 jnz .Lkmc_truncated_dec lghi %r1,0 - stg %r1,128($sp) - stg %r1,136($sp) + stg %r1,16*$SIZE_T($sp) + stg %r1,16*$SIZE_T+8($sp) bras %r1,1f - mvc 128(1,$sp),0($inp) + mvc 16*$SIZE_T(1,$sp),0($inp) 1: ex $key,0(%r1) la %r1,16($sp) # restore parameter block - la $inp,128($sp) + la $inp,16*$SIZE_T($sp) lghi $len,16 .long 0xb92f0042 # kmc %r4,%r2 j .Lkmc_done .align 16 .Lkmc_truncated_dec: - stg $out,64($sp) - la $out,128($sp) + st${g} $out,4*$SIZE_T($sp) + la $out,16*$SIZE_T($sp) lghi $len,16 .long 0xb92f0042 # kmc %r4,%r2 - lg $out,64($sp) + l${g} $out,4*$SIZE_T($sp) bras %r1,2f - mvc 0(1,$out),128($sp) + mvc 0(1,$out),16*$SIZE_T($sp) 2: ex $key,0(%r1) j .Lkmc_done .align 16 .Lcbc_software: ___ $code.=<<___; - stmg $key,$ra,40($sp) + stm${g} $key,$ra,5*$SIZE_T($sp) lhi %r0,0 - cl %r0,164($sp) + cl %r0,`$stdframe+$SIZE_T-4`($sp) je .Lcbc_decrypt larl $tbl,AES_Te @@ -1232,10 +1254,10 @@ $code.=<<___; llgf $s3,12($ivp) lghi $t0,16 - slgr $len,$t0 + sl${g}r $len,$t0 brc 4,.Lcbc_enc_tail # if borrow .Lcbc_enc_loop: - stmg $inp,$out,16($sp) + stm${g} $inp,$out,2*$SIZE_T($sp) x $s0,0($inp) x $s1,4($inp) x $s2,8($inp) @@ -1244,7 +1266,7 @@ $code.=<<___; bras $ra,_s390x_AES_encrypt - lmg $inp,$key,16($sp) + lm${g} $inp,$key,2*$SIZE_T($sp) st $s0,0($out) st $s1,4($out) st $s2,8($out) @@ -1253,33 +1275,33 @@ $code.=<<___; la $inp,16($inp) la $out,16($out) lghi $t0,16 - ltgr $len,$len + lt${g}r $len,$len jz .Lcbc_enc_done - slgr $len,$t0 + sl${g}r $len,$t0 brc 4,.Lcbc_enc_tail # if borrow j .Lcbc_enc_loop .align 16 .Lcbc_enc_done: - lg $ivp,48($sp) + l${g} $ivp,6*$SIZE_T($sp) st $s0,0($ivp) st $s1,4($ivp) st $s2,8($ivp) st $s3,12($ivp) - lmg %r7,$ra,56($sp) + lm${g} %r7,$ra,7*$SIZE_T($sp) br $ra .align 16 .Lcbc_enc_tail: aghi $len,15 lghi $t0,0 - stg $t0,128($sp) - stg $t0,136($sp) + stg $t0,16*$SIZE_T($sp) + stg $t0,16*$SIZE_T+8($sp) bras $t1,3f - mvc 128(1,$sp),0($inp) + mvc 16*$SIZE_T(1,$sp),0($inp) 3: ex $len,0($t1) lghi $len,0 - la $inp,128($sp) + la $inp,16*$SIZE_T($sp) j .Lcbc_enc_loop .align 16 @@ -1288,10 +1310,10 @@ $code.=<<___; lg $t0,0($ivp) lg $t1,8($ivp) - stmg $t0,$t1,128($sp) + stmg $t0,$t1,16*$SIZE_T($sp) .Lcbc_dec_loop: - stmg $inp,$out,16($sp) + stm${g} $inp,$out,2*$SIZE_T($sp) llgf $s0,0($inp) llgf $s1,4($inp) llgf $s2,8($inp) @@ -1300,7 +1322,7 @@ $code.=<<___; bras $ra,_s390x_AES_decrypt - lmg $inp,$key,16($sp) + lm${g} $inp,$key,2*$SIZE_T($sp) sllg $s0,$s0,32 sllg $s2,$s2,32 lr $s0,$s1 @@ -1308,15 +1330,15 @@ $code.=<<___; lg $t0,0($inp) lg $t1,8($inp) - xg $s0,128($sp) - xg $s2,136($sp) + xg $s0,16*$SIZE_T($sp) + xg $s2,16*$SIZE_T+8($sp) lghi $s1,16 - slgr $len,$s1 + sl${g}r $len,$s1 brc 4,.Lcbc_dec_tail # if borrow brc 2,.Lcbc_dec_done # if zero stg $s0,0($out) stg $s2,8($out) - stmg $t0,$t1,128($sp) + stmg $t0,$t1,16*$SIZE_T($sp) la $inp,16($inp) la $out,16($out) @@ -1326,7 +1348,7 @@ $code.=<<___; stg $s0,0($out) stg $s2,8($out) .Lcbc_dec_exit: - lmg $ivp,$ra,48($sp) + lm${g} %r6,$ra,6*$SIZE_T($sp) stmg $t0,$t1,0($ivp) br $ra @@ -1334,10 +1356,10 @@ $code.=<<___; .align 16 .Lcbc_dec_tail: aghi $len,15 - stg $s0,128($sp) - stg $s2,136($sp) + stg $s0,16*$SIZE_T($sp) + stg $s2,16*$SIZE_T+8($sp) bras $s1,4f - mvc 0(1,$out),128($sp) + mvc 0(1,$out),16*$SIZE_T($sp) 4: ex $len,0($s1) j .Lcbc_dec_exit .size AES_cbc_encrypt,.-AES_cbc_encrypt @@ -1359,6 +1381,7 @@ $code.=<<___; .type AES_ctr32_encrypt,\@function .align 16 AES_ctr32_encrypt: + llgfr $len,$len # safe in ctr32 subroutine even in 64-bit case ___ $code.=<<___ if (!$softonly); l %r0,240($key) @@ -1366,7 +1389,7 @@ $code.=<<___ if (!$softonly); clr %r0,%r1 jl .Lctr32_software - stmg %r6,$s3,48($sp) + stm${g} %r6,$s3,6*$SIZE_T($sp) slgr $out,$inp la %r1,0($key) # %r1 is permanent copy of $key @@ -1388,14 +1411,14 @@ $code.=<<___ if (!$softonly); la $sp,1024($s0) # alloca srlg $fp,$fp,4 # convert bytes to blocks, minimum 16 - stg $s2,0($sp) # back-chain - stg $fp,8($sp) + st${g} $s2,0($sp) # back-chain + st${g} $fp,$SIZE_T($sp) slgr $len,$fp brc 1,.Lctr32_hw_loop # not zero, no borrow algr $fp,$len # input is shorter than allocated buffer lghi $len,0 - stg $fp,8($sp) + st${g} $fp,$SIZE_T($sp) .Lctr32_hw_loop: la $s2,16($sp) @@ -1432,8 +1455,8 @@ $code.=<<___ if (!$softonly); lghi $len,0 brc 4+1,.Lctr32_hw_loop # not zero - lg $s0,0($sp) - lg $s1,8($sp) + l${g} $s0,0($sp) + l${g} $s1,$SIZE_T($sp) la $s2,16($sp) .Lctr32_hw_zap: stg $s0,0($s2) @@ -1442,30 +1465,30 @@ $code.=<<___ if (!$softonly); brct $s1,.Lctr32_hw_zap la $sp,0($s0) - lmg %r6,$s3,48($sp) + lm${g} %r6,$s3,6*$SIZE_T($sp) br $ra .align 16 .Lctr32_software: ___ $code.=<<___; - stmg $key,$ra,40($sp) - slgr $out,$inp + stm${g} $key,$ra,5*$SIZE_T($sp) + sl${g}r $out,$inp larl $tbl,AES_Te llgf $t1,12($ivp) .Lctr32_loop: - stmg $inp,$len,16($sp) + stm${g} $inp,$len,2*$SIZE_T($sp) llgf $s0,0($ivp) llgf $s1,4($ivp) llgf $s2,8($ivp) lgr $s3,$t1 - st $t1,128($sp) + st $t1,16*$SIZE_T($sp) lgr %r4,$key bras $ra,_s390x_AES_encrypt - lmg $inp,$ivp,16($sp) - llgf $t1,128($sp) + lm${g} $inp,$ivp,2*$SIZE_T($sp) + llgf $t1,16*$SIZE_T($sp) x $s0,0($inp) x $s1,4($inp) x $s2,8($inp) @@ -1479,7 +1502,7 @@ $code.=<<___; ahi $t1,1 # 32-bit increment brct $len,.Lctr32_loop - lmg %r6,$ra,48($sp) + lm${g} %r6,$ra,6*$SIZE_T($sp) br $ra .size AES_ctr32_encrypt,.-AES_ctr32_encrypt ___ diff --git a/crypto/bn/asm/s390x-mont.pl b/crypto/bn/asm/s390x-mont.pl index 502fa2e01f..0c5f0638e1 100644 --- a/crypto/bn/asm/s390x-mont.pl +++ b/crypto/bn/asm/s390x-mont.pl @@ -32,9 +32,33 @@ # Reschedule to minimize/avoid Address Generation Interlock hazard, # make inner loops counter-based. +# November 2010. +# +# Adapt for -m31 build. If kernel supports what's called "highgprs" +# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit +# instructions and achieve "64-bit" performance even in 31-bit legacy +# application context. The feature is not specific to any particular +# processor, as long as it's "z-CPU". Latter implies that the code +# remains z/Architecture specific. Compatibility with 32-bit BN_ULONG +# is achieved by swapping words after 64-bit loads, follow _dswap-s. +# On z990 it was measured to perform 2.6-2.2 times better, less for +# longer keys... + +$flavour = shift; + +if ($flavour =~ /3[12]/) { + $SIZE_T=4; + $g=""; +} else { + $SIZE_T=8; + $g="g"; +} + while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} open STDOUT,">$output"; +$stdframe=16*$SIZE_T+4*8; + $mn0="%r0"; $num="%r1"; @@ -63,34 +87,44 @@ $code.=<<___; .globl bn_mul_mont .type bn_mul_mont,\@function bn_mul_mont: - lgf $num,164($sp) # pull $num - sla $num,3 # $num to enumerate bytes + lgf $num,`$stdframe+$SIZE_T-4`($sp) # pull $num + sla $num,`log($SIZE_T)/log(2)` # $num to enumerate bytes la $bp,0($num,$bp) - stg %r2,16($sp) + st${g} %r2,2*$SIZE_T($sp) cghi $num,16 # lghi %r2,0 # blr %r14 # if($num<16) return 0; +___ +$code.=<<___ if ($flavour =~ /3[12]/); + tmll $num,4 + bnzr %r14 # if ($num&1) return 0; +___ +$code.=<<___ if ($flavour !~ /3[12]/); cghi $num,128 # bhr %r14 # if($num>128) return 0; +___ +$code.=<<___; + stm${g} %r3,%r15,3*$SIZE_T($sp) - stmg %r3,%r15,24($sp) - - lghi $rp,-160-8 # leave room for carry bit + lghi $rp,-$stdframe-8 # leave room for carry bit lcgr $j,$num # -$num lgr %r0,$sp la $rp,0($rp,$sp) la $sp,0($j,$rp) # alloca - stg %r0,0($sp) # back chain + st${g} %r0,0($sp) # back chain sra $num,3 # restore $num la $bp,0($j,$bp) # restore $bp ahi $num,-1 # adjust $num for inner loop lg $n0,0($n0) # pull n0 + _dswap $n0 lg $bi,0($bp) + _dswap $bi lg $alo,0($ap) + _dswap $alo mlgr $ahi,$bi # ap[0]*bp[0] lgr $AHI,$ahi @@ -98,6 +132,7 @@ bn_mul_mont: msgr $mn0,$n0 lg $nlo,0($np) # + _dswap $nlo mlgr $nhi,$mn0 # np[0]*m1 algr $nlo,$alo # +="tp[0]" lghi $NHI,0 @@ -109,12 +144,14 @@ bn_mul_mont: .align 16 .L1st: lg $alo,0($j,$ap) + _dswap $alo mlgr $ahi,$bi # ap[j]*bp[0] algr $alo,$AHI lghi $AHI,0 alcgr $AHI,$ahi lg $nlo,0($j,$np) + _dswap $nlo mlgr $nhi,$mn0 # np[j]*m1 algr $nlo,$NHI lghi $NHI,0 @@ -122,22 +159,24 @@ bn_mul_mont: algr $nlo,$alo alcgr $NHI,$nhi - stg $nlo,160-8($j,$sp) # tp[j-1]= + stg $nlo,$stdframe-8($j,$sp) # tp[j-1]= la $j,8($j) # j++ brct $count,.L1st algr $NHI,$AHI lghi $AHI,0 alcgr $AHI,$AHI # upmost overflow bit - stg $NHI,160-8($j,$sp) - stg $AHI,160($j,$sp) + stg $NHI,$stdframe-8($j,$sp) + stg $AHI,$stdframe($j,$sp) la $bp,8($bp) # bp++ .Louter: lg $bi,0($bp) # bp[i] + _dswap $bi lg $alo,0($ap) + _dswap $alo mlgr $ahi,$bi # ap[0]*bp[i] - alg $alo,160($sp) # +=tp[0] + alg $alo,$stdframe($sp) # +=tp[0] lghi $AHI,0 alcgr $AHI,$ahi @@ -145,6 +184,7 @@ bn_mul_mont: msgr $mn0,$n0 # tp[0]*n0 lg $nlo,0($np) # np[0] + _dswap $nlo mlgr $nhi,$mn0 # np[0]*m1 algr $nlo,$alo # +="tp[0]" lghi $NHI,0 @@ -156,14 +196,16 @@ bn_mul_mont: .align 16 .Linner: lg $alo,0($j,$ap) + _dswap $alo mlgr $ahi,$bi # ap[j]*bp[i] algr $alo,$AHI lghi $AHI,0 alcgr $ahi,$AHI - alg $alo,160($j,$sp)# +=tp[j] + alg $alo,$stdframe($j,$sp)# +=tp[j] alcgr $AHI,$ahi lg $nlo,0($j,$np) + _dswap $nlo mlgr $nhi,$mn0 # np[j]*m1 algr $nlo,$NHI lghi $NHI,0 @@ -171,31 +213,33 @@ bn_mul_mont: algr $nlo,$alo # +="tp[j]" alcgr $NHI,$nhi - stg $nlo,160-8($j,$sp) # tp[j-1]= + stg $nlo,$stdframe-8($j,$sp) # tp[j-1]= la $j,8($j) # j++ brct $count,.Linner algr $NHI,$AHI lghi $AHI,0 alcgr $AHI,$AHI - alg $NHI,160($j,$sp)# accumulate previous upmost overflow bit + alg $NHI,$stdframe($j,$sp)# accumulate previous upmost overflow bit lghi $ahi,0 alcgr $AHI,$ahi # new upmost overflow bit - stg $NHI,160-8($j,$sp) - stg $AHI,160($j,$sp) + stg $NHI,$stdframe-8($j,$sp) + stg $AHI,$stdframe($j,$sp) la $bp,8($bp) # bp++ - clg $bp,160+8+32($j,$sp) # compare to &bp[num] + cl${g} $bp,`$stdframe+8+4*$SIZE_T`($j,$sp) # compare to &bp[num] jne .Louter - lg $rp,160+8+16($j,$sp) # reincarnate rp - la $ap,160($sp) + l${g} $rp,`$stdframe+8+2*$SIZE_T`($j,$sp) # reincarnate rp + la $ap,$stdframe($sp) ahi $num,1 # restore $num, incidentally clears "borrow" la $j,0(%r0) lr $count,$num .Lsub: lg $alo,0($j,$ap) - slbg $alo,0($j,$np) + lg $nlo,0($j,$np) + _dswap $nlo + slbgr $alo,$nlo stg $alo,0($j,$rp) la $j,8($j) brct $count,.Lsub @@ -210,19 +254,24 @@ bn_mul_mont: la $j,0(%r0) lgr $count,$num -.Lcopy: lg $alo,0($j,$ap) # copy or in-place refresh - stg $j,160($j,$sp) # zap tp +.Lcopy: lg $alo,0($j,$ap) # copy or in-place refresh + _dswap $alo + stg $j,$stdframe($j,$sp) # zap tp stg $alo,0($j,$rp) la $j,8($j) brct $count,.Lcopy - la %r1,160+8+48($j,$sp) - lmg %r6,%r15,0(%r1) + la %r1,`$stdframe+8+6*$SIZE_T`($j,$sp) + lm${g} %r6,%r15,0(%r1) lghi %r2,1 # signal "processed" br %r14 .size bn_mul_mont,.-bn_mul_mont .string "Montgomery Multiplication for s390x, CRYPTOGAMS by " ___ -print $code; +foreach (split("\n",$code)) { + s/\`([^\`]*)\`/eval $1/ge; + s/_dswap\s+(%r[0-9]+)/sprintf("rllg\t%s,%s,32",$1,$1) if($SIZE_T==4)/e; + print $_,"\n"; +} close STDOUT; diff --git a/crypto/modes/asm/ghash-s390x.pl b/crypto/modes/asm/ghash-s390x.pl index d7689de541..16ad034fc1 100644 --- a/crypto/modes/asm/ghash-s390x.pl +++ b/crypto/modes/asm/ghash-s390x.pl @@ -18,6 +18,26 @@ # and the result should be close to 12. In the lack of instruction- # level profiling data it's impossible to tell why... +# November 2010. +# +# Adapt for -m31 build. If kernel supports what's called "highgprs" +# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit +# instructions and achieve "64-bit" performance even in 31-bit legacy +# application context. The feature is not specific to any particular +# processor, as long as it's "z-CPU". Latter implies that the code +# remains z/Architecture specific. On z990 it was measured to perform +# 2.8x better than 32-bit code generated by gcc 4.3. + +$flavour = shift; + +if ($flavour =~ /3[12]/) { + $SIZE_T=4; + $g=""; +} else { + $SIZE_T=8; + $g="g"; +} + while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} open STDOUT,">$output"; @@ -74,7 +94,7 @@ $code.=<<___ if(!$softonly); .Lsoft_gmult: ___ $code.=<<___; - stmg %r6,%r14,48($sp) + stm${g} %r6,%r14,6*$SIZE_T($sp) aghi $Xi,-1 lghi $len,1 @@ -109,8 +129,11 @@ $code.=<<___ if(!$softonly); .align 32 .Lsoft_ghash: ___ +$cdoe.=<<___ if ($flavour =~ /3[12]/); + llgfr $len,$len +___ $code.=<<___; - stmg %r6,%r14,48($sp) + stm${g} %r6,%r14,6*$SIZE_T($sp) aghi $Xi,-1 srlg $len,$len,4 @@ -209,7 +232,7 @@ $code.=<<___; xgr $Zhi,$tmp stg $Zlo,8+1($Xi) stg $Zhi,0+1($Xi) - lmg %r6,%r14,48($sp) + lm${g} %r6,%r14,6*$SIZE_T($sp) br %r14 .type gcm_ghash_4bit,\@function .size gcm_ghash_4bit,(.-gcm_ghash_4bit) diff --git a/crypto/rc4/asm/rc4-s390x.pl b/crypto/rc4/asm/rc4-s390x.pl index f26c515e78..1aa754820c 100644 --- a/crypto/rc4/asm/rc4-s390x.pl +++ b/crypto/rc4/asm/rc4-s390x.pl @@ -13,6 +13,26 @@ # "cluster" Address Generation Interlocks, so that one pipeline stall # resolves several dependencies. +# November 2010. +# +# Adapt for -m31 build. If kernel supports what's called "highgprs" +# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit +# instructions and achieve "64-bit" performance even in 31-bit legacy +# application context. The feature is not specific to any particular +# processor, as long as it's "z-CPU". Latter implies that the code +# remains z/Architecture specific. On z990 it was measured to perform +# 50% better than code generated by gcc 4.3. + +$flavour = shift; + +if ($flavour =~ /3[12]/) { + $SIZE_T=4; + $g=""; +} else { + $SIZE_T=8; + $g="g"; +} + while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} open STDOUT,">$output"; @@ -42,7 +62,12 @@ $code.=<<___; .type RC4,\@function .align 64 RC4: - stmg %r6,%r11,48($sp) + stm${g} %r6,%r11,6*$SIZE_T($sp) +___ +$code.=<<___ if ($flavour =~ /3[12]/); + llgfr $len,$len +___ +$code.=<<___; llgc $XX[0],0($key) llgc $YY,1($key) la $XX[0],1($XX[0]) @@ -93,7 +118,7 @@ $code.=<<___; xgr $acc,$TX[1] stg $acc,0($out) la $out,8($out) - brct $cnt,.Loop8 + brctg $cnt,.Loop8 .Lshort: lghi $acc,7 @@ -125,7 +150,7 @@ $code.=<<___; ahi $XX[0],-1 stc $XX[0],0($key) stc $YY,1($key) - lmg %r6,%r11,48($sp) + lm${g} %r6,%r11,6*$SIZE_T($sp) br $rp .size RC4,.-RC4 .string "RC4 for s390x, CRYPTOGAMS by " @@ -150,7 +175,7 @@ $code.=<<___; .type RC4_set_key,\@function .align 64 RC4_set_key: - stmg %r6,%r8,48($sp) + stm${g} %r6,%r8,6*$SIZE_T($sp) lhi $cnt,256 la $idx,0(%r0) sth $idx,0($key) @@ -183,7 +208,7 @@ RC4_set_key: la $iinp,0(%r0) j .L2ndloop .Ldone: - lmg %r6,%r8,48($sp) + lm${g} %r6,%r8,6*$SIZE_T($sp) br $rp .size RC4_set_key,.-RC4_set_key diff --git a/crypto/s390xcpuid.S b/crypto/s390xcpuid.S index 82312d8b4e..06815347e6 100644 --- a/crypto/s390xcpuid.S +++ b/crypto/s390xcpuid.S @@ -62,6 +62,9 @@ OPENSSL_wipe_cpu: .type OPENSSL_cleanse,@function .align 16 OPENSSL_cleanse: +#if !defined(__s390x__) && !defined(__s390x) + llgfr %r3,%r3 +#endif lghi %r4,15 lghi %r0,0 clgr %r3,%r4 diff --git a/crypto/sha/asm/sha1-s390x.pl b/crypto/sha/asm/sha1-s390x.pl index 0e38f8e36d..9193dda45e 100644 --- a/crypto/sha/asm/sha1-s390x.pl +++ b/crypto/sha/asm/sha1-s390x.pl @@ -21,8 +21,27 @@ # instructions to favour dual-issue z10 pipeline. On z10 hardware is # "only" ~2.3x faster than software. +# November 2010. +# +# Adapt for -m31 build. If kernel supports what's called "highgprs" +# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit +# instructions and achieve "64-bit" performance even in 31-bit legacy +# application context. The feature is not specific to any particular +# processor, as long as it's "z-CPU". Latter implies that the code +# remains z/Architecture specific. + $kimdfunc=1; # magic function code for kimd instruction +$flavour = shift; + +if ($flavour =~ /3[12]/) { + $SIZE_T=4; + $g=""; +} else { + $SIZE_T=8; + $g="g"; +} + while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} open STDOUT,">$output"; @@ -42,13 +61,14 @@ $t1="%r11"; @X=("%r12","%r13","%r14"); $sp="%r15"; -$frame=160+16*4; +$stdframe=16*$SIZE_T+4*8; +$frame=$stdframe+16*4; sub Xupdate { my $i=shift; $code.=<<___ if ($i==15); - lg $prefetch,160($sp) ### Xupdate(16) warm-up + lg $prefetch,$stdframe($sp) ### Xupdate(16) warm-up lr $X[0],$X[2] ___ return if ($i&1); # Xupdate is vectorized and executed every 2nd cycle @@ -58,8 +78,8 @@ $code.=<<___ if ($i<16); ___ $code.=<<___ if ($i>=16); xgr $X[0],$prefetch ### Xupdate($i) - lg $prefetch,`160+4*(($i+2)%16)`($sp) - xg $X[0],`160+4*(($i+8)%16)`($sp) + lg $prefetch,`$stdframe+4*(($i+2)%16)`($sp) + xg $X[0],`$stdframe+4*(($i+8)%16)`($sp) xgr $X[0],$prefetch rll $X[0],$X[0],1 rllg $X[1],$X[0],32 @@ -68,7 +88,7 @@ $code.=<<___ if ($i>=16); lr $X[2],$X[1] # feedback ___ $code.=<<___ if ($i<=70); - stg $X[0],`160+4*($i%16)`($sp) + stg $X[0],`$stdframe+4*($i%16)`($sp) ___ unshift(@X,pop(@X)); } @@ -148,9 +168,9 @@ $code.=<<___ if ($kimdfunc); tmhl %r0,0x4000 # check for message-security assist jz .Lsoftware lghi %r0,0 - la %r1,16($sp) + la %r1,`2*$SIZE_T`($sp) .long 0xb93e0002 # kimd %r0,%r2 - lg %r0,16($sp) + lg %r0,`2*$SIZE_T`($sp) tmhh %r0,`0x8000>>$kimdfunc` jz .Lsoftware lghi %r0,$kimdfunc @@ -165,11 +185,11 @@ $code.=<<___ if ($kimdfunc); ___ $code.=<<___; lghi %r1,-$frame - stg $ctx,16($sp) - stmg %r6,%r15,48($sp) + st${g} $ctx,`2*$SIZE_T`($sp) + stm${g} %r6,%r15,`6*$SIZE_T`($sp) lgr %r0,$sp la $sp,0(%r1,$sp) - stg %r0,0($sp) + st${g} %r0,0($sp) larl $t0,Ktable llgf $A,0($ctx) @@ -199,7 +219,7 @@ ___ for (;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } $code.=<<___; - lg $ctx,`$frame+16`($sp) + l${g} $ctx,`$frame+2*$SIZE_T`($sp) la $inp,64($inp) al $A,0($ctx) al $B,4($ctx) @@ -211,9 +231,9 @@ $code.=<<___; st $C,8($ctx) st $D,12($ctx) st $E,16($ctx) - brct $len,.Lloop + brct${g} $len,.Lloop - lmg %r6,%r15,`$frame+48`($sp) + lm${g} %r6,%r15,`$frame+6*$SIZE_T`($sp) br %r14 .size sha1_block_data_order,.-sha1_block_data_order .string "SHA1 block transform for s390x, CRYPTOGAMS by " diff --git a/crypto/sha/asm/sha512-s390x.pl b/crypto/sha/asm/sha512-s390x.pl index 3a358a4860..079a3fc78a 100644 --- a/crypto/sha/asm/sha512-s390x.pl +++ b/crypto/sha/asm/sha512-s390x.pl @@ -26,6 +26,26 @@ # favour dual-issue z10 pipeline. Hardware SHA256/512 is ~4.7x faster # than software. +# November 2010. +# +# Adapt for -m31 build. If kernel supports what's called "highgprs" +# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit +# instructions and achieve "64-bit" performance even in 31-bit legacy +# application context. The feature is not specific to any particular +# processor, as long as it's "z-CPU". Latter implies that the code +# remains z/Architecture specific. On z900 SHA256 was measured to +# perform 2.4x and SHA512 - 13x better than code generated by gcc 4.3. + +$flavour = shift; + +if ($flavour =~ /3[12]/) { + $SIZE_T=4; + $g=""; +} else { + $SIZE_T=8; + $g="g"; +} + $t0="%r0"; $t1="%r1"; $ctx="%r2"; $t2="%r2"; @@ -78,7 +98,8 @@ if ($output =~ /512/) { } $Func="sha${label}_block_data_order"; $Table="K${label}"; -$frame=160+16*$SZ; +$stdframe=16*$SIZE_T+4*8; +$frame=$stdframe+16*$SZ; sub BODY_00_15 { my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; @@ -93,9 +114,9 @@ $code.=<<___; xgr $t0,$t1 $ROT $t1,$t1,`$Sigma1[2]-$Sigma1[1]` xgr $t2,$g - $ST $T1,`160+$SZ*($i%16)`($sp) + $ST $T1,`$stdframe+$SZ*($i%16)`($sp) xgr $t0,$t1 # Sigma1(e) - la $T1,0($T1,$h) # T1+=h + algr $T1,$h # T1+=h ngr $t2,$e lgr $t1,$a algr $T1,$t0 # T1+=Sigma1(e) @@ -113,7 +134,7 @@ $code.=<<___; ngr $t2,$b algr $h,$T1 # h+=T1 ogr $t2,$t1 # Maj(a,b,c) - la $d,0($d,$T1) # d+=T1 + algr $d,$T1 # d+=T1 algr $h,$t2 # h+=Maj(a,b,c) ___ } @@ -122,19 +143,19 @@ sub BODY_16_XX { my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; $code.=<<___; - $LD $T1,`160+$SZ*(($i+1)%16)`($sp) ### $i - $LD $t1,`160+$SZ*(($i+14)%16)`($sp) + $LD $T1,`$stdframe+$SZ*(($i+1)%16)`($sp) ### $i + $LD $t1,`$stdframe+$SZ*(($i+14)%16)`($sp) $ROT $t0,$T1,$sigma0[0] $SHR $T1,$sigma0[2] $ROT $t2,$t0,`$sigma0[1]-$sigma0[0]` xgr $T1,$t0 $ROT $t0,$t1,$sigma1[0] - xgr $T1,$t2 # sigma0(X[i+1]) + xgr $T1,$t2 # sigma0(X[i+1]) $SHR $t1,$sigma1[2] - $ADD $T1,`160+$SZ*($i%16)`($sp) # +=X[i] + $ADD $T1,`$stdframe+$SZ*($i%16)`($sp) # +=X[i] xgr $t1,$t0 $ROT $t0,$t0,`$sigma1[1]-$sigma1[0]` - $ADD $T1,`160+$SZ*(($i+9)%16)`($sp) # +=X[i+9] + $ADD $T1,`$stdframe+$SZ*(($i+9)%16)`($sp) # +=X[i+9] xgr $t1,$t0 # sigma1(X[i+14]) algr $T1,$t1 # +=sigma1(X[i+14]) ___ @@ -212,6 +233,7 @@ $code.=<<___; .globl $Func .type $Func,\@function $Func: + sllg $len,$len,`log(16*$SZ)/log(2)` ___ $code.=<<___ if ($kimdfunc); larl %r1,OPENSSL_s390xcap_P @@ -219,15 +241,15 @@ $code.=<<___ if ($kimdfunc); tmhl %r0,0x4000 # check for message-security assist jz .Lsoftware lghi %r0,0 - la %r1,16($sp) + la %r1,`2*$SIZE_T`($sp) .long 0xb93e0002 # kimd %r0,%r2 - lg %r0,16($sp) + lg %r0,`2*$SIZE_T`($sp) tmhh %r0,`0x8000>>$kimdfunc` jz .Lsoftware lghi %r0,$kimdfunc lgr %r1,$ctx lgr %r2,$inp - sllg %r3,$len,`log(16*$SZ)/log(2)` + lgr %r3,$len .long 0xb93e0002 # kimd %r0,%r2 brc 1,.-4 # pay attention to "partial completion" br %r14 @@ -235,13 +257,12 @@ $code.=<<___ if ($kimdfunc); .Lsoftware: ___ $code.=<<___; - sllg $len,$len,`log(16*$SZ)/log(2)` lghi %r1,-$frame - agr $len,$inp - stmg $ctx,%r15,16($sp) + la $len,0($len,$inp) + stm${g} $ctx,%r15,`2*$SIZE_T`($sp) lgr %r0,$sp la $sp,0(%r1,$sp) - stg %r0,0($sp) + st${g} %r0,0($sp) larl $tbl,$Table $LD $A,`0*$SZ`($ctx) @@ -265,7 +286,7 @@ $code.=<<___; clgr $len,$t0 jne .Lrounds_16_xx - lg $ctx,`$frame+16`($sp) + l${g} $ctx,`$frame+2*$SIZE_T`($sp) la $inp,`16*$SZ`($inp) $ADD $A,`0*$SZ`($ctx) $ADD $B,`1*$SZ`($ctx) @@ -283,10 +304,10 @@ $code.=<<___; $ST $F,`5*$SZ`($ctx) $ST $G,`6*$SZ`($ctx) $ST $H,`7*$SZ`($ctx) - clg $inp,`$frame+32`($sp) + cl${g} $inp,`$frame+4*$SIZE_T`($sp) jne .Lloop - lmg %r6,%r15,`$frame+48`($sp) + lm${g} %r6,%r15,`$frame+6*$SIZE_T`($sp) br %r14 .size $Func,.-$Func .string "SHA${label} block transform for s390x, CRYPTOGAMS by " -- 2.40.0