From 313e6ec11fb8a7bda1676ce5804bee8755664141 Mon Sep 17 00:00:00 2001 From: Andy Polyakov Date: Thu, 2 Apr 2015 10:17:42 +0200 Subject: [PATCH] Add assembly support for 32-bit iOS. Reviewed-by: Matt Caswell Reviewed-by: Richard Levitte --- Configurations/10-main.conf | 9 ++ crypto/Makefile | 1 + crypto/aes/asm/aes-armv4.pl | 38 ++++- crypto/aes/asm/bsaes-armv7.pl | 44 +++++- crypto/armcap.c | 2 +- crypto/{armv4cpuid.S => armv4cpuid.pl} | 27 +++- crypto/bn/asm/armv4-gf2m.pl | 20 ++- crypto/bn/asm/armv4-mont.pl | 22 ++- crypto/ec/asm/ecp_nistz256-armv4.pl | 184 +++++++++++++------------ crypto/modes/asm/ghash-armv4.pl | 37 +++-- crypto/perlasm/arm-xlate.pl | 2 +- crypto/sha/asm/sha1-armv4-large.pl | 19 ++- crypto/sha/asm/sha256-armv4.pl | 32 ++++- crypto/sha/asm/sha512-armv4.pl | 26 +++- 14 files changed, 327 insertions(+), 136 deletions(-) rename crypto/{armv4cpuid.S => armv4cpuid.pl} (88%) diff --git a/Configurations/10-main.conf b/Configurations/10-main.conf index a26fc6eaa9..025bd86ac7 100644 --- a/Configurations/10-main.conf +++ b/Configurations/10-main.conf @@ -1391,6 +1391,15 @@ cflags => "-isysroot \$(CROSS_TOP)/SDKs/\$(CROSS_SDK) -fno-common", sys_id => "iOS", }, + "ios-cross" => { + inherit_from => [ "darwin-common", asm("armv4_asm") ], + # It should be possible to go below iOS 6 and even add -arch armv6, + # thus targeting iPhone pre-3GS, but it's assumed to be irrelevant + # at this point (and impossible to download SDK for). + cflags => "-arch armv7 -mios-version-min=6.0.0 -isysroot \$(CROSS_TOP)/SDKs/\$(CROSS_SDK) -fno-common", + sys_id => "iOS", + perlasm_scheme => "ios32", + }, "ios64-cross" => { inherit_from => [ "darwin-common", asm("aarch64_asm") ], cflags => "-arch arm64 -mios-version-min=7.0.0 -isysroot \$(CROSS_TOP)/SDKs/\$(CROSS_SDK) -fno-common", diff --git a/crypto/Makefile b/crypto/Makefile index ec5af47a37..5270d75e19 100644 --- a/crypto/Makefile +++ b/crypto/Makefile @@ -84,6 +84,7 @@ alphacpuid.s: alphacpuid.pl $(PERL) alphacpuid.pl > $$preproc && \ $(CC) -E -P $$preproc > $@ && rm $$preproc) arm64cpuid.S: arm64cpuid.pl; $(PERL) arm64cpuid.pl $(PERLASM_SCHEME) > $@ +armv4cpuid.S: armv4cpuid.pl; $(PERL) armv4cpuid.pl $(PERLASM_SCHEME) > $@ subdirs: @target=all; $(RECURSIVE_MAKE) diff --git a/crypto/aes/asm/aes-armv4.pl b/crypto/aes/asm/aes-armv4.pl index a620a7cddb..0f7ec39d56 100644 --- a/crypto/aes/asm/aes-armv4.pl +++ b/crypto/aes/asm/aes-armv4.pl @@ -32,8 +32,20 @@ # Profiler-assisted and platform-specific optimization resulted in 16% # improvement on Cortex A8 core and ~21.5 cycles per byte. -while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} -open STDOUT,">$output"; +$flavour = shift; +if ($flavour=~/^\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; } +else { while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} } + +if ($flavour && $flavour ne "void") { + $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; + ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or + ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or + die "can't locate arm-xlate.pl"; + + open STDOUT,"| \"$^X\" $xlate $flavour $output"; +} else { + open STDOUT,">$output"; +} $s0="r0"; $s1="r1"; @@ -62,7 +74,7 @@ $code=<<___; .code 32 #else .syntax unified -# ifdef __thumb2__ +# if defined(__thumb2__) && !defined(__APPLE__) .thumb # else .code 32 @@ -187,9 +199,13 @@ AES_encrypt: adr r3,AES_encrypt #endif stmdb sp!,{r1,r4-r12,lr} +#ifdef __APPLE__ + adr $tbl,AES_Te +#else + sub $tbl,r3,#AES_encrypt-AES_Te @ Te +#endif mov $rounds,r0 @ inp mov $key,r2 - sub $tbl,r3,#AES_encrypt-AES_Te @ Te #if __ARM_ARCH__<7 ldrb $s0,[$rounds,#3] @ load input data in endian-neutral ldrb $t1,[$rounds,#2] @ manner... @@ -457,12 +473,16 @@ _armv4_AES_set_encrypt_key: bne .Labrt .Lok: stmdb sp!,{r4-r12,lr} - sub $tbl,r3,#_armv4_AES_set_encrypt_key-AES_Te-1024 @ Te4 - mov $rounds,r0 @ inp mov lr,r1 @ bits mov $key,r2 @ key +#ifdef __APPLE__ + adr $tbl,AES_Te+1024 @ Te4 +#else + sub $tbl,r3,#_armv4_AES_set_encrypt_key-AES_Te-1024 @ Te4 +#endif + #if __ARM_ARCH__<7 ldrb $s0,[$rounds,#3] @ load input data in endian-neutral ldrb $t1,[$rounds,#2] @ manner... @@ -955,9 +975,13 @@ AES_decrypt: adr r3,AES_decrypt #endif stmdb sp!,{r1,r4-r12,lr} +#ifdef __APPLE__ + adr $tbl,AES_Td +#else + sub $tbl,r3,#AES_decrypt-AES_Td @ Td +#endif mov $rounds,r0 @ inp mov $key,r2 - sub $tbl,r3,#AES_decrypt-AES_Td @ Td #if __ARM_ARCH__<7 ldrb $s0,[$rounds,#3] @ load input data in endian-neutral ldrb $t1,[$rounds,#2] @ manner... diff --git a/crypto/aes/asm/bsaes-armv7.pl b/crypto/aes/asm/bsaes-armv7.pl index a4d3856e7d..043fa383b7 100644 --- a/crypto/aes/asm/bsaes-armv7.pl +++ b/crypto/aes/asm/bsaes-armv7.pl @@ -47,8 +47,20 @@ # # -while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} -open STDOUT,">$output"; +$flavour = shift; +if ($flavour=~/^\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; } +else { while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} } + +if ($flavour && $flavour ne "void") { + $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; + ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or + ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or + die "can't locate arm-xlate.pl"; + + open STDOUT,"| \"$^X\" $xlate $flavour $output"; +} else { + open STDOUT,">$output"; +} my ($inp,$out,$len,$key)=("r0","r1","r2","r3"); my @XMM=map("q$_",(0..15)); @@ -715,7 +727,7 @@ $code.=<<___; .text .syntax unified @ ARMv7-capable assembler is expected to handle this -#ifdef __thumb2__ +#if defined(__thumb2__) && !defined(__APPLE__) .thumb #else .code 32 @@ -726,7 +738,11 @@ $code.=<<___; _bsaes_decrypt8: adr $const,_bsaes_decrypt8 vldmia $key!, {@XMM[9]} @ round 0 key +#ifdef __APPLE__ + adr $const,.LM0ISR +#else add $const,$const,#.LM0ISR-_bsaes_decrypt8 +#endif vldmia $const!, {@XMM[8]} @ .LM0ISR veor @XMM[10], @XMM[0], @XMM[9] @ xor with round0 key @@ -821,7 +837,11 @@ _bsaes_const: _bsaes_encrypt8: adr $const,_bsaes_encrypt8 vldmia $key!, {@XMM[9]} @ round 0 key +#ifdef __APPLE__ + adr $const,.LM0SR +#else sub $const,$const,#_bsaes_encrypt8-.LM0SR +#endif vldmia $const!, {@XMM[8]} @ .LM0SR _bsaes_encrypt8_alt: @@ -925,7 +945,11 @@ $code.=<<___; _bsaes_key_convert: adr $const,_bsaes_key_convert vld1.8 {@XMM[7]}, [$inp]! @ load round 0 key +#ifdef __APPLE__ + adr $const,.LM0 +#else sub $const,$const,#_bsaes_key_convert-.LM0 +#endif vld1.8 {@XMM[15]}, [$inp]! @ load round 1 key vmov.i8 @XMM[8], #0x01 @ bit masks @@ -1392,7 +1416,12 @@ bsaes_ctr32_encrypt_blocks: vstmia r12, {@XMM[7]} @ save last round key vld1.8 {@XMM[0]}, [$ctr] @ load counter +#ifdef __APPLE__ + mov $ctr, #.LREVM0SR-.LM0 + add $ctr, $const, $ctr +#else add $ctr, $const, #.LREVM0SR-.LM0 @ borrow $ctr +#endif vldmia $keysched, {@XMM[4]} @ load round0 key #else ldr r12, [$key, #244] @@ -1449,7 +1478,12 @@ bsaes_ctr32_encrypt_blocks: vldmia $ctr, {@XMM[8]} @ .LREVM0SR mov r5, $rounds @ pass rounds vstmia $fp, {@XMM[10]} @ save next counter +#ifdef __APPLE__ + mov $const, #.LREVM0SR-.LSR + sub $const, $ctr, $const +#else sub $const, $ctr, #.LREVM0SR-.LSR @ pass constants +#endif bl _bsaes_encrypt8_alt @@ -1550,7 +1584,7 @@ bsaes_ctr32_encrypt_blocks: rev r8, r8 #endif sub sp, sp, #0x10 - vst1.8 {@XMM[1]}, [sp,:64] @ copy counter value + vst1.8 {@XMM[1]}, [sp] @ copy counter value sub sp, sp, #0x10 .Lctr_enc_short_loop: @@ -1561,7 +1595,7 @@ bsaes_ctr32_encrypt_blocks: bl AES_encrypt vld1.8 {@XMM[0]}, [r4]! @ load input - vld1.8 {@XMM[1]}, [sp,:64] @ load encrypted counter + vld1.8 {@XMM[1]}, [sp] @ load encrypted counter add r8, r8, #1 #ifdef __ARMEL__ rev r0, r8 diff --git a/crypto/armcap.c b/crypto/armcap.c index 3dbe5748ea..1afbc9fcd0 100644 --- a/crypto/armcap.c +++ b/crypto/armcap.c @@ -3,7 +3,7 @@ #include #include #include -#include +#include #include "arm_arch.h" diff --git a/crypto/armv4cpuid.S b/crypto/armv4cpuid.pl similarity index 88% rename from crypto/armv4cpuid.S rename to crypto/armv4cpuid.pl index 65010ae4fe..1c447187c7 100644 --- a/crypto/armv4cpuid.S +++ b/crypto/armv4cpuid.pl @@ -1,3 +1,17 @@ +#!/usr/bin/env perl + +$flavour = shift; +$output = shift; + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or +( $xlate="${dir}perlasm/arm-xlate.pl" and -f $xlate) or +die "can't locate arm-xlate.pl"; + +open OUT,"| \"$^X\" $xlate $flavour $output"; +*STDOUT=*OUT; + +$code.=<<___; #include "arm_arch.h" .text @@ -91,7 +105,11 @@ _armv7_neon_probe: .global _armv7_tick .type _armv7_tick,%function _armv7_tick: +#ifdef __APPLE__ + mrrc p15,0,r0,r1,c14 @ CNTPCT +#else mrrc p15,1,r0,r1,c14 @ CNTVCT +#endif bx lr .size _armv7_tick,.-_armv7_tick @@ -130,6 +148,9 @@ OPENSSL_wipe_cpu: ldr r0,.LOPENSSL_armcap adr r1,.LOPENSSL_armcap ldr r0,[r1,r0] +#ifdef __APPLE__ + ldr r0,[r0] +#endif #endif eor r2,r2,r2 eor r3,r3,r3 @@ -190,7 +211,7 @@ OPENSSL_instrument_bus2: .align 5 #if __ARM_MAX_ARCH__>=7 .LOPENSSL_armcap: -.word OPENSSL_armcap_P-.LOPENSSL_armcap +.word OPENSSL_armcap_P-. #endif #if __ARM_ARCH__>=6 .align 5 @@ -207,3 +228,7 @@ atomic_add_spinlock: .comm OPENSSL_armcap_P,4,4 .hidden OPENSSL_armcap_P +___ + +print $code; +close STDOUT; diff --git a/crypto/bn/asm/armv4-gf2m.pl b/crypto/bn/asm/armv4-gf2m.pl index 8f529c95cf..f05461a8f0 100644 --- a/crypto/bn/asm/armv4-gf2m.pl +++ b/crypto/bn/asm/armv4-gf2m.pl @@ -32,8 +32,20 @@ # # http://conradoplg.cryptoland.net/files/2010/12/mocrysen13.pdf -while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} -open STDOUT,">$output"; +$flavour = shift; +if ($flavour=~/^\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; } +else { while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} } + +if ($flavour && $flavour ne "void") { + $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; + ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or + ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or + die "can't locate arm-xlate.pl"; + + open STDOUT,"| \"$^X\" $xlate $flavour $output"; +} else { + open STDOUT,">$output"; +} $code=<<___; #include "arm_arch.h" @@ -213,8 +225,8 @@ $code.=<<___; .align 5 .LNEON: ldr r12, [sp] @ 5th argument - vmov.32 $a, r2, r1 - vmov.32 $b, r12, r3 + vmov $a, r2, r1 + vmov $b, r12, r3 vmov.i64 $k48, #0x0000ffffffffffff vmov.i64 $k32, #0x00000000ffffffff vmov.i64 $k16, #0x000000000000ffff diff --git a/crypto/bn/asm/armv4-mont.pl b/crypto/bn/asm/armv4-mont.pl index 1d330e9f8a..59f218b5cf 100644 --- a/crypto/bn/asm/armv4-mont.pl +++ b/crypto/bn/asm/armv4-mont.pl @@ -38,8 +38,20 @@ # for execution on all NEON-capable processors, because gain on # others outweighs the marginal loss on Cortex-A9. -while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} -open STDOUT,">$output"; +$flavour = shift; +if ($flavour=~/^\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; } +else { while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} } + +if ($flavour && $flavour ne "void") { + $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; + ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or + ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or + die "can't locate arm-xlate.pl"; + + open STDOUT,"| \"$^X\" $xlate $flavour $output"; +} else { + open STDOUT,">$output"; +} $num="r0"; # starts as num argument, but holds &tp[num-1] $ap="r1"; @@ -75,7 +87,7 @@ $code=<<___; #if __ARM_MAX_ARCH__>=7 .align 5 .LOPENSSL_armcap: -.word OPENSSL_armcap_P-bn_mul_mont +.word OPENSSL_armcap_P-.Lbn_mul_mont #endif .global bn_mul_mont @@ -83,6 +95,7 @@ $code=<<___; .align 5 bn_mul_mont: +.Lbn_mul_mont: ldr ip,[sp,#4] @ load num stmdb sp!,{r0,r2} @ sp points at argument block #if __ARM_MAX_ARCH__>=7 @@ -91,6 +104,9 @@ bn_mul_mont: adr r0,bn_mul_mont ldr r2,.LOPENSSL_armcap ldr r0,[r0,r2] +#ifdef __APPLE__ + ldr r0,[r0] +#endif tst r0,#1 @ NEON available? ldmia sp, {r0,r2} beq .Lialu diff --git a/crypto/ec/asm/ecp_nistz256-armv4.pl b/crypto/ec/asm/ecp_nistz256-armv4.pl index 9f5500ebf3..b49b77ea3e 100755 --- a/crypto/ec/asm/ecp_nistz256-armv4.pl +++ b/crypto/ec/asm/ecp_nistz256-armv4.pl @@ -27,15 +27,19 @@ # operation. Keep in mind that +200% means 3x improvement. $flavour = shift; -while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} - -$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; -( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or -( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or -die "can't locate arm-xlate.pl"; - -open OUT,"| \"$^X\" $xlate $flavour $output"; -*STDOUT=*OUT; +if ($flavour=~/^\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; } +else { while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} } + +if ($flavour && $flavour ne "void") { + $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; + ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or + ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or + die "can't locate arm-xlate.pl"; + + open STDOUT,"| \"$^X\" $xlate $flavour $output"; +} else { + open STDOUT,">$output"; +} $code.=<<___; #include "arm_arch.h" @@ -129,7 +133,7 @@ ecp_nistz256_from_mont: .align 4 ecp_nistz256_mul_by_2: stmdb sp!,{r4-r12,lr} - bl _ecp_nistz256_mul_by_2 + bl __ecp_nistz256_mul_by_2 #if __ARM_ARCH__>=5 || !defined(__thumb__) ldmia sp!,{r4-r12,pc} #else @@ -138,9 +142,9 @@ ecp_nistz256_mul_by_2: #endif .size ecp_nistz256_mul_by_2,.-ecp_nistz256_mul_by_2 -.type _ecp_nistz256_mul_by_2,%function +.type __ecp_nistz256_mul_by_2,%function .align 4 -_ecp_nistz256_mul_by_2: +__ecp_nistz256_mul_by_2: ldr $a0,[$a_ptr,#0] ldr $a1,[$a_ptr,#4] ldr $a2,[$a_ptr,#8] @@ -161,7 +165,7 @@ _ecp_nistz256_mul_by_2: movcs $ff,#-1 @ $ff = carry ? -1 : 0 b .Lreduce_by_sub -.size _ecp_nistz256_mul_by_2,.-_ecp_nistz256_mul_by_2 +.size __ecp_nistz256_mul_by_2,.-__ecp_nistz256_mul_by_2 @ void ecp_nistz256_add(BN_ULONG r0[8],const BN_ULONG r1[8], @ const BN_ULONG r2[8]); @@ -170,7 +174,7 @@ _ecp_nistz256_mul_by_2: .align 4 ecp_nistz256_add: stmdb sp!,{r4-r12,lr} - bl _ecp_nistz256_add + bl __ecp_nistz256_add #if __ARM_ARCH__>=5 || !defined(__thumb__) ldmia sp!,{r4-r12,pc} #else @@ -179,9 +183,9 @@ ecp_nistz256_add: #endif .size ecp_nistz256_add,.-ecp_nistz256_add -.type _ecp_nistz256_add,%function +.type __ecp_nistz256_add,%function .align 4 -_ecp_nistz256_add: +__ecp_nistz256_add: str lr,[sp,#-4]! @ push lr ldr $a0,[$a_ptr,#0] @@ -239,7 +243,7 @@ _ecp_nistz256_add: str $a7,[$r_ptr,#28] mov pc,lr -.size _ecp_nistz256_add,.-_ecp_nistz256_add +.size __ecp_nistz256_add,.-__ecp_nistz256_add @ void ecp_nistz256_mul_by_3(BN_ULONG r0[8],const BN_ULONG r1[8]); .globl ecp_nistz256_mul_by_3 @@ -247,7 +251,7 @@ _ecp_nistz256_add: .align 4 ecp_nistz256_mul_by_3: stmdb sp!,{r4-r12,lr} - bl _ecp_nistz256_mul_by_3 + bl __ecp_nistz256_mul_by_3 #if __ARM_ARCH__>=5 || !defined(__thumb__) ldmia sp!,{r4-r12,pc} #else @@ -256,13 +260,13 @@ ecp_nistz256_mul_by_3: #endif .size ecp_nistz256_mul_by_3,.-ecp_nistz256_mul_by_3 -.type _ecp_nistz256_mul_by_3,%function +.type __ecp_nistz256_mul_by_3,%function .align 4 -_ecp_nistz256_mul_by_3: +__ecp_nistz256_mul_by_3: str lr,[sp,#-4]! @ push lr @ As multiplication by 3 is performed as 2*n+n, below are inline - @ copies of _ecp_nistz256_mul_by_2 and _ecp_nistz256_add, see + @ copies of __ecp_nistz256_mul_by_2 and __ecp_nistz256_add, see @ corresponding subroutines for details. ldr $a0,[$a_ptr,#0] @@ -326,7 +330,7 @@ _ecp_nistz256_mul_by_3: .align 4 ecp_nistz256_div_by_2: stmdb sp!,{r4-r12,lr} - bl _ecp_nistz256_div_by_2 + bl __ecp_nistz256_div_by_2 #if __ARM_ARCH__>=5 || !defined(__thumb__) ldmia sp!,{r4-r12,pc} #else @@ -335,9 +339,9 @@ ecp_nistz256_div_by_2: #endif .size ecp_nistz256_div_by_2,.-ecp_nistz256_div_by_2 -.type _ecp_nistz256_div_by_2,%function +.type __ecp_nistz256_div_by_2,%function .align 4 -_ecp_nistz256_div_by_2: +__ecp_nistz256_div_by_2: @ ret = (a is odd ? a+mod : a) >> 1 ldr $a0,[$a_ptr,#0] @@ -392,16 +396,16 @@ _ecp_nistz256_div_by_2: str $a7,[$r_ptr,#28] mov pc,lr -.size _ecp_nistz256_div_by_2,.-_ecp_nistz256_div_by_2 +.size __ecp_nistz256_div_by_2,.-__ecp_nistz256_div_by_2 @ void ecp_nistz256_sub(BN_ULONG r0[8],const BN_ULONG r1[8], -@ const BN_ULONG r2[8]); +@ const BN_ULONG r2[8]); .globl ecp_nistz256_sub .type ecp_nistz256_sub,%function .align 4 ecp_nistz256_sub: stmdb sp!,{r4-r12,lr} - bl _ecp_nistz256_sub + bl __ecp_nistz256_sub #if __ARM_ARCH__>=5 || !defined(__thumb__) ldmia sp!,{r4-r12,pc} #else @@ -410,9 +414,9 @@ ecp_nistz256_sub: #endif .size ecp_nistz256_sub,.-ecp_nistz256_sub -.type _ecp_nistz256_sub,%function +.type __ecp_nistz256_sub,%function .align 4 -_ecp_nistz256_sub: +__ecp_nistz256_sub: str lr,[sp,#-4]! @ push lr ldr $a0,[$a_ptr,#0] @@ -469,7 +473,7 @@ _ecp_nistz256_sub: str $a7,[$r_ptr,#28] mov pc,lr -.size _ecp_nistz256_sub,.-_ecp_nistz256_sub +.size __ecp_nistz256_sub,.-__ecp_nistz256_sub @ void ecp_nistz256_neg(BN_ULONG r0[8],const BN_ULONG r1[8]); .globl ecp_nistz256_neg @@ -477,7 +481,7 @@ _ecp_nistz256_sub: .align 4 ecp_nistz256_neg: stmdb sp!,{r4-r12,lr} - bl _ecp_nistz256_neg + bl __ecp_nistz256_neg #if __ARM_ARCH__>=5 || !defined(__thumb__) ldmia sp!,{r4-r12,pc} #else @@ -486,9 +490,9 @@ ecp_nistz256_neg: #endif .size ecp_nistz256_neg,.-ecp_nistz256_neg -.type _ecp_nistz256_neg,%function +.type __ecp_nistz256_neg,%function .align 4 -_ecp_nistz256_neg: +__ecp_nistz256_neg: ldr $a0,[$a_ptr,#0] eor $ff,$ff,$ff ldr $a1,[$a_ptr,#4] @@ -509,7 +513,7 @@ _ecp_nistz256_neg: sbc $ff,$ff,$ff b .Lreduce_by_add -.size _ecp_nistz256_neg,.-_ecp_nistz256_neg +.size __ecp_nistz256_neg,.-__ecp_nistz256_neg ___ { my @acc=map("r$_",(3..11)); @@ -533,7 +537,7 @@ ecp_nistz256_sqr_mont: ecp_nistz256_mul_mont: .Lecp_nistz256_mul_mont: stmdb sp!,{r4-r12,lr} - bl _ecp_nistz256_mul_mont + bl __ecp_nistz256_mul_mont #if __ARM_ARCH__>=5 || !defined(__thumb__) ldmia sp!,{r4-r12,pc} #else @@ -542,9 +546,9 @@ ecp_nistz256_mul_mont: #endif .size ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont -.type _ecp_nistz256_mul_mont,%function +.type __ecp_nistz256_mul_mont,%function .align 4 -_ecp_nistz256_mul_mont: +__ecp_nistz256_mul_mont: stmdb sp!,{r0-r2,lr} @ make a copy of arguments too ldr $bj,[$b_ptr,#0] @ b[0] @@ -675,14 +679,14 @@ $code.=<<___; @ "other way around", namely subtract modulus from result @ and if it borrowed, add modulus back. - subs @acc[1],@acc[1],#-1 @ compare to modulus - sbcs @acc[2],@acc[2],#-1 - sbcs @acc[3],@acc[3],#-1 + adds @acc[1],@acc[1],#1 @ subs @acc[1],@acc[1],#-1 + adcs @acc[2],@acc[2],#0 @ sbcs @acc[2],@acc[2],#-1 + adcs @acc[3],@acc[3],#0 @ sbcs @acc[3],@acc[3],#-1 sbcs @acc[4],@acc[4],#0 sbcs @acc[5],@acc[5],#0 sbcs @acc[6],@acc[6],#0 sbcs @acc[7],@acc[7],#1 - sbcs @acc[8],@acc[8],#-1 + adcs @acc[8],@acc[8],#0 @ sbcs @acc[8],@acc[8],#-1 ldr lr,[sp,#44] @ restore lr sbc @acc[0],@acc[0],#0 @ broadcast borrow bit add sp,sp,#48 @@ -710,7 +714,7 @@ $code.=<<___; str @acc[8],[$r_ptr,#28] mov pc,lr -.size _ecp_nistz256_mul_mont,.-_ecp_nistz256_mul_mont +.size __ecp_nistz256_mul_mont,.-__ecp_nistz256_mul_mont ___ } @@ -1064,7 +1068,7 @@ ___ {{{ ######################################################################## # Below $aN assignment matches order in which 256-bit result appears in -# register bank at return from _ecp_nistz256_mul_mont, so that we can +# register bank at return from __ecp_nistz256_mul_mont, so that we can # skip over reloading it from memory. This means that below functions # use custom calling sequence accepting 256-bit input in registers, # output pointer in r0, $r_ptr, and optional pointer in r2, $b_ptr. @@ -1164,9 +1168,9 @@ __ecp_nistz256_sub_morf: mov pc,lr .size __ecp_nistz256_sub_morf,.-__ecp_nistz256_sub_morf -.type __ecp_nistz256_mul_by_2,%function +.type __ecp_nistz256_add_self,%function .align 4 -__ecp_nistz256_mul_by_2: +__ecp_nistz256_add_self: adds $a0,$a0,$a0 @ a[0:7]+=a[0:7] adcs $a1,$a1,$a1 adcs $a2,$a2,$a2 @@ -1196,7 +1200,7 @@ __ecp_nistz256_mul_by_2: str $a7,[$r_ptr,#28] mov pc,lr -.size __ecp_nistz256_mul_by_2,.-__ecp_nistz256_mul_by_2 +.size __ecp_nistz256_add_self,.-__ecp_nistz256_add_self ___ @@ -1227,69 +1231,69 @@ ecp_nistz256_point_double: stmia r3,{r4-r11} add $r_ptr,sp,#$S - bl _ecp_nistz256_mul_by_2 @ p256_mul_by_2(S, in_y); + bl __ecp_nistz256_mul_by_2 @ p256_mul_by_2(S, in_y); add $b_ptr,$a_ptr,#32 add $a_ptr,$a_ptr,#32 add $r_ptr,sp,#$Zsqr - bl _ecp_nistz256_mul_mont @ p256_sqr_mont(Zsqr, in_z); + bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Zsqr, in_z); add $a_ptr,sp,#$S add $b_ptr,sp,#$S add $r_ptr,sp,#$S - bl _ecp_nistz256_mul_mont @ p256_sqr_mont(S, S); + bl __ecp_nistz256_mul_mont @ p256_sqr_mont(S, S); ldr $b_ptr,[sp,#32*5+4] add $a_ptr,$b_ptr,#32 add $b_ptr,$b_ptr,#64 add $r_ptr,sp,#$tmp0 - bl _ecp_nistz256_mul_mont @ p256_mul_mont(tmp0, in_z, in_y); + bl __ecp_nistz256_mul_mont @ p256_mul_mont(tmp0, in_z, in_y); ldr $r_ptr,[sp,#32*5] add $r_ptr,$r_ptr,#64 - bl __ecp_nistz256_mul_by_2 @ p256_mul_by_2(res_z, tmp0); + bl __ecp_nistz256_add_self @ p256_mul_by_2(res_z, tmp0); add $a_ptr,sp,#$in_x add $b_ptr,sp,#$Zsqr add $r_ptr,sp,#$M - bl _ecp_nistz256_add @ p256_add(M, in_x, Zsqr); + bl __ecp_nistz256_add @ p256_add(M, in_x, Zsqr); add $a_ptr,sp,#$in_x add $b_ptr,sp,#$Zsqr add $r_ptr,sp,#$Zsqr - bl _ecp_nistz256_sub @ p256_sub(Zsqr, in_x, Zsqr); + bl __ecp_nistz256_sub @ p256_sub(Zsqr, in_x, Zsqr); add $a_ptr,sp,#$S add $b_ptr,sp,#$S add $r_ptr,sp,#$tmp0 - bl _ecp_nistz256_mul_mont @ p256_sqr_mont(tmp0, S); + bl __ecp_nistz256_mul_mont @ p256_sqr_mont(tmp0, S); add $a_ptr,sp,#$Zsqr add $b_ptr,sp,#$M add $r_ptr,sp,#$M - bl _ecp_nistz256_mul_mont @ p256_mul_mont(M, M, Zsqr); + bl __ecp_nistz256_mul_mont @ p256_mul_mont(M, M, Zsqr); ldr $r_ptr,[sp,#32*5] add $a_ptr,sp,#$tmp0 add $r_ptr,$r_ptr,#32 - bl _ecp_nistz256_div_by_2 @ p256_div_by_2(res_y, tmp0); + bl __ecp_nistz256_div_by_2 @ p256_div_by_2(res_y, tmp0); add $a_ptr,sp,#$M add $r_ptr,sp,#$M - bl _ecp_nistz256_mul_by_3 @ p256_mul_by_3(M, M); + bl __ecp_nistz256_mul_by_3 @ p256_mul_by_3(M, M); add $a_ptr,sp,#$in_x add $b_ptr,sp,#$S add $r_ptr,sp,#$S - bl _ecp_nistz256_mul_mont @ p256_mul_mont(S, S, in_x); + bl __ecp_nistz256_mul_mont @ p256_mul_mont(S, S, in_x); add $r_ptr,sp,#$tmp0 - bl __ecp_nistz256_mul_by_2 @ p256_mul_by_2(tmp0, S); + bl __ecp_nistz256_add_self @ p256_mul_by_2(tmp0, S); ldr $r_ptr,[sp,#32*5] add $a_ptr,sp,#$M add $b_ptr,sp,#$M - bl _ecp_nistz256_mul_mont @ p256_sqr_mont(res_x, M); + bl __ecp_nistz256_mul_mont @ p256_sqr_mont(res_x, M); add $b_ptr,sp,#$tmp0 bl __ecp_nistz256_sub_from @ p256_sub(res_x, res_x, tmp0); @@ -1300,7 +1304,7 @@ ecp_nistz256_point_double: add $a_ptr,sp,#$M add $b_ptr,sp,#$S - bl _ecp_nistz256_mul_mont @ p256_mul_mont(S, S, M); + bl __ecp_nistz256_mul_mont @ p256_mul_mont(S, S, M); ldr $r_ptr,[sp,#32*5] add $b_ptr,$r_ptr,#32 @@ -1398,32 +1402,32 @@ ecp_nistz256_point_add: add $a_ptr,sp,#$in2_z add $b_ptr,sp,#$in2_z add $r_ptr,sp,#$Z2sqr - bl _ecp_nistz256_mul_mont @ p256_sqr_mont(Z2sqr, in2_z); + bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Z2sqr, in2_z); add $a_ptr,sp,#$in1_z add $b_ptr,sp,#$in1_z add $r_ptr,sp,#$Z1sqr - bl _ecp_nistz256_mul_mont @ p256_sqr_mont(Z1sqr, in1_z); + bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Z1sqr, in1_z); add $a_ptr,sp,#$in2_z add $b_ptr,sp,#$Z2sqr add $r_ptr,sp,#$S1 - bl _ecp_nistz256_mul_mont @ p256_mul_mont(S1, Z2sqr, in2_z); + bl __ecp_nistz256_mul_mont @ p256_mul_mont(S1, Z2sqr, in2_z); add $a_ptr,sp,#$in1_z add $b_ptr,sp,#$Z1sqr add $r_ptr,sp,#$S2 - bl _ecp_nistz256_mul_mont @ p256_mul_mont(S2, Z1sqr, in1_z); + bl __ecp_nistz256_mul_mont @ p256_mul_mont(S2, Z1sqr, in1_z); add $a_ptr,sp,#$in1_y add $b_ptr,sp,#$S1 add $r_ptr,sp,#$S1 - bl _ecp_nistz256_mul_mont @ p256_mul_mont(S1, S1, in1_y); + bl __ecp_nistz256_mul_mont @ p256_mul_mont(S1, S1, in1_y); add $a_ptr,sp,#$in2_y add $b_ptr,sp,#$S2 add $r_ptr,sp,#$S2 - bl _ecp_nistz256_mul_mont @ p256_mul_mont(S2, S2, in2_y); + bl __ecp_nistz256_mul_mont @ p256_mul_mont(S2, S2, in2_y); add $b_ptr,sp,#$S1 add $r_ptr,sp,#$R @@ -1441,12 +1445,12 @@ ecp_nistz256_point_add: str $a0,[sp,#32*18+12] add $r_ptr,sp,#$U1 - bl _ecp_nistz256_mul_mont @ p256_mul_mont(U1, in1_x, Z2sqr); + bl __ecp_nistz256_mul_mont @ p256_mul_mont(U1, in1_x, Z2sqr); add $a_ptr,sp,#$in2_x add $b_ptr,sp,#$Z1sqr add $r_ptr,sp,#$U2 - bl _ecp_nistz256_mul_mont @ p256_mul_mont(U2, in2_x, Z1sqr); + bl __ecp_nistz256_mul_mont @ p256_mul_mont(U2, in2_x, Z1sqr); add $b_ptr,sp,#$U1 add $r_ptr,sp,#$H @@ -1489,35 +1493,35 @@ ecp_nistz256_point_add: add $a_ptr,sp,#$R add $b_ptr,sp,#$R add $r_ptr,sp,#$Rsqr - bl _ecp_nistz256_mul_mont @ p256_sqr_mont(Rsqr, R); + bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Rsqr, R); add $a_ptr,sp,#$H add $b_ptr,sp,#$in1_z add $r_ptr,sp,#$res_z - bl _ecp_nistz256_mul_mont @ p256_mul_mont(res_z, H, in1_z); + bl __ecp_nistz256_mul_mont @ p256_mul_mont(res_z, H, in1_z); add $a_ptr,sp,#$H add $b_ptr,sp,#$H add $r_ptr,sp,#$Hsqr - bl _ecp_nistz256_mul_mont @ p256_sqr_mont(Hsqr, H); + bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Hsqr, H); add $a_ptr,sp,#$in2_z add $b_ptr,sp,#$res_z add $r_ptr,sp,#$res_z - bl _ecp_nistz256_mul_mont @ p256_mul_mont(res_z, res_z, in2_z); + bl __ecp_nistz256_mul_mont @ p256_mul_mont(res_z, res_z, in2_z); add $a_ptr,sp,#$H add $b_ptr,sp,#$Hsqr add $r_ptr,sp,#$Hcub - bl _ecp_nistz256_mul_mont @ p256_mul_mont(Hcub, Hsqr, H); + bl __ecp_nistz256_mul_mont @ p256_mul_mont(Hcub, Hsqr, H); add $a_ptr,sp,#$Hsqr add $b_ptr,sp,#$U1 add $r_ptr,sp,#$U2 - bl _ecp_nistz256_mul_mont @ p256_mul_mont(U2, U1, Hsqr); + bl __ecp_nistz256_mul_mont @ p256_mul_mont(U2, U1, Hsqr); add $r_ptr,sp,#$Hsqr - bl __ecp_nistz256_mul_by_2 @ p256_mul_by_2(Hsqr, U2); + bl __ecp_nistz256_add_self @ p256_mul_by_2(Hsqr, U2); add $b_ptr,sp,#$Rsqr add $r_ptr,sp,#$res_x @@ -1533,12 +1537,12 @@ ecp_nistz256_point_add: add $a_ptr,sp,#$Hcub add $b_ptr,sp,#$S1 add $r_ptr,sp,#$S2 - bl _ecp_nistz256_mul_mont @ p256_mul_mont(S2, S1, Hcub); + bl __ecp_nistz256_mul_mont @ p256_mul_mont(S2, S1, Hcub); add $a_ptr,sp,#$R add $b_ptr,sp,#$res_y add $r_ptr,sp,#$res_y - bl _ecp_nistz256_mul_mont @ p256_mul_mont(res_y, res_y, R); + bl __ecp_nistz256_mul_mont @ p256_mul_mont(res_y, res_y, R); add $b_ptr,sp,#$S2 bl __ecp_nistz256_sub_from @ p256_sub(res_y, res_y, S2); @@ -1663,12 +1667,12 @@ ecp_nistz256_point_add_affine: add $a_ptr,sp,#$in1_z add $b_ptr,sp,#$in1_z add $r_ptr,sp,#$Z1sqr - bl _ecp_nistz256_mul_mont @ p256_sqr_mont(Z1sqr, in1_z); + bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Z1sqr, in1_z); add $a_ptr,sp,#$Z1sqr add $b_ptr,sp,#$in2_x add $r_ptr,sp,#$U2 - bl _ecp_nistz256_mul_mont @ p256_mul_mont(U2, Z1sqr, in2_x); + bl __ecp_nistz256_mul_mont @ p256_mul_mont(U2, Z1sqr, in2_x); add $b_ptr,sp,#$in1_x add $r_ptr,sp,#$H @@ -1677,17 +1681,17 @@ ecp_nistz256_point_add_affine: add $a_ptr,sp,#$Z1sqr add $b_ptr,sp,#$in1_z add $r_ptr,sp,#$S2 - bl _ecp_nistz256_mul_mont @ p256_mul_mont(S2, Z1sqr, in1_z); + bl __ecp_nistz256_mul_mont @ p256_mul_mont(S2, Z1sqr, in1_z); add $a_ptr,sp,#$H add $b_ptr,sp,#$in1_z add $r_ptr,sp,#$res_z - bl _ecp_nistz256_mul_mont @ p256_mul_mont(res_z, H, in1_z); + bl __ecp_nistz256_mul_mont @ p256_mul_mont(res_z, H, in1_z); add $a_ptr,sp,#$in2_y add $b_ptr,sp,#$S2 add $r_ptr,sp,#$S2 - bl _ecp_nistz256_mul_mont @ p256_mul_mont(S2, S2, in2_y); + bl __ecp_nistz256_mul_mont @ p256_mul_mont(S2, S2, in2_y); add $b_ptr,sp,#$in1_y add $r_ptr,sp,#$R @@ -1696,25 +1700,25 @@ ecp_nistz256_point_add_affine: add $a_ptr,sp,#$H add $b_ptr,sp,#$H add $r_ptr,sp,#$Hsqr - bl _ecp_nistz256_mul_mont @ p256_sqr_mont(Hsqr, H); + bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Hsqr, H); add $a_ptr,sp,#$R add $b_ptr,sp,#$R add $r_ptr,sp,#$Rsqr - bl _ecp_nistz256_mul_mont @ p256_sqr_mont(Rsqr, R); + bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Rsqr, R); add $a_ptr,sp,#$H add $b_ptr,sp,#$Hsqr add $r_ptr,sp,#$Hcub - bl _ecp_nistz256_mul_mont @ p256_mul_mont(Hcub, Hsqr, H); + bl __ecp_nistz256_mul_mont @ p256_mul_mont(Hcub, Hsqr, H); add $a_ptr,sp,#$Hsqr add $b_ptr,sp,#$in1_x add $r_ptr,sp,#$U2 - bl _ecp_nistz256_mul_mont @ p256_mul_mont(U2, in1_x, Hsqr); + bl __ecp_nistz256_mul_mont @ p256_mul_mont(U2, in1_x, Hsqr); add $r_ptr,sp,#$Hsqr - bl __ecp_nistz256_mul_by_2 @ p256_mul_by_2(Hsqr, U2); + bl __ecp_nistz256_add_self @ p256_mul_by_2(Hsqr, U2); add $b_ptr,sp,#$Rsqr add $r_ptr,sp,#$res_x @@ -1730,12 +1734,12 @@ ecp_nistz256_point_add_affine: add $a_ptr,sp,#$Hcub add $b_ptr,sp,#$in1_y add $r_ptr,sp,#$S2 - bl _ecp_nistz256_mul_mont @ p256_mul_mont(S2, in1_y, Hcub); + bl __ecp_nistz256_mul_mont @ p256_mul_mont(S2, in1_y, Hcub); add $a_ptr,sp,#$R add $b_ptr,sp,#$res_y add $r_ptr,sp,#$res_y - bl _ecp_nistz256_mul_mont @ p256_mul_mont(res_y, res_y, R); + bl __ecp_nistz256_mul_mont @ p256_mul_mont(res_y, res_y, R); add $b_ptr,sp,#$S2 bl __ecp_nistz256_sub_from @ p256_sub(res_y, res_y, S2); diff --git a/crypto/modes/asm/ghash-armv4.pl b/crypto/modes/asm/ghash-armv4.pl index 44521f8984..7311ad2966 100644 --- a/crypto/modes/asm/ghash-armv4.pl +++ b/crypto/modes/asm/ghash-armv4.pl @@ -71,8 +71,20 @@ # *native* byte order on current platform. See gcm128.c for working # example... -while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} -open STDOUT,">$output"; +$flavour = shift; +if ($flavour=~/^\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; } +else { while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} } + +if ($flavour && $flavour ne "void") { + $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; + ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or + ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or + die "can't locate arm-xlate.pl"; + + open STDOUT,"| \"$^X\" $xlate $flavour $output"; +} else { + open STDOUT,">$output"; +} $Xi="r0"; # argument block $Htbl="r1"; @@ -126,6 +138,11 @@ $code=<<___; .text .code 32 +#ifdef __APPLE__ +#define ldrplb ldrbpl +#define ldrneb ldrbne +#endif + .type rem_4bit,%object .align 5 rem_4bit: @@ -373,9 +390,9 @@ $code.=<<___; .type gcm_init_neon,%function .align 4 gcm_init_neon: - vld1.64 $IN#hi,[r1,:64]! @ load H + vld1.64 $IN#hi,[r1]! @ load H vmov.i8 $t0,#0xe1 - vld1.64 $IN#lo,[r1,:64] + vld1.64 $IN#lo,[r1] vshl.i64 $t0#hi,#57 vshr.u64 $t0#lo,#63 @ t0=0xc2....01 vdup.8 $t1,$IN#hi[7] @@ -394,8 +411,8 @@ gcm_init_neon: .type gcm_gmult_neon,%function .align 4 gcm_gmult_neon: - vld1.64 $IN#hi,[$Xi,:64]! @ load Xi - vld1.64 $IN#lo,[$Xi,:64]! + vld1.64 $IN#hi,[$Xi]! @ load Xi + vld1.64 $IN#lo,[$Xi]! vmov.i64 $k48,#0x0000ffffffffffff vldmia $Htbl,{$Hlo-$Hhi} @ load twisted H vmov.i64 $k32,#0x00000000ffffffff @@ -412,8 +429,8 @@ gcm_gmult_neon: .type gcm_ghash_neon,%function .align 4 gcm_ghash_neon: - vld1.64 $Xl#hi,[$Xi,:64]! @ load Xi - vld1.64 $Xl#lo,[$Xi,:64]! + vld1.64 $Xl#hi,[$Xi]! @ load Xi + vld1.64 $Xl#lo,[$Xi]! vmov.i64 $k48,#0x0000ffffffffffff vldmia $Htbl,{$Hlo-$Hhi} @ load twisted H vmov.i64 $k32,#0x00000000ffffffff @@ -468,8 +485,8 @@ $code.=<<___; vrev64.8 $Xl,$Xl #endif sub $Xi,#16 - vst1.64 $Xl#hi,[$Xi,:64]! @ write out Xi - vst1.64 $Xl#lo,[$Xi,:64] + vst1.64 $Xl#hi,[$Xi]! @ write out Xi + vst1.64 $Xl#lo,[$Xi] ret @ bx lr .size gcm_ghash_neon,.-gcm_ghash_neon diff --git a/crypto/perlasm/arm-xlate.pl b/crypto/perlasm/arm-xlate.pl index 22dc7e4ecc..81ceb3142c 100755 --- a/crypto/perlasm/arm-xlate.pl +++ b/crypto/perlasm/arm-xlate.pl @@ -154,7 +154,7 @@ while($line=<>) { $line = &$opcode($arg); } elsif ($mnemonic) { $line = $c.$mnemonic; - $line.= "\t$arg" if ($arg); + $line.= "\t$arg" if ($arg ne ""); } } diff --git a/crypto/sha/asm/sha1-armv4-large.pl b/crypto/sha/asm/sha1-armv4-large.pl index 61307b7c61..356b52fc1b 100644 --- a/crypto/sha/asm/sha1-armv4-large.pl +++ b/crypto/sha/asm/sha1-armv4-large.pl @@ -68,8 +68,20 @@ # # Add ARMv8 code path performing at 2.35 cpb on Apple A7. -while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} -open STDOUT,">$output"; +$flavour = shift; +if ($flavour=~/^\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; } +else { while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} } + +if ($flavour && $flavour ne "void") { + $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; + ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or + ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or + die "can't locate arm-xlate.pl"; + + open STDOUT,"| \"$^X\" $xlate $flavour $output"; +} else { + open STDOUT,">$output"; +} $ctx="r0"; $inp="r1"; @@ -180,6 +192,9 @@ sha1_block_data_order: sub r3,pc,#8 @ sha1_block_data_order ldr r12,.LOPENSSL_armcap ldr r12,[r3,r12] @ OPENSSL_armcap_P +#ifdef __APPLE__ + ldr r12,[r12] +#endif tst r12,#ARMV8_SHA1 bne .LARMv8 tst r12,#ARMV7_NEON diff --git a/crypto/sha/asm/sha256-armv4.pl b/crypto/sha/asm/sha256-armv4.pl index fac0533ea6..efee1fb1f3 100644 --- a/crypto/sha/asm/sha256-armv4.pl +++ b/crypto/sha/asm/sha256-armv4.pl @@ -37,8 +37,20 @@ # # Add ARMv8 code path performing at 2.0 cpb on Apple A7. -while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} -open STDOUT,">$output"; +$flavour = shift; +if ($flavour=~/^\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; } +else { while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} } + +if ($flavour && $flavour ne "void") { + $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; + ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or + ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or + die "can't locate arm-xlate.pl"; + + open STDOUT,"| \"$^X\" $xlate $flavour $output"; +} else { + open STDOUT,">$output"; +} $ctx="r0"; $t0="r0"; $inp="r1"; $t4="r1"; @@ -167,7 +179,7 @@ $code=<<___; .code 32 #else .syntax unified -# ifdef __thumb2__ +# if defined(__thumb2__) && !defined(__APPLE__) # define adrl adr .thumb # else @@ -198,13 +210,14 @@ K256: .word 0 @ terminator #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) .LOPENSSL_armcap: -.word OPENSSL_armcap_P-sha256_block_data_order +.word OPENSSL_armcap_P-.Lsha256_block_data_order #endif .align 5 .global sha256_block_data_order .type sha256_block_data_order,%function sha256_block_data_order: +.Lsha256_block_data_order: #if __ARM_ARCH__<7 sub r3,pc,#8 @ sha256_block_data_order #else @@ -213,6 +226,9 @@ sha256_block_data_order: #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) ldr r12,.LOPENSSL_armcap ldr r12,[r3,r12] @ OPENSSL_armcap_P +#ifdef __APPLE__ + ldr r12,[r12] +#endif tst r12,#ARMV8_SHA256 bne .LARMv8 tst r12,#ARMV7_NEON @@ -463,7 +479,7 @@ sha256_block_data_order_neon: stmdb sp!,{r4-r12,lr} sub $H,sp,#16*4+16 - adrl $Ktbl,K256 + adr $Ktbl,K256 bic $H,$H,#15 @ align for 128-bit stores mov $t2,sp mov sp,$H @ alloca @@ -583,7 +599,7 @@ my $Ktbl="r3"; $code.=<<___; #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) -# ifdef __thumb2__ +# if defined(__thumb2__) && !defined(__APPLE__) # define INST(a,b,c,d) .byte c,d|0xc,a,b # else # define INST(a,b,c,d) .byte a,b,c,d @@ -594,7 +610,9 @@ $code.=<<___; sha256_block_data_order_armv8: .LARMv8: vld1.32 {$ABCD,$EFGH},[$ctx] -# ifdef __thumb2__ +# ifdef __APPLE__ + sub $Ktbl,$Ktbl,#256+32 +# elif defined(__thumb2__) adr $Ktbl,.LARMv8 sub $Ktbl,$Ktbl,#.LARMv8-K256 # else diff --git a/crypto/sha/asm/sha512-armv4.pl b/crypto/sha/asm/sha512-armv4.pl index a2b11a8443..77d6c5eae9 100644 --- a/crypto/sha/asm/sha512-armv4.pl +++ b/crypto/sha/asm/sha512-armv4.pl @@ -50,8 +50,20 @@ $hi="HI"; $lo="LO"; # ==================================================================== -while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} -open STDOUT,">$output"; +$flavour = shift; +if ($flavour=~/^\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; } +else { while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} } + +if ($flavour && $flavour ne "void") { + $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; + ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or + ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or + die "can't locate arm-xlate.pl"; + + open STDOUT,"| \"$^X\" $xlate $flavour $output"; +} else { + open STDOUT,">$output"; +} $ctx="r0"; # parameter block $inp="r1"; @@ -200,7 +212,7 @@ $code=<<___; #endif .text -#if __ARM_ARCH__<7 +#if __ARM_ARCH__<7 || defined(__APPLE__) .code 32 #else .syntax unified @@ -258,7 +270,7 @@ WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817) .size K512,.-K512 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) .LOPENSSL_armcap: -.word OPENSSL_armcap_P-sha512_block_data_order +.word OPENSSL_armcap_P-.Lsha512_block_data_order .skip 32-4 #else .skip 32 @@ -267,6 +279,7 @@ WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817) .global sha512_block_data_order .type sha512_block_data_order,%function sha512_block_data_order: +.Lsha512_block_data_order: #if __ARM_ARCH__<7 sub r3,pc,#8 @ sha512_block_data_order #else @@ -275,6 +288,9 @@ sha512_block_data_order: #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) ldr r12,.LOPENSSL_armcap ldr r12,[r3,r12] @ OPENSSL_armcap_P +#ifdef __APPLE__ + ldr r12,[r12] +#endif tst r12,#1 bne .LNEON #endif @@ -593,8 +609,8 @@ sha512_block_data_order_neon: .LNEON: dmb @ errata #451034 on early Cortex A8 add $len,$inp,$len,lsl#7 @ len to point at the end of inp + adr $Ktbl,K512 VFP_ABI_PUSH - adrl $Ktbl,K512 vldmia $ctx,{$A-$H} @ load context .Loop_neon: ___ -- 2.40.0