From 3e181369dd635c8ce5d133e346928d6670cf83f6 Mon Sep 17 00:00:00 2001 From: Andy Polyakov Date: Wed, 18 Apr 2012 13:01:36 +0000 Subject: [PATCH] C64x+ assembler pack. linux-c64xplus build is *not* tested nor can it be tested, because kernel is not in shape to handle it *yet*. The code is committed mostly to stimulate the kernel development. --- Configure | 3 + TABLE | 33 + crypto/aes/asm/aes-c64xplus.pl | 1361 ++++++++++++++++++++++++++++ crypto/bn/asm/bn-c64xplus.asm | 333 +++++++ crypto/bn/asm/c64xplus-gf2m.pl | 146 +++ crypto/c64xpluscpuid.pl | 246 +++++ crypto/modes/asm/ghash-c64xplus.pl | 231 +++++ crypto/sha/asm/sha1-c64xplus.pl | 323 +++++++ crypto/sha/asm/sha256-c64xplus.pl | 302 ++++++ crypto/sha/asm/sha512-c64xplus.pl | 421 +++++++++ 10 files changed, 3399 insertions(+) create mode 100644 crypto/aes/asm/aes-c64xplus.pl create mode 100644 crypto/bn/asm/bn-c64xplus.asm create mode 100644 crypto/bn/asm/c64xplus-gf2m.pl create mode 100644 crypto/c64xpluscpuid.pl create mode 100644 crypto/modes/asm/ghash-c64xplus.pl create mode 100644 crypto/sha/asm/sha1-c64xplus.pl create mode 100644 crypto/sha/asm/sha256-c64xplus.pl create mode 100644 crypto/sha/asm/sha512-c64xplus.pl diff --git a/Configure b/Configure index 5900e18313..bae77cbebc 100755 --- a/Configure +++ b/Configure @@ -399,6 +399,9 @@ my %table=( "linux-alpha+bwx-gcc","gcc:-O3 -DL_ENDIAN -DTERMIO::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_RISC1 DES_UNROLL:${alpha_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", "linux-alpha-ccc","ccc:-fast -readonly_strings -DL_ENDIAN -DTERMIO::-D_REENTRANT:::SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_PTR DES_RISC1 DES_UNROLL:${alpha_asm}", "linux-alpha+bwx-ccc","ccc:-fast -readonly_strings -DL_ENDIAN -DTERMIO::-D_REENTRANT:::SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_INT DES_PTR DES_RISC1 DES_UNROLL:${alpha_asm}", +# +# TI_CGT_C6000_7.3.x is a requirement +"linux-c64xplus","cl6x:--linux --strip_coff_underscore -ea=.s -eo=.o -mv6400+ -o2 -ox -ms -pden -DOPENSSL_SMALL_FOOTPRINT::-D_REENTRANT:::BN_LLONG:c64xpluscpuid.o:bn-c64xplus.o c64xplus-gf2m.o::aes-c64xplus.o aes_cbc.o aes_ctr.o:::sha1-c64xplus.o sha256-c64xplus.o sha512-c64xplus.o:::::::ghash-c64xplus.o::void:dlfcn:linux-shared:--pic:-z --sysv --shared:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR):true", # Android: linux-* but without -DTERMIO and pointers to headers and libs. "android","gcc:-mandroid -I\$(ANDROID_DEV)/include -B\$(ANDROID_DEV)/lib -O3 -fomit-frame-pointer -Wall::-D_REENTRANT::-ldl:BN_LLONG RC4_CHAR RC4_CHUNK DES_INT DES_UNROLL BF_PTR:${no_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", diff --git a/TABLE b/TABLE index 74c891ac93..d5f0b1fc99 100644 --- a/TABLE +++ b/TABLE @@ -3927,6 +3927,39 @@ $ranlib = $arflags = $multilib = +*** linux-c64xplus +$cc = cl6x +$cflags = --linux --strip_coff_underscore -ea=.s -eo=.o -mv6400+ -o2 -ox -ms -pden -DOPENSSL_SMALL_FOOTPRINT +$unistd = +$thread_cflag = -D_REENTRANT +$sys_id = +$lflags = +$bn_ops = BN_LLONG +$cpuid_obj = c64xpluscpuid.o +$bn_obj = bn-c64xplus.o c64xplus-gf2m.o +$des_obj = +$aes_obj = aes-c64xplus.o aes_cbc.o aes_ctr.o +$bf_obj = +$md5_obj = +$sha1_obj = sha1-c64xplus.o sha256-c64xplus.o sha512-c64xplus.o +$cast_obj = +$rc4_obj = +$rmd160_obj = +$rc5_obj = +$wp_obj = +$cmll_obj = +$modes_obj = ghash-c64xplus.o +$engines_obj = +$perlasm_scheme = void +$dso_scheme = dlfcn +$shared_target= linux-shared +$shared_cflag = --pic +$shared_ldflag = -z --sysv --shared +$shared_extension = .so.$(SHLIB_MAJOR).$(SHLIB_MINOR) +$ranlib = true +$arflags = +$multilib = + *** linux-elf $cc = gcc $cflags = -DL_ENDIAN -DTERMIO -O3 -fomit-frame-pointer -Wall diff --git a/crypto/aes/asm/aes-c64xplus.pl b/crypto/aes/asm/aes-c64xplus.pl new file mode 100644 index 0000000000..ad0c15a36f --- /dev/null +++ b/crypto/aes/asm/aes-c64xplus.pl @@ -0,0 +1,1361 @@ +#!/usr/bin/env perl +# +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# [Endian-neutral] AES for C64x+. +# +# Even though SPLOOPs are scheduled for 13 cycles, and thus expected +# performance is ~8.5 cycles per byte processed with 128-bit key, +# measured performance turned to be ~10 cycles per byte. Discrepancy +# must be caused by limitations of L1D memory banking(*), see SPRU871 +# TI publication for further details. If any consolation it's still +# ~20% faster than TI's linear assembly module anyway... Compared to +# aes_core.c compiled with cl6x 6.0 with -mv6400+ -o2 options this +# code is 3.75x faster and almost 3x smaller (tables included). +# +# (*) This means that there might be subtle correlation between data +# and timing and one can wonder if it can be ... attacked:-( +# On the other hand this also means that *if* one chooses to +# implement *4* T-tables variant [instead of 1 T-table as in +# this implementation, or in addition to], then one ought to +# *interleave* them. Even though it complicates addressing, +# references to interleaved tables would be guaranteed not to +# clash. I reckon that it should be possible to break 8 cycles +# per byte "barrier," i.e. improve by ~20%, naturally at the +# cost of 8x increased pressure on L1D. 8x because you'd have +# to interleave both Te and Td tables... + +while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} +open STDOUT,">$output"; + +($TEA,$TEB)=("A5","B5"); +($KPA,$KPB)=("A3","B1"); +@K=("A6","B6","A7","B7"); +@s=("A8","B8","A9","B9"); +@Te0=@Td0=("A16","B16","A17","B17"); +@Te1=@Td1=("A18","B18","A19","B19"); +@Te2=@Td2=("A20","B20","A21","B21"); +@Te3=@Td3=("A22","B22","A23","B23"); + +$code=<<___; + .text + .if __TI_EABI__ + .nocmp + .endif + + .asg B3,RA + .asg A4,INP + .asg B4,OUT + .asg A6,KEY + .asg A4,RET + .asg B15,SP + + .eval 24,EXT0 + .eval 16,EXT1 + .eval 8,EXT2 + .eval 0,EXT3 + .eval 8,TBL1 + .eval 16,TBL2 + .eval 24,TBL3 + + .if .BIG_ENDIAN + .eval 24-EXT0,EXT0 + .eval 24-EXT1,EXT1 + .eval 24-EXT2,EXT2 + .eval 24-EXT3,EXT3 + .eval 32-TBL1,TBL1 + .eval 32-TBL2,TBL2 + .eval 32-TBL3,TBL3 + .endif + + .global _AES_encrypt +_AES_encrypt: + .asmfunc + MVK 1,B2 +__encrypt: + .if __TI_EABI__ + [B2] LDNDW *INP++,A9:A8 ; load input +|| MVKL \$PCR_OFFSET(AES_Te,_AES_encrypt),$TEA +|| ADDKPC _AES_encrypt,B0 + [B2] LDNDW *INP++,B9:B8 +|| MVKH \$PCR_OFFSET(AES_Te,_AES_encrypt),$TEA +|| ADD 0,KEY,$KPA +|| ADD 4,KEY,$KPB + .else + [B2] LDNDW *INP++,A9:A8 ; load input +|| MVKL (AES_Te-_AES_encrypt),$TEA +|| ADDKPC _AES_encrypt,B0 + [B2] LDNDW *INP++,B9:B8 +|| MVKH (AES_Te-_AES_encrypt),$TEA +|| ADD 0,KEY,$KPA +|| ADD 4,KEY,$KPB + .endif + LDW *$KPA++[2],$Te0[0] ; zero round key +|| LDW *$KPB++[2],$Te0[1] +|| MVK 60,A0 +|| ADD B0,$TEA,$TEA ; AES_Te + LDW *KEY[A0],B0 ; rounds +|| MVK 1024,A0 ; sizeof(AES_Te) + LDW *$KPA++[2],$Te0[2] +|| LDW *$KPB++[2],$Te0[3] +|| MV $TEA,$TEB + NOP + .if .BIG_ENDIAN + MV A9,$s[0] +|| MV A8,$s[1] +|| MV B9,$s[2] +|| MV B8,$s[3] + .else + MV A8,$s[0] +|| MV A9,$s[1] +|| MV B8,$s[2] +|| MV B9,$s[3] + .endif + XOR $Te0[0],$s[0],$s[0] +|| XOR $Te0[1],$s[1],$s[1] +|| LDW *$KPA++[2],$K[0] ; 1st round key +|| LDW *$KPB++[2],$K[1] + SUB B0,2,B0 + + SPLOOPD 13 +|| MVC B0,ILC +|| LDW *$KPA++[2],$K[2] +|| LDW *$KPB++[2],$K[3] +;;==================================================================== + EXTU $s[1],EXT1,24,$Te1[1] +|| EXTU $s[0],EXT3,24,$Te3[0] + LDW *${TEB}[$Te1[1]],$Te1[1] ; Te1[s1>>8], t0 +|| LDW *${TEA}[$Te3[0]],$Te3[0] ; Te3[s0>>24], t1 +|| XOR $s[2],$Te0[2],$s[2] ; modulo-scheduled +|| XOR $s[3],$Te0[3],$s[3] ; modulo-scheduled +|| EXTU $s[1],EXT3,24,$Te3[1] +|| EXTU $s[0],EXT1,24,$Te1[0] + LDW *${TEB}[$Te3[1]],$Te3[1] ; Te3[s1>>24], t2 +|| LDW *${TEA}[$Te1[0]],$Te1[0] ; Te1[s0>>8], t3 +|| EXTU $s[2],EXT2,24,$Te2[2] +|| EXTU $s[3],EXT2,24,$Te2[3] + LDW *${TEA}[$Te2[2]],$Te2[2] ; Te2[s2>>16], t0 +|| LDW *${TEB}[$Te2[3]],$Te2[3] ; Te2[s3>>16], t1 +|| EXTU $s[3],EXT3,24,$Te3[3] +|| EXTU $s[2],EXT1,24,$Te1[2] + LDW *${TEB}[$Te3[3]],$Te3[3] ; Te3[s3>>24], t0 +|| LDW *${TEA}[$Te1[2]],$Te1[2] ; Te1[s2>>8], t1 +|| EXTU $s[0],EXT2,24,$Te2[0] +|| EXTU $s[1],EXT2,24,$Te2[1] + LDW *${TEA}[$Te2[0]],$Te2[0] ; Te2[s0>>16], t2 +|| LDW *${TEB}[$Te2[1]],$Te2[1] ; Te2[s1>>16], t3 +|| EXTU $s[3],EXT1,24,$Te1[3] +|| EXTU $s[2],EXT3,24,$Te3[2] + LDW *${TEB}[$Te1[3]],$Te1[3] ; Te1[s3>>8], t2 +|| LDW *${TEA}[$Te3[2]],$Te3[2] ; Te3[s2>>24], t3 +|| ROTL $Te1[1],TBL1,$Te3[0] ; t0 +|| ROTL $Te3[0],TBL3,$Te1[1] ; t1 +|| EXTU $s[0],EXT0,24,$Te0[0] +|| EXTU $s[1],EXT0,24,$Te0[1] + LDW *${TEA}[$Te0[0]],$Te0[0] ; Te0[s0], t0 +|| LDW *${TEB}[$Te0[1]],$Te0[1] ; Te0[s1], t1 +|| ROTL $Te3[1],TBL3,$Te1[0] ; t2 +|| ROTL $Te1[0],TBL1,$Te3[1] ; t3 +|| EXTU $s[2],EXT0,24,$Te0[2] +|| EXTU $s[3],EXT0,24,$Te0[3] + LDW *${TEA}[$Te0[2]],$Te0[2] ; Te0[s2], t2 +|| LDW *${TEB}[$Te0[3]],$Te0[3] ; Te0[s3], t3 +|| ROTL $Te2[2],TBL2,$Te2[2] ; t0 +|| ROTL $Te2[3],TBL2,$Te2[3] ; t1 +|| XOR $K[0],$Te3[0],$s[0] +|| XOR $K[1],$Te1[1],$s[1] + ROTL $Te3[3],TBL3,$Te1[2] ; t0 +|| ROTL $Te1[2],TBL1,$Te3[3] ; t1 +|| XOR $K[2],$Te1[0],$s[2] +|| XOR $K[3],$Te3[1],$s[3] +|| LDW *$KPA++[2],$K[0] ; next round key +|| LDW *$KPB++[2],$K[1] + ROTL $Te2[0],TBL2,$Te2[0] ; t2 +|| ROTL $Te2[1],TBL2,$Te2[1] ; t3 +|| XOR $s[0],$Te2[2],$s[0] +|| XOR $s[1],$Te2[3],$s[1] +|| LDW *$KPA++[2],$K[2] +|| LDW *$KPB++[2],$K[3] + ROTL $Te1[3],TBL1,$Te3[2] ; t2 +|| ROTL $Te3[2],TBL3,$Te1[3] ; t3 +|| XOR $s[0],$Te1[2],$s[0] +|| XOR $s[1],$Te3[3],$s[1] + XOR $s[2],$Te2[0],$s[2] +|| XOR $s[3],$Te2[1],$s[3] +|| XOR $s[0],$Te0[0],$s[0] +|| XOR $s[1],$Te0[1],$s[1] + SPKERNEL +|| XOR.L $s[2],$Te3[2],$s[2] +|| XOR.L $s[3],$Te1[3],$s[3] +;;==================================================================== + ADD.D ${TEA},A0,${TEA} ; point to Te4 +|| ADD.D ${TEB},A0,${TEB} +|| EXTU $s[1],EXT1,24,$Te1[1] +|| EXTU $s[0],EXT3,24,$Te3[0] + LDBU *${TEB}[$Te1[1]],$Te1[1] ; Te1[s1>>8], t0 +|| LDBU *${TEA}[$Te3[0]],$Te3[0] ; Te3[s0>>24], t1 +|| XOR $s[2],$Te0[2],$s[2] ; modulo-scheduled +|| XOR $s[3],$Te0[3],$s[3] ; modulo-scheduled +|| EXTU $s[0],EXT0,24,$Te0[0] +|| EXTU $s[1],EXT0,24,$Te0[1] + LDBU *${TEA}[$Te0[0]],$Te0[0] ; Te0[s0], t0 +|| LDBU *${TEB}[$Te0[1]],$Te0[1] ; Te0[s1], t1 +|| EXTU $s[3],EXT3,24,$Te3[3] +|| EXTU $s[2],EXT1,24,$Te1[2] + LDBU *${TEB}[$Te3[3]],$Te3[3] ; Te3[s3>>24], t0 +|| LDBU *${TEA}[$Te1[2]],$Te1[2] ; Te1[s2>>8], t1 +|| EXTU $s[2],EXT2,24,$Te2[2] +|| EXTU $s[3],EXT2,24,$Te2[3] + LDBU *${TEA}[$Te2[2]],$Te2[2] ; Te2[s2>>16], t0 +|| LDBU *${TEB}[$Te2[3]],$Te2[3] ; Te2[s3>>16], t1 +|| EXTU $s[1],EXT3,24,$Te3[1] +|| EXTU $s[0],EXT1,24,$Te1[0] + LDBU *${TEB}[$Te3[1]],$Te3[1] ; Te3[s1>>24], t2 +|| LDBU *${TEA}[$Te1[0]],$Te1[0] ; Te1[s0>>8], t3 +|| EXTU $s[3],EXT1,24,$Te1[3] +|| EXTU $s[2],EXT3,24,$Te3[2] + LDBU *${TEB}[$Te1[3]],$Te1[3] ; Te1[s3>>8], t2 +|| LDBU *${TEA}[$Te3[2]],$Te3[2] ; Te3[s2>>24], t3 +|| EXTU $s[2],EXT0,24,$Te0[2] +|| EXTU $s[3],EXT0,24,$Te0[3] + LDBU *${TEA}[$Te0[2]],$Te0[2] ; Te0[s2], t2 +|| LDBU *${TEB}[$Te0[3]],$Te0[3] ; Te0[s3], t3 +|| EXTU $s[0],EXT2,24,$Te2[0] +|| EXTU $s[1],EXT2,24,$Te2[1] + LDBU *${TEA}[$Te2[0]],$Te2[0] ; Te2[s0>>16], t2 +|| LDBU *${TEB}[$Te2[1]],$Te2[1] ; Te2[s1>>16], t3 + + .if .BIG_ENDIAN + PACK2 $Te0[0],$Te1[1],$Te0[0] +|| PACK2 $Te0[1],$Te1[2],$Te0[1] + PACK2 $Te2[2],$Te3[3],$Te2[2] +|| PACK2 $Te2[3],$Te3[0],$Te2[3] + PACKL4 $Te0[0],$Te2[2],$Te0[0] +|| PACKL4 $Te0[1],$Te2[3],$Te0[1] + XOR $K[0],$Te0[0],$Te0[0] ; s[0] +|| XOR $K[1],$Te0[1],$Te0[1] ; s[1] + + PACK2 $Te0[2],$Te1[3],$Te0[2] +|| PACK2 $Te0[3],$Te1[0],$Te0[3] + PACK2 $Te2[0],$Te3[1],$Te2[0] +|| PACK2 $Te2[1],$Te3[2],$Te2[1] +|| BNOP RA + PACKL4 $Te0[2],$Te2[0],$Te0[2] +|| PACKL4 $Te0[3],$Te2[1],$Te0[3] + XOR $K[2],$Te0[2],$Te0[2] ; s[2] +|| XOR $K[3],$Te0[3],$Te0[3] ; s[3] + + MV $Te0[0],A9 +|| MV $Te0[1],A8 + MV $Te0[2],B9 +|| MV $Te0[3],B8 +|| [B2] STNDW A9:A8,*OUT++ + [B2] STNDW B9:B8,*OUT++ + .else + PACK2 $Te1[1],$Te0[0],$Te1[1] +|| PACK2 $Te1[2],$Te0[1],$Te1[2] + PACK2 $Te3[3],$Te2[2],$Te3[3] +|| PACK2 $Te3[0],$Te2[3],$Te3[0] + PACKL4 $Te3[3],$Te1[1],$Te1[1] +|| PACKL4 $Te3[0],$Te1[2],$Te1[2] + XOR $K[0],$Te1[1],$Te1[1] ; s[0] +|| XOR $K[1],$Te1[2],$Te1[2] ; s[1] + + PACK2 $Te1[3],$Te0[2],$Te1[3] +|| PACK2 $Te1[0],$Te0[3],$Te1[0] + PACK2 $Te3[1],$Te2[0],$Te3[1] +|| PACK2 $Te3[2],$Te2[1],$Te3[2] +|| BNOP RA + PACKL4 $Te3[1],$Te1[3],$Te1[3] +|| PACKL4 $Te3[2],$Te1[0],$Te1[0] + XOR $K[2],$Te1[3],$Te1[3] ; s[2] +|| XOR $K[3],$Te1[0],$Te1[0] ; s[3] + + MV $Te1[1],A8 +|| MV $Te1[2],A9 + MV $Te1[3],B8 +|| MV $Te1[0],B9 +|| [B2] STNDW A9:A8,*OUT++ + [B2] STNDW B9:B8,*OUT++ + .endif + .endasmfunc + + .global _AES_decrypt +_AES_decrypt: + .asmfunc + MVK 1,B2 +__decrypt: + .if __TI_EABI__ + [B2] LDNDW *INP++,A9:A8 ; load input +|| MVKL \$PCR_OFFSET(AES_Td,_AES_decrypt),$TEA +|| ADDKPC _AES_decrypt,B0 + [B2] LDNDW *INP++,B9:B8 +|| MVKH \$PCR_OFFSET(AES_Td,_AES_decrypt),$TEA +|| ADD 0,KEY,$KPA +|| ADD 4,KEY,$KPB + .else + [B2] LDNDW *INP++,A9:A8 ; load input +|| MVKL (AES_Td-_AES_decrypt),$TEA +|| ADDKPC _AES_decrypt,B0 + [B2] LDNDW *INP++,B9:B8 +|| MVKH (AES_Td-_AES_decrypt),$TEA +|| ADD 0,KEY,$KPA +|| ADD 4,KEY,$KPB + .endif + LDW *$KPA++[2],$Td0[0] ; zero round key +|| LDW *$KPB++[2],$Td0[1] +|| MVK 60,A0 +|| ADD B0,$TEA,$TEA ; AES_Td + LDW *KEY[A0],B0 ; rounds +|| MVK 1024,A0 ; sizeof(AES_Td) + LDW *$KPA++[2],$Td0[2] +|| LDW *$KPB++[2],$Td0[3] +|| MV $TEA,$TEB + NOP + .if .BIG_ENDIAN + MV A9,$s[0] +|| MV A8,$s[1] +|| MV B9,$s[2] +|| MV B8,$s[3] + .else + MV A8,$s[0] +|| MV A9,$s[1] +|| MV B8,$s[2] +|| MV B9,$s[3] + .endif + XOR $Td0[0],$s[0],$s[0] +|| XOR $Td0[1],$s[1],$s[1] +|| LDW *$KPA++[2],$K[0] ; 1st round key +|| LDW *$KPB++[2],$K[1] + SUB B0,2,B0 + + SPLOOPD 13 +|| MVC B0,ILC +|| LDW *$KPA++[2],$K[2] +|| LDW *$KPB++[2],$K[3] +;;==================================================================== + EXTU $s[1],EXT3,24,$Td3[1] +|| EXTU $s[0],EXT1,24,$Td1[0] + LDW *${TEB}[$Td3[1]],$Td3[1] ; Td3[s1>>24], t0 +|| LDW *${TEA}[$Td1[0]],$Td1[0] ; Td1[s0>>8], t1 +|| XOR $s[2],$Td0[2],$s[2] ; modulo-scheduled +|| XOR $s[3],$Td0[3],$s[3] ; modulo-scheduled +|| EXTU $s[1],EXT1,24,$Td1[1] +|| EXTU $s[0],EXT3,24,$Td3[0] + LDW *${TEB}[$Td1[1]],$Td1[1] ; Td1[s1>>8], t2 +|| LDW *${TEA}[$Td3[0]],$Td3[0] ; Td3[s0>>24], t3 +|| EXTU $s[2],EXT2,24,$Td2[2] +|| EXTU $s[3],EXT2,24,$Td2[3] + LDW *${TEA}[$Td2[2]],$Td2[2] ; Td2[s2>>16], t0 +|| LDW *${TEB}[$Td2[3]],$Td2[3] ; Td2[s3>>16], t1 +|| EXTU $s[3],EXT1,24,$Td1[3] +|| EXTU $s[2],EXT3,24,$Td3[2] + LDW *${TEB}[$Td1[3]],$Td1[3] ; Td1[s3>>8], t0 +|| LDW *${TEA}[$Td3[2]],$Td3[2] ; Td3[s2>>24], t1 +|| EXTU $s[0],EXT2,24,$Td2[0] +|| EXTU $s[1],EXT2,24,$Td2[1] + LDW *${TEA}[$Td2[0]],$Td2[0] ; Td2[s0>>16], t2 +|| LDW *${TEB}[$Td2[1]],$Td2[1] ; Td2[s1>>16], t3 +|| EXTU $s[3],EXT3,24,$Td3[3] +|| EXTU $s[2],EXT1,24,$Td1[2] + LDW *${TEB}[$Td3[3]],$Td3[3] ; Td3[s3>>24], t2 +|| LDW *${TEA}[$Td1[2]],$Td1[2] ; Td1[s2>>8], t3 +|| ROTL $Td3[1],TBL3,$Td1[0] ; t0 +|| ROTL $Td1[0],TBL1,$Td3[1] ; t1 +|| EXTU $s[0],EXT0,24,$Td0[0] +|| EXTU $s[1],EXT0,24,$Td0[1] + LDW *${TEA}[$Td0[0]],$Td0[0] ; Td0[s0], t0 +|| LDW *${TEB}[$Td0[1]],$Td0[1] ; Td0[s1], t1 +|| ROTL $Td1[1],TBL1,$Td3[0] ; t2 +|| ROTL $Td3[0],TBL3,$Td1[1] ; t3 +|| EXTU $s[2],EXT0,24,$Td0[2] +|| EXTU $s[3],EXT0,24,$Td0[3] + LDW *${TEA}[$Td0[2]],$Td0[2] ; Td0[s2], t2 +|| LDW *${TEB}[$Td0[3]],$Td0[3] ; Td0[s3], t3 +|| ROTL $Td2[2],TBL2,$Td2[2] ; t0 +|| ROTL $Td2[3],TBL2,$Td2[3] ; t1 +|| XOR $K[0],$Td1[0],$s[0] +|| XOR $K[1],$Td3[1],$s[1] + ROTL $Td1[3],TBL1,$Td3[2] ; t0 +|| ROTL $Td3[2],TBL3,$Td1[3] ; t1 +|| XOR $K[2],$Td3[0],$s[2] +|| XOR $K[3],$Td1[1],$s[3] +|| LDW *$KPA++[2],$K[0] ; next round key +|| LDW *$KPB++[2],$K[1] + ROTL $Td2[0],TBL2,$Td2[0] ; t2 +|| ROTL $Td2[1],TBL2,$Td2[1] ; t3 +|| XOR $s[0],$Td2[2],$s[0] +|| XOR $s[1],$Td2[3],$s[1] +|| LDW *$KPA++[2],$K[2] +|| LDW *$KPB++[2],$K[3] + ROTL $Td3[3],TBL3,$Td1[2] ; t2 +|| ROTL $Td1[2],TBL1,$Td3[3] ; t3 +|| XOR $s[0],$Td3[2],$s[0] +|| XOR $s[1],$Td1[3],$s[1] + XOR $s[2],$Td2[0],$s[2] +|| XOR $s[3],$Td2[1],$s[3] +|| XOR $s[0],$Td0[0],$s[0] +|| XOR $s[1],$Td0[1],$s[1] + SPKERNEL +|| XOR.L $s[2],$Td1[2],$s[2] +|| XOR.L $s[3],$Td3[3],$s[3] +;;==================================================================== + ADD.D ${TEA},A0,${TEA} ; point to Td4 +|| ADD.D ${TEB},A0,${TEB} +|| EXTU $s[1],EXT3,24,$Td3[1] +|| EXTU $s[0],EXT1,24,$Td1[0] + LDBU *${TEB}[$Td3[1]],$Td3[1] ; Td3[s1>>24], t0 +|| LDBU *${TEA}[$Td1[0]],$Td1[0] ; Td1[s0>>8], t1 +|| XOR $s[2],$Td0[2],$s[2] ; modulo-scheduled +|| XOR $s[3],$Td0[3],$s[3] ; modulo-scheduled +|| EXTU $s[0],EXT0,24,$Td0[0] +|| EXTU $s[1],EXT0,24,$Td0[1] + LDBU *${TEA}[$Td0[0]],$Td0[0] ; Td0[s0], t0 +|| LDBU *${TEB}[$Td0[1]],$Td0[1] ; Td0[s1], t1 +|| EXTU $s[2],EXT2,24,$Td2[2] +|| EXTU $s[3],EXT2,24,$Td2[3] + LDBU *${TEA}[$Td2[2]],$Td2[2] ; Td2[s2>>16], t0 +|| LDBU *${TEB}[$Td2[3]],$Td2[3] ; Td2[s3>>16], t1 +|| EXTU $s[3],EXT1,24,$Td1[3] +|| EXTU $s[2],EXT3,24,$Td3[2] + LDBU *${TEB}[$Td1[3]],$Td1[3] ; Td1[s3>>8], t0 +|| LDBU *${TEA}[$Td3[2]],$Td3[2] ; Td3[s2>>24], t1 +|| EXTU $s[1],EXT1,24,$Td1[1] +|| EXTU $s[0],EXT3,24,$Td3[0] + LDBU *${TEB}[$Td1[1]],$Td1[1] ; Td1[s1>>8], t2 +|| LDBU *${TEA}[$Td3[0]],$Td3[0] ; Td3[s0>>24], t3 +|| EXTU $s[0],EXT2,24,$Td2[0] +|| EXTU $s[1],EXT2,24,$Td2[1] + LDBU *${TEA}[$Td2[0]],$Td2[0] ; Td2[s0>>16], t2 +|| LDBU *${TEB}[$Td2[1]],$Td2[1] ; Td2[s1>>16], t3 +|| EXTU $s[3],EXT3,24,$Td3[3] +|| EXTU $s[2],EXT1,24,$Td1[2] + LDBU *${TEB}[$Td3[3]],$Td3[3] ; Td3[s3>>24], t2 +|| LDBU *${TEA}[$Td1[2]],$Td1[2] ; Td1[s2>>8], t3 +|| EXTU $s[2],EXT0,24,$Td0[2] +|| EXTU $s[3],EXT0,24,$Td0[3] + LDBU *${TEA}[$Td0[2]],$Td0[2] ; Td0[s2], t2 +|| LDBU *${TEB}[$Td0[3]],$Td0[3] ; Td0[s3], t3 + + .if .BIG_ENDIAN + PACK2 $Td0[0],$Td1[3],$Td0[0] +|| PACK2 $Td0[1],$Td1[0],$Td0[1] + PACK2 $Td2[2],$Td3[1],$Td2[2] +|| PACK2 $Td2[3],$Td3[2],$Td2[3] + PACKL4 $Td0[0],$Td2[2],$Td0[0] +|| PACKL4 $Td0[1],$Td2[3],$Td0[1] + XOR $K[0],$Td0[0],$Td0[0] ; s[0] +|| XOR $K[1],$Td0[1],$Td0[1] ; s[1] + + PACK2 $Td0[2],$Td1[1],$Td0[2] +|| PACK2 $Td0[3],$Td1[2],$Td0[3] + PACK2 $Td2[0],$Td3[3],$Td2[0] +|| PACK2 $Td2[1],$Td3[0],$Td2[1] +|| BNOP RA + PACKL4 $Td0[2],$Td2[0],$Td0[2] +|| PACKL4 $Td0[3],$Td2[1],$Td0[3] + XOR $K[2],$Td0[2],$Td0[2] ; s[2] +|| XOR $K[3],$Td0[3],$Td0[3] ; s[3] + + MV $Td0[0],A9 +|| MV $Td0[1],A8 + MV $Td0[2],B9 +|| MV $Td0[3],B8 +|| [B2] STNDW A9:A8,*OUT++ + [B2] STNDW B9:B8,*OUT++ + .else + PACK2 $Td1[3],$Td0[0],$Td1[3] +|| PACK2 $Td1[0],$Td0[1],$Td1[0] + PACK2 $Td3[1],$Td2[2],$Td3[1] +|| PACK2 $Td3[2],$Td2[3],$Td3[2] + PACKL4 $Td3[1],$Td1[3],$Td1[3] +|| PACKL4 $Td3[2],$Td1[0],$Td1[0] + XOR $K[0],$Td1[3],$Td1[3] ; s[0] +|| XOR $K[1],$Td1[0],$Td1[0] ; s[1] + + PACK2 $Td1[1],$Td0[2],$Td1[1] +|| PACK2 $Td1[2],$Td0[3],$Td1[2] + PACK2 $Td3[3],$Td2[0],$Td3[3] +|| PACK2 $Td3[0],$Td2[1],$Td3[0] +|| BNOP RA + PACKL4 $Td3[3],$Td1[1],$Td1[1] +|| PACKL4 $Td3[0],$Td1[2],$Td1[2] + XOR $K[2],$Td1[1],$Td1[1] ; s[2] +|| XOR $K[3],$Td1[2],$Td1[2] ; s[3] + + MV $Td1[3],A8 +|| MV $Td1[0],A9 + MV $Td1[1],B8 +|| MV $Td1[2],B9 +|| [B2] STNDW A9:A8,*OUT++ + [B2] STNDW B9:B8,*OUT++ + .endif + .endasmfunc +___ +{ +my @K=(@K,@s); # extended key +my @Te4=map("B$_",(16..19)); + +my @Kx9=@Te0; # used in AES_set_decrypt_key +my @KxB=@Te1; +my @KxD=@Te2; +my @KxE=@Te3; + +$code.=<<___; + .asg OUT,BITS + + .global _AES_set_encrypt_key +_AES_set_encrypt_key: +__set_encrypt_key: + .asmfunc + MV INP,A0 +|| SHRU BITS,5,BITS ; 128-192-256 -> 4-6-8 +|| MV KEY,A1 + [!A0] B RA +||[!A0] MVK -1,RET +||[!A0] MVK 1,A1 ; only one B RA + [!A1] B RA +||[!A1] MVK -1,RET +||[!A1] MVK 0,A0 +|| MVK 0,B0 +|| MVK 0,A1 + [A0] LDNDW *INP++,A9:A8 +|| [A0] CMPEQ 4,BITS,B0 +|| [A0] CMPLT 3,BITS,A1 + [B0] B key128? +|| [A1] LDNDW *INP++,B9:B8 +|| [A0] CMPEQ 6,BITS,B0 +|| [A0] CMPLT 5,BITS,A1 + [B0] B key192? +|| [A1] LDNDW *INP++,B17:B16 +|| [A0] CMPEQ 8,BITS,B0 +|| [A0] CMPLT 7,BITS,A1 + [B0] B key256? +|| [A1] LDNDW *INP++,B19:B18 + + .if __TI_EABI__ + [A0] ADD 0,KEY,$KPA +|| [A0] ADD 4,KEY,$KPB +|| [A0] MVKL \$PCR_OFFSET(AES_Te4,_AES_set_encrypt_key),$TEA +|| [A0] ADDKPC _AES_set_encrypt_key,B6 + [A0] MVKH \$PCR_OFFSET(AES_Te4,_AES_set_encrypt_key),$TEA + [A0] ADD B6,$TEA,$TEA ; AES_Te4 + .else + [A0] ADD 0,KEY,$KPA +|| [A0] ADD 4,KEY,$KPB +|| [A0] MVKL (AES_Te4-_AES_set_encrypt_key),$TEA +|| [A0] ADDKPC _AES_set_encrypt_key,B6 + [A0] MVKH (AES_Te4-_AES_set_encrypt_key),$TEA + [A0] ADD B6,$TEA,$TEA ; AES_Te4 + .endif + NOP + NOP + + BNOP RA,5 +|| MVK -2,RET ; unknown bit lenght +|| MVK 0,B0 ; redundant +;;==================================================================== +;;==================================================================== +key128?: + .if .BIG_ENDIAN + MV A9,$K[0] +|| MV A8,$K[1] +|| MV B9,$Te4[2] +|| MV B8,$K[3] + .else + MV A8,$K[0] +|| MV A9,$K[1] +|| MV B8,$Te4[2] +|| MV B9,$K[3] + .endif + + MVK 256,A0 +|| MVK 9,B0 + + SPLOOPD 14 +|| MVC B0,ILC +|| MV $TEA,$TEB +|| ADD $TEA,A0,A30 ; rcon +;;==================================================================== + LDW *A30++[1],A31 ; rcon[i] +|| MV $Te4[2],$K[2] +|| EXTU $K[3],EXT1,24,$Te4[0] + LDBU *${TEB}[$Te4[0]],$Te4[0] +|| MV $K[3],A0 +|| EXTU $K[3],EXT2,24,$Te4[1] + LDBU *${TEB}[$Te4[1]],$Te4[1] +|| EXTU A0,EXT3,24,A0 +|| EXTU $K[3],EXT0,24,$Te4[3] + .if .BIG_ENDIAN + LDBU *${TEA}[A0],$Te4[3] +|| LDBU *${TEB}[$Te4[3]],A0 + .else + LDBU *${TEA}[A0],A0 +|| LDBU *${TEB}[$Te4[3]],$Te4[3] + .endif + + STW $K[0],*$KPA++[2] +|| STW $K[1],*$KPB++[2] + STW $K[2],*$KPA++[2] +|| STW $K[3],*$KPB++[2] + + XOR A31,$K[0],$K[0] ; ^=rcon[i] + .if .BIG_ENDIAN + PACK2 $Te4[0],$Te4[1],$Te4[1] + PACK2 $Te4[3],A0,$Te4[3] + PACKL4 $Te4[1],$Te4[3],$Te4[3] + .else + PACK2 $Te4[1],$Te4[0],$Te4[1] + PACK2 $Te4[3],A0,$Te4[3] + PACKL4 $Te4[3],$Te4[1],$Te4[3] + .endif + XOR $Te4[3],$K[0],$Te4[0] ; K[0] + XOR $Te4[0],$K[1],$K[1] ; K[1] + MV $Te4[0],$K[0] +|| XOR $K[1],$K[2],$Te4[2] ; K[2] + XOR $Te4[2],$K[3],$K[3] ; K[3] + SPKERNEL +;;==================================================================== + BNOP RA + MV $Te4[2],$K[2] +|| STW $K[0],*$KPA++[2] +|| STW $K[1],*$KPB++[2] + STW $K[2],*$KPA++[2] +|| STW $K[3],*$KPB++[2] + MVK 10,B0 ; rounds + STW B0,*++${KPB}[15] + MVK 0,RET +;;==================================================================== +;;==================================================================== +key192?: + .if .BIG_ENDIAN + MV A9,$K[0] +|| MV A8,$K[1] +|| MV B9,$K[2] +|| MV B8,$K[3] + MV B17,$Te4[2] +|| MV B16,$K[5] + .else + MV A8,$K[0] +|| MV A9,$K[1] +|| MV B8,$K[2] +|| MV B9,$K[3] + MV B16,$Te4[2] +|| MV B17,$K[5] + .endif + + MVK 256,A0 +|| MVK 6,B0 + MV $TEA,$TEB +|| ADD $TEA,A0,A30 ; rcon +;;==================================================================== +loop192?: + LDW *A30++[1],A31 ; rcon[i] +|| MV $Te4[2],$K[4] +|| EXTU $K[5],EXT1,24,$Te4[0] + LDBU *${TEB}[$Te4[0]],$Te4[0] +|| MV $K[5],A0 +|| EXTU $K[5],EXT2,24,$Te4[1] + LDBU *${TEB}[$Te4[1]],$Te4[1] +|| EXTU A0,EXT3,24,A0 +|| EXTU $K[5],EXT0,24,$Te4[3] + .if .BIG_ENDIAN + LDBU *${TEA}[A0],$Te4[3] +|| LDBU *${TEB}[$Te4[3]],A0 + .else + LDBU *${TEA}[A0],A0 +|| LDBU *${TEB}[$Te4[3]],$Te4[3] + .endif + + STW $K[0],*$KPA++[2] +|| STW $K[1],*$KPB++[2] + STW $K[2],*$KPA++[2] +|| STW $K[3],*$KPB++[2] + STW $K[4],*$KPA++[2] +|| STW $K[5],*$KPB++[2] + + XOR A31,$K[0],$K[0] ; ^=rcon[i] + .if .BIG_ENDIAN + PACK2 $Te4[0],$Te4[1],$Te4[1] +|| PACK2 $Te4[3],A0,$Te4[3] + PACKL4 $Te4[1],$Te4[3],$Te4[3] + .else + PACK2 $Te4[1],$Te4[0],$Te4[1] +|| PACK2 $Te4[3],A0,$Te4[3] + PACKL4 $Te4[3],$Te4[1],$Te4[3] + .endif + BDEC loop192?,B0 +|| XOR $Te4[3],$K[0],$Te4[0] ; K[0] + XOR $Te4[0],$K[1],$K[1] ; K[1] + MV $Te4[0],$K[0] +|| XOR $K[1],$K[2],$Te4[2] ; K[2] + XOR $Te4[2],$K[3],$K[3] ; K[3] + MV $Te4[2],$K[2] +|| XOR $K[3],$K[4],$Te4[2] ; K[4] + XOR $Te4[2],$K[5],$K[5] ; K[5] +;;==================================================================== + BNOP RA + STW $K[0],*$KPA++[2] +|| STW $K[1],*$KPB++[2] + STW $K[2],*$KPA++[2] +|| STW $K[3],*$KPB++[2] + MVK 12,B0 ; rounds + STW B0,*++${KPB}[7] + MVK 0,RET +;;==================================================================== +;;==================================================================== +key256?: + .if .BIG_ENDIAN + MV A9,$K[0] +|| MV A8,$K[1] +|| MV B9,$K[2] +|| MV B8,$K[3] + MV B17,$K[4] +|| MV B16,$K[5] +|| MV B19,$Te4[2] +|| MV B18,$K[7] + .else + MV A8,$K[0] +|| MV A9,$K[1] +|| MV B8,$K[2] +|| MV B9,$K[3] + MV B16,$K[4] +|| MV B17,$K[5] +|| MV B18,$Te4[2] +|| MV B19,$K[7] + .endif + + MVK 256,A0 +|| MVK 6,B0 + MV $TEA,$TEB +|| ADD $TEA,A0,A30 ; rcon +;;==================================================================== +loop256?: + LDW *A30++[1],A31 ; rcon[i] +|| MV $Te4[2],$K[6] +|| EXTU $K[7],EXT1,24,$Te4[0] + LDBU *${TEB}[$Te4[0]],$Te4[0] +|| MV $K[7],A0 +|| EXTU $K[7],EXT2,24,$Te4[1] + LDBU *${TEB}[$Te4[1]],$Te4[1] +|| EXTU A0,EXT3,24,A0 +|| EXTU $K[7],EXT0,24,$Te4[3] + .if .BIG_ENDIAN + LDBU *${TEA}[A0],$Te4[3] +|| LDBU *${TEB}[$Te4[3]],A0 + .else + LDBU *${TEA}[A0],A0 +|| LDBU *${TEB}[$Te4[3]],$Te4[3] + .endif + + STW $K[0],*$KPA++[2] +|| STW $K[1],*$KPB++[2] + STW $K[2],*$KPA++[2] +|| STW $K[3],*$KPB++[2] + STW $K[4],*$KPA++[2] +|| STW $K[5],*$KPB++[2] + STW $K[6],*$KPA++[2] +|| STW $K[7],*$KPB++[2] +|| XOR A31,$K[0],$K[0] ; ^=rcon[i] + .if .BIG_ENDIAN + PACK2 $Te4[0],$Te4[1],$Te4[1] +|| PACK2 $Te4[3],A0,$Te4[3] + PACKL4 $Te4[1],$Te4[3],$Te4[3] +||[!B0] B done256? + .else + PACK2 $Te4[1],$Te4[0],$Te4[1] +|| PACK2 $Te4[3],A0,$Te4[3] + PACKL4 $Te4[3],$Te4[1],$Te4[3] +||[!B0] B done256? + .endif + XOR $Te4[3],$K[0],$Te4[0] ; K[0] + XOR $Te4[0],$K[1],$K[1] ; K[1] + MV $Te4[0],$K[0] +|| XOR $K[1],$K[2],$Te4[2] ; K[2] + XOR $Te4[2],$K[3],$K[3] ; K[3] + + MV $Te4[2],$K[2] +|| [B0] EXTU $K[3],EXT0,24,$Te4[0] +|| [B0] SUB B0,1,B0 + LDBU *${TEB}[$Te4[0]],$Te4[0] +|| MV $K[3],A0 +|| EXTU $K[3],EXT1,24,$Te4[1] + LDBU *${TEB}[$Te4[1]],$Te4[1] +|| EXTU A0,EXT2,24,A0 +|| EXTU $K[3],EXT3,24,$Te4[3] + + .if .BIG_ENDIAN + LDBU *${TEA}[A0],$Te4[3] +|| LDBU *${TEB}[$Te4[3]],A0 + NOP 3 + PACK2 $Te4[0],$Te4[1],$Te4[1] + PACK2 $Te4[3],A0,$Te4[3] +|| B loop256? + PACKL4 $Te4[1],$Te4[3],$Te4[3] + .else + LDBU *${TEA}[A0],A0 +|| LDBU *${TEB}[$Te4[3]],$Te4[3] + NOP 3 + PACK2 $Te4[1],$Te4[0],$Te4[1] + PACK2 $Te4[3],A0,$Te4[3] +|| B loop256? + PACKL4 $Te4[3],$Te4[1],$Te4[3] + .endif + + XOR $Te4[3],$K[4],$Te4[0] ; K[4] + XOR $Te4[0],$K[5],$K[5] ; K[5] + MV $Te4[0],$K[4] +|| XOR $K[5],$K[6],$Te4[2] ; K[6] + XOR $Te4[2],$K[7],$K[7] ; K[7] +;;==================================================================== +done256?: + BNOP RA + STW $K[0],*$KPA++[2] +|| STW $K[1],*$KPB++[2] + STW $K[2],*$KPA++[2] +|| STW $K[3],*$KPB++[2] + MVK 14,B0 ; rounds + STW B0,*--${KPB}[1] + MVK 0,RET + .endasmfunc + + .global _AES_set_decrypt_key +_AES_set_decrypt_key: + .asmfunc + B __set_encrypt_key ; guarantee local call + MV KEY,B30 ; B30 is not modified + MV RA, B31 ; B31 is not modified + ADDKPC ret?,RA,2 +ret?: ; B0 holds rounds or zero + [!B0] BNOP B31 ; return if zero + [B0] SHL B0,4,A0 ; offset to last round key + [B0] SHRU B0,1,B1 + [B0] SUB B1,1,B1 + [B0] MVK 0x0000001B,B3 ; AES polynomial + [B0] MVKH 0x07000000,B3 + + SPLOOPD 9 ; flip round keys +|| MVC B1,ILC +|| MV B30,$KPA +|| ADD B30,A0,$KPB +|| MVK 16,A0 ; sizeof(round key) +;;==================================================================== + LDW *${KPA}[0],A16 +|| LDW *${KPB}[0],B16 + LDW *${KPA}[1],A17 +|| LDW *${KPB}[1],B17 + LDW *${KPA}[2],A18 +|| LDW *${KPB}[2],B18 + LDW *${KPA}[3],A19 +|| ADD $KPA,A0,$KPA +|| LDW *${KPB}[3],B19 +|| SUB $KPB,A0,$KPB + NOP + STW B16,*${KPA}[-4] +|| STW A16,*${KPB}[4] + STW B17,*${KPA}[-3] +|| STW A17,*${KPB}[5] + STW B18,*${KPA}[-2] +|| STW A18,*${KPB}[6] + STW B19,*${KPA}[-1] +|| STW A19,*${KPB}[7] + SPKERNEL +;;==================================================================== + SUB B0,1,B0 ; skip last round +|| ADD B30,A0,$KPA ; skip first round +|| ADD B30,A0,$KPB +|| MVC GFPGFR,B30 ; save GFPGFR + LDW *${KPA}[0],$K[0] +|| LDW *${KPB}[1],$K[1] +|| MVC B3,GFPGFR + LDW *${KPA}[2],$K[2] +|| LDW *${KPB}[3],$K[3] + MVK 0x00000909,A24 +|| MVK 0x00000B0B,B24 + MVKH 0x09090000,A24 +|| MVKH 0x0B0B0000,B24 + MVC B0,ILC +|| SUB B0,1,B0 + + GMPY4 $K[0],A24,$Kx9[0] ; ·0x09 +|| GMPY4 $K[1],A24,$Kx9[1] +|| MVK 0x00000D0D,A25 +|| MVK 0x00000E0E,B25 + GMPY4 $K[2],A24,$Kx9[2] +|| GMPY4 $K[3],A24,$Kx9[3] +|| MVKH 0x0D0D0000,A25 +|| MVKH 0x0E0E0000,B25 + + GMPY4 $K[0],B24,$KxB[0] ; ·0x0B +|| GMPY4 $K[1],B24,$KxB[1] + GMPY4 $K[2],B24,$KxB[2] +|| GMPY4 $K[3],B24,$KxB[3] + + SPLOOP 11 ; InvMixColumns +;;==================================================================== + GMPY4 $K[0],A25,$KxD[0] ; ·0x0D +|| GMPY4 $K[1],A25,$KxD[1] +|| SWAP2 $Kx9[0],$Kx9[0] ; rotate by 16 +|| SWAP2 $Kx9[1],$Kx9[1] +|| MV $K[0],$s[0] ; this or DINT +|| MV $K[1],$s[1] +|| [B0] LDW *${KPA}[4],$K[0] +|| [B0] LDW *${KPB}[5],$K[1] + GMPY4 $K[2],A25,$KxD[2] +|| GMPY4 $K[3],A25,$KxD[3] +|| SWAP2 $Kx9[2],$Kx9[2] +|| SWAP2 $Kx9[3],$Kx9[3] +|| MV $K[2],$s[2] +|| MV $K[3],$s[3] +|| [B0] LDW *${KPA}[6],$K[2] +|| [B0] LDW *${KPB}[7],$K[3] + + GMPY4 $s[0],B25,$KxE[0] ; ·0x0E +|| GMPY4 $s[1],B25,$KxE[1] +|| XOR $Kx9[0],$KxB[0],$KxB[0] +|| XOR $Kx9[1],$KxB[1],$KxB[1] + GMPY4 $s[2],B25,$KxE[2] +|| GMPY4 $s[3],B25,$KxE[3] +|| XOR $Kx9[2],$KxB[2],$KxB[2] +|| XOR $Kx9[3],$KxB[3],$KxB[3] + + ROTL $KxB[0],TBL3,$KxB[0] +|| ROTL $KxB[1],TBL3,$KxB[1] +|| SWAP2 $KxD[0],$KxD[0] ; rotate by 16 +|| SWAP2 $KxD[1],$KxD[1] + ROTL $KxB[2],TBL3,$KxB[2] +|| ROTL $KxB[3],TBL3,$KxB[3] +|| SWAP2 $KxD[2],$KxD[2] +|| SWAP2 $KxD[3],$KxD[3] + + XOR $KxE[0],$KxD[0],$KxE[0] +|| XOR $KxE[1],$KxD[1],$KxE[1] +|| [B0] GMPY4 $K[0],A24,$Kx9[0] ; ·0x09 +|| [B0] GMPY4 $K[1],A24,$Kx9[1] +|| ADDAW $KPA,4,$KPA + XOR $KxE[2],$KxD[2],$KxE[2] +|| XOR $KxE[3],$KxD[3],$KxE[3] +|| [B0] GMPY4 $K[2],A24,$Kx9[2] +|| [B0] GMPY4 $K[3],A24,$Kx9[3] +|| ADDAW $KPB,4,$KPB + + XOR $KxB[0],$KxE[0],$KxE[0] +|| XOR $KxB[1],$KxE[1],$KxE[1] +|| [B0] GMPY4 $K[0],B24,$KxB[0] ; ·0x0B +|| [B0] GMPY4 $K[1],B24,$KxB[1] + XOR $KxB[2],$KxE[2],$KxE[2] +|| XOR $KxB[3],$KxE[3],$KxE[3] +|| [B0] GMPY4 $K[2],B24,$KxB[2] +|| [B0] GMPY4 $K[3],B24,$KxB[3] +|| STW $KxE[0],*${KPA}[-4] +|| STW $KxE[1],*${KPB}[-3] + STW $KxE[2],*${KPA}[-2] +|| STW $KxE[3],*${KPB}[-1] +|| [B0] SUB B0,1,B0 + SPKERNEL +;;==================================================================== + BNOP B31,3 + MVC B30,GFPGFR ; restore GFPGFR(*) + MVK 0,RET + .endasmfunc +___ +# (*) Even though ABI doesn't specify GFPGFR as non-volatile, there +# are code samples out there that *assume* its default value. +} +{ +my ($inp,$out,$blocks,$key,$ivp)=("A4","B4","A6","B6","A8"); +$code.=<<___; + .global _AES_ctr32_encrypt +_AES_ctr32_encrypt: + .asmfunc + LDNDW *${ivp}[0],A31:A30 ; load counter value +|| MV $blocks,A2 ; reassign $blocks +|| DMV RA,$key,B27:B26 ; reassign RA and $key + LDNDW *${ivp}[1],B31:B30 +|| MVK 0,B2 ; don't let __encrypt load input +|| MVK 0,A1 ; and postpone writing output + .if .BIG_ENDIAN + NOP + .else + NOP 4 + SWAP2 B31,B31 ; keep least significant 32 bits + SWAP4 B31,B31 ; in host byte order + .endif +ctr32_loop?: + [A2] BNOP __encrypt +|| [A1] XOR A29,A9,A9 ; input^Ek(counter) +|| [A1] XOR A28,A8,A8 +|| [A2] LDNDW *INP++,A29:A28 ; load input + [!A2] BNOP B27 ; return +|| [A1] XOR B29,B9,B9 +|| [A1] XOR B28,B8,B8 +|| [A2] LDNDW *INP++,B29:B28 + .if .BIG_ENDIAN + [A1] STNDW A9:A8,*OUT++ ; save output +|| [A2] DMV A31,A30,A9:A8 ; pass counter value to __encrypt + [A1] STNDW B9:B8,*OUT++ +|| [A2] DMV B31,B30,B9:B8 +|| [A2] ADD B30,1,B30 ; counter++ + .else + [A1] STNDW A9:A8,*OUT++ ; save output +|| [A2] DMV A31,A30,A9:A8 +|| [A2] SWAP2 B31,B0 +|| [A2] ADD B31,1,B31 ; counter++ + [A1] STNDW B9:B8,*OUT++ +|| [A2] MV B30,B8 +|| [A2] SWAP4 B0,B9 + .endif + [A2] ADDKPC ctr32_loop?,RA ; return to ctr32_loop? +|| [A2] MV B26,KEY ; pass $key +|| [A2] SUB A2,1,A2 ; $blocks-- +||[!A1] MVK 1,A1 + NOP + NOP + .endasmfunc +___ +} +# Tables are kept in endian-neutral manner +$code.=<<___; + .sect ".const:aes_asm" + .align 128 +AES_Te: + .byte 0xc6,0x63,0x63,0xa5, 0xf8,0x7c,0x7c,0x84 + .byte 0xee,0x77,0x77,0x99, 0xf6,0x7b,0x7b,0x8d + .byte 0xff,0xf2,0xf2,0x0d, 0xd6,0x6b,0x6b,0xbd + .byte 0xde,0x6f,0x6f,0xb1, 0x91,0xc5,0xc5,0x54 + .byte 0x60,0x30,0x30,0x50, 0x02,0x01,0x01,0x03 + .byte 0xce,0x67,0x67,0xa9, 0x56,0x2b,0x2b,0x7d + .byte 0xe7,0xfe,0xfe,0x19, 0xb5,0xd7,0xd7,0x62 + .byte 0x4d,0xab,0xab,0xe6, 0xec,0x76,0x76,0x9a + .byte 0x8f,0xca,0xca,0x45, 0x1f,0x82,0x82,0x9d + .byte 0x89,0xc9,0xc9,0x40, 0xfa,0x7d,0x7d,0x87 + .byte 0xef,0xfa,0xfa,0x15, 0xb2,0x59,0x59,0xeb + .byte 0x8e,0x47,0x47,0xc9, 0xfb,0xf0,0xf0,0x0b + .byte 0x41,0xad,0xad,0xec, 0xb3,0xd4,0xd4,0x67 + .byte 0x5f,0xa2,0xa2,0xfd, 0x45,0xaf,0xaf,0xea + .byte 0x23,0x9c,0x9c,0xbf, 0x53,0xa4,0xa4,0xf7 + .byte 0xe4,0x72,0x72,0x96, 0x9b,0xc0,0xc0,0x5b + .byte 0x75,0xb7,0xb7,0xc2, 0xe1,0xfd,0xfd,0x1c + .byte 0x3d,0x93,0x93,0xae, 0x4c,0x26,0x26,0x6a + .byte 0x6c,0x36,0x36,0x5a, 0x7e,0x3f,0x3f,0x41 + .byte 0xf5,0xf7,0xf7,0x02, 0x83,0xcc,0xcc,0x4f + .byte 0x68,0x34,0x34,0x5c, 0x51,0xa5,0xa5,0xf4 + .byte 0xd1,0xe5,0xe5,0x34, 0xf9,0xf1,0xf1,0x08 + .byte 0xe2,0x71,0x71,0x93, 0xab,0xd8,0xd8,0x73 + .byte 0x62,0x31,0x31,0x53, 0x2a,0x15,0x15,0x3f + .byte 0x08,0x04,0x04,0x0c, 0x95,0xc7,0xc7,0x52 + .byte 0x46,0x23,0x23,0x65, 0x9d,0xc3,0xc3,0x5e + .byte 0x30,0x18,0x18,0x28, 0x37,0x96,0x96,0xa1 + .byte 0x0a,0x05,0x05,0x0f, 0x2f,0x9a,0x9a,0xb5 + .byte 0x0e,0x07,0x07,0x09, 0x24,0x12,0x12,0x36 + .byte 0x1b,0x80,0x80,0x9b, 0xdf,0xe2,0xe2,0x3d + .byte 0xcd,0xeb,0xeb,0x26, 0x4e,0x27,0x27,0x69 + .byte 0x7f,0xb2,0xb2,0xcd, 0xea,0x75,0x75,0x9f + .byte 0x12,0x09,0x09,0x1b, 0x1d,0x83,0x83,0x9e + .byte 0x58,0x2c,0x2c,0x74, 0x34,0x1a,0x1a,0x2e + .byte 0x36,0x1b,0x1b,0x2d, 0xdc,0x6e,0x6e,0xb2 + .byte 0xb4,0x5a,0x5a,0xee, 0x5b,0xa0,0xa0,0xfb + .byte 0xa4,0x52,0x52,0xf6, 0x76,0x3b,0x3b,0x4d + .byte 0xb7,0xd6,0xd6,0x61, 0x7d,0xb3,0xb3,0xce + .byte 0x52,0x29,0x29,0x7b, 0xdd,0xe3,0xe3,0x3e + .byte 0x5e,0x2f,0x2f,0x71, 0x13,0x84,0x84,0x97 + .byte 0xa6,0x53,0x53,0xf5, 0xb9,0xd1,0xd1,0x68 + .byte 0x00,0x00,0x00,0x00, 0xc1,0xed,0xed,0x2c + .byte 0x40,0x20,0x20,0x60, 0xe3,0xfc,0xfc,0x1f + .byte 0x79,0xb1,0xb1,0xc8, 0xb6,0x5b,0x5b,0xed + .byte 0xd4,0x6a,0x6a,0xbe, 0x8d,0xcb,0xcb,0x46 + .byte 0x67,0xbe,0xbe,0xd9, 0x72,0x39,0x39,0x4b + .byte 0x94,0x4a,0x4a,0xde, 0x98,0x4c,0x4c,0xd4 + .byte 0xb0,0x58,0x58,0xe8, 0x85,0xcf,0xcf,0x4a + .byte 0xbb,0xd0,0xd0,0x6b, 0xc5,0xef,0xef,0x2a + .byte 0x4f,0xaa,0xaa,0xe5, 0xed,0xfb,0xfb,0x16 + .byte 0x86,0x43,0x43,0xc5, 0x9a,0x4d,0x4d,0xd7 + .byte 0x66,0x33,0x33,0x55, 0x11,0x85,0x85,0x94 + .byte 0x8a,0x45,0x45,0xcf, 0xe9,0xf9,0xf9,0x10 + .byte 0x04,0x02,0x02,0x06, 0xfe,0x7f,0x7f,0x81 + .byte 0xa0,0x50,0x50,0xf0, 0x78,0x3c,0x3c,0x44 + .byte 0x25,0x9f,0x9f,0xba, 0x4b,0xa8,0xa8,0xe3 + .byte 0xa2,0x51,0x51,0xf3, 0x5d,0xa3,0xa3,0xfe + .byte 0x80,0x40,0x40,0xc0, 0x05,0x8f,0x8f,0x8a + .byte 0x3f,0x92,0x92,0xad, 0x21,0x9d,0x9d,0xbc + .byte 0x70,0x38,0x38,0x48, 0xf1,0xf5,0xf5,0x04 + .byte 0x63,0xbc,0xbc,0xdf, 0x77,0xb6,0xb6,0xc1 + .byte 0xaf,0xda,0xda,0x75, 0x42,0x21,0x21,0x63 + .byte 0x20,0x10,0x10,0x30, 0xe5,0xff,0xff,0x1a + .byte 0xfd,0xf3,0xf3,0x0e, 0xbf,0xd2,0xd2,0x6d + .byte 0x81,0xcd,0xcd,0x4c, 0x18,0x0c,0x0c,0x14 + .byte 0x26,0x13,0x13,0x35, 0xc3,0xec,0xec,0x2f + .byte 0xbe,0x5f,0x5f,0xe1, 0x35,0x97,0x97,0xa2 + .byte 0x88,0x44,0x44,0xcc, 0x2e,0x17,0x17,0x39 + .byte 0x93,0xc4,0xc4,0x57, 0x55,0xa7,0xa7,0xf2 + .byte 0xfc,0x7e,0x7e,0x82, 0x7a,0x3d,0x3d,0x47 + .byte 0xc8,0x64,0x64,0xac, 0xba,0x5d,0x5d,0xe7 + .byte 0x32,0x19,0x19,0x2b, 0xe6,0x73,0x73,0x95 + .byte 0xc0,0x60,0x60,0xa0, 0x19,0x81,0x81,0x98 + .byte 0x9e,0x4f,0x4f,0xd1, 0xa3,0xdc,0xdc,0x7f + .byte 0x44,0x22,0x22,0x66, 0x54,0x2a,0x2a,0x7e + .byte 0x3b,0x90,0x90,0xab, 0x0b,0x88,0x88,0x83 + .byte 0x8c,0x46,0x46,0xca, 0xc7,0xee,0xee,0x29 + .byte 0x6b,0xb8,0xb8,0xd3, 0x28,0x14,0x14,0x3c + .byte 0xa7,0xde,0xde,0x79, 0xbc,0x5e,0x5e,0xe2 + .byte 0x16,0x0b,0x0b,0x1d, 0xad,0xdb,0xdb,0x76 + .byte 0xdb,0xe0,0xe0,0x3b, 0x64,0x32,0x32,0x56 + .byte 0x74,0x3a,0x3a,0x4e, 0x14,0x0a,0x0a,0x1e + .byte 0x92,0x49,0x49,0xdb, 0x0c,0x06,0x06,0x0a + .byte 0x48,0x24,0x24,0x6c, 0xb8,0x5c,0x5c,0xe4 + .byte 0x9f,0xc2,0xc2,0x5d, 0xbd,0xd3,0xd3,0x6e + .byte 0x43,0xac,0xac,0xef, 0xc4,0x62,0x62,0xa6 + .byte 0x39,0x91,0x91,0xa8, 0x31,0x95,0x95,0xa4 + .byte 0xd3,0xe4,0xe4,0x37, 0xf2,0x79,0x79,0x8b + .byte 0xd5,0xe7,0xe7,0x32, 0x8b,0xc8,0xc8,0x43 + .byte 0x6e,0x37,0x37,0x59, 0xda,0x6d,0x6d,0xb7 + .byte 0x01,0x8d,0x8d,0x8c, 0xb1,0xd5,0xd5,0x64 + .byte 0x9c,0x4e,0x4e,0xd2, 0x49,0xa9,0xa9,0xe0 + .byte 0xd8,0x6c,0x6c,0xb4, 0xac,0x56,0x56,0xfa + .byte 0xf3,0xf4,0xf4,0x07, 0xcf,0xea,0xea,0x25 + .byte 0xca,0x65,0x65,0xaf, 0xf4,0x7a,0x7a,0x8e + .byte 0x47,0xae,0xae,0xe9, 0x10,0x08,0x08,0x18 + .byte 0x6f,0xba,0xba,0xd5, 0xf0,0x78,0x78,0x88 + .byte 0x4a,0x25,0x25,0x6f, 0x5c,0x2e,0x2e,0x72 + .byte 0x38,0x1c,0x1c,0x24, 0x57,0xa6,0xa6,0xf1 + .byte 0x73,0xb4,0xb4,0xc7, 0x97,0xc6,0xc6,0x51 + .byte 0xcb,0xe8,0xe8,0x23, 0xa1,0xdd,0xdd,0x7c + .byte 0xe8,0x74,0x74,0x9c, 0x3e,0x1f,0x1f,0x21 + .byte 0x96,0x4b,0x4b,0xdd, 0x61,0xbd,0xbd,0xdc + .byte 0x0d,0x8b,0x8b,0x86, 0x0f,0x8a,0x8a,0x85 + .byte 0xe0,0x70,0x70,0x90, 0x7c,0x3e,0x3e,0x42 + .byte 0x71,0xb5,0xb5,0xc4, 0xcc,0x66,0x66,0xaa + .byte 0x90,0x48,0x48,0xd8, 0x06,0x03,0x03,0x05 + .byte 0xf7,0xf6,0xf6,0x01, 0x1c,0x0e,0x0e,0x12 + .byte 0xc2,0x61,0x61,0xa3, 0x6a,0x35,0x35,0x5f + .byte 0xae,0x57,0x57,0xf9, 0x69,0xb9,0xb9,0xd0 + .byte 0x17,0x86,0x86,0x91, 0x99,0xc1,0xc1,0x58 + .byte 0x3a,0x1d,0x1d,0x27, 0x27,0x9e,0x9e,0xb9 + .byte 0xd9,0xe1,0xe1,0x38, 0xeb,0xf8,0xf8,0x13 + .byte 0x2b,0x98,0x98,0xb3, 0x22,0x11,0x11,0x33 + .byte 0xd2,0x69,0x69,0xbb, 0xa9,0xd9,0xd9,0x70 + .byte 0x07,0x8e,0x8e,0x89, 0x33,0x94,0x94,0xa7 + .byte 0x2d,0x9b,0x9b,0xb6, 0x3c,0x1e,0x1e,0x22 + .byte 0x15,0x87,0x87,0x92, 0xc9,0xe9,0xe9,0x20 + .byte 0x87,0xce,0xce,0x49, 0xaa,0x55,0x55,0xff + .byte 0x50,0x28,0x28,0x78, 0xa5,0xdf,0xdf,0x7a + .byte 0x03,0x8c,0x8c,0x8f, 0x59,0xa1,0xa1,0xf8 + .byte 0x09,0x89,0x89,0x80, 0x1a,0x0d,0x0d,0x17 + .byte 0x65,0xbf,0xbf,0xda, 0xd7,0xe6,0xe6,0x31 + .byte 0x84,0x42,0x42,0xc6, 0xd0,0x68,0x68,0xb8 + .byte 0x82,0x41,0x41,0xc3, 0x29,0x99,0x99,0xb0 + .byte 0x5a,0x2d,0x2d,0x77, 0x1e,0x0f,0x0f,0x11 + .byte 0x7b,0xb0,0xb0,0xcb, 0xa8,0x54,0x54,0xfc + .byte 0x6d,0xbb,0xbb,0xd6, 0x2c,0x16,0x16,0x3a +AES_Te4: + .byte 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5 + .byte 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76 + .byte 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0 + .byte 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0 + .byte 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc + .byte 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15 + .byte 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a + .byte 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75 + .byte 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0 + .byte 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84 + .byte 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b + .byte 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf + .byte 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85 + .byte 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8 + .byte 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5 + .byte 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2 + .byte 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17 + .byte 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73 + .byte 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88 + .byte 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb + .byte 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c + .byte 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79 + .byte 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9 + .byte 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08 + .byte 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6 + .byte 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a + .byte 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e + .byte 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e + .byte 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94 + .byte 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf + .byte 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68 + .byte 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16 +rcon: + .byte 0x01,0x00,0x00,0x00, 0x02,0x00,0x00,0x00 + .byte 0x04,0x00,0x00,0x00, 0x08,0x00,0x00,0x00 + .byte 0x10,0x00,0x00,0x00, 0x20,0x00,0x00,0x00 + .byte 0x40,0x00,0x00,0x00, 0x80,0x00,0x00,0x00 + .byte 0x1B,0x00,0x00,0x00, 0x36,0x00,0x00,0x00 + .align 128 +AES_Td: + .byte 0x51,0xf4,0xa7,0x50, 0x7e,0x41,0x65,0x53 + .byte 0x1a,0x17,0xa4,0xc3, 0x3a,0x27,0x5e,0x96 + .byte 0x3b,0xab,0x6b,0xcb, 0x1f,0x9d,0x45,0xf1 + .byte 0xac,0xfa,0x58,0xab, 0x4b,0xe3,0x03,0x93 + .byte 0x20,0x30,0xfa,0x55, 0xad,0x76,0x6d,0xf6 + .byte 0x88,0xcc,0x76,0x91, 0xf5,0x02,0x4c,0x25 + .byte 0x4f,0xe5,0xd7,0xfc, 0xc5,0x2a,0xcb,0xd7 + .byte 0x26,0x35,0x44,0x80, 0xb5,0x62,0xa3,0x8f + .byte 0xde,0xb1,0x5a,0x49, 0x25,0xba,0x1b,0x67 + .byte 0x45,0xea,0x0e,0x98, 0x5d,0xfe,0xc0,0xe1 + .byte 0xc3,0x2f,0x75,0x02, 0x81,0x4c,0xf0,0x12 + .byte 0x8d,0x46,0x97,0xa3, 0x6b,0xd3,0xf9,0xc6 + .byte 0x03,0x8f,0x5f,0xe7, 0x15,0x92,0x9c,0x95 + .byte 0xbf,0x6d,0x7a,0xeb, 0x95,0x52,0x59,0xda + .byte 0xd4,0xbe,0x83,0x2d, 0x58,0x74,0x21,0xd3 + .byte 0x49,0xe0,0x69,0x29, 0x8e,0xc9,0xc8,0x44 + .byte 0x75,0xc2,0x89,0x6a, 0xf4,0x8e,0x79,0x78 + .byte 0x99,0x58,0x3e,0x6b, 0x27,0xb9,0x71,0xdd + .byte 0xbe,0xe1,0x4f,0xb6, 0xf0,0x88,0xad,0x17 + .byte 0xc9,0x20,0xac,0x66, 0x7d,0xce,0x3a,0xb4 + .byte 0x63,0xdf,0x4a,0x18, 0xe5,0x1a,0x31,0x82 + .byte 0x97,0x51,0x33,0x60, 0x62,0x53,0x7f,0x45 + .byte 0xb1,0x64,0x77,0xe0, 0xbb,0x6b,0xae,0x84 + .byte 0xfe,0x81,0xa0,0x1c, 0xf9,0x08,0x2b,0x94 + .byte 0x70,0x48,0x68,0x58, 0x8f,0x45,0xfd,0x19 + .byte 0x94,0xde,0x6c,0x87, 0x52,0x7b,0xf8,0xb7 + .byte 0xab,0x73,0xd3,0x23, 0x72,0x4b,0x02,0xe2 + .byte 0xe3,0x1f,0x8f,0x57, 0x66,0x55,0xab,0x2a + .byte 0xb2,0xeb,0x28,0x07, 0x2f,0xb5,0xc2,0x03 + .byte 0x86,0xc5,0x7b,0x9a, 0xd3,0x37,0x08,0xa5 + .byte 0x30,0x28,0x87,0xf2, 0x23,0xbf,0xa5,0xb2 + .byte 0x02,0x03,0x6a,0xba, 0xed,0x16,0x82,0x5c + .byte 0x8a,0xcf,0x1c,0x2b, 0xa7,0x79,0xb4,0x92 + .byte 0xf3,0x07,0xf2,0xf0, 0x4e,0x69,0xe2,0xa1 + .byte 0x65,0xda,0xf4,0xcd, 0x06,0x05,0xbe,0xd5 + .byte 0xd1,0x34,0x62,0x1f, 0xc4,0xa6,0xfe,0x8a + .byte 0x34,0x2e,0x53,0x9d, 0xa2,0xf3,0x55,0xa0 + .byte 0x05,0x8a,0xe1,0x32, 0xa4,0xf6,0xeb,0x75 + .byte 0x0b,0x83,0xec,0x39, 0x40,0x60,0xef,0xaa + .byte 0x5e,0x71,0x9f,0x06, 0xbd,0x6e,0x10,0x51 + .byte 0x3e,0x21,0x8a,0xf9, 0x96,0xdd,0x06,0x3d + .byte 0xdd,0x3e,0x05,0xae, 0x4d,0xe6,0xbd,0x46 + .byte 0x91,0x54,0x8d,0xb5, 0x71,0xc4,0x5d,0x05 + .byte 0x04,0x06,0xd4,0x6f, 0x60,0x50,0x15,0xff + .byte 0x19,0x98,0xfb,0x24, 0xd6,0xbd,0xe9,0x97 + .byte 0x89,0x40,0x43,0xcc, 0x67,0xd9,0x9e,0x77 + .byte 0xb0,0xe8,0x42,0xbd, 0x07,0x89,0x8b,0x88 + .byte 0xe7,0x19,0x5b,0x38, 0x79,0xc8,0xee,0xdb + .byte 0xa1,0x7c,0x0a,0x47, 0x7c,0x42,0x0f,0xe9 + .byte 0xf8,0x84,0x1e,0xc9, 0x00,0x00,0x00,0x00 + .byte 0x09,0x80,0x86,0x83, 0x32,0x2b,0xed,0x48 + .byte 0x1e,0x11,0x70,0xac, 0x6c,0x5a,0x72,0x4e + .byte 0xfd,0x0e,0xff,0xfb, 0x0f,0x85,0x38,0x56 + .byte 0x3d,0xae,0xd5,0x1e, 0x36,0x2d,0x39,0x27 + .byte 0x0a,0x0f,0xd9,0x64, 0x68,0x5c,0xa6,0x21 + .byte 0x9b,0x5b,0x54,0xd1, 0x24,0x36,0x2e,0x3a + .byte 0x0c,0x0a,0x67,0xb1, 0x93,0x57,0xe7,0x0f + .byte 0xb4,0xee,0x96,0xd2, 0x1b,0x9b,0x91,0x9e + .byte 0x80,0xc0,0xc5,0x4f, 0x61,0xdc,0x20,0xa2 + .byte 0x5a,0x77,0x4b,0x69, 0x1c,0x12,0x1a,0x16 + .byte 0xe2,0x93,0xba,0x0a, 0xc0,0xa0,0x2a,0xe5 + .byte 0x3c,0x22,0xe0,0x43, 0x12,0x1b,0x17,0x1d + .byte 0x0e,0x09,0x0d,0x0b, 0xf2,0x8b,0xc7,0xad + .byte 0x2d,0xb6,0xa8,0xb9, 0x14,0x1e,0xa9,0xc8 + .byte 0x57,0xf1,0x19,0x85, 0xaf,0x75,0x07,0x4c + .byte 0xee,0x99,0xdd,0xbb, 0xa3,0x7f,0x60,0xfd + .byte 0xf7,0x01,0x26,0x9f, 0x5c,0x72,0xf5,0xbc + .byte 0x44,0x66,0x3b,0xc5, 0x5b,0xfb,0x7e,0x34 + .byte 0x8b,0x43,0x29,0x76, 0xcb,0x23,0xc6,0xdc + .byte 0xb6,0xed,0xfc,0x68, 0xb8,0xe4,0xf1,0x63 + .byte 0xd7,0x31,0xdc,0xca, 0x42,0x63,0x85,0x10 + .byte 0x13,0x97,0x22,0x40, 0x84,0xc6,0x11,0x20 + .byte 0x85,0x4a,0x24,0x7d, 0xd2,0xbb,0x3d,0xf8 + .byte 0xae,0xf9,0x32,0x11, 0xc7,0x29,0xa1,0x6d + .byte 0x1d,0x9e,0x2f,0x4b, 0xdc,0xb2,0x30,0xf3 + .byte 0x0d,0x86,0x52,0xec, 0x77,0xc1,0xe3,0xd0 + .byte 0x2b,0xb3,0x16,0x6c, 0xa9,0x70,0xb9,0x99 + .byte 0x11,0x94,0x48,0xfa, 0x47,0xe9,0x64,0x22 + .byte 0xa8,0xfc,0x8c,0xc4, 0xa0,0xf0,0x3f,0x1a + .byte 0x56,0x7d,0x2c,0xd8, 0x22,0x33,0x90,0xef + .byte 0x87,0x49,0x4e,0xc7, 0xd9,0x38,0xd1,0xc1 + .byte 0x8c,0xca,0xa2,0xfe, 0x98,0xd4,0x0b,0x36 + .byte 0xa6,0xf5,0x81,0xcf, 0xa5,0x7a,0xde,0x28 + .byte 0xda,0xb7,0x8e,0x26, 0x3f,0xad,0xbf,0xa4 + .byte 0x2c,0x3a,0x9d,0xe4, 0x50,0x78,0x92,0x0d + .byte 0x6a,0x5f,0xcc,0x9b, 0x54,0x7e,0x46,0x62 + .byte 0xf6,0x8d,0x13,0xc2, 0x90,0xd8,0xb8,0xe8 + .byte 0x2e,0x39,0xf7,0x5e, 0x82,0xc3,0xaf,0xf5 + .byte 0x9f,0x5d,0x80,0xbe, 0x69,0xd0,0x93,0x7c + .byte 0x6f,0xd5,0x2d,0xa9, 0xcf,0x25,0x12,0xb3 + .byte 0xc8,0xac,0x99,0x3b, 0x10,0x18,0x7d,0xa7 + .byte 0xe8,0x9c,0x63,0x6e, 0xdb,0x3b,0xbb,0x7b + .byte 0xcd,0x26,0x78,0x09, 0x6e,0x59,0x18,0xf4 + .byte 0xec,0x9a,0xb7,0x01, 0x83,0x4f,0x9a,0xa8 + .byte 0xe6,0x95,0x6e,0x65, 0xaa,0xff,0xe6,0x7e + .byte 0x21,0xbc,0xcf,0x08, 0xef,0x15,0xe8,0xe6 + .byte 0xba,0xe7,0x9b,0xd9, 0x4a,0x6f,0x36,0xce + .byte 0xea,0x9f,0x09,0xd4, 0x29,0xb0,0x7c,0xd6 + .byte 0x31,0xa4,0xb2,0xaf, 0x2a,0x3f,0x23,0x31 + .byte 0xc6,0xa5,0x94,0x30, 0x35,0xa2,0x66,0xc0 + .byte 0x74,0x4e,0xbc,0x37, 0xfc,0x82,0xca,0xa6 + .byte 0xe0,0x90,0xd0,0xb0, 0x33,0xa7,0xd8,0x15 + .byte 0xf1,0x04,0x98,0x4a, 0x41,0xec,0xda,0xf7 + .byte 0x7f,0xcd,0x50,0x0e, 0x17,0x91,0xf6,0x2f + .byte 0x76,0x4d,0xd6,0x8d, 0x43,0xef,0xb0,0x4d + .byte 0xcc,0xaa,0x4d,0x54, 0xe4,0x96,0x04,0xdf + .byte 0x9e,0xd1,0xb5,0xe3, 0x4c,0x6a,0x88,0x1b + .byte 0xc1,0x2c,0x1f,0xb8, 0x46,0x65,0x51,0x7f + .byte 0x9d,0x5e,0xea,0x04, 0x01,0x8c,0x35,0x5d + .byte 0xfa,0x87,0x74,0x73, 0xfb,0x0b,0x41,0x2e + .byte 0xb3,0x67,0x1d,0x5a, 0x92,0xdb,0xd2,0x52 + .byte 0xe9,0x10,0x56,0x33, 0x6d,0xd6,0x47,0x13 + .byte 0x9a,0xd7,0x61,0x8c, 0x37,0xa1,0x0c,0x7a + .byte 0x59,0xf8,0x14,0x8e, 0xeb,0x13,0x3c,0x89 + .byte 0xce,0xa9,0x27,0xee, 0xb7,0x61,0xc9,0x35 + .byte 0xe1,0x1c,0xe5,0xed, 0x7a,0x47,0xb1,0x3c + .byte 0x9c,0xd2,0xdf,0x59, 0x55,0xf2,0x73,0x3f + .byte 0x18,0x14,0xce,0x79, 0x73,0xc7,0x37,0xbf + .byte 0x53,0xf7,0xcd,0xea, 0x5f,0xfd,0xaa,0x5b + .byte 0xdf,0x3d,0x6f,0x14, 0x78,0x44,0xdb,0x86 + .byte 0xca,0xaf,0xf3,0x81, 0xb9,0x68,0xc4,0x3e + .byte 0x38,0x24,0x34,0x2c, 0xc2,0xa3,0x40,0x5f + .byte 0x16,0x1d,0xc3,0x72, 0xbc,0xe2,0x25,0x0c + .byte 0x28,0x3c,0x49,0x8b, 0xff,0x0d,0x95,0x41 + .byte 0x39,0xa8,0x01,0x71, 0x08,0x0c,0xb3,0xde + .byte 0xd8,0xb4,0xe4,0x9c, 0x64,0x56,0xc1,0x90 + .byte 0x7b,0xcb,0x84,0x61, 0xd5,0x32,0xb6,0x70 + .byte 0x48,0x6c,0x5c,0x74, 0xd0,0xb8,0x57,0x42 +AES_Td4: + .byte 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38 + .byte 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb + .byte 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87 + .byte 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb + .byte 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d + .byte 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e + .byte 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2 + .byte 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25 + .byte 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16 + .byte 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92 + .byte 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda + .byte 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84 + .byte 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a + .byte 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06 + .byte 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02 + .byte 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b + .byte 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea + .byte 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73 + .byte 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85 + .byte 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e + .byte 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89 + .byte 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b + .byte 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20 + .byte 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4 + .byte 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31 + .byte 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f + .byte 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d + .byte 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef + .byte 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0 + .byte 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61 + .byte 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26 + .byte 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d + .cstring "AES for C64x+, CRYPTOGAMS by " + .align 4 +___ + +print $code; diff --git a/crypto/bn/asm/bn-c64xplus.asm b/crypto/bn/asm/bn-c64xplus.asm new file mode 100644 index 0000000000..161547c3b0 --- /dev/null +++ b/crypto/bn/asm/bn-c64xplus.asm @@ -0,0 +1,333 @@ +;;==================================================================== +;; Written by Andy Polyakov for the OpenSSL +;; project. +;; +;; Rights for redistribution and usage in source and binary forms are +;; granted according to the OpenSSL license. Warranty of any kind is +;; disclaimed. +;;==================================================================== +;; Compiler-generated multiply-n-add SPLOOP runs at 12*n cycles, n +;; being the number of 32-bit words, addition - 8*n. Corresponding 4x +;; unrolled SPLOOP-free loops - at ~8*n and ~5*n. Below assembler +;; SPLOOPs spin at ... 2*n cycles [plus epilogue]. +;;==================================================================== + .text + + .asg B3,RA + .asg A4,ARG0 + .asg B4,ARG1 + .asg A6,ARG2 + .asg B6,ARG3 + .asg A8,ARG4 + .asg B8,ARG5 + .asg A4,RET + .asg A15,FP + .asg B14,DP + .asg B15,SP + + .global _bn_mul_add_words +_bn_mul_add_words: + .asmfunc + MV ARG2,B0 + [!B0] BNOP RA +||[!B0] MVK 0,RET + [B0] MVC B0,ILC + [B0] ZERO A19 ; high part of accumulator +|| [B0] MV ARG0,A2 +|| [B0] MV ARG3,A3 + NOP 3 + + SPLOOP 2 ; 2*n+10 +;;==================================================================== + LDW *ARG1++,B7 ; ap[i] + NOP 3 + LDW *ARG0++,A7 ; rp[i] + MPY32U B7,A3,A17:A16 + NOP 3 ; [2,0] in epilogue + ADDU A16,A7,A21:A20 + ADDU A19,A21:A20,A19:A18 +|| MV.S A17,A23 + SPKERNEL 2,1 ; leave slot for "return value" +|| STW A18,*A2++ ; rp[i] +|| ADD A19,A23,A19 +;;==================================================================== + BNOP RA,4 + MV A19,RET ; return value + .endasmfunc + + .global _bn_mul_words +_bn_mul_words: + .asmfunc + MV ARG2,B0 + [!B0] BNOP RA +||[!B0] MVK 0,RET + [B0] MVC B0,ILC + [B0] ZERO A19 ; high part of accumulator + NOP 3 + + SPLOOP 2 ; 2*n+10 +;;==================================================================== + LDW *ARG1++,A7 ; ap[i] + NOP 4 + MPY32U A7,ARG3,A17:A16 + NOP 4 ; [2,0] in epiloque + ADDU A19,A16,A19:A18 +|| MV.S A17,A21 + SPKERNEL 2,1 ; leave slot for "return value" +|| STW A18,*ARG0++ ; rp[i] +|| ADD.L A19,A21,A19 +;;==================================================================== + BNOP RA,4 + MV A19,RET ; return value + .endasmfunc + + .global _bn_sqr_words +_bn_sqr_words: + .asmfunc + MV ARG2,B0 + [!B0] BNOP RA +||[!B0] MVK 0,RET + [B0] MVC B0,ILC + [B0] MV ARG0,B2 +|| [B0] ADD 4,ARG0,ARG0 + NOP 3 + + SPLOOP 2 ; 2*n+10 +;;==================================================================== + LDW *ARG1++,B7 ; ap[i] + NOP 4 + MPY32U B7,B7,B1:B0 + NOP 3 ; [2,0] in epilogue + STW B0,*B2++(8) ; rp[2*i] + MV B1,A1 + SPKERNEL 2,0 ; fully overlap BNOP RA,5 +|| STW A1,*ARG0++(8) ; rp[2*i+1] +;;==================================================================== + BNOP RA,5 + .endasmfunc + + .global _bn_add_words +_bn_add_words: + .asmfunc + MV ARG3,B0 + [!B0] BNOP RA +||[!B0] MVK 0,RET + [B0] MVC B0,ILC + [B0] ZERO A1 ; carry flag +|| [B0] MV ARG0,A3 + NOP 3 + + SPLOOP 2 ; 2*n+6 +;;==================================================================== + LDW *ARG2++,A7 ; bp[i] +|| LDW *ARG1++,B7 ; ap[i] + NOP 4 + ADDU A7,B7,A9:A8 + ADDU A1,A9:A8,A1:A0 + SPKERNEL 0,0 ; fully overlap BNOP RA,5 +|| STW A0,*A3++ ; write result +|| MV A1,RET ; keep carry flag in RET +;;==================================================================== + BNOP RA,5 + .endasmfunc + + .global _bn_sub_words +_bn_sub_words: + .asmfunc + MV ARG3,B0 + [!B0] BNOP RA +||[!B0] MVK 0,RET + [B0] MVC B0,ILC + [B0] ZERO A2 ; borrow flag +|| [B0] MV ARG0,A3 + NOP 3 + + SPLOOP 2 ; 2*n+6 +;;==================================================================== + LDW *ARG2++,A7 ; bp[i] +|| LDW *ARG1++,B7 ; ap[i] + NOP 4 + SUBU B7,A7,A1:A0 + [A2] SUB A1:A0,1,A1:A0 + SPKERNEL 0,1 ; leave slot for "return borrow flag" +|| STW A0,*A3++ ; write result +|| AND 1,A1,A2 ; pass on borrow flag +;;==================================================================== + BNOP RA,4 + AND 1,A1,RET ; return borrow flag + .endasmfunc + + .global _bn_div_words + .global __divull +_bn_div_words: + .asmfunc + CALLP __divull,A3 ; jump to rts64plus.lib +|| MV ARG0,A5 +|| MV ARG1,ARG0 +|| MV ARG2,ARG1 +|| ZERO B5 + .endasmfunc + +;;==================================================================== +;; Not really Comba algorithm, just straightforward NxM... Dedicated +;; fully unrolled real Comba implementations are asymptotically 2x +;; faster, but naturally larger undertaking. Purpose of this exercise +;; was rather to learn to master nested SPLOOPs... +;;==================================================================== + .global _bn_sqr_comba8 + .global _bn_mul_comba8 +_bn_sqr_comba8: + MV ARG1,ARG2 +_bn_mul_comba8: + .asmfunc + MVK 8,B0 ; N, RILC +|| MVK 8,A0 ; M, outer loop counter +|| MV ARG1,A5 ; copy ap +|| MV ARG0,B4 ; copy rp +|| ZERO B19 ; high part of accumulator + MVC B0,RILC +|| SUB B0,2,B1 ; N-2, initial ILC +|| SUB B0,1,B2 ; const B2=N-1 +|| LDW *A5++,B6 ; ap[0] +|| MV A0,A3 ; const A3=M +sploopNxM?: ; for best performance arrange M<=N + [A0] SPLOOPD 2 ; 2*n+10 +|| MVC B1,ILC +|| ADDAW B4,B0,B5 +|| ZERO B7 +|| LDW *A5++,A9 ; pre-fetch ap[1] +|| ZERO A1 +|| SUB A0,1,A0 +;;==================================================================== +;; SPLOOP from bn_mul_add_words, but with flipped A<>B register files. +;; This is because of Advisory 15 from TI publication SPRZ247I. + LDW *ARG2++,A7 ; bp[i] + NOP 3 + [A1] LDW *B5++,B7 ; rp[i] + MPY32U A7,B6,B17:B16 + NOP 3 + ADDU B16,B7,B21:B20 + ADDU B19,B21:B20,B19:B18 +|| MV.S B17,B23 + SPKERNEL +|| STW B18,*B4++ ; rp[i] +|| ADD.S B19,B23,B19 +;;==================================================================== +outer?: ; m*2*(n+1)+10 + SUBAW ARG2,A3,ARG2 ; rewind bp to bp[0] + SPMASKR +|| CMPGT A0,1,A2 ; done pre-fetching ap[i+1]? + MVD A9,B6 ; move through .M unit(*) + [A2] LDW *A5++,A9 ; pre-fetch ap[i+1] + SUBAW B5,B2,B5 ; rewind rp to rp[1] + MVK 1,A1 + [A0] BNOP.S1 outer?,4 +|| [A0] SUB.L A0,1,A0 + STW B19,*B4--[B2] ; rewind rp tp rp[1] +|| ZERO.S B19 ; high part of accumulator +;; end of outer? + BNOP RA,5 ; return + .endasmfunc +;; (*) It should be noted that B6 is used as input to MPY32U in +;; chronologically next cycle in *preceding* SPLOOP iteration. +;; Normally such arrangement would require DINT, but at this +;; point SPLOOP is draining and interrupts are disabled +;; implicitly. + + .global _bn_sqr_comba4 + .global _bn_mul_comba4 +_bn_sqr_comba4: + MV ARG1,ARG2 +_bn_mul_comba4: + .asmfunc + .if 0 + BNOP sploopNxM?,3 + ;; Above mentioned m*2*(n+1)+10 does not apply in n=m=4 case, + ;; because of read-after-write penalties, it's rather + ;; n*2*(n+3)+10, or 66 cycles [plus various overheads]... + MVK 4,B0 ; N, RILC +|| MVK 4,A0 ; M, outer loop counter +|| MV ARG1,A5 ; copy ap +|| MV ARG0,B4 ; copy rp +|| ZERO B19 ; high part of accumulator + MVC B0,RILC +|| SUB B0,2,B1 ; first ILC +|| SUB B0,1,B2 ; const B2=N-1 +|| LDW *A5++,B6 ; ap[0] +|| MV A0,A3 ; const A3=M + .else + ;; This alternative is exercise in fully unrolled Comba + ;; algorithm implementation that operates at n*(n+1)+12, or + ;; as little as 32 cycles... + LDW *ARG1[0],B16 ; a[0] +|| LDW *ARG2[0],A16 ; b[0] + LDW *ARG1[1],B17 ; a[1] +|| LDW *ARG2[1],A17 ; b[1] + LDW *ARG1[2],B18 ; a[2] +|| LDW *ARG2[2],A18 ; b[2] + LDW *ARG1[3],B19 ; a[3] +|| LDW *ARG2[3],A19 ; b[3] + NOP + MPY32U A16,B16,A1:A0 ; a[0]*b[0] + MPY32U A17,B16,A23:A22 ; a[0]*b[1] + MPY32U A16,B17,A25:A24 ; a[1]*b[0] + MPY32U A16,B18,A27:A26 ; a[2]*b[0] + STW A0,*ARG0[0] +|| MPY32U A17,B17,A29:A28 ; a[1]*b[1] + MPY32U A18,B16,A31:A30 ; a[0]*b[2] +|| ADDU A22,A1,A1:A0 + MV A23,B0 +|| MPY32U A19,B16,A21:A20 ; a[3]*b[0] +|| ADDU A24,A1:A0,A1:A0 + ADDU A25,B0,B1:B0 +|| STW A0,*ARG0[1] +|| MPY32U A18,B17,A23:A22 ; a[2]*b[1] +|| ADDU A26,A1,A9:A8 + ADDU A27,B1,B9:B8 +|| MPY32U A17,B18,A25:A24 ; a[1]*b[2] +|| ADDU A28,A9:A8,A9:A8 + ADDU A29,B9:B8,B9:B8 +|| MPY32U A16,B19,A27:A26 ; a[0]*b[3] +|| ADDU A30,A9:A8,A9:A8 + ADDU A31,B9:B8,B9:B8 +|| ADDU B0,A9:A8,A9:A8 + STW A8,*ARG0[2] +|| ADDU A20,A9,A1:A0 + ADDU A21,B9,B1:B0 +|| MPY32U A19,B17,A21:A20 ; a[3]*b[1] +|| ADDU A22,A1:A0,A1:A0 + ADDU A23,B1:B0,B1:B0 +|| MPY32U A18,B18,A23:A22 ; a[2]*b[2] +|| ADDU A24,A1:A0,A1:A0 + ADDU A25,B1:B0,B1:B0 +|| MPY32U A17,B19,A25:A24 ; a[1]*b[3] +|| ADDU A26,A1:A0,A1:A0 + ADDU A27,B1:B0,B1:B0 +|| ADDU B8,A1:A0,A1:A0 + STW A0,*ARG0[3] +|| MPY32U A19,B18,A27:A26 ; a[3]*b[2] +|| ADDU A20,A1,A9:A8 + ADDU A21,B1,B9:B8 +|| MPY32U A18,B19,A29:A28 ; a[2]*b[3] +|| ADDU A22,A9:A8,A9:A8 + ADDU A23,B9:B8,B9:B8 +|| MPY32U A19,B19,A31:A30 ; a[3]*b[3] +|| ADDU A24,A9:A8,A9:A8 + ADDU A25,B9:B8,B9:B8 +|| ADDU B0,A9:A8,A9:A8 + STW A8,*ARG0[4] +|| ADDU A26,A9,A1:A0 + ADDU A27,B9,B1:B0 +|| ADDU A28,A1:A0,A1:A0 + ADDU A29,B1:B0,B1:B0 +|| BNOP RA +|| ADDU B8,A1:A0,A1:A0 + STW A0,*ARG0[5] +|| ADDU A30,A1,A9:A8 + ADD A31,B1,B8 + ADDU B0,A9:A8,A9:A8 ; removed || to avoid cross-path stall below + ADD B8,A9,A9 +|| STW A8,*ARG0[6] + STW A9,*ARG0[7] + .endif + .endasmfunc diff --git a/crypto/bn/asm/c64xplus-gf2m.pl b/crypto/bn/asm/c64xplus-gf2m.pl new file mode 100644 index 0000000000..cef83942c9 --- /dev/null +++ b/crypto/bn/asm/c64xplus-gf2m.pl @@ -0,0 +1,146 @@ +#!/usr/bin/env perl +# +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# February 2012 +# +# The module implements bn_GF2m_mul_2x2 polynomial multiplication +# used in bn_gf2m.c. It's kind of low-hanging mechanical port from +# C for the time being... The subroutine runs in 37 cycles, which is +# 4.5x faster than compiler-generated code. Though comparison is +# totally unfair, because this module utilizes Galois Field Multiply +# instruction. + +while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} +open STDOUT,">$output"; + +($rp,$a1,$a0,$b1,$b0)=("A4","B4","A6","B6","A8"); # argument vector + +($Alo,$Alox0,$Alox1,$Alox2,$Alox3)=map("A$_",(16..20)); +($Ahi,$Ahix0,$Ahix1,$Ahix2,$Ahix3)=map("B$_",(16..20)); +($B_0,$B_1,$B_2,$B_3)=("B5","A5","A7","B7"); +($A,$B)=($Alo,$B_1); +$xFF="B1"; + +sub mul_1x1_upper { +my ($A,$B)=@_; +$code.=<<___; + EXTU $B,8,24,$B_2 ; smash $B to 4 bytes +|| AND $B,$xFF,$B_0 +|| SHRU $B,24,$B_3 + SHRU $A,16, $Ahi ; smash $A to two halfwords +|| EXTU $A,16,16,$Alo + + XORMPY $Alo,$B_2,$Alox2 ; 16x8 bits muliplication +|| XORMPY $Ahi,$B_2,$Ahix2 +|| EXTU $B,16,24,$B_1 + XORMPY $Alo,$B_0,$Alox0 +|| XORMPY $Ahi,$B_0,$Ahix0 + XORMPY $Alo,$B_3,$Alox3 +|| XORMPY $Ahi,$B_3,$Ahix3 + XORMPY $Alo,$B_1,$Alox1 +|| XORMPY $Ahi,$B_1,$Ahix1 +___ +} +sub mul_1x1_merged { +my ($OUTlo,$OUThi,$A,$B)=@_; +$code.=<<___; + EXTU $B,8,24,$B_2 ; smash $B to 4 bytes +|| AND $B,$xFF,$B_0 +|| SHRU $B,24,$B_3 + SHRU $A,16, $Ahi ; smash $A to two halfwords +|| EXTU $A,16,16,$Alo + + XOR $Ahix0,$Alox2,$Ahix0 +|| MV $Ahix2,$OUThi +|| XORMPY $Alo,$B_2,$Alox2 + XORMPY $Ahi,$B_2,$Ahix2 +|| EXTU $B,16,24,$B_1 +|| XORMPY $Alo,$B_0,A1 ; $Alox0 + XOR $Ahix1,$Alox3,$Ahix1 +|| SHL $Ahix0,16,$OUTlo +|| SHRU $Ahix0,16,$Ahix0 + XOR $Alox0,$OUTlo,$OUTlo +|| XOR $Ahix0,$OUThi,$OUThi +|| XORMPY $Ahi,$B_0,$Ahix0 +|| XORMPY $Alo,$B_3,$Alox3 +|| SHL $Alox1,8,$Alox1 +|| SHL $Ahix3,8,$Ahix3 + XOR $Alox1,$OUTlo,$OUTlo +|| XOR $Ahix3,$OUThi,$OUThi +|| XORMPY $Ahi,$B_3,$Ahix3 +|| SHL $Ahix1,24,$Alox1 +|| SHRU $Ahix1,8, $Ahix1 + XOR $Alox1,$OUTlo,$OUTlo +|| XOR $Ahix1,$OUThi,$OUThi +|| XORMPY $Alo,$B_1,$Alox1 +|| XORMPY $Ahi,$B_1,$Ahix1 +|| MV A1,$Alox0 +___ +} +sub mul_1x1_lower { +my ($OUTlo,$OUThi)=@_; +$code.=<<___; + ;NOP + XOR $Ahix0,$Alox2,$Ahix0 +|| MV $Ahix2,$OUThi + NOP + XOR $Ahix1,$Alox3,$Ahix1 +|| SHL $Ahix0,16,$OUTlo +|| SHRU $Ahix0,16,$Ahix0 + XOR $Alox0,$OUTlo,$OUTlo +|| XOR $Ahix0,$OUThi,$OUThi +|| SHL $Alox1,8,$Alox1 +|| SHL $Ahix3,8,$Ahix3 + XOR $Alox1,$OUTlo,$OUTlo +|| XOR $Ahix3,$OUThi,$OUThi +|| SHL $Ahix1,24,$Alox1 +|| SHRU $Ahix1,8, $Ahix1 + XOR $Alox1,$OUTlo,$OUTlo +|| XOR $Ahix1,$OUThi,$OUThi +___ +} +$code.=<<___; + .text + + .global _bn_GF2m_mul_2x2 +_bn_GF2m_mul_2x2: + .asmfunc + MVK 0xFF,$xFF +___ + &mul_1x1_upper($a0,$b0); # a0·b0 +$code.=<<___; +|| MV $b1,$B + MV $a1,$A +___ + &mul_1x1_merged("A28","B28",$A,$B); # a0·b0/a1·b1 +$code.=<<___; +|| XOR $b0,$b1,$B + XOR $a0,$a1,$A +___ + &mul_1x1_merged("A31","B31",$A,$B); # a1·b1/(a0+a1)·(b0+b1) +$code.=<<___; + XOR A28,A31,A29 +|| XOR B28,B31,B29 ; a0·b0+a1·b1 +___ + &mul_1x1_lower("A30","B30"); # (a0+a1)·(b0+b1) +$code.=<<___; +|| BNOP B3 + XOR A29,A30,A30 +|| XOR B29,B30,B30 ; (a0+a1)·(b0+b1)-a0·b0-a1·b1 + XOR B28,A30,A30 +|| STW A28,*${rp}[0] + XOR B30,A31,A31 +|| STW A30,*${rp}[1] + STW A31,*${rp}[2] + STW B31,*${rp}[3] + .endasmfunc +___ + +print $code; +close STDOUT; diff --git a/crypto/c64xpluscpuid.pl b/crypto/c64xpluscpuid.pl new file mode 100644 index 0000000000..067b693d5c --- /dev/null +++ b/crypto/c64xpluscpuid.pl @@ -0,0 +1,246 @@ +#!/usr/bin/env perl +# + +while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} +open STDOUT,">$output"; + +$code.=<<___; + .text + + .asg B3,RA + + .global _OPENSSL_rdtsc +_OPENSSL_rdtsc: + .asmfunc + B RA + MVC TSCL,B0 + MVC TSCH,B1 + [!B0] MVC B0,TSCL ; start TSC + MV B0,A4 + MV B1,A5 + .endasmfunc + + .global _OPENSSL_cleanse +_OPENSSL_cleanse: + .asmfunc + ZERO A3:A2 +|| ZERO B2 +|| SHRU B4,3,B0 ; is length >= 8 +|| ADD 1,A4,B6 + [!B0] BNOP RA +|| ZERO A1 +|| ZERO B1 + [B0] MVC B0,ILC +||[!B0] CMPLT 0,B4,A1 +||[!B0] CMPLT 1,B4,B1 + [A1] STB A2,*A4++[2] +|| [B1] STB B2,*B6++[2] +||[!B0] CMPLT 2,B4,A1 +||[!B0] CMPLT 3,B4,B1 + [A1] STB A2,*A4++[2] +|| [B1] STB B2,*B6++[2] +||[!B0] CMPLT 4,B4,A1 +||[!B0] CMPLT 5,B4,B1 + [A1] STB A2,*A4++[2] +|| [B1] STB B2,*B6++[2] +||[!B0] CMPLT 6,B4,A1 + [A1] STB A2,*A4++[2] + + SPLOOP 1 + STNDW A3:A2,*A4++ +|| SUB B4,8,B4 + SPKERNEL + + MV B4,B0 ; remaining bytes +|| ADD 1,A4,B6 +|| BNOP RA + [B0] CMPLT 0,B0,A1 +|| [B0] CMPLT 1,B0,B1 + [A1] STB A2,*A4++[2] +|| [B1] STB B2,*B6++[2] +|| [B0] CMPLT 2,B0,A1 +|| [B0] CMPLT 3,B0,B1 + [A1] STB A2,*A4++[2] +|| [B1] STB B2,*B6++[2] +|| [B0] CMPLT 4,B0,A1 +|| [B0] CMPLT 5,B0,B1 + [A1] STB A2,*A4++[2] +|| [B1] STB B2,*B6++[2] +|| [B0] CMPLT 6,B0,A1 + [A1] STB A2,*A4++[2] + .endasmfunc + + .global _OPENSSL_atomic_add +_OPENSSL_atomic_add: + .asmfunc + MV A4,B0 +atomic_add?: + LL *B0,B5 + NOP 4 + ADD B4,B5,B5 + SL B5,*B0 + CMTL *B0,B1 + NOP 4 + [!B1] B atomic_add? + [B1] BNOP RA,4 + MV B5,A4 + .endasmfunc + + .global _OPENSSL_wipe_cpu +_OPENSSL_wipe_cpu: + .asmfunc + ZERO A0 +|| ZERO B0 +|| ZERO A1 +|| ZERO B1 + ZERO A3:A2 +|| MVD B0,B2 +|| ZERO A4 +|| ZERO B4 +|| ZERO A5 +|| ZERO B5 +|| BNOP RA + ZERO A7:A6 +|| ZERO B7:B6 +|| ZERO A8 +|| ZERO B8 +|| ZERO A9 +|| ZERO B9 + ZERO A17:A16 +|| ZERO B17:B16 +|| ZERO A18 +|| ZERO B18 +|| ZERO A19 +|| ZERO B19 + ZERO A21:A20 +|| ZERO B21:B20 +|| ZERO A22 +|| ZERO B22 +|| ZERO A23 +|| ZERO B23 + ZERO A25:A24 +|| ZERO B25:B24 +|| ZERO A26 +|| ZERO B26 +|| ZERO A27 +|| ZERO B27 + ZERO A29:A28 +|| ZERO B29:B28 +|| ZERO A30 +|| ZERO B30 +|| ZERO A31 +|| ZERO B31 + .endasmfunc + +CLFLUSH .macro CONTROL,ADDR,LEN + B passthrough? +|| STW ADDR,*CONTROL[0] + STW LEN,*CONTROL[1] +spinlock?: + LDW *CONTROL[1],A0 + NOP 3 +passthrough?: + NOP + [A0] BNOP spinlock?,5 + .endm + + .global _OPENSSL_instrument_bus +_OPENSSL_instrument_bus: + .asmfunc + MV B4,B0 ; reassign sizeof(output) +|| MV A4,B4 ; reassign output +|| MVK 0x00004030,A3 + MV B0,A4 ; return value +|| MVK 1,A1 +|| MVKH 0x01840000,A3 ; L1DWIBAR + MVC TSCL,B8 ; collect 1st tick +|| MVK 0x00004010,A5 + MV B8,B9 ; lasttick = tick +|| MVK 0,B7 ; lastdiff = 0 +|| MVKH 0x01840000,A5 ; L2WIBAR + CLFLUSH A3,B4,A1 ; write-back and invalidate L1D line + CLFLUSH A5,B4,A1 ; write-back and invalidate L2 line + LL *B4,B5 + NOP 4 + ADD B7,B5,B5 + SL B5,*B4 + CMTL *B4,B1 + NOP 4 + STW B5,*B4 +bus_loop1?: + MVC TSCL,B8 +|| [B0] SUB B0,1,B0 + SUB B8,B9,B7 ; lastdiff = tick - lasttick +|| MV B8,B9 ; lasttick = tick + CLFLUSH A3,B4,A1 ; write-back and invalidate L1D line + CLFLUSH A5,B4,A1 ; write-back and invalidate L2 line + LL *B4,B5 + NOP 4 + ADD B7,B5,B5 + SL B5,*B4 + CMTL *B4,B1 + STW B5,*B4 ; [!B1] is removed to flatten samples +|| ADDK 4,B4 +|| [B0] BNOP bus_loop1?,5 + + BNOP RA,5 + .endasmfunc + + .global _OPENSSL_instrument_bus2 +_OPENSSL_instrument_bus2: + .asmfunc + MV A6,B0 ; reassign max +|| MV B4,A6 ; reassing sizeof(output) +|| MVK 0x00004030,A3 + MV A4,B4 ; reassign output +|| MVK 0,A4 ; return value +|| MVK 1,A1 +|| MVKH 0x01840000,A3 ; L1DWIBAR + + MVC TSCL,B8 ; collect 1st tick +|| MVK 0x00004010,A5 + MV B8,B9 ; lasttick = tick +|| MVK 0,B7 ; lastdiff = 0 +|| MVKH 0x01840000,A5 ; L2WIBAR + CLFLUSH A3,B4,A1 ; write-back and invalidate L1D line + CLFLUSH A5,B4,A1 ; write-back and invalidate L2 line + LL *B4,B5 + NOP 4 + ADD B7,B5,B5 + SL B5,*B4 + CMTL *B4,B1 + NOP 4 + STW B5,*B4 + + MVC TSCL,B8 ; collect 1st diff + SUB B8,B9,B7 ; lastdiff = tick - lasttick +|| MV B8,B9 ; lasttick = tick +|| SUB B0,1,B0 +bus_loop2?: + CLFLUSH A3,B4,A1 ; write-back and invalidate L1D line + CLFLUSH A5,B4,A1 ; write-back and invalidate L2 line + LL *B4,B5 + NOP 4 + ADD B7,B5,B5 + SL B5,*B4 + CMTL *B4,B1 + STW B5,*B4 ; [!B1] is removed to flatten samples +||[!B0] BNOP bus_loop2_done?,2 +|| SUB B0,1,B0 + MVC TSCL,B8 + SUB B8,B9,B8 +|| MV B8,B9 + CMPEQ B8,B7,B2 +|| MV B8,B7 + [!B2] ADDAW B4,1,B4 +||[!B2] ADDK 1,A4 + CMPEQ A4,A6,A2 + [!A2] BNOP bus_loop2?,5 + +bus_loop2_done?: + BNOP RA,5 + .endasmfunc +___ + +print $code; +close STDOUT; diff --git a/crypto/modes/asm/ghash-c64xplus.pl b/crypto/modes/asm/ghash-c64xplus.pl new file mode 100644 index 0000000000..1ac4d927d0 --- /dev/null +++ b/crypto/modes/asm/ghash-c64xplus.pl @@ -0,0 +1,231 @@ +#!/usr/bin/env perl +# +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# December 2011 +# +# The module implements GCM GHASH function and underlying single +# multiplication operation in GF(2^128). Even though subroutines +# have _4bit suffix, they are not using any tables, but rely on +# hardware Galois Field Multiply support. Streamed GHASH processes +# byte in ~7 cycles, which is >6x faster than "4-bit" table-driven +# code compiled with TI's cl6x 6.0 with -mv6400+ -o2 flags. We are +# comparing apples vs. oranges, but compiler surely could have done +# better, because theoretical [though not necessarily achievable] +# estimate for "4-bit" table-driven implementation is ~12 cycles. + +while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} +open STDOUT,">$output"; + +($Xip,$Htable,$inp,$len)=("A4","B4","A6","B6"); # arguments + +($Z0,$Z1,$Z2,$Z3, $H0, $H1, $H2, $H3, + $H0x,$H1x,$H2x,$H3x)=map("A$_",(16..27)); +($H01u,$H01y,$H2u,$H3u, $H0y,$H1y,$H2y,$H3y, + $H0z,$H1z,$H2z,$H3z)=map("B$_",(16..27)); +($FF000000,$E10000)=("B30","B31"); +($xip,$x0,$x1,$xib)=map("B$_",(6..9)); # $xip zaps $len + $xia="A9"; +($rem,$res)=("B4","B5"); # $rem zaps $Htable + +$code.=<<___; + .text + + .asg B3,RA + + .if 0 + .global _gcm_gmult_1bit +_gcm_gmult_1bit: + ADDAD $Htable,2,$Htable + .endif + .global _gcm_gmult_4bit +_gcm_gmult_4bit: + .asmfunc + LDDW *${Htable}[-1],$H1:$H0 ; H.lo + LDDW *${Htable}[-2],$H3:$H2 ; H.hi +|| MV $Xip,${xip} ; reassign Xi +|| MVK 15,B1 ; SPLOOPD constant + + MVK 0xE1,$E10000 +|| LDBU *++${xip}[15],$x1 ; Xi[15] + MVK 0xFF,$FF000000 +|| LDBU *--${xip},$x0 ; Xi[14] + SHL $E10000,16,$E10000 ; [pre-shifted] reduction polynomial + SHL $FF000000,24,$FF000000 ; upper byte mask +|| BNOP ghash_loop? +|| MVK 1,B0 ; take a single spin + + PACKH2 $H0,$H1,$xia ; pack H0' and H1's upper bytes + AND $H2,$FF000000,$H2u ; H2's upper byte + AND $H3,$FF000000,$H3u ; H3's upper byte +|| SHRU $H2u,8,$H2u + SHRU $H3u,8,$H3u +|| ZERO $Z1:$Z0 + SHRU2 $xia,8,$H01u +|| ZERO $Z3:$Z2 + .endasmfunc + + .global _gcm_ghash_4bit +_gcm_ghash_4bit: + .asmfunc + LDDW *${Htable}[-1],$H1:$H0 ; H.lo +|| SHRU $len,4,B0 ; reassign len + LDDW *${Htable}[-2],$H3:$H2 ; H.hi +|| MV $Xip,${xip} ; reassign Xi +|| MVK 15,B1 ; SPLOOPD constant + + MVK 0xE1,$E10000 +|| [B0] LDNDW *${inp}[1],$H1x:$H0x + MVK 0xFF,$FF000000 +|| [B0] LDNDW *${inp}++[2],$H3x:$H2x + SHL $E10000,16,$E10000 ; [pre-shifted] reduction polynomial +|| LDDW *${xip}[1],$Z1:$Z0 + SHL $FF000000,24,$FF000000 ; upper byte mask +|| LDDW *${xip}[0],$Z3:$Z2 + + PACKH2 $H0,$H1,$xia ; pack H0' and H1's upper bytes + AND $H2,$FF000000,$H2u ; H2's upper byte + AND $H3,$FF000000,$H3u ; H3's upper byte +|| SHRU $H2u,8,$H2u + SHRU $H3u,8,$H3u + SHRU2 $xia,8,$H01u + +|| [B0] XOR $H0x,$Z0,$Z0 ; Xi^=inp +|| [B0] XOR $H1x,$Z1,$Z1 + .if .LITTLE_ENDIAN + [B0] XOR $H2x,$Z2,$Z2 +|| [B0] XOR $H3x,$Z3,$Z3 +|| [B0] SHRU $Z1,24,$xia ; Xi[15], avoid cross-path stall + STDW $Z1:$Z0,*${xip}[1] +|| [B0] SHRU $Z1,16,$x0 ; Xi[14] +|| [B0] ZERO $Z1:$Z0 + .else + [B0] XOR $H2x,$Z2,$Z2 +|| [B0] XOR $H3x,$Z3,$Z3 +|| [B0] MV $Z0,$xia ; Xi[15], avoid cross-path stall + STDW $Z1:$Z0,*${xip}[1] +|| [B0] SHRU $Z0,8,$x0 ; Xi[14] +|| [B0] ZERO $Z1:$Z0 + .endif + STDW $Z3:$Z2,*${xip}[0] +|| [B0] ZERO $Z3:$Z2 +|| [B0] MV $xia,$x1 + [B0] ADDK 14,${xip} + +ghash_loop?: + SPLOOPD 6 ; 6*16+7 +|| MVC B1,ILC +|| [B0] SUB B0,1,B0 +|| ZERO A0 +|| ADD $x1,$x1,$xib ; SHL $x1,1,$xib +|| SHL $x1,1,$xia +___ + +########____________________________ +# 0 D2. M1 M2 | +# 1 M1 | +# 2 M1 M2 | +# 3 D1. M1 M2 | +# 4 S1. L1 | +# 5 S2 S1x L1 D2 L2 |____________________________ +# 6/0 L1 S1 L2 S2x |D2. M1 M2 | +# 7/1 L1 S1 D1x S2 M2 | M1 | +# 8/2 S1 L1x S2 | M1 M2 | +# 9/3 S1 L1x | D1. M1 M2 | +# 10/4 D1x | S1. L1 | +# 11/5 |S2 S1x L1 D2 L2 |____________ +# 12/6/0 D1x __| L1 S1 L2 S2x |D2. .... +# 7/1 L1 S1 D1x S2 M2 | .... +# 8/2 S1 L1x S2 | .... +#####... ................|............ +$code.=<<___; + XORMPY $H0,$xia,$H0x ; 0 ; H·Xi[i] +|| XORMPY $H01u,$xib,$H01y +|| [A0] LDBU *--${xip},$x0 + XORMPY $H1,$xia,$H1x ; 1 + XORMPY $H2,$xia,$H2x ; 2 +|| XORMPY $H2u,$xib,$H2y + XORMPY $H3,$xia,$H3x ; 3 +|| XORMPY $H3u,$xib,$H3y +||[!A0] MVK.D 15,A0 ; *--${xip} counter + XOR.L $H0x,$Z0,$Z0 ; 4 ; Z^=H·Xi[i] +|| [A0] SUB.S A0,1,A0 + XOR.L $H1x,$Z1,$Z1 ; 5 +|| AND.D $H01y,$FF000000,$H0z +|| SWAP2.L $H01y,$H1y ; ; SHL $H01y,16,$H1y +|| SHL $x0,1,$xib +|| SHL $x0,1,$xia + + XOR.L $H2x,$Z2,$Z2 ; 6/0 ; [0,0] in epilogue +|| SHL $Z0,1,$rem ; ; rem=Z<<1 +|| SHRMB.S $Z1,$Z0,$Z0 ; ; Z>>=8 +|| AND.L $H1y,$FF000000,$H1z + XOR.L $H3x,$Z3,$Z3 ; 7/1 +|| SHRMB.S $Z2,$Z1,$Z1 +|| XOR.D $H0z,$Z0,$Z0 ; merge upper byte products +|| AND.S $H2y,$FF000000,$H2z +|| XORMPY $E10000,$rem,$res ; ; implicit rem&0x1FE + XOR.L $H1z,$Z1,$Z1 ; 8/2 +|| SHRMB.S $Z3,$Z2,$Z2 +|| AND.S $H3y,$FF000000,$H3z + XOR.L $H2z,$Z2,$Z2 ; 9/3 +|| SHRU $Z3,8,$Z3 + XOR.D $H3z,$Z3,$Z3 ; 10/4 + NOP ; 11/5 + + SPKERNEL 0,2 +|| XOR.D $res,$Z3,$Z3 ; 12/6/0; Z^=res + + ; input pre-fetch is possible where D1 slot is available... + [B0] LDNDW *${inp}[1],$H1x:$H0x ; 8/- + [B0] LDNDW *${inp}++[2],$H3x:$H2x ; 9/- + NOP ; 10/- + .if .LITTLE_ENDIAN + SWAP2 $Z0,$Z1 ; 11/- +|| SWAP4 $Z1,$Z0 + SWAP4 $Z1,$Z1 ; 12/- +|| SWAP2 $Z0,$Z0 + SWAP2 $Z2,$Z3 +|| SWAP4 $Z3,$Z2 +||[!B0] BNOP RA + SWAP4 $Z3,$Z3 +|| SWAP2 $Z2,$Z2 +|| [B0] BNOP ghash_loop? + [B0] XOR $H0x,$Z0,$Z0 ; Xi^=inp +|| [B0] XOR $H1x,$Z1,$Z1 + [B0] XOR $H2x,$Z2,$Z2 +|| [B0] XOR $H3x,$Z3,$Z3 +|| [B0] SHRU $Z1,24,$xia ; Xi[15], avoid cross-path stall + STDW $Z1:$Z0,*${xip}[1] +|| [B0] SHRU $Z1,16,$x0 ; Xi[14] +|| [B0] ZERO $Z1:$Z0 + .else + [!B0] BNOP RA ; 11/- + [B0] BNOP ghash_loop? ; 12/- + [B0] XOR $H0x,$Z0,$Z0 ; Xi^=inp +|| [B0] XOR $H1x,$Z1,$Z1 + [B0] XOR $H2x,$Z2,$Z2 +|| [B0] XOR $H3x,$Z3,$Z3 +|| [B0] MV $Z0,$xia ; Xi[15], avoid cross-path stall + STDW $Z1:$Z0,*${xip}[1] +|| [B0] SHRU $Z0,8,$x0 ; Xi[14] +|| [B0] ZERO $Z1:$Z0 + .endif + STDW $Z3:$Z2,*${xip}[0] +|| [B0] ZERO $Z3:$Z2 +|| [B0] MV $xia,$x1 + [B0] ADDK 14,${xip} + .endasmfunc + + .sect .const + .cstring "GHASH for C64x+, CRYPTOGAMS by " + .align 4 +___ + +print $code; +close STDOUT; diff --git a/crypto/sha/asm/sha1-c64xplus.pl b/crypto/sha/asm/sha1-c64xplus.pl new file mode 100644 index 0000000000..87000d1e8f --- /dev/null +++ b/crypto/sha/asm/sha1-c64xplus.pl @@ -0,0 +1,323 @@ +#!/usr/bin/env perl +# +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# SHA1 for C64x+. +# +# November 2011 +# +# If compared to compiler-generated code with similar characteristics, +# i.e. compiled with OPENSSL_SMALL_FOOTPRINT and utilizing SPLOOPs, +# this implementation is 25% smaller and >2x faster. In absolute terms +# performance is (quite impressive) ~6.5 cycles per processed byte. +# Fully unrolled assembler would be ~5x larger and is likely to be +# ~15% faster. It would be free from references to intermediate ring +# buffer, but put more pressure on L1P [both because the code would be +# larger and won't be using SPLOOP buffer]. There are no plans to +# realize fully unrolled variant though... +# +# !!! Note that this module uses AMR, which means that all interrupt +# service routines are expected to preserve it and for own well-being +# zero it upon entry. + +while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} +open STDOUT,">$output"; + +($CTX,$INP,$NUM) = ("A4","B4","A6"); # arguments + +($A,$B,$C,$D,$E, $Arot,$F,$F0,$T,$K) = map("A$_",(16..20, 21..25)); +($X0,$X2,$X8,$X13) = ("A26","B26","A27","B27"); +($TX0,$TX1,$TX2,$TX3) = map("B$_",(28..31)); +($XPA,$XPB) = ("A5","B5"); # X circular buffer +($Actx,$Bctx,$Cctx,$Dctx,$Ectx) = map("A$_",(3,6..9)); # zaps $NUM + +$code=<<___; + .text + + .asg B3,RA + .asg A15,FP + .asg B15,SP + + .if .BIG_ENDIAN + .asg MV,SWAP2 + .asg MV,SWAP4 + .endif + + .global _sha1_block_data_order +_sha1_block_data_order: + .asmfunc stack_usage(64) + MV $NUM,A0 ; reassign $NUM +|| MVK -64,B0 + [!A0] BNOP RA ; if ($NUM==0) return; +|| [A0] STW FP,*SP--[16] ; save frame pointer and alloca(64) +|| [A0] MV SP,FP + [A0] LDW *${CTX}[0],$A ; load A-E... +|| [A0] AND B0,SP,SP ; align stack at 64 bytes + [A0] LDW *${CTX}[1],$B +|| [A0] SUBAW SP,2,SP ; reserve two words above buffer + [A0] LDW *${CTX}[2],$C +|| [A0] MVK 0x00404,B0 + [A0] LDW *${CTX}[3],$D +|| [A0] MVKH 0x50000,B0 ; 0x050404, 64 bytes for $XP[AB] + [A0] LDW *${CTX}[4],$E +|| [A0] MVC B0,AMR ; setup circular addressing + LDNW *${INP}++,$TX1 ; pre-fetch input + NOP 1 + +loop?: + MVK 0x00007999,$K +|| ADDAW SP,2,$XPA +|| SUB A0,1,A0 +|| MVK 13,B0 + MVKH 0x5a820000,$K ; K_00_19 +|| ADDAW SP,2,$XPB +|| MV $A,$Actx +|| MV $B,$Bctx +;;================================================== + SPLOOPD 5 ; BODY_00_13 +|| MV $C,$Cctx +|| MV $D,$Dctx +|| MV $E,$Ectx +|| MVC B0,ILC + + ROTL $A,5,$Arot +|| AND $C,$B,$F +|| ANDN $D,$B,$F0 +|| ADD $K,$E,$T ; T=E+K + + XOR $F0,$F,$F ; F_00_19(B,C,D) +|| MV $D,$E ; E=D +|| MV $C,$D ; D=C +|| SWAP2 $TX1,$TX2 +|| LDNW *${INP}++,$TX1 + + ADD $F,$T,$T ; T+=F_00_19(B,C,D) +|| ROTL $B,30,$C ; C=ROL(B,30) +|| SWAP4 $TX2,$TX3 ; byte swap + + ADD $Arot,$T,$T ; T+=ROL(A,5) +|| MV $A,$B ; B=A + + ADD $TX3,$T,$A ; A=T+Xi +|| STW $TX3,*${XPB}++ + SPKERNEL +;;================================================== + ROTL $A,5,$Arot ; BODY_14 +|| AND $C,$B,$F +|| ANDN $D,$B,$F0 +|| ADD $K,$E,$T ; T=E+K + + XOR $F0,$F,$F ; F_00_19(B,C,D) +|| MV $D,$E ; E=D +|| MV $C,$D ; D=C +|| SWAP2 $TX1,$TX2 +|| LDNW *${INP}++,$TX1 + + ADD $F,$T,$T ; T+=F_00_19(B,C,D) +|| ROTL $B,30,$C ; C=ROL(B,30) +|| SWAP4 $TX2,$TX2 ; byte swap +|| LDW *${XPA}++,$X0 ; fetches from X ring buffer are +|| LDW *${XPB}[4],$X2 ; 2 iterations ahead + + ADD $Arot,$T,$T ; T+=ROL(A,5) +|| MV $A,$B ; B=A +|| LDW *${XPA}[7],$X8 +|| MV $TX3,$X13 ; || LDW *${XPB}[15],$X13 +|| MV $TX2,$TX3 + + ADD $TX2,$T,$A ; A=T+Xi +|| STW $TX2,*${XPB}++ +;;================================================== + ROTL $A,5,$Arot ; BODY_15 +|| AND $C,$B,$F +|| ANDN $D,$B,$F0 +|| ADD $K,$E,$T ; T=E+K + + XOR $F0,$F,$F ; F_00_19(B,C,D) +|| MV $D,$E ; E=D +|| MV $C,$D ; D=C +|| SWAP2 $TX1,$TX2 + + ADD $F,$T,$T ; T+=F_00_19(B,C,D) +|| ROTL $B,30,$C ; C=ROL(B,30) +|| SWAP4 $TX2,$TX2 ; byte swap +|| XOR $X0,$X2,$TX0 ; Xupdate XORs are 1 iteration ahead +|| LDW *${XPA}++,$X0 +|| LDW *${XPB}[4],$X2 + + ADD $Arot,$T,$T ; T+=ROL(A,5) +|| MV $A,$B ; B=A +|| XOR $X8,$X13,$TX1 +|| LDW *${XPA}[7],$X8 +|| MV $TX3,$X13 ; || LDW *${XPB}[15],$X13 +|| MV $TX2,$TX3 + + ADD $TX2,$T,$A ; A=T+Xi +|| STW $TX2,*${XPB}++ +|| XOR $TX0,$TX1,$TX1 +|| MVK 3,B0 +;;================================================== + SPLOOPD 5 ; BODY_16_19 +|| MVC B0,ILC + + ROTL $A,5,$Arot +|| AND $C,$B,$F +|| ANDN $D,$B,$F0 +|| ADD $K,$E,$T ; T=E+K +|| ROTL $TX1,1,$TX2 ; Xupdate output + + XOR $F0,$F,$F ; F_00_19(B,C,D) +|| MV $D,$E ; E=D +|| MV $C,$D ; D=C + + ADD $F,$T,$T ; T+=F_00_19(B,C,D) +|| ROTL $B,30,$C ; C=ROL(B,30) +|| XOR $X0,$X2,$TX0 +|| LDW *${XPA}++,$X0 +|| LDW *${XPB}[4],$X2 + + ADD $Arot,$T,$T ; T+=ROL(A,5) +|| MV $A,$B ; B=A +|| XOR $X8,$X13,$TX1 +|| LDW *${XPA}[7],$X8 +|| MV $TX3,$X13 ; || LDW *${XPB}[15],$X13 +|| MV $TX2,$TX3 + + ADD $TX2,$T,$A ; A=T+Xi +|| STW $TX2,*${XPB}++ +|| XOR $TX0,$TX1,$TX1 + SPKERNEL + + MVK 0xffffeba1,$K +|| MVK 19,B0 + MVKH 0x6ed90000,$K ; K_20_39 +___ +sub BODY_20_39 { +$code.=<<___; +;;================================================== + SPLOOPD 5 ; BODY_20_39 +|| MVC B0,ILC + + ROTL $A,5,$Arot +|| XOR $B,$C,$F +|| ADD $K,$E,$T ; T=E+K +|| ROTL $TX1,1,$TX2 ; Xupdate output + + XOR $D,$F,$F ; F_20_39(B,C,D) +|| MV $D,$E ; E=D +|| MV $C,$D ; D=C + + ADD $F,$T,$T ; T+=F_20_39(B,C,D) +|| ROTL $B,30,$C ; C=ROL(B,30) +|| XOR $X0,$X2,$TX0 +|| LDW *${XPA}++,$X0 +|| LDW *${XPB}[4],$X2 + + ADD $Arot,$T,$T ; T+=ROL(A,5) +|| MV $A,$B ; B=A +|| XOR $X8,$X13,$TX1 +|| LDW *${XPA}[7],$X8 +|| MV $TX3,$X13 ; || LDW *${XPB}[15],$X13 +|| MV $TX2,$TX3 + + ADD $TX2,$T,$A ; A=T+Xi +|| STW $TX2,*${XPB}++ ; last one is redundant +|| XOR $TX0,$TX1,$TX1 + SPKERNEL +___ +$code.=<<___ if (!shift); + MVK 0xffffbcdc,$K + MVKH 0x8f1b0000,$K ; K_40_59 +___ +} &BODY_20_39(); +$code.=<<___; +;;================================================== + SPLOOPD 5 ; BODY_40_59 +|| MVC B0,ILC +|| AND $B,$C,$F +|| AND $B,$D,$F0 + + ROTL $A,5,$Arot +|| XOR $F0,$F,$F +|| AND $C,$D,$F0 +|| ADD $K,$E,$T ; T=E+K +|| ROTL $TX1,1,$TX2 ; Xupdate output + + XOR $F0,$F,$F ; F_40_59(B,C,D) +|| MV $D,$E ; E=D +|| MV $C,$D ; D=C + + ADD $F,$T,$T ; T+=F_40_59(B,C,D) +|| ROTL $B,30,$C ; C=ROL(B,30) +|| XOR $X0,$X2,$TX0 +|| LDW *${XPA}++,$X0 +|| LDW *${XPB}[4],$X2 + + ADD $Arot,$T,$T ; T+=ROL(A,5) +|| MV $A,$B ; B=A +|| XOR $X8,$X13,$TX1 +|| LDW *${XPA}[7],$X8 +|| MV $TX3,$X13 ; || LDW *${XPB}[15],$X13 +|| MV $TX2,$TX3 + + ADD $TX2,$T,$A ; A=T+Xi +|| STW $TX2,*${XPB}++ +|| XOR $TX0,$TX1,$TX1 +|| AND $B,$C,$F +|| AND $B,$D,$F0 + SPKERNEL + + MVK 0xffffc1d6,$K +|| MVK 18,B0 + MVKH 0xca620000,$K ; K_60_79 +___ + &BODY_20_39(-1); # BODY_60_78 +$code.=<<___; +;;================================================== + [A0] B loop? +|| ROTL $A,5,$Arot ; BODY_79 +|| XOR $B,$C,$F +|| ROTL $TX1,1,$TX2 ; Xupdate output + + [A0] LDNW *${INP}++,$TX1 ; pre-fetch input +|| ADD $K,$E,$T ; T=E+K +|| XOR $D,$F,$F ; F_20_39(B,C,D) + + ADD $F,$T,$T ; T+=F_20_39(B,C,D) +|| ADD $Ectx,$D,$E ; E=D,E+=Ectx +|| ADD $Dctx,$C,$D ; D=C,D+=Dctx +|| ROTL $B,30,$C ; C=ROL(B,30) + + ADD $Arot,$T,$T ; T+=ROL(A,5) +|| ADD $Bctx,$A,$B ; B=A,B+=Bctx + + ADD $TX2,$T,$A ; A=T+Xi + + ADD $Actx,$A,$A ; A+=Actx +|| ADD $Cctx,$C,$C ; C+=Cctx +;; end of loop? + + BNOP RA ; return +|| MV FP,SP ; restore stack pointer +|| LDW *FP[0],FP ; restore frame pointer + STW $A,*${CTX}[0] ; emit A-E... +|| MVK 0,B0 + STW $B,*${CTX}[1] +|| MVC B0,AMR ; clear AMR + STW $C,*${CTX}[2] + STW $D,*${CTX}[3] + STW $E,*${CTX}[4] + .endasmfunc + + .sect .const + .cstring "SHA1 block transform for C64x+, CRYPTOGAMS by " + .align 4 +___ + +print $code; +close STDOUT; diff --git a/crypto/sha/asm/sha256-c64xplus.pl b/crypto/sha/asm/sha256-c64xplus.pl new file mode 100644 index 0000000000..5a057868b4 --- /dev/null +++ b/crypto/sha/asm/sha256-c64xplus.pl @@ -0,0 +1,302 @@ +#!/usr/bin/env perl +# +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# SHA256 for C64x+. +# +# January 2012 +# +# Performance is just below 10 cycles per processed byte, which is +# almost 40% faster than compiler-generated code. Unroll is unlikely +# to give more than ~8% improvement... +# +# !!! Note that this module uses AMR, which means that all interrupt +# service routines are expected to preserve it and for own well-being +# zero it upon entry. + +while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} +open STDOUT,">$output"; + +($CTXA,$INP,$NUM) = ("A4","B4","A6"); # arguments + $K256="A3"; + +($A,$Actx,$B,$Bctx,$C,$Cctx,$D,$Dctx,$T2,$S0,$s1,$t0a,$t1a,$t2a,$X9,$X14) + =map("A$_",(16..31)); +($E,$Ectx,$F,$Fctx,$G,$Gctx,$H,$Hctx,$T1,$S1,$s0,$t0e,$t1e,$t2e,$X1,$X15) + =map("B$_",(16..31)); + +($Xia,$Xib)=("A5","B5"); # circular/ring buffer + $CTXB=$t2e; + +($Xn,$X0,$K)=("B7","B8","B9"); +($Maj,$Ch)=($T2,"B6"); + +$code.=<<___; + .text + .if __TI_EABI__ + .nocmp + .endif + + .asg B3,RA + .asg A15,FP + .asg B15,SP + + .if .BIG_ENDIAN + .asg SWAP2,MV + .asg SWAP4,MV + .endif + + .global _sha256_block_data_order +_sha256_block_data_order: + .asmfunc stack_usage(64) + MV $NUM,A0 ; reassign $NUM +|| MVK -64,B0 + [!A0] BNOP RA ; if ($NUM==0) return; +|| [A0] STW FP,*SP--[16] ; save frame pointer and alloca(64) +|| [A0] MV SP,FP + [A0] ADDKPC _sha256_block_data_order,B2 +|| [A0] AND B0,SP,SP ; align stack at 64 bytes + .if __TI_EABI__ + [A0] MVK 0x00404,B1 +|| [A0] MVKL \$PCR_OFFSET(K256,_sha256_block_data_order),$K256 + [A0] MVKH 0x50000,B1 +|| [A0] MVKH \$PCR_OFFSET(K256,_sha256_block_data_order),$K256 + .else + [A0] MVK 0x00404,B1 +|| [A0] MVKL (K256-_sha256_block_data_order),$K256 + [A0] MVKH 0x50000,B1 +|| [A0] MVKH (K256-_sha256_block_data_order),$K256 + .endif + [A0] MVC B1,AMR ; setup circular addressing +|| [A0] MV SP,$Xia + [A0] MV SP,$Xib +|| [A0] ADD B2,$K256,$K256 +|| [A0] MV $CTXA,$CTXB +|| [A0] SUBAW SP,2,SP ; reserve two words above buffer + LDW *${CTXA}[0],$A ; load ctx +|| LDW *${CTXB}[4],$E + LDW *${CTXA}[1],$B +|| LDW *${CTXB}[5],$F + LDW *${CTXA}[2],$C +|| LDW *${CTXB}[6],$G + LDW *${CTXA}[3],$D +|| LDW *${CTXB}[7],$H + + LDNW *$INP++,$Xn ; pre-fetch input + LDW *$K256++,$K ; pre-fetch K256[0] + MVK 14,B0 ; loop counters + MVK 47,B1 +|| ADDAW $Xia,9,$Xia +outerloop?: + SUB A0,1,A0 +|| MV $A,$Actx +|| MV $E,$Ectx +|| MVD $B,$Bctx +|| MVD $F,$Fctx + MV $C,$Cctx +|| MV $G,$Gctx +|| MVD $D,$Dctx +|| MVD $H,$Hctx +|| SWAP4 $Xn,$X0 + + SPLOOPD 8 ; BODY_00_14 +|| MVC B0,ILC +|| SWAP2 $X0,$X0 + + LDNW *$INP++,$Xn +|| ROTL $A,30,$S0 +|| OR $A,$B,$Maj +|| AND $A,$B,$t2a +|| ROTL $E,26,$S1 +|| AND $F,$E,$Ch +|| ANDN $G,$E,$t2e + ROTL $A,19,$t0a +|| AND $C,$Maj,$Maj +|| ROTL $E,21,$t0e +|| XOR $t2e,$Ch,$Ch ; Ch(e,f,g) = (e&f)^(~e&g) + ROTL $A,10,$t1a +|| OR $t2a,$Maj,$Maj ; Maj(a,b,c) = ((a|b)&c)|(a&b) +|| ROTL $E,7,$t1e +|| ADD $K,$H,$T1 ; T1 = h + K256[i] + ADD $X0,$T1,$T1 ; T1 += X[i]; +|| STW $X0,*$Xib++ +|| XOR $t0a,$S0,$S0 +|| XOR $t0e,$S1,$S1 + XOR $t1a,$S0,$S0 ; Sigma0(a) +|| XOR $t1e,$S1,$S1 ; Sigma1(e) +|| LDW *$K256++,$K ; pre-fetch K256[i+1] +|| ADD $Ch,$T1,$T1 ; T1 += Ch(e,f,g) + ADD $S1,$T1,$T1 ; T1 += Sigma1(e) +|| ADD $S0,$Maj,$T2 ; T2 = Sigma0(a) + Maj(a,b,c) +|| ROTL $G,0,$H ; h = g +|| MV $F,$G ; g = f +|| MV $X0,$X14 +|| SWAP4 $Xn,$X0 + SWAP2 $X0,$X0 +|| MV $E,$F ; f = e +|| ADD $D,$T1,$E ; e = d + T1 +|| MV $C,$D ; d = c + MV $B,$C ; c = b +|| MV $A,$B ; b = a +|| ADD $T1,$T2,$A ; a = T1 + T2 + SPKERNEL + + ROTL $A,30,$S0 ; BODY_15 +|| OR $A,$B,$Maj +|| AND $A,$B,$t2a +|| ROTL $E,26,$S1 +|| AND $F,$E,$Ch +|| ANDN $G,$E,$t2e +|| LDW *${Xib}[1],$Xn ; modulo-scheduled + ROTL $A,19,$t0a +|| AND $C,$Maj,$Maj +|| ROTL $E,21,$t0e +|| XOR $t2e,$Ch,$Ch ; Ch(e,f,g) = (e&f)^(~e&g) +|| LDW *${Xib}[2],$X1 ; modulo-scheduled + ROTL $A,10,$t1a +|| OR $t2a,$Maj,$Maj ; Maj(a,b,c) = ((a|b)&c)|(a&b) +|| ROTL $E,7,$t1e +|| ADD $K,$H,$T1 ; T1 = h + K256[i] + ADD $X0,$T1,$T1 ; T1 += X[i]; +|| STW $X0,*$Xib++ +|| XOR $t0a,$S0,$S0 +|| XOR $t0e,$S1,$S1 + XOR $t1a,$S0,$S0 ; Sigma0(a) +|| XOR $t1e,$S1,$S1 ; Sigma1(e) +|| LDW *$K256++,$K ; pre-fetch K256[i+1] +|| ADD $Ch,$T1,$T1 ; T1 += Ch(e,f,g) + ADD $S1,$T1,$T1 ; T1 += Sigma1(e) +|| ADD $S0,$Maj,$T2 ; T2 = Sigma0(a) + Maj(a,b,c) +|| ROTL $G,0,$H ; h = g +|| MV $F,$G ; g = f +|| MV $X0,$X15 + MV $E,$F ; f = e +|| ADD $D,$T1,$E ; e = d + T1 +|| MV $C,$D ; d = c +|| MV $Xn,$X0 ; modulo-scheduled +|| LDW *$Xia,$X9 ; modulo-scheduled +|| ROTL $X1,25,$t0e ; modulo-scheduled +|| ROTL $X14,15,$t0a ; modulo-scheduled + SHRU $X1,3,$s0 ; modulo-scheduled +|| SHRU $X14,10,$s1 ; modulo-scheduled +|| ROTL $B,0,$C ; c = b +|| MV $A,$B ; b = a +|| ADD $T1,$T2,$A ; a = T1 + T2 + + SPLOOPD 10 ; BODY_16_63 +|| MVC B1,ILC +|| ROTL $X1,14,$t1e ; modulo-scheduled +|| ROTL $X14,13,$t1a ; modulo-scheduled + + XOR $t0e,$s0,$s0 +|| XOR $t0a,$s1,$s1 +|| MV $X15,$X14 +|| MV $X1,$Xn + XOR $t1e,$s0,$s0 ; sigma0(X[i+1]) +|| XOR $t1a,$s1,$s1 ; sigma1(X[i+14]) +|| LDW *${Xib}[2],$X1 ; module-scheduled + ROTL $A,30,$S0 +|| OR $A,$B,$Maj +|| AND $A,$B,$t2a +|| ROTL $E,26,$S1 +|| AND $F,$E,$Ch +|| ANDN $G,$E,$t2e +|| ADD $X9,$X0,$X0 ; X[i] += X[i+9] + ROTL $A,19,$t0a +|| AND $C,$Maj,$Maj +|| ROTL $E,21,$t0e +|| XOR $t2e,$Ch,$Ch ; Ch(e,f,g) = (e&f)^(~e&g) +|| ADD $s0,$X0,$X0 ; X[i] += sigma1(X[i+1]) + ROTL $A,10,$t1a +|| OR $t2a,$Maj,$Maj ; Maj(a,b,c) = ((a|b)&c)|(a&b) +|| ROTL $E,7,$t1e +|| ADD $H,$K,$T1 ; T1 = h + K256[i] +|| ADD $s1,$X0,$X0 ; X[i] += sigma1(X[i+14]) + XOR $t0a,$S0,$S0 +|| XOR $t0e,$S1,$S1 +|| ADD $X0,$T1,$T1 ; T1 += X[i] +|| STW $X0,*$Xib++ + XOR $t1a,$S0,$S0 ; Sigma0(a) +|| XOR $t1e,$S1,$S1 ; Sigma1(e) +|| ADD $Ch,$T1,$T1 ; T1 += Ch(e,f,g) +|| MV $X0,$X15 +|| ROTL $G,0,$H ; h = g +|| LDW *$K256++,$K ; pre-fetch K256[i+1] + ADD $S1,$T1,$T1 ; T1 += Sigma1(e) +|| ADD $S0,$Maj,$T2 ; T2 = Sigma0(a) + Maj(a,b,c) +|| MV $F,$G ; g = f +|| MV $Xn,$X0 ; modulo-scheduled +|| LDW *++$Xia,$X9 ; modulo-scheduled +|| ROTL $X1,25,$t0e ; module-scheduled +|| ROTL $X14,15,$t0a ; modulo-scheduled + ROTL $X1,14,$t1e ; modulo-scheduled +|| ROTL $X14,13,$t1a ; modulo-scheduled +|| MV $E,$F ; f = e +|| ADD $D,$T1,$E ; e = d + T1 +|| MV $C,$D ; d = c +|| MV $B,$C ; c = b + MV $A,$B ; b = a +|| ADD $T1,$T2,$A ; a = T1 + T2 +|| SHRU $X1,3,$s0 ; modulo-scheduled +|| SHRU $X14,10,$s1 ; modulo-scheduled + SPKERNEL + + [A0] B outerloop? +|| [A0] LDNW *$INP++,$Xn ; pre-fetch input +|| [A0] ADDK -260,$K256 ; rewind K256 +|| ADD $Actx,$A,$A ; accumulate ctx +|| ADD $Ectx,$E,$E +|| ADD $Bctx,$B,$B + ADD $Fctx,$F,$F +|| ADD $Cctx,$C,$C +|| ADD $Gctx,$G,$G +|| ADD $Dctx,$D,$D +|| ADD $Hctx,$H,$H +|| [A0] LDW *$K256++,$K ; pre-fetch K256[0] + + [!A0] BNOP RA +||[!A0] MV $CTXA,$CTXB + [!A0] MV FP,SP ; restore stack pointer +||[!A0] LDW *FP[0],FP ; restore frame pointer + [!A0] STW $A,*${CTXA}[0] ; save ctx +||[!A0] STW $E,*${CTXB}[4] +||[!A0] MVK 0,B0 + [!A0] STW $B,*${CTXA}[1] +||[!A0] STW $F,*${CTXB}[5] +||[!A0] MVC B0,AMR ; clear AMR + STW $C,*${CTXA}[2] +|| STW $G,*${CTXB}[6] + STW $D,*${CTXA}[3] +|| STW $H,*${CTXB}[7] + .endasmfunc + + .sect ".const:sha_asm" + .align 128 +K256: + .uword 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5 + .uword 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5 + .uword 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3 + .uword 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174 + .uword 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc + .uword 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da + .uword 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7 + .uword 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967 + .uword 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13 + .uword 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85 + .uword 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3 + .uword 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070 + .uword 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5 + .uword 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3 + .uword 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208 + .uword 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 + .cstring "SHA256 block transform for C64x+, CRYPTOGAMS by " + .align 4 + +___ + +print $code; diff --git a/crypto/sha/asm/sha512-c64xplus.pl b/crypto/sha/asm/sha512-c64xplus.pl new file mode 100644 index 0000000000..e4e7c042fd --- /dev/null +++ b/crypto/sha/asm/sha512-c64xplus.pl @@ -0,0 +1,421 @@ +#!/usr/bin/env perl +# +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# SHA512 for C64x+. +# +# January 2012 +# +# Performance is 19 cycles per processed byte. Compared to block +# transform function from sha512.c compiled with cl6x with -mv6400+ +# -o2 -DOPENSSL_SMALL_FOOTPRINT it's almost 7x faster and 2x smaller. +# Loop unroll won't make it, this implementation, any faster, because +# it's effectively dominated by SHRU||SHL pairs and you can't schedule +# more of them. +# +# !!! Note that this module uses AMR, which means that all interrupt +# service routines are expected to preserve it and for own well-being +# zero it upon entry. + +while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} +open STDOUT,">$output"; + +($CTXA,$INP,$NUM) = ("A4","B4","A6"); # arguments + $K512="A3"; + +($Ahi,$Actxhi,$Bhi,$Bctxhi,$Chi,$Cctxhi,$Dhi,$Dctxhi, + $Ehi,$Ectxhi,$Fhi,$Fctxhi,$Ghi,$Gctxhi,$Hhi,$Hctxhi)=map("A$_",(16..31)); +($Alo,$Actxlo,$Blo,$Bctxlo,$Clo,$Cctxlo,$Dlo,$Dctxlo, + $Elo,$Ectxlo,$Flo,$Fctxlo,$Glo,$Gctxlo,$Hlo,$Hctxlo)=map("B$_",(16..31)); + +($S1hi,$CHhi,$S0hi,$t0hi)=map("A$_",(10..13)); +($S1lo,$CHlo,$S0lo,$t0lo)=map("B$_",(10..13)); +($T1hi, $T2hi)= ("A6","A7"); +($T1lo,$T1carry,$T2lo,$T2carry)=("B6","B7","B8","B9"); +($Khi,$Klo)=("A9","A8"); +($MAJhi,$MAJlo)=($T2hi,$T2lo); +($t1hi,$t1lo)=($Khi,"B2"); + $CTXB=$t1lo; + +($Xihi,$Xilo)=("A5","B5"); # circular/ring buffer + +$code.=<<___; + .text + .if __TI_EABI__ + .nocmp + .endif + + .asg B3,RA + .asg A15,FP + .asg B15,SP + + .if .BIG_ENDIAN + .asg $Khi,KHI + .asg $Klo,KLO + .else + .asg $Khi,KLO + .asg $Klo,KHI + .endif + + .global _sha512_block_data_order +_sha512_block_data_order: + .asmfunc stack_usage(40+128) + MV $NUM,A0 ; reassign $NUM +|| MVK -128,B0 + [!A0] BNOP RA ; if ($NUM==0) return; +|| [A0] STW FP,*SP--(40) ; save frame pointer +|| [A0] MV SP,FP + [A0] STDW B13:B12,*SP[4] +|| [A0] MVK 0x00404,B1 + [A0] STDW B11:B10,*SP[3] +|| [A0] STDW A13:A12,*FP[-3] +|| [A0] MVKH 0x60000,B1 + [A0] STDW A11:A10,*SP[1] +|| [A0] MVC B1,AMR ; setup circular addressing +|| [A0] ADD B0,SP,SP ; alloca(128) + .if __TI_EABI__ + [A0] AND B0,SP,SP ; align stack at 128 bytes +|| [A0] ADDKPC _sha512_block_data_order,B1 +|| [A0] MVKL \$PCR_OFFSET(K512,_sha512_block_data_order),$K512 + [A0] MVKH \$PCR_OFFSET(K512,_sha512_block_data_order),$K512 +|| [A0] SUBAW SP,2,SP ; reserve two words above buffer + .else + [A0] AND B0,SP,SP ; align stack at 128 bytes +|| [A0] ADDKPC _sha512_block_data_order,B1 +|| [A0] MVKL (K512-_sha512_block_data_order),$K512 + [A0] MVKH (K512-_sha512_block_data_order),$K512 +|| [A0] SUBAW SP,2,SP ; reserve two words above buffer + .endif + ADDAW SP,3,$Xilo + ADDAW SP,2,$Xihi + +|| MV $CTXA,$CTXB + LDW *${CTXA}[0^.LITTLE_ENDIAN],$Ahi ; load ctx +|| LDW *${CTXB}[1^.LITTLE_ENDIAN],$Alo +|| ADD B1,$K512,$K512 + LDW *${CTXA}[2^.LITTLE_ENDIAN],$Bhi +|| LDW *${CTXB}[3^.LITTLE_ENDIAN],$Blo + LDW *${CTXA}[4^.LITTLE_ENDIAN],$Chi +|| LDW *${CTXB}[5^.LITTLE_ENDIAN],$Clo + LDW *${CTXA}[6^.LITTLE_ENDIAN],$Dhi +|| LDW *${CTXB}[7^.LITTLE_ENDIAN],$Dlo + LDW *${CTXA}[8^.LITTLE_ENDIAN],$Ehi +|| LDW *${CTXB}[9^.LITTLE_ENDIAN],$Elo + LDW *${CTXA}[10^.LITTLE_ENDIAN],$Fhi +|| LDW *${CTXB}[11^.LITTLE_ENDIAN],$Flo + LDW *${CTXA}[12^.LITTLE_ENDIAN],$Ghi +|| LDW *${CTXB}[13^.LITTLE_ENDIAN],$Glo + LDW *${CTXA}[14^.LITTLE_ENDIAN],$Hhi +|| LDW *${CTXB}[15^.LITTLE_ENDIAN],$Hlo + + LDNDW *$INP++,B11:B10 ; pre-fetch input + LDDW *$K512++,$Khi:$Klo ; pre-fetch K512[0] +outerloop?: + MVK 15,B0 ; loop counters +|| MVK 64,B1 +|| SUB A0,1,A0 + MV $Ahi,$Actxhi +|| MV $Alo,$Actxlo +|| MV $Bhi,$Bctxhi +|| MV $Blo,$Bctxlo +|| MV $Chi,$Cctxhi +|| MV $Clo,$Cctxlo +|| MVD $Dhi,$Dctxhi +|| MVD $Dlo,$Dctxlo + MV $Ehi,$Ectxhi +|| MV $Elo,$Ectxlo +|| MV $Fhi,$Fctxhi +|| MV $Flo,$Fctxlo +|| MV $Ghi,$Gctxhi +|| MV $Glo,$Gctxlo +|| MVD $Hhi,$Hctxhi +|| MVD $Hlo,$Hctxlo +loop0_15?: + .if .BIG_ENDIAN + MV B11,$T1hi +|| MV B10,$T1lo + .else + SWAP4 B10,$T1hi +|| SWAP4 B11,$T1lo + SWAP2 $T1hi,$T1hi +|| SWAP2 $T1lo,$T1lo + .endif +loop16_79?: + STW $T1hi,*$Xihi++[2] +|| STW $T1lo,*$Xilo++[2] ; X[i] = T1 +|| ADD $Hhi,$T1hi,$T1hi +|| ADDU $Hlo,$T1lo,$T1carry:$T1lo ; T1 += h +|| SHRU $Ehi,14,$S1hi +|| SHL $Ehi,32-14,$S1lo + XOR $Fhi,$Ghi,$CHhi +|| XOR $Flo,$Glo,$CHlo +|| ADD KHI,$T1hi,$T1hi +|| ADDU KLO,$T1carry:$T1lo,$T1carry:$T1lo ; T1 += K512[i] +|| SHRU $Elo,14,$t0lo +|| SHL $Elo,32-14,$t0hi + XOR $t0hi,$S1hi,$S1hi +|| XOR $t0lo,$S1lo,$S1lo +|| AND $Ehi,$CHhi,$CHhi +|| AND $Elo,$CHlo,$CHlo +|| ROTL $Ghi,0,$Hhi +|| ROTL $Glo,0,$Hlo ; h = g +|| SHRU $Ehi,18,$t0hi +|| SHL $Ehi,32-18,$t0lo + XOR $t0hi,$S1hi,$S1hi +|| XOR $t0lo,$S1lo,$S1lo +|| XOR $Ghi,$CHhi,$CHhi +|| XOR $Glo,$CHlo,$CHlo ; Ch(e,f,g) = ((f^g)&e)^g +|| ROTL $Fhi,0,$Ghi +|| ROTL $Flo,0,$Glo ; g = f +|| SHRU $Elo,18,$t0lo +|| SHL $Elo,32-18,$t0hi + XOR $t0hi,$S1hi,$S1hi +|| XOR $t0lo,$S1lo,$S1lo +|| OR $Ahi,$Bhi,$MAJhi +|| OR $Alo,$Blo,$MAJlo +|| ROTL $Ehi,0,$Fhi +|| ROTL $Elo,0,$Flo ; f = e +|| SHRU $Ehi,41-32,$t0lo +|| SHL $Ehi,64-41,$t0hi + XOR $t0hi,$S1hi,$S1hi +|| XOR $t0lo,$S1lo,$S1lo +|| AND $Chi,$MAJhi,$MAJhi +|| AND $Clo,$MAJlo,$MAJlo +|| ROTL $Dhi,0,$Ehi +|| ROTL $Dlo,0,$Elo ; e = d +|| SHRU $Elo,41-32,$t0hi +|| SHL $Elo,64-41,$t0lo + XOR $t0hi,$S1hi,$S1hi +|| XOR $t0lo,$S1lo,$S1lo ; Sigma1(e) +|| AND $Ahi,$Bhi,$t1hi +|| AND $Alo,$Blo,$t1lo +|| ROTL $Chi,0,$Dhi +|| ROTL $Clo,0,$Dlo ; d = c +|| SHRU $Ahi,28,$S0hi +|| SHL $Ahi,32-28,$S0lo + OR $t1hi,$MAJhi,$MAJhi +|| OR $t1lo,$MAJlo,$MAJlo ; Maj(a,b,c) = ((a|b)&c)|(a&b) +|| ADD $CHhi,$T1hi,$T1hi +|| ADDU $CHlo,$T1carry:$T1lo,$T1carry:$T1lo ; T1 += Ch(e,f,g) +|| ROTL $Bhi,0,$Chi +|| ROTL $Blo,0,$Clo ; c = b +|| SHRU $Alo,28,$t0lo +|| SHL $Alo,32-28,$t0hi + XOR $t0hi,$S0hi,$S0hi +|| XOR $t0lo,$S0lo,$S0lo +|| ADD $S1hi,$T1hi,$T1hi +|| ADDU $S1lo,$T1carry:$T1lo,$T1carry:$T1lo ; T1 += Sigma1(e) +|| ROTL $Ahi,0,$Bhi +|| ROTL $Alo,0,$Blo ; b = a +|| SHRU $Ahi,34-32,$t0lo +|| SHL $Ahi,64-34,$t0hi + XOR $t0hi,$S0hi,$S0hi +|| XOR $t0lo,$S0lo,$S0lo +|| ADD $MAJhi,$T1hi,$T2hi +|| ADDU $MAJlo,$T1carry:$T1lo,$T2carry:$T2lo ; T2 = T1+Maj(a,b,c) +|| SHRU $Alo,34-32,$t0hi +|| SHL $Alo,64-34,$t0lo + XOR $t0hi,$S0hi,$S0hi +|| XOR $t0lo,$S0lo,$S0lo +|| ADD $Ehi,$T1hi,$T1hi +|| ADDU $Elo,$T1carry:$T1lo,$T1carry:$T1lo ; T1 += e +|| [B0] BNOP loop0_15? +|| SHRU $Ahi,39-32,$t0lo +|| SHL $Ahi,64-39,$t0hi + XOR $t0hi,$S0hi,$S0hi +|| XOR $t0lo,$S0lo,$S0lo +|| [B0] LDNDW *$INP++,B11:B10 ; pre-fetch input +||[!B1] BNOP break? +|| SHRU $Alo,39-32,$t0hi +|| SHL $Alo,64-39,$t0lo + XOR $t0hi,$S0hi,$S0hi +|| XOR $t0lo,$S0lo,$S0lo ; Sigma0(a) +|| ADD $T1carry,$T1hi,$Ehi +|| MV $T1lo,$Elo ; e = T1 +||[!B0] LDW *${Xihi}[28],$T1hi +||[!B0] LDW *${Xilo}[28],$T1lo ; X[i+14] + ADD $S0hi,$T2hi,$T2hi +|| ADDU $S0lo,$T2carry:$T2lo,$T2carry:$T2lo ; T2 += Sigma0(a) +|| [B1] LDDW *$K512++,$Khi:$Klo ; pre-fetch K512[i] + NOP ; avoid cross-path stall + ADD $T2carry,$T2hi,$Ahi +|| MV $T2lo,$Alo ; a = T2 +|| [B0] SUB B0,1,B0 +;;===== branch to loop00_15? is taken here + NOP +;;===== branch to break? is taken here + LDW *${Xihi}[2],$T2hi +|| LDW *${Xilo}[2],$T2lo ; X[i+1] +|| SHRU $T1hi,19,$S1hi +|| SHL $T1hi,32-19,$S1lo + SHRU $T1lo,19,$t0lo +|| SHL $T1lo,32-19,$t0hi + XOR $t0hi,$S1hi,$S1hi +|| XOR $t0lo,$S1lo,$S1lo +|| SHRU $T1hi,61-32,$t0lo +|| SHL $T1hi,64-61,$t0hi + XOR $t0hi,$S1hi,$S1hi +|| XOR $t0lo,$S1lo,$S1lo +|| SHRU $T1lo,61-32,$t0hi +|| SHL $T1lo,64-61,$t0lo + XOR $t0hi,$S1hi,$S1hi +|| XOR $t0lo,$S1lo,$S1lo +|| SHRU $T1hi,6,$t0hi +|| SHL $T1hi,32-6,$t0lo + XOR $t0hi,$S1hi,$S1hi +|| XOR $t0lo,$S1lo,$S1lo +|| SHRU $T1lo,6,$t0lo +|| LDW *${Xihi}[18],$T1hi +|| LDW *${Xilo}[18],$T1lo ; X[i+9] + XOR $t0lo,$S1lo,$S1lo ; sigma1(Xi[i+14]) + +|| LDW *${Xihi}[0],$CHhi +|| LDW *${Xilo}[0],$CHlo ; X[i] +|| SHRU $T2hi,1,$S0hi +|| SHL $T2hi,32-1,$S0lo + SHRU $T2lo,1,$t0lo +|| SHL $T2lo,32-1,$t0hi + XOR $t0hi,$S0hi,$S0hi +|| XOR $t0lo,$S0lo,$S0lo +|| SHRU $T2hi,8,$t0hi +|| SHL $T2hi,32-8,$t0lo + XOR $t0hi,$S0hi,$S0hi +|| XOR $t0lo,$S0lo,$S0lo +|| SHRU $T2lo,8,$t0lo +|| SHL $T2lo,32-8,$t0hi + XOR $t0hi,$S0hi,$S0hi +|| XOR $t0lo,$S0lo,$S0lo +|| ADD $S1hi,$T1hi,$T1hi +|| ADDU $S1lo,$T1lo,$T1carry:$T1lo ; T1 = X[i+9]+sigma1() +|| [B1] BNOP loop16_79? +|| SHRU $T2hi,7,$t0hi +|| SHL $T2hi,32-7,$t0lo + XOR $t0hi,$S0hi,$S0hi +|| XOR $t0lo,$S0lo,$S0lo +|| ADD $CHhi,$T1hi,$T1hi +|| ADDU $CHlo,$T1carry:$T1lo,$T1carry:$T1lo ; T1 += X[i] +|| SHRU $T2lo,7,$t0lo + XOR $t0lo,$S0lo,$S0lo ; sigma0(Xi[i+1] + + ADD $S0hi,$T1hi,$T1hi +|| ADDU $S0lo,$T1carry:$T1lo,$T1carry:$T1lo ; T1 += sigma0() +|| [B1] SUB B1,1,B1 + NOP ; avoid cross-path stall + ADD $T1carry,$T1hi,$T1hi +;;===== branch to loop16_79? is taken here + +break?: + ADD $Ahi,$Actxhi,$Ahi ; accumulate ctx +|| ADDU $Alo,$Actxlo,$Actxlo:$Alo +|| [A0] LDNDW *$INP++,B11:B10 ; pre-fetch input +|| [A0] ADDK -640,$K512 ; rewind pointer to K512 + ADD $Bhi,$Bctxhi,$Bhi +|| ADDU $Blo,$Bctxlo,$Bctxlo:$Blo +|| [A0] LDDW *$K512++,$Khi:$Klo ; pre-fetch K512[0] + ADD $Chi,$Cctxhi,$Chi +|| ADDU $Clo,$Cctxlo,$Cctxlo:$Clo +|| ADD $Actxlo,$Ahi,$Ahi +||[!A0] MV $CTXA,$CTXB + ADD $Dhi,$Dctxhi,$Dhi +|| ADDU $Dlo,$Dctxlo,$Dctxlo:$Dlo +|| ADD $Bctxlo,$Bhi,$Bhi +||[!A0] STW $Ahi,*${CTXA}[0^.LITTLE_ENDIAN] ; save ctx +||[!A0] STW $Alo,*${CTXB}[1^.LITTLE_ENDIAN] + ADD $Ehi,$Ectxhi,$Ehi +|| ADDU $Elo,$Ectxlo,$Ectxlo:$Elo +|| ADD $Cctxlo,$Chi,$Chi +|| [A0] BNOP outerloop? +||[!A0] STW $Bhi,*${CTXA}[2^.LITTLE_ENDIAN] +||[!A0] STW $Blo,*${CTXB}[3^.LITTLE_ENDIAN] + ADD $Fhi,$Fctxhi,$Fhi +|| ADDU $Flo,$Fctxlo,$Fctxlo:$Flo +|| ADD $Dctxlo,$Dhi,$Dhi +||[!A0] STW $Chi,*${CTXA}[4^.LITTLE_ENDIAN] +||[!A0] STW $Clo,*${CTXB}[5^.LITTLE_ENDIAN] + ADD $Ghi,$Gctxhi,$Ghi +|| ADDU $Glo,$Gctxlo,$Gctxlo:$Glo +|| ADD $Ectxlo,$Ehi,$Ehi +||[!A0] STW $Dhi,*${CTXA}[6^.LITTLE_ENDIAN] +||[!A0] STW $Dlo,*${CTXB}[7^.LITTLE_ENDIAN] + ADD $Hhi,$Hctxhi,$Hhi +|| ADDU $Hlo,$Hctxlo,$Hctxlo:$Hlo +|| ADD $Fctxlo,$Fhi,$Fhi +||[!A0] STW $Ehi,*${CTXA}[8^.LITTLE_ENDIAN] +||[!A0] STW $Elo,*${CTXB}[9^.LITTLE_ENDIAN] + ADD $Gctxlo,$Ghi,$Ghi +||[!A0] STW $Fhi,*${CTXA}[10^.LITTLE_ENDIAN] +||[!A0] STW $Flo,*${CTXB}[11^.LITTLE_ENDIAN] + ADD $Hctxlo,$Hhi,$Hhi +||[!A0] STW $Ghi,*${CTXA}[12^.LITTLE_ENDIAN] +||[!A0] STW $Glo,*${CTXB}[13^.LITTLE_ENDIAN] +;;===== branch to outerloop? is taken here + + STW $Hhi,*${CTXA}[14^.LITTLE_ENDIAN] +|| STW $Hlo,*${CTXB}[15^.LITTLE_ENDIAN] +|| MVK -40,B0 + ADD FP,B0,SP ; destroy circular buffer +|| LDDW *FP[-4],A11:A10 + LDDW *SP[2],A13:A12 +|| LDDW *FP[-2],B11:B10 + LDDW *SP[4],B13:B12 +|| BNOP RA + LDW *++SP(40),FP ; restore frame pointer + MVK 0,B0 + MVC B0,AMR ; clear AMR + NOP 2 ; wait till FP is committed + .endasmfunc + + .sect ".const:sha_asm" + .align 128 +K512: + .uword 0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd + .uword 0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc + .uword 0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019 + .uword 0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118 + .uword 0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe + .uword 0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2 + .uword 0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1 + .uword 0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694 + .uword 0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3 + .uword 0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65 + .uword 0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483 + .uword 0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5 + .uword 0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210 + .uword 0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4 + .uword 0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725 + .uword 0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70 + .uword 0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926 + .uword 0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df + .uword 0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8 + .uword 0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b + .uword 0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001 + .uword 0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30 + .uword 0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910 + .uword 0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8 + .uword 0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53 + .uword 0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8 + .uword 0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb + .uword 0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3 + .uword 0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60 + .uword 0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec + .uword 0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9 + .uword 0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b + .uword 0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207 + .uword 0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178 + .uword 0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6 + .uword 0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b + .uword 0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493 + .uword 0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c + .uword 0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a + .uword 0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817 + .cstring "SHA512 block transform for C64x+, CRYPTOGAMS by " + .align 4 +___ + +print $code; +close STDOUT; -- 2.40.0