From 149ca7128cfefebcbf8f043e998e85affb07abc2 Mon Sep 17 00:00:00 2001 From: Andy Polyakov Date: Tue, 4 Oct 2011 11:05:16 +0000 Subject: [PATCH] e_padlock-x86*.pl: Nano-related update. --- engines/asm/e_padlock-x86.pl | 45 +++++++++++++++++++++++---------- engines/asm/e_padlock-x86_64.pl | 25 ++++++++++++------ 2 files changed, 49 insertions(+), 21 deletions(-) diff --git a/engines/asm/e_padlock-x86.pl b/engines/asm/e_padlock-x86.pl index 672d91257a..df8f56b521 100644 --- a/engines/asm/e_padlock-x86.pl +++ b/engines/asm/e_padlock-x86.pl @@ -15,14 +15,21 @@ # mode and ~75% in CBC mode. For aligned data improvement can be # observed for short inputs only, e.g. 45% for 64-byte messages in # ECB mode, 20% in CBC. Difference in performance for aligned vs. -# misaligned data depends on misalignment and is either ~1.8x or -# ~2.9x. These are approximately same factors as for hardware support, -# so there is little reason to rely on the latter. It might actually -# hurt performance in mixture of aligned and misaligned buffers, -# because a) if you choose to flip 'align' flag on per-buffer basis, -# then you'd have to reload key context; b) if you choose to set -# 'align' flag permanently, it limits performance for aligned data -# to ~1/2. All results were collected on 1.5GHz C7. +# misaligned data depends on misalignment and is either ~1.8x or 2.9x. +# These are approximately same factors as for hardware support, so +# there is little reason to rely on the latter. On the contrary, it +# might actually hurt performance in mixture of aligned and misaligned +# buffers, because a) if you choose to flip 'align' flag in control +# word on per-buffer basis, then you'd have to reload key context, +# which incurs penalty; b) if you choose to set 'align' flag +# permanently, it limits performance even for aligned data to ~1/2. +# All above mentioned results were collected on 1.5GHz C7. Nano on the +# other hand handles unaligned data more gracefully. Depending on +# algorithm and how unaligned data is, hardware can be up to 70% more +# efficient than below software alignment procedures, nor does 'align' +# flag have affect on aligned performance [if has any meaning at all]. +# Therefore suggestion is to unconditionally set 'align' flag on Nano +# for optimal performance. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; push(@INC,"${dir}","${dir}../../crypto/perlasm"); @@ -362,7 +369,7 @@ my ($mode,$opcode) = @_; &ret (); &function_end_B("padlock_sha1_oneshot"); -&function_begin_B("padlock_sha1"); +&function_begin_B("padlock_sha1_blocks"); &push ("edi"); &push ("esi"); &mov ("eax",-1); @@ -373,7 +380,7 @@ my ($mode,$opcode) = @_; &pop ("esi"); &pop ("edi"); &ret (); -&function_end_B("padlock_sha1"); +&function_end_B("padlock_sha1_blocks"); &function_begin_B("padlock_sha256_oneshot"); &push ("edi"); @@ -397,7 +404,7 @@ my ($mode,$opcode) = @_; &ret (); &function_end_B("padlock_sha256_oneshot"); -&function_begin_B("padlock_sha256"); +&function_begin_B("padlock_sha256_blocks"); &push ("edi"); &push ("esi"); &mov ("eax",-1); @@ -408,7 +415,19 @@ my ($mode,$opcode) = @_; &pop ("esi"); &pop ("edi"); &ret (); -&function_end_B("padlock_sha256"); +&function_end_B("padlock_sha256_blocks"); + +&function_begin_B("padlock_sha512_blocks"); + &push ("edi"); + &push ("esi"); + &mov ("edi",&wparam(0)); + &mov ("esi",&wparam(1)); + &mov ("ecx",&wparam(2)); + &data_byte(0xf3,0x0f,0xa6,0xe0); # rep xsha512 + &pop ("esi"); + &pop ("edi"); + &ret (); +&function_end_B("padlock_sha512_blocks"); &asciz ("VIA Padlock x86 module, CRYPTOGAMS by "); &align (16); @@ -417,7 +436,7 @@ my ($mode,$opcode) = @_; # Essentially this variable belongs in thread local storage. # Having this variable global on the other hand can only cause # few bogus key reloads [if any at all on signle-CPU system], -# so we accept the panalty... +# so we accept the penalty... &set_label("padlock_saved_context",4); &data_word(0); diff --git a/engines/asm/e_padlock-x86_64.pl b/engines/asm/e_padlock-x86_64.pl index f1c040e7f3..30e17c129c 100644 --- a/engines/asm/e_padlock-x86_64.pl +++ b/engines/asm/e_padlock-x86_64.pl @@ -151,15 +151,15 @@ padlock_sha1_oneshot: ret .size padlock_sha1_oneshot,.-padlock_sha1_oneshot -.globl padlock_sha1 -.type padlock_sha1,\@function,3 +.globl padlock_sha1_blocks +.type padlock_sha1_blocks,\@function,3 .align 16 -padlock_sha1: +padlock_sha1_blocks: mov \$-1,%rax mov %rdx,%rcx .byte 0xf3,0x0f,0xa6,0xc8 # rep xsha1 ret -.size padlock_sha1,.-padlock_sha1 +.size padlock_sha1_blocks,.-padlock_sha1_blocks .globl padlock_sha256_oneshot .type padlock_sha256_oneshot,\@function,3 @@ -171,15 +171,23 @@ padlock_sha256_oneshot: ret .size padlock_sha256_oneshot,.-padlock_sha256_oneshot -.globl padlock_sha256 -.type padlock_sha256,\@function,3 +.globl padlock_sha256_blocks +.type padlock_sha256_blocks,\@function,3 .align 16 -padlock_sha256: +padlock_sha256_blocks: mov \$-1,%rax mov %rdx,%rcx .byte 0xf3,0x0f,0xa6,0xd0 # rep xsha256 ret -.size padlock_sha256,.-padlock_sha256 +.size padlock_sha256_blocks,.-padlock_sha256_blocks + +.globl padlock_sha512_blocks,\@function,3 +.align 16 +padlock_sha512_blocks: + mov %rdx,%rcx + .byte 0xf3,0x0f,0xa6,0xe0 # rep xha512 + ret +.size padlock_sha512_blocks,.-padlock_sha512_blocks ___ sub generate_mode { @@ -207,6 +215,7 @@ padlock_${mode}_encrypt: xor %eax,%eax xor %ebx,%ebx testl \$`1<<5`,($ctx) # align bit in control word + jnz .L${mode}_aligned test \$0x0f,$out setz %al # !out_misaligned test \$0x0f,$inp -- 2.40.0