From d17b59e49f32ec47be8e2418b439c239a5cd9618 Mon Sep 17 00:00:00 2001 From: Andy Polyakov Date: Sun, 14 Oct 2012 16:51:27 +0000 Subject: [PATCH] md5-sparcv9.pl: avoid %asi modifications, improve short input performance by 30-20%. --- crypto/md5/asm/md5-sparcv9.pl | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/crypto/md5/asm/md5-sparcv9.pl b/crypto/md5/asm/md5-sparcv9.pl index 062a0738c4..ef16666cc3 100644 --- a/crypto/md5/asm/md5-sparcv9.pl +++ b/crypto/md5/asm/md5-sparcv9.pl @@ -12,7 +12,7 @@ # MD5 for SPARCv9, 6.9 cycles per byte on UltraSPARC, >40% faster than # code generated by Sun C 5.2. -# SPARC T4 MD5 hardware achieves 3.24 cycles per byte, which is 2.1x +# SPARC T4 MD5 hardware achieves 3.20 cycles per byte, which is 2.1x # faster than software. Multi-process benchmark saturates at 12x # single-process result on 8-core processor, or ~11GBps per 2.85GHz # socket. @@ -221,15 +221,15 @@ md5_block_asm_data_order: be .Lsoftware nop - rd %asi, %g5 - wr %g0, 0x88, %asi ! ASI_PRIMARY_LITTLE - - lda [%o0 + 0x00] %asi, %f0 ! load context - lda [%o0 + 0x04] %asi, %f1 + mov 4, %g1 andcc %o1, 0x7, %g0 - lda [%o0 + 0x08] %asi, %f2 + lda [%o0 + %g0]0x88, %f0 ! load context + lda [%o0 + %g1]0x88, %f1 + add %o0, 8, %o0 + lda [%o0 + %g0]0x88, %f2 + lda [%o0 + %g1]0x88, %f3 bne,pn %icc, .Lhwunaligned - lda [%o0 + 0x0c] %asi, %f3 + sub %o0, 8, %o0 .Lhw_loop: ldd [%o1 + 0x00], %f8 @@ -250,12 +250,13 @@ md5_block_asm_data_order: nop .Lhwfinish: - sta %f0, [%o0 + 0x00] %asi ! store context - sta %f1, [%o0 + 0x04] %asi - sta %f2, [%o0 + 0x08] %asi - sta %f3, [%o0 + 0x0c] %asi + sta %f0, [%o0 + %g0]0x88 ! store context + sta %f1, [%o0 + %g1]0x88 + add %o0, 8, %o0 + sta %f2, [%o0 + %g0]0x88 + sta %f3, [%o0 + %g1]0x88 retl - wr %g5, 0x0, %asi ! restore %asi + nop .align 8 .Lhwunaligned: -- 2.40.0