From 629fd3aa913f547f6228740d5068193f283abe94 Mon Sep 17 00:00:00 2001 From: Andy Polyakov Date: Thu, 13 May 2010 21:01:24 +0000 Subject: [PATCH] rc4-x86_64.pl: "Westmere" optimization. --- crypto/rc4/asm/rc4-x86_64.pl | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/crypto/rc4/asm/rc4-x86_64.pl b/crypto/rc4/asm/rc4-x86_64.pl index 677be5fe25..23fe4d9996 100755 --- a/crypto/rc4/asm/rc4-x86_64.pl +++ b/crypto/rc4/asm/rc4-x86_64.pl @@ -58,6 +58,10 @@ # fit for Core2 and therefore the code was modified to skip cloop8 on # this CPU. +# Intel Westmere was observed to perform suboptimally. Adding yet +# another movzb to cloop1 improved performance by almost 50%! Core2 +# performance is improved too, but nominally... + $flavour = shift; $output = shift; if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } @@ -229,6 +233,7 @@ $code.=<<___; .align 16 .Lcloop1: add $TX[0]#b,$YY#b + movzb $YY#b,$YY#d movzb ($dat,$YY),$TY#d movb $TX[0]#b,($dat,$YY) movb $TY#b,($dat,$XX[0]) -- 2.40.0