rc4-x86_64.pl: "Westmere" optimization.

author Andy Polyakov <appro@openssl.org>

Thu, 13 May 2010 21:01:24 +0000 (21:01 +0000)

committer Andy Polyakov <appro@openssl.org>

Thu, 13 May 2010 21:01:24 +0000 (21:01 +0000)
author Andy Polyakov <appro@openssl.org>
Thu, 13 May 2010 21:01:24 +0000 (21:01 +0000)
committer Andy Polyakov <appro@openssl.org>
Thu, 13 May 2010 21:01:24 +0000 (21:01 +0000)
diff --git a/crypto/rc4/asm/rc4-x86_64.pl b/crypto/rc4/asm/rc4-x86_64.pl

index 677be5fe25badfe44a315855a827c65d71543e72..23fe4d99963b73a8c4f0016b680471b82f30999a 100755 (executable)
--- a/crypto/rc4/asm/rc4-x86_64.pl
+++ b/crypto/rc4/asm/rc4-x86_64.pl
@@ -58,6 +58,10 @@
  # fit for Core2 and therefore the code was modified to skip cloop8 on
  # this CPU.
  
+# Intel Westmere was observed to perform suboptimally. Adding yet
+# another movzb to cloop1 improved performance by almost 50%! Core2
+# performance is improved too, but nominally...
+
  $flavour = shift;
  $output  = shift;
  if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
@@ -229,6 +233,7 @@ $code.=<<___;
  .align 16
  .Lcloop1:
         add     $TX[0]#b,$YY#b
+       movzb   $YY#b,$YY#d
         movzb   ($dat,$YY),$TY#d
         movb    $TX[0]#b,($dat,$YY)
         movb    $TY#b,($dat,$XX[0])
author	Andy Polyakov <appro@openssl.org>
	Thu, 13 May 2010 21:01:24 +0000 (21:01 +0000)
committer	Andy Polyakov <appro@openssl.org>
	Thu, 13 May 2010 21:01:24 +0000 (21:01 +0000)