x86[_64]cpuid.pl: handle new extensions.

author Andy Polyakov <appro@openssl.org>

Mon, 16 May 2011 20:35:11 +0000 (20:35 +0000)

committer Andy Polyakov <appro@openssl.org>

Mon, 16 May 2011 20:35:11 +0000 (20:35 +0000)
author Andy Polyakov <appro@openssl.org>
Mon, 16 May 2011 20:35:11 +0000 (20:35 +0000)
committer Andy Polyakov <appro@openssl.org>
Mon, 16 May 2011 20:35:11 +0000 (20:35 +0000)
diff --git a/crypto/x86_64cpuid.pl b/crypto/x86_64cpuid.pl

index ecfcfc763c7ac68febbc2a4a419ffcb13514feb6..ba95f0b2298d4f3959d7e5c7e9f8abbe6a43bb3f 100644 (file)
--- a/crypto/x86_64cpuid.pl
+++ b/crypto/x86_64cpuid.pl
@@ -47,7 +47,7 @@ OPENSSL_rdtsc:
  .type  OPENSSL_ia32_cpuid,\@abi-omnipotent
  .align 16
  OPENSSL_ia32_cpuid:
-       mov     %rbx,%r8
+       mov     %rbx,%r8                # save %rbx
  
         xor     %eax,%eax
         cpuid
@@ -79,7 +79,15 @@ OPENSSL_ia32_cpuid:
         # AMD specific
         mov     \$0x80000000,%eax
         cpuid
-       cmp     \$0x80000008,%eax
+       cmp     \$0x80000001,%eax
+       jb      .Lintel
+       mov     %eax,%r10d
+       mov     \$0x80000001,%eax
+       cpuid
+       or      %ecx,%r9d
+       and     \$0x00000801,%r9d       # isolate AMD XOP bit, 1<<11
+
+       cmp     \$0x80000008,%r10d
         jb      .Lintel
  
         mov     \$0x80000008,%eax
@@ -90,12 +98,12 @@ OPENSSL_ia32_cpuid:
         mov     \$1,%eax
         cpuid
         bt      \$28,%edx               # test hyper-threading bit
-       jnc     .Ldone
+       jnc     .Lgeneric
         shr     \$16,%ebx               # number of logical processors
         cmp     %r10b,%bl
-       ja      .Ldone
+       ja      .Lgeneric
         and     \$0xefffffff,%edx       # ~(1<<28)
-       jmp     .Ldone
+       jmp     .Lgeneric
  
  .Lintel:
         cmp     \$4,%r11d
@@ -121,21 +129,38 @@ OPENSSL_ia32_cpuid:
         or      \$0x40000000,%edx       # use reserved bit to skip unrolled loop
  .Lnotintel:
         bt      \$28,%edx               # test hyper-threading bit
-       jnc     .Ldone
+       jnc     .Lgeneric
         and     \$0xefffffff,%edx       # ~(1<<28)
         cmp     \$0,%r10d
-       je      .Ldone
+       je      .Lgeneric
  
         or      \$0x10000000,%edx       # 1<<28
         shr     \$16,%ebx
         cmp     \$1,%bl                 # see if cache is shared
-       ja      .Ldone
+       ja      .Lgeneric
         and     \$0xefffffff,%edx       # ~(1<<28)
-.Ldone:
+.Lgeneric:
+       and     \$0x00000800,%r9d       # isolate AMD XOP flag
+       and     \$0xfffff7ff,%ecx
+       or      %r9d,%ecx               # merge AMD XOP flag
+
         shl     \$32,%rcx
-       mov     %edx,%eax
-       mov     %r8,%rbx
-       or      %rcx,%rax
+       mov     %edx,%ebx
+       or      %rcx,%rbx               # compose capability vector in %rbx
+       bt      \$27+32,%rcx            # check OSXSAVE bit
+       jnc     .Lclear_avx
+       xor     %ecx,%ecx               # XCR0
+       .byte   0x0f,0x01,0xd0          # xgetbv
+       and     \$6,%eax                # isolate XMM and YMM state support
+       cmp     \$6,%eax
+       je      .Ldone
+.Lclear_avx:
+       mov     \$0xefffe7ff,%eax       # ~(1<<28|1<<12|1<<11)
+       shl     \$32,%rax
+       and     %rax,%rbx               # clear AVX, FMA and AMD XOP bits
+.Ldone:
+       mov     %rbx,%rax
+       mov     %r8,%rbx                # restore %rbx
         ret
  .size  OPENSSL_ia32_cpuid,.-OPENSSL_ia32_cpuid
  
@@ -250,7 +275,7 @@ OPENSSL_instrument_bus:
         mov     %eax,$lasttick  # lasttick = tick
         mov     \$0,$lastdiff   # lastdiff = 0
         clflush ($out)
-       lock
+       .byte   0xf0            # lock
         add     $lastdiff,($out)
         jmp     .Loop
  .align 16
@@ -260,7 +285,7 @@ OPENSSL_instrument_bus:
         mov     %edx,$lasttick
         mov     %eax,$lastdiff
         clflush ($out)
-       lock
+       .byte   0xf0            # lock
         add     %eax,($out)
         lea     4($out),$out
         sub     \$1,$cnt
@@ -284,7 +309,7 @@ OPENSSL_instrument_bus2:
         mov     \$0,$lastdiff   # lastdiff = 0
  
         clflush ($out)
-       lock
+       .byte   0xf0            # lock
         add     $lastdiff,($out)
  
         rdtsc                   # collect 1st diff
@@ -294,7 +319,7 @@ OPENSSL_instrument_bus2:
         mov     %eax,$lastdiff  # lastdiff = diff
  .Loop2:
         clflush ($out)
-       lock
+       .byte   0xf0            # lock
         add     %eax,($out)     # accumulate diff
  
         sub     \$1,$max
diff --git a/crypto/x86cpuid.pl b/crypto/x86cpuid.pl

index 0513398739f0dc04a2f8e2c2fa514bee31fe62ee..f424c2debeed80da5038f4ec0740e918322a9a8a 100644 (file)
--- a/crypto/x86cpuid.pl
+++ b/crypto/x86cpuid.pl
@@ -20,7 +20,7 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
         &pop    ("eax");
         &xor    ("ecx","eax");
         &bt     ("ecx",21);
-       &jnc    (&label("done"));
+       &jnc    (&label("generic"));
         &xor    ("eax","eax");
         &cpuid  ();
         &mov    ("edi","eax");          # max value for standard query level
@@ -51,7 +51,14 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
         # AMD specific
         &mov    ("eax",0x80000000);
         &cpuid  ();
-       &cmp    ("eax",0x80000008);
+       &cmp    ("eax",0x80000001);
+       &jb     (&label("intel"));
+       &mov    ("esi","eax");
+       &mov    ("eax",0x80000001);
+       &cpuid  ();
+       &or     ("ebp","ecx");
+       &and    ("ebp",1<<11|1);        # isolate XOP bit
+       &cmp    ("esi",0x80000008);
         &jb     (&label("intel"));
  
         &mov    ("eax",0x80000008);
@@ -62,13 +69,13 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
         &mov    ("eax",1);
         &cpuid  ();
         &bt     ("edx",28);
-       &jnc    (&label("done"));
+       &jnc    (&label("generic"));
         &shr    ("ebx",16);
         &and    ("ebx",0xff);
         &cmp    ("ebx","esi");
-       &ja     (&label("done"));
+       &ja     (&label("generic"));
         &and    ("edx",0xefffffff);     # clear hyper-threading bit
-       &jmp    (&label("done"));
+       &jmp    (&label("generic"));
         
  &set_label("intel");
         &cmp    ("edi",4);
@@ -93,19 +100,42 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
         &or     ("edx",1<<20);          # use reserved bit to engage RC4_CHAR
  &set_label("notP4");
         &bt     ("edx",28);             # test hyper-threading bit
-       &jnc    (&label("done"));
+       &jnc    (&label("generic"));
         &and    ("edx",0xefffffff);
         &cmp    ("edi",0);
-       &je     (&label("done"));
+       &je     (&label("generic"));
  
         &or     ("edx",0x10000000);
         &shr    ("ebx",16);
         &cmp    (&LB("ebx"),1);
-       &ja     (&label("done"));
+       &ja     (&label("generic"));
         &and    ("edx",0xefffffff);     # clear hyper-threading bit if not
+
+&set_label("generic");
+       &and    ("ebp",1<<11);          # isolate AMD XOP flag
+       &and    ("ecx",~(1<<11));
+       &mov    ("esi","edx");
+       &or     ("ebp","ecx");          # merge AMD XOP flag
+
+       &bt     ("ecx",26);             # check XSAVE bit
+       &jnc    (&label("done"));
+       &bt     ("ecx",27);             # check OSXSAVE bit
+       &jnc    (&label("clear_xmm"));
+       &xor    ("ecx","ecx");
+       &data_byte(0x0f,0x01,0xd0);     # xgetbv
+       &and    ("eax",6);
+       &cmp    ("eax",6);
+       &je     (&label("done"));
+       &cmp    ("eax",2);
+       &je     (&label("clear_avx"));
+&set_label("clear_xmm");
+       &and    ("ebp",~(1<<25|1<<1));  # clear AESNI and PCLMULQDQ bits
+       &and    ("esi",~(1<<24));       # clear FXSR
+&set_label("clear_avx");
+       &and    ("ebp",~(1<<28|1<<12|1<<11));# clear AVX, FMA and AMD XOP bits
  &set_label("done");
-       &mov    ("eax","edx");
-       &mov    ("edx","ecx");
+       &mov    ("eax","esi");
+       &mov    ("edx","ebp");
  &function_end("OPENSSL_ia32_cpuid");
  
  &external_label("OPENSSL_ia32cap_P");
@@ -199,8 +229,9 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
         &bt     (&DWP(0,"ecx"),1);
         &jnc    (&label("no_x87"));
         if ($sse2) {
-               &bt     (&DWP(0,"ecx"),26);
-               &jnc    (&label("no_sse2"));
+               &and    ("ecx",1<<26|1<<24);    # check SSE2 and FXSR bits
+               &cmp    ("ecx",1<<26|1<<24);
+               &jne    (&label("no_sse2"));
                 &pxor   ("xmm0","xmm0");
                 &pxor   ("xmm1","xmm1");
                 &pxor   ("xmm2","xmm2");
@@ -331,7 +362,7 @@ my $max = "ebp";
         &mov    ($lasttick,"eax");      # lasttick = tick
         &mov    ($lastdiff,0);          # lastdiff = 0
         &clflush(&DWP(0,$out));
-       &lock   ();
+       &data_byte(0xf0);               # lock
         &add    (&DWP(0,$out),$lastdiff);
         &jmp    (&label("loop"));
  
@@ -342,7 +373,7 @@ my $max = "ebp";
         &mov    ($lasttick,"edx");      # lasttick = tick
         &mov    ($lastdiff,"eax");      # lastdiff = diff
         &clflush(&DWP(0,$out));
-       &lock   ();
+       &data_byte(0xf0);               # lock
         &add    (&DWP(0,$out),"eax");   # accumulate diff
         &lea    ($out,&DWP(4,$out));    # ++$out
         &sub    ($cnt,1);               # --$cnt
@@ -371,7 +402,7 @@ my $max = "ebp";
         &mov    ($lastdiff,0);          # lastdiff = 0
  
         &clflush(&DWP(0,$out));
-       &lock   ();
+       &data_byte(0xf0);               # lock
         &add    (&DWP(0,$out),$lastdiff);
  
         &rdtsc  ();                     # collect 1st diff
@@ -383,7 +414,7 @@ my $max = "ebp";
  
  &set_label("loop2",16);
         &clflush(&DWP(0,$out));
-       &lock   ();
+       &data_byte(0xf0);               # lock
         &add    (&DWP(0,$out),"eax");   # accumulate diff
  
         &sub    ($max,1);
diff --git a/doc/crypto/OPENSSL_ia32cap.pod b/doc/crypto/OPENSSL_ia32cap.pod

index dca2e20aced6af9b526b7eb1eefb7abd9af84251..af6b4f3a4d1d87fa4a16ef76c84c4cd8abb0b8e9 100644 (file)
--- a/doc/crypto/OPENSSL_ia32cap.pod
+++ b/doc/crypto/OPENSSL_ia32cap.pod
@@ -2,7 +2,7 @@
  
  =head1 NAME
  
-OPENSSL_ia32cap - finding the IA-32 processor capabilities
+OPENSSL_ia32cap - the IA-32 processor capabilities vector
  
  =head1 SYNOPSIS
  
@@ -18,30 +18,52 @@ input value (see Intel Application Note #241618). Naturally it's
  meaningful on x86 and x86_64 platforms only. The variable is normally
  set up automatically upon toolkit initialization, but can be
  manipulated afterwards to modify crypto library behaviour. For the
-moment of this writing seven bits are significant, namely:
-
-1. bit #4 denoting presence of Time-Stamp Counter.
-2. bit #20, reserved by Intel, is used to choose among RC4 code
-   paths;
-3. bit #23 denoting MMX support;
-4. bit #25 denoting SSE support;
-5. bit #26 denoting SSE2 support;
-6. bit #28 denoting Hyperthreading, which is used to distiguish
-   cores with shared cache;
-7. bit #30, reserved by Intel, is used to choose among RC4 code
-   paths;
-8. bit #57 denoting Intel AES instruction set extension;
+moment of this writing following bits are significant:
+
+=item bit #4 denoting presence of Time-Stamp Counter.
+
+=item bit #19 denoting availability of CLFLUSH instruction;
+
+=item bit #20, reserved by Intel, is used to choose among RC4 code paths;
+
+=item bit #23 denoting MMX support;
+
+=item bit #24, FXSR bit, denoting availability of XMM registers;
+
+=item bit #25 denoting SSE support;
+
+=item bit #26 denoting SSE2 support;
+
+=item bit #28 denoting Hyperthreading, which is used to distiguish
+      cores with shared cache;
+
+=item bit #30, reserved by Intel, is used to choose among RC4 code
+      paths;
+
+=item bit #33 denoting availability of PCLMULQDQ instruction;
+
+=item bit #41 denoting SSSE3, Supplemental SSE3, support;
+
+=item bit #43 denoting AMD XOP support (forced to zero on Intel);
+
+=item bit #57 denoting AES-NI instruction set extension;
+
+=item bit #59, OSXSAVE bit, denoting availability of YMM registers;
+
+=item bit #60 denoting AVX extension;
  
  For example, clearing bit #26 at run-time disables high-performance
-SSE2 code present in the crypto library. You might have to do this if
-target OpenSSL application is executed on SSE2 capable CPU, but under
-control of OS which does not support SSE2 extentions. Even though you
-can manipulate the value programmatically, you most likely will find it
-more appropriate to set up an environment variable with the same name
-prior starting target application, e.g. on Intel P4 processor 'env
-OPENSSL_ia32cap=0x12900010 apps/openssl', to achieve same effect
-without modifying the application source code. Alternatively you can
-reconfigure the toolkit with no-sse2 option and recompile.
+SSE2 code present in the crypto library, while clearing bit #24
+disables SSE2 code operating on 128-bit XMM register bank. You might
+have to do the latter if target OpenSSL application is executed on SSE2
+capable CPU, but under control of OS that does not enable XMM
+registers. Even though you can manipulate the value programmatically,
+you most likely will find it more appropriate to set up an environment
+variable with the same name prior starting target application, e.g. on
+Intel P4 processor 'env OPENSSL_ia32cap=0x16980010 apps/openssl', to
+achieve same effect without modifying the application source code.
+Alternatively you can reconfigure the toolkit with no-sse2 option and
+recompile.
  
  Less intuituve is clearing bit #28. The truth is that it's not copied
  from CPUID output verbatim, but is adjusted to reflect whether or not
@@ -49,4 +71,3 @@ the data cache is actually shared between logical cores. This in turn
  affects the decision on whether or not expensive countermeasures
  against cache-timing attacks are applied, most notably in AES assembler
  module.
-=cut
author	Andy Polyakov <appro@openssl.org>
	Mon, 16 May 2011 20:35:11 +0000 (20:35 +0000)
committer	Andy Polyakov <appro@openssl.org>
	Mon, 16 May 2011 20:35:11 +0000 (20:35 +0000)
crypto/x86_64cpuid.pl		patch \| blob \| history
crypto/x86cpuid.pl		patch \| blob \| history
doc/crypto/OPENSSL_ia32cap.pod		patch \| blob \| history