.type OPENSSL_ia32_cpuid,\@abi-omnipotent
.align 16
OPENSSL_ia32_cpuid:
- mov %rbx,%r8
+ mov %rbx,%r8 # save %rbx
xor %eax,%eax
cpuid
# AMD specific
mov \$0x80000000,%eax
cpuid
- cmp \$0x80000008,%eax
+ cmp \$0x80000001,%eax
+ jb .Lintel
+ mov %eax,%r10d
+ mov \$0x80000001,%eax
+ cpuid
+ or %ecx,%r9d
+ and \$0x00000801,%r9d # isolate AMD XOP bit, 1<<11
+
+ cmp \$0x80000008,%r10d
jb .Lintel
mov \$0x80000008,%eax
mov \$1,%eax
cpuid
bt \$28,%edx # test hyper-threading bit
- jnc .Ldone
+ jnc .Lgeneric
shr \$16,%ebx # number of logical processors
cmp %r10b,%bl
- ja .Ldone
+ ja .Lgeneric
and \$0xefffffff,%edx # ~(1<<28)
- jmp .Ldone
+ jmp .Lgeneric
.Lintel:
cmp \$4,%r11d
or \$0x40000000,%edx # use reserved bit to skip unrolled loop
.Lnotintel:
bt \$28,%edx # test hyper-threading bit
- jnc .Ldone
+ jnc .Lgeneric
and \$0xefffffff,%edx # ~(1<<28)
cmp \$0,%r10d
- je .Ldone
+ je .Lgeneric
or \$0x10000000,%edx # 1<<28
shr \$16,%ebx
cmp \$1,%bl # see if cache is shared
- ja .Ldone
+ ja .Lgeneric
and \$0xefffffff,%edx # ~(1<<28)
-.Ldone:
+.Lgeneric:
+ and \$0x00000800,%r9d # isolate AMD XOP flag
+ and \$0xfffff7ff,%ecx
+ or %r9d,%ecx # merge AMD XOP flag
+
shl \$32,%rcx
- mov %edx,%eax
- mov %r8,%rbx
- or %rcx,%rax
+ mov %edx,%ebx
+ or %rcx,%rbx # compose capability vector in %rbx
+ bt \$27+32,%rcx # check OSXSAVE bit
+ jnc .Lclear_avx
+ xor %ecx,%ecx # XCR0
+ .byte 0x0f,0x01,0xd0 # xgetbv
+ and \$6,%eax # isolate XMM and YMM state support
+ cmp \$6,%eax
+ je .Ldone
+.Lclear_avx:
+ mov \$0xefffe7ff,%eax # ~(1<<28|1<<12|1<<11)
+ shl \$32,%rax
+ and %rax,%rbx # clear AVX, FMA and AMD XOP bits
+.Ldone:
+ mov %rbx,%rax
+ mov %r8,%rbx # restore %rbx
ret
.size OPENSSL_ia32_cpuid,.-OPENSSL_ia32_cpuid
mov %eax,$lasttick # lasttick = tick
mov \$0,$lastdiff # lastdiff = 0
clflush ($out)
- lock
+ .byte 0xf0 # lock
add $lastdiff,($out)
jmp .Loop
.align 16
mov %edx,$lasttick
mov %eax,$lastdiff
clflush ($out)
- lock
+ .byte 0xf0 # lock
add %eax,($out)
lea 4($out),$out
sub \$1,$cnt
mov \$0,$lastdiff # lastdiff = 0
clflush ($out)
- lock
+ .byte 0xf0 # lock
add $lastdiff,($out)
rdtsc # collect 1st diff
mov %eax,$lastdiff # lastdiff = diff
.Loop2:
clflush ($out)
- lock
+ .byte 0xf0 # lock
add %eax,($out) # accumulate diff
sub \$1,$max
&pop ("eax");
&xor ("ecx","eax");
&bt ("ecx",21);
- &jnc (&label("done"));
+ &jnc (&label("generic"));
&xor ("eax","eax");
&cpuid ();
&mov ("edi","eax"); # max value for standard query level
# AMD specific
&mov ("eax",0x80000000);
&cpuid ();
- &cmp ("eax",0x80000008);
+ &cmp ("eax",0x80000001);
+ &jb (&label("intel"));
+ &mov ("esi","eax");
+ &mov ("eax",0x80000001);
+ &cpuid ();
+ &or ("ebp","ecx");
+ &and ("ebp",1<<11|1); # isolate XOP bit
+ &cmp ("esi",0x80000008);
&jb (&label("intel"));
&mov ("eax",0x80000008);
&mov ("eax",1);
&cpuid ();
&bt ("edx",28);
- &jnc (&label("done"));
+ &jnc (&label("generic"));
&shr ("ebx",16);
&and ("ebx",0xff);
&cmp ("ebx","esi");
- &ja (&label("done"));
+ &ja (&label("generic"));
&and ("edx",0xefffffff); # clear hyper-threading bit
- &jmp (&label("done"));
+ &jmp (&label("generic"));
&set_label("intel");
&cmp ("edi",4);
&or ("edx",1<<20); # use reserved bit to engage RC4_CHAR
&set_label("notP4");
&bt ("edx",28); # test hyper-threading bit
- &jnc (&label("done"));
+ &jnc (&label("generic"));
&and ("edx",0xefffffff);
&cmp ("edi",0);
- &je (&label("done"));
+ &je (&label("generic"));
&or ("edx",0x10000000);
&shr ("ebx",16);
&cmp (&LB("ebx"),1);
- &ja (&label("done"));
+ &ja (&label("generic"));
&and ("edx",0xefffffff); # clear hyper-threading bit if not
+
+&set_label("generic");
+ &and ("ebp",1<<11); # isolate AMD XOP flag
+ &and ("ecx",~(1<<11));
+ &mov ("esi","edx");
+ &or ("ebp","ecx"); # merge AMD XOP flag
+
+ &bt ("ecx",26); # check XSAVE bit
+ &jnc (&label("done"));
+ &bt ("ecx",27); # check OSXSAVE bit
+ &jnc (&label("clear_xmm"));
+ &xor ("ecx","ecx");
+ &data_byte(0x0f,0x01,0xd0); # xgetbv
+ &and ("eax",6);
+ &cmp ("eax",6);
+ &je (&label("done"));
+ &cmp ("eax",2);
+ &je (&label("clear_avx"));
+&set_label("clear_xmm");
+ &and ("ebp",~(1<<25|1<<1)); # clear AESNI and PCLMULQDQ bits
+ &and ("esi",~(1<<24)); # clear FXSR
+&set_label("clear_avx");
+ &and ("ebp",~(1<<28|1<<12|1<<11));# clear AVX, FMA and AMD XOP bits
&set_label("done");
- &mov ("eax","edx");
- &mov ("edx","ecx");
+ &mov ("eax","esi");
+ &mov ("edx","ebp");
&function_end("OPENSSL_ia32_cpuid");
&external_label("OPENSSL_ia32cap_P");
&bt (&DWP(0,"ecx"),1);
&jnc (&label("no_x87"));
if ($sse2) {
- &bt (&DWP(0,"ecx"),26);
- &jnc (&label("no_sse2"));
+ &and ("ecx",1<<26|1<<24); # check SSE2 and FXSR bits
+ &cmp ("ecx",1<<26|1<<24);
+ &jne (&label("no_sse2"));
&pxor ("xmm0","xmm0");
&pxor ("xmm1","xmm1");
&pxor ("xmm2","xmm2");
&mov ($lasttick,"eax"); # lasttick = tick
&mov ($lastdiff,0); # lastdiff = 0
&clflush(&DWP(0,$out));
- &lock ();
+ &data_byte(0xf0); # lock
&add (&DWP(0,$out),$lastdiff);
&jmp (&label("loop"));
&mov ($lasttick,"edx"); # lasttick = tick
&mov ($lastdiff,"eax"); # lastdiff = diff
&clflush(&DWP(0,$out));
- &lock ();
+ &data_byte(0xf0); # lock
&add (&DWP(0,$out),"eax"); # accumulate diff
&lea ($out,&DWP(4,$out)); # ++$out
&sub ($cnt,1); # --$cnt
&mov ($lastdiff,0); # lastdiff = 0
&clflush(&DWP(0,$out));
- &lock ();
+ &data_byte(0xf0); # lock
&add (&DWP(0,$out),$lastdiff);
&rdtsc (); # collect 1st diff
&set_label("loop2",16);
&clflush(&DWP(0,$out));
- &lock ();
+ &data_byte(0xf0); # lock
&add (&DWP(0,$out),"eax"); # accumulate diff
&sub ($max,1);
=head1 NAME
-OPENSSL_ia32cap - finding the IA-32 processor capabilities
+OPENSSL_ia32cap - the IA-32 processor capabilities vector
=head1 SYNOPSIS
meaningful on x86 and x86_64 platforms only. The variable is normally
set up automatically upon toolkit initialization, but can be
manipulated afterwards to modify crypto library behaviour. For the
-moment of this writing seven bits are significant, namely:
-
-1. bit #4 denoting presence of Time-Stamp Counter.
-2. bit #20, reserved by Intel, is used to choose among RC4 code
- paths;
-3. bit #23 denoting MMX support;
-4. bit #25 denoting SSE support;
-5. bit #26 denoting SSE2 support;
-6. bit #28 denoting Hyperthreading, which is used to distiguish
- cores with shared cache;
-7. bit #30, reserved by Intel, is used to choose among RC4 code
- paths;
-8. bit #57 denoting Intel AES instruction set extension;
+moment of this writing following bits are significant:
+
+=item bit #4 denoting presence of Time-Stamp Counter.
+
+=item bit #19 denoting availability of CLFLUSH instruction;
+
+=item bit #20, reserved by Intel, is used to choose among RC4 code paths;
+
+=item bit #23 denoting MMX support;
+
+=item bit #24, FXSR bit, denoting availability of XMM registers;
+
+=item bit #25 denoting SSE support;
+
+=item bit #26 denoting SSE2 support;
+
+=item bit #28 denoting Hyperthreading, which is used to distiguish
+ cores with shared cache;
+
+=item bit #30, reserved by Intel, is used to choose among RC4 code
+ paths;
+
+=item bit #33 denoting availability of PCLMULQDQ instruction;
+
+=item bit #41 denoting SSSE3, Supplemental SSE3, support;
+
+=item bit #43 denoting AMD XOP support (forced to zero on Intel);
+
+=item bit #57 denoting AES-NI instruction set extension;
+
+=item bit #59, OSXSAVE bit, denoting availability of YMM registers;
+
+=item bit #60 denoting AVX extension;
For example, clearing bit #26 at run-time disables high-performance
-SSE2 code present in the crypto library. You might have to do this if
-target OpenSSL application is executed on SSE2 capable CPU, but under
-control of OS which does not support SSE2 extentions. Even though you
-can manipulate the value programmatically, you most likely will find it
-more appropriate to set up an environment variable with the same name
-prior starting target application, e.g. on Intel P4 processor 'env
-OPENSSL_ia32cap=0x12900010 apps/openssl', to achieve same effect
-without modifying the application source code. Alternatively you can
-reconfigure the toolkit with no-sse2 option and recompile.
+SSE2 code present in the crypto library, while clearing bit #24
+disables SSE2 code operating on 128-bit XMM register bank. You might
+have to do the latter if target OpenSSL application is executed on SSE2
+capable CPU, but under control of OS that does not enable XMM
+registers. Even though you can manipulate the value programmatically,
+you most likely will find it more appropriate to set up an environment
+variable with the same name prior starting target application, e.g. on
+Intel P4 processor 'env OPENSSL_ia32cap=0x16980010 apps/openssl', to
+achieve same effect without modifying the application source code.
+Alternatively you can reconfigure the toolkit with no-sse2 option and
+recompile.
Less intuituve is clearing bit #28. The truth is that it's not copied
from CPUID output verbatim, but is adjusted to reflect whether or not
affects the decision on whether or not expensive countermeasures
against cache-timing attacks are applied, most notably in AES assembler
module.
-=cut