From: Andy Polyakov Date: Sat, 17 Nov 2012 19:04:15 +0000 (+0000) Subject: Extend OPENSSL_ia32cap_P with extra word to accomodate AVX2 capability. X-Git-Tag: master-pre-reformat~1559 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=c5cd28bd64fa2b02f29e74486539e4b2f6741114;p=openssl Extend OPENSSL_ia32cap_P with extra word to accomodate AVX2 capability. --- diff --git a/crypto/cryptlib.c b/crypto/cryptlib.c index c85fe5aa3d..6defb7cc69 100644 --- a/crypto/cryptlib.c +++ b/crypto/cryptlib.c @@ -125,7 +125,7 @@ static double SSLeay_MSVC5_hack=0.0; /* and for VC1.5 */ defined(__INTEL__) || \ defined(__x86_64) || defined(__x86_64__) || defined(_M_AMD64) || defined(_M_X64) -extern unsigned int OPENSSL_ia32cap_P[2]; +extern unsigned int OPENSSL_ia32cap_P[4]; unsigned int *OPENSSL_ia32cap_loc(void) { return OPENSSL_ia32cap_P; } #if defined(OPENSSL_CPUID_OBJ) && !defined(OPENSSL_NO_ASM) && !defined(I386_ONLY) @@ -137,7 +137,7 @@ typedef unsigned long long IA32CAP; #endif void OPENSSL_cpuid_setup(void) { static int trigger=0; - IA32CAP OPENSSL_ia32_cpuid(void); + IA32CAP OPENSSL_ia32_cpuid(unsigned int *); IA32CAP vec; char *env; @@ -151,10 +151,18 @@ void OPENSSL_cpuid_setup(void) #else if (!sscanf(env+off,"%lli",(long long *)&vec)) vec = strtoul(env+off,NULL,0); #endif - if (off) vec = OPENSSL_ia32_cpuid()&~vec; + if (off) vec = OPENSSL_ia32_cpuid(OPENSSL_ia32cap_P)&~vec; + + OPENSSL_ia32cap_P[2] = 0; + if ((env=strchr(env,':'))) { + off = (env[1]=='~')?2:1; + vec = strtoul(env+off,NULL,0); + if (off>1) OPENSSL_ia32cap_P[2] &= ~vec; + else OPENSSL_ia32cap_P[2] = vec; + } } else - vec = OPENSSL_ia32_cpuid(); + vec = OPENSSL_ia32_cpuid(OPENSSL_ia32cap_P); /* * |(1<<10) sets a reserved bit to signal that variable @@ -165,7 +173,7 @@ void OPENSSL_cpuid_setup(void) OPENSSL_ia32cap_P[1] = (unsigned int)(vec>>32); } #else -unsigned int OPENSSL_ia32cap_P[2]; +unsigned int OPENSSL_ia32cap_P[4]; #endif #else @@ -173,7 +181,7 @@ unsigned int *OPENSSL_ia32cap_loc(void) { return NULL; } #endif int OPENSSL_NONPIC_relocated = 0; #if !defined(OPENSSL_CPUID_SETUP) && !defined(OPENSSL_CPUID_OBJ) -void OPENSSL_cpuid_setup(void) {} +void OPENSSL_cpuid_setup(unsigned int *) {} #endif #if (defined(_WIN32) || defined(__CYGWIN__)) && defined(_WINDLL) diff --git a/crypto/perlasm/x86asm.pl b/crypto/perlasm/x86asm.pl index 3f190ae590..17abf92297 100644 --- a/crypto/perlasm/x86asm.pl +++ b/crypto/perlasm/x86asm.pl @@ -131,6 +131,32 @@ sub ::rdrand { &::generic("rdrand",@_); } } +sub rxb { + local *opcode=shift; + my ($dst,$src1,$src2,$rxb)=@_; + + $rxb|=0x7<<5; + $rxb&=~(0x04<<5) if($dst>=8); + $rxb&=~(0x01<<5) if($src1>=8); + $rxb&=~(0x02<<5) if($src2>=8); + push @opcode,$rxb; +} + +sub ::vprotd +{ my $args=join(',',@_); + if ($args =~ /xmm([0-7]),xmm([0-7]),([x0-9a-f]+)/) + { my @opcode=(0x8f); + rxb(\@opcode,$1,$2,-1,0x08); + push @opcode,0x78,0xc2; + push @opcode,0xc0|($2&7)|(($1&7)<<3); # ModR/M + my $c=$3; + push @opcode,$c=~/^0/?oct($c):$c; + &::data_byte(@opcode); + } + else + { &::generic("vprotd",@_); } +} + # label management $lbdecor="L"; # local label decoration, set by package $label="000"; diff --git a/crypto/perlasm/x86gas.pl b/crypto/perlasm/x86gas.pl index e02ee84258..5c2498118f 100644 --- a/crypto/perlasm/x86gas.pl +++ b/crypto/perlasm/x86gas.pl @@ -70,6 +70,8 @@ sub ::DWP { my($addr,$reg1,$reg2,$idx)=@_; my $ret=""; + if (!defined($idx) && 1*$reg2) { $idx=$reg2; $reg2=$reg1; undef $reg1; } + $addr =~ s/^\s+//; # prepend global references with optional underscore $addr =~ s/^([^\+\-0-9][^\+\-]*)/&::islabel($1) or "$nmdecor$1"/ige; @@ -157,7 +159,7 @@ sub ::file_end } } if (grep {/\b${nmdecor}OPENSSL_ia32cap_P\b/i} @out) { - my $tmp=".comm\t${nmdecor}OPENSSL_ia32cap_P,8"; + my $tmp=".comm\t${nmdecor}OPENSSL_ia32cap_P,16"; if ($::macosx) { push (@out,"$tmp,2\n"); } elsif ($::elf) { push (@out,"$tmp,4\n"); } else { push (@out,"$tmp\n"); } diff --git a/crypto/perlasm/x86masm.pl b/crypto/perlasm/x86masm.pl index f937d07c87..1741342c3a 100644 --- a/crypto/perlasm/x86masm.pl +++ b/crypto/perlasm/x86masm.pl @@ -39,6 +39,8 @@ sub get_mem { my($size,$addr,$reg1,$reg2,$idx)=@_; my($post,$ret); + if (!defined($idx) && 1*$reg2) { $idx=$reg2; $reg2=$reg1; undef $reg1; } + $ret .= "$size PTR " if ($size ne ""); $addr =~ s/^\s+//; @@ -133,7 +135,7 @@ ___ if (grep {/\b${nmdecor}OPENSSL_ia32cap_P\b/i} @out) { my $comm=<<___; .bss SEGMENT 'BSS' -COMM ${nmdecor}OPENSSL_ia32cap_P:QWORD +COMM ${nmdecor}OPENSSL_ia32cap_P:DWORD:4 .bss ENDS ___ # comment out OPENSSL_ia32cap_P declarations diff --git a/crypto/perlasm/x86nasm.pl b/crypto/perlasm/x86nasm.pl index ca2511c9eb..5d92f6092a 100644 --- a/crypto/perlasm/x86nasm.pl +++ b/crypto/perlasm/x86nasm.pl @@ -36,6 +36,8 @@ sub get_mem { my($size,$addr,$reg1,$reg2,$idx)=@_; my($post,$ret); + if (!defined($idx) && 1*$reg2) { $idx=$reg2; $reg2=$reg1; undef $reg1; } + if ($size ne "") { $ret .= "$size"; $ret .= " PTR" if ($::mwerks); @@ -117,7 +119,7 @@ sub ::file_end { if (grep {/\b${nmdecor}OPENSSL_ia32cap_P\b/i} @out) { my $comm=<<___; ${drdecor}segment .bss -${drdecor}common ${nmdecor}OPENSSL_ia32cap_P 8 +${drdecor}common ${nmdecor}OPENSSL_ia32cap_P 16 ___ # comment out OPENSSL_ia32cap_P declarations grep {s/(^extern\s+${nmdecor}OPENSSL_ia32cap_P)/\;$1/} @out; diff --git a/crypto/x86_64cpuid.pl b/crypto/x86_64cpuid.pl index 58c7bab1b6..3a1adeeccc 100644 --- a/crypto/x86_64cpuid.pl +++ b/crypto/x86_64cpuid.pl @@ -23,7 +23,7 @@ print<<___; call OPENSSL_cpuid_setup .hidden OPENSSL_ia32cap_P -.comm OPENSSL_ia32cap_P,8,4 +.comm OPENSSL_ia32cap_P,16,4 .text @@ -52,12 +52,13 @@ OPENSSL_rdtsc: .size OPENSSL_rdtsc,.-OPENSSL_rdtsc .globl OPENSSL_ia32_cpuid -.type OPENSSL_ia32_cpuid,\@abi-omnipotent +.type OPENSSL_ia32_cpuid,\@function,1 .align 16 OPENSSL_ia32_cpuid: mov %rbx,%r8 # save %rbx xor %eax,%eax + mov %eax,8(%rdi) # clear 3rd word cpuid mov %eax,%r11d # max value for standard query level @@ -125,6 +126,14 @@ OPENSSL_ia32_cpuid: shr \$14,%r10d and \$0xfff,%r10d # number of cores -1 per L1D + cmp \$7,%r11d + jb .Lnocacheinfo + + mov \$7,%eax + xor %ecx,%ecx + cpuid + mov %ebx,8(%rdi) + .Lnocacheinfo: mov \$1,%eax cpuid @@ -164,6 +173,7 @@ OPENSSL_ia32_cpuid: .Lclear_avx: mov \$0xefffe7ff,%eax # ~(1<<28|1<<12|1<<11) and %eax,%r9d # clear AVX, FMA and AMD XOP bits + andl \$0xffffffdf,8(%rdi) # cleax AVX2, ~(1<<5) .Ldone: shl \$32,%r9 mov %r10d,%eax diff --git a/crypto/x86cpuid.pl b/crypto/x86cpuid.pl index 597b7a51a7..3b6c469d08 100644 --- a/crypto/x86cpuid.pl +++ b/crypto/x86cpuid.pl @@ -22,6 +22,8 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); } &xor ("eax","eax"); &bt ("ecx",21); &jnc (&label("nocpuid")); + &mov ("esi",&wparam(0)); + &mov (&DWP(8,"esi"),"eax"); # clear 3rd word &cpuid (); &mov ("edi","eax"); # max value for standard query level @@ -89,6 +91,15 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); } &shr ("edi",14); &and ("edi",0xfff); # number of cores -1 per L1D + &cmp ("edi",7); + &jb (&label("nocacheinfo")); + + &mov ("esi",&wparam(0)); + &mov ("eax",7); + &xor ("ecx","ecx"); + &cpuid (); + &mov (&DWP(8,"esi"),"ebx"); + &set_label("nocacheinfo"); &mov ("eax",1); &cpuid (); @@ -133,6 +144,8 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); } &and ("esi",0xfeffffff); # clear FXSR &set_label("clear_avx"); &and ("ebp",0xefffe7ff); # clear AVX, FMA and AMD XOP bits + &mov ("edi",&wparam(0)); + &and (&DWP(8,"edi"),0xffffffdf); # clear AVX2 &set_label("done"); &mov ("eax","esi"); &mov ("edx","ebp"); diff --git a/doc/crypto/OPENSSL_ia32cap.pod b/doc/crypto/OPENSSL_ia32cap.pod index 16f500fc76..4f0a94648e 100644 --- a/doc/crypto/OPENSSL_ia32cap.pod +++ b/doc/crypto/OPENSSL_ia32cap.pod @@ -72,3 +72,17 @@ the data cache is actually shared between logical cores. This in turn affects the decision on whether or not expensive countermeasures against cache-timing attacks are applied, most notably in AES assembler module. + +The vector is further extended with EBX value returned by CPUID with +EAX=7 and ECX=0 as input. Following bits are significant: + +=item bit #64+3 denoting availability of BMI1 instructions, e.g. ANDN; + +=item bit #64+5 denoting availability of AVX2 instructions; + +=item bit #64+8 denoting availability of BMI2 instructions, e.g. MUXL + and RORX; + +=item bit #64+18 denoting availability of RDSEED instruction; + +=itme bit #64+19 denoting availability of ADCX and ADOX instructions;