]> granicus.if.org Git - llvm/commitdiff
Merge r227983 (along with r227815 and r227972 to make it apply cleanly)
authorHans Wennborg <hans@hanshq.net>
Thu, 5 Feb 2015 18:17:20 +0000 (18:17 +0000)
committerHans Wennborg <hans@hanshq.net>
Thu, 5 Feb 2015 18:17:20 +0000 (18:17 +0000)
------------------------------------------------------------------------
r227815 | spatel | 2015-02-02 09:47:30 -0800 (Mon, 02 Feb 2015) | 2 lines

fix typo

------------------------------------------------------------------------

------------------------------------------------------------------------
r227972 | spatel | 2015-02-03 07:37:18 -0800 (Tue, 03 Feb 2015) | 10 lines

Improve test to actually check for a folded load.

This test was checking for lack of a "movaps" (an aligned load)
rather than a "movups" (an unaligned load). It also included
a store which complicated the checking.

Add specific CPU runs to prevent subtarget feature flag overrides
from inhibiting this optimization.
------------------------------------------------------------------------

------------------------------------------------------------------------
r227983 | spatel | 2015-02-03 09:13:04 -0800 (Tue, 03 Feb 2015) | 22 lines

Fix program crashes due to alignment exceptions generated for SSE memop instructions (PR22371).

r224330 introduced a bug by misinterpreting the "FeatureVectorUAMem" bit.
The commit log says that change did not affect anything, but that's not correct.
That change allowed SSE instructions to have unaligned mem operands folded into
math ops, and that's not allowed in the default specification for any SSE variant.

The bug is exposed when compiling for an AVX-capable CPU that had this feature
flag but without enabling AVX codegen. Another mistake in r224330 was not adding
the feature flag to all AVX CPUs; the AMD chips were excluded.

This is part of the fix for PR22371 ( http://llvm.org/bugs/show_bug.cgi?id=22371 ).

This feature bit is SSE-specific, so I've renamed it to "FeatureSSEUnalignedMem".
Changed the existing test case for the feature bit to reflect the new name and
renamed the test file itself to better reflect the feature.
Added runs to fold-vex.ll to check for the failing codegen.

Note that the feature bit is not set by default on any CPU because it may require a
configuration register setting to enable the enhanced unaligned behavior.
------------------------------------------------------------------------

git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_36@228323 91177308-0d34-0410-b5e6-96231b3b80d8

lib/Target/X86/X86.td
lib/Target/X86/X86InstrFragmentsSIMD.td
lib/Target/X86/X86Subtarget.cpp
lib/Target/X86/X86Subtarget.h
test/CodeGen/X86/fold-vex.ll
test/CodeGen/X86/sse-unaligned-mem-feature.ll [moved from test/CodeGen/X86/2010-01-07-UAMemFeature.ll with 74% similarity]

index ab3319afe93f0a78fdb7686eafad1818fc2d264f..30b3b2876b8a9f86e34119dc4e4267a810076363 100644 (file)
@@ -132,9 +132,9 @@ def FeatureFMA4    : SubtargetFeature<"fma4", "HasFMA4", "true",
 def FeatureXOP     : SubtargetFeature<"xop", "HasXOP", "true",
                                       "Enable XOP instructions",
                                       [FeatureFMA4]>;
-def FeatureVectorUAMem : SubtargetFeature<"vector-unaligned-mem",
-                                          "HasVectorUAMem", "true",
-                 "Allow unaligned memory operands on vector/SIMD instructions">;
+def FeatureSSEUnalignedMem : SubtargetFeature<"sse-unaligned-mem",
+                                          "HasSSEUnalignedMem", "true",
+                      "Allow unaligned memory operands with SSE instructions">;
 def FeatureAES     : SubtargetFeature<"aes", "HasAES", "true",
                                       "Enable AES instructions",
                                       [FeatureSSE2]>;
@@ -309,7 +309,6 @@ class SandyBridgeProc<string Name> : ProcessorModel<Name, SandyBridgeModel, [
                                        FeatureCMPXCHG16B,
                                        FeatureFastUAMem,
                                        FeatureSlowUAMem32,
-                                       FeatureVectorUAMem,
                                        FeaturePOPCNT,
                                        FeatureAES,
                                        FeaturePCLMUL
@@ -322,7 +321,6 @@ class IvyBridgeProc<string Name> : ProcessorModel<Name, SandyBridgeModel, [
                                      FeatureCMPXCHG16B,
                                      FeatureFastUAMem,
                                      FeatureSlowUAMem32,
-                                     FeatureVectorUAMem,
                                      FeaturePOPCNT,
                                      FeatureAES,
                                      FeaturePCLMUL,
@@ -337,7 +335,6 @@ class HaswellProc<string Name> : ProcessorModel<Name, HaswellModel, [
                                    FeatureAVX2,
                                    FeatureCMPXCHG16B,
                                    FeatureFastUAMem,
-                                   FeatureVectorUAMem,
                                    FeaturePOPCNT,
                                    FeatureAES,
                                    FeaturePCLMUL,
@@ -360,7 +357,6 @@ class BroadwellProc<string Name> : ProcessorModel<Name, HaswellModel, [
                                      FeatureAVX2,
                                      FeatureCMPXCHG16B,
                                      FeatureFastUAMem,
-                                     FeatureVectorUAMem,
                                      FeaturePOPCNT,
                                      FeatureAES,
                                      FeaturePCLMUL,
@@ -388,7 +384,7 @@ class KnightsLandingProc<string Name> : ProcessorModel<Name, HaswellModel,
                       FeatureAES, FeaturePCLMUL, FeatureRDRAND, FeatureF16C,
                       FeatureFSGSBase, FeatureMOVBE, FeatureLZCNT, FeatureBMI,
                       FeatureBMI2, FeatureFMA, FeatureRTM, FeatureHLE,
-                      FeatureSlowIncDec, FeatureVectorUAMem]>;
+                      FeatureSlowIncDec]>;
 def : KnightsLandingProc<"knl">;
 
 // FIXME: define SKX model
@@ -399,7 +395,7 @@ class SkylakeProc<string Name> : ProcessorModel<Name, HaswellModel,
                       FeatureAES, FeaturePCLMUL, FeatureRDRAND, FeatureF16C,
                       FeatureFSGSBase, FeatureMOVBE, FeatureLZCNT, FeatureBMI,
                       FeatureBMI2, FeatureFMA, FeatureRTM, FeatureHLE,
-                      FeatureSlowIncDec, FeatureSGX, FeatureVectorUAMem]>;
+                      FeatureSlowIncDec, FeatureSGX]>;
 def : SkylakeProc<"skylake">;
 def : SkylakeProc<"skx">; // Legacy alias.
 
index 76e8fad78de37c939ac32ac8e507544406940e32..7069bd68e00e9ecc113109512d63031bfa761ba1 100644 (file)
@@ -424,7 +424,7 @@ def alignedloadv8i64  : PatFrag<(ops node:$ptr),
 // setting a feature bit in the processor (on startup, for example).
 // Opteron 10h and later implement such a feature.
 def memop : PatFrag<(ops node:$ptr), (load node:$ptr), [{
-  return    Subtarget->hasVectorUAMem()
+  return    Subtarget->hasSSEUnalignedMem()
          || cast<LoadSDNode>(N)->getAlignment() >= 16;
 }]>;
 
index e59395c06a5c7fe63c44d2cf9da17adfa51b4f4e..983169886ad3e2996286f40b35dab79722e38218 100644 (file)
@@ -265,7 +265,7 @@ void X86Subtarget::initializeEnvironment() {
   IsSHLDSlow = false;
   IsUAMemFast = false;
   IsUAMem32Slow = false;
-  HasVectorUAMem = false;
+  HasSSEUnalignedMem = false;
   HasCmpxchg16b = false;
   UseLeaForSP = false;
   HasSlowDivide32 = false;
index 754b5b9247178bb7f81d61cbcc8fea63e9004041..e0263d66e928d204e7c1c83d242102af85a79a7f 100644 (file)
@@ -162,9 +162,9 @@ protected:
   /// True if unaligned 32-byte memory accesses are slow.
   bool IsUAMem32Slow;
 
-  /// HasVectorUAMem - True if SIMD operations can have unaligned memory
-  /// operands. This may require setting a feature bit in the processor.
-  bool HasVectorUAMem;
+  /// True if SSE operations can have unaligned memory operands.
+  /// This may require setting a configuration bit in the processor.
+  bool HasSSEUnalignedMem;
 
   /// HasCmpxchg16b - True if this processor has the CMPXCHG16B instruction;
   /// this is true for most x86-64 chips, but not the first AMD chips.
@@ -378,7 +378,7 @@ public:
   bool isSHLDSlow() const { return IsSHLDSlow; }
   bool isUnalignedMemAccessFast() const { return IsUAMemFast; }
   bool isUnalignedMem32Slow() const { return IsUAMem32Slow; }
-  bool hasVectorUAMem() const { return HasVectorUAMem; }
+  bool hasSSEUnalignedMem() const { return HasSSEUnalignedMem; }
   bool hasCmpxchg16b() const { return HasCmpxchg16b; }
   bool useLeaForSP() const { return UseLeaForSP; }
   bool hasSlowDivide32() const { return HasSlowDivide32; }
index 2bb5b441c7c00d5e8307e7b0e8b08a4f81abc939..5a8b1d8cbfdf6f4d6c5d7c1e56f105efbd5aa8a8 100644 (file)
@@ -1,16 +1,31 @@
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=corei7-avx | FileCheck %s
+; Use CPU parameters to ensure that a CPU-specific attribute is not overriding the AVX definition.
 
-;CHECK: @test
-; No need to load from memory. The operand will be loaded as part of th AND instr.
-;CHECK-NOT: vmovaps
-;CHECK: vandps
-;CHECK: ret
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown                  -mattr=+avx | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=corei7-avx             | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=btver2                 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown                  -mattr=-avx | FileCheck %s --check-prefix=SSE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=corei7-avx -mattr=-avx | FileCheck %s --check-prefix=SSE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=btver2     -mattr=-avx | FileCheck %s --check-prefix=SSE
 
-define void @test1(<8 x i32>* %p0, <8 x i32> %in1) nounwind {
-entry:
-  %in0 = load <8 x i32>* %p0, align 2
-  %a = and <8 x i32> %in0, %in1
-  store <8 x i32> %a, <8 x i32>* undef
-  ret void
+; No need to load unaligned operand from memory using an explicit instruction with AVX.
+; The operand should be folded into the AND instr.
+
+; With SSE, folding memory operands into math/logic ops requires 16-byte alignment
+; unless specially configured on some CPUs such as AMD Family 10H.
+
+define <4 x i32> @test1(<4 x i32>* %p0, <4 x i32> %in1) nounwind {
+  %in0 = load <4 x i32>* %p0, align 2
+  %a = and <4 x i32> %in0, %in1
+  ret <4 x i32> %a
+
+; CHECK-LABEL: @test1
+; CHECK-NOT:   vmovups
+; CHECK:       vandps (%rdi), %xmm0, %xmm0
+; CHECK-NEXT:  ret
+
+; SSE-LABEL: @test1
+; SSE:       movups (%rdi), %xmm1
+; SSE-NEXT:  andps %xmm1, %xmm0
+; SSE-NEXT:  ret
 }
 
similarity index 74%
rename from test/CodeGen/X86/2010-01-07-UAMemFeature.ll
rename to test/CodeGen/X86/sse-unaligned-mem-feature.ll
index bb24adb41817d4bb7d47a4d496cbb984b2509e0d..15f91ee04eafec6ecb3f5233a371d7336a08520c 100644 (file)
@@ -1,5 +1,4 @@
-; RUN: llc -mcpu=yonah -mattr=vector-unaligned-mem -march=x86 < %s | FileCheck %s
-; CHECK: addps (
+; RUN: llc -mcpu=yonah -mattr=sse-unaligned-mem -march=x86 < %s | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
 target triple = "x86_64-unknown-linux-gnu"
@@ -8,4 +7,7 @@ define <4 x float> @foo(<4 x float>* %P, <4 x float> %In) nounwind {
        %A = load <4 x float>* %P, align 4
        %B = fadd <4 x float> %A, %In
        ret <4 x float> %B
+
+; CHECK-LABEL: @foo
+; CHECK:       addps (
 }