[X86][SSE] Disable shouldFoldConstantShiftPairToMask for btver1/btver2 targets (PR40758)

author Simon Pilgrim <llvm-dev@redking.me.uk>

Fri, 26 Apr 2019 10:49:13 +0000 (10:49 +0000)

committer Simon Pilgrim <llvm-dev@redking.me.uk>

Fri, 26 Apr 2019 10:49:13 +0000 (10:49 +0000)
author Simon Pilgrim <llvm-dev@redking.me.uk>
Fri, 26 Apr 2019 10:49:13 +0000 (10:49 +0000)
committer Simon Pilgrim <llvm-dev@redking.me.uk>
Fri, 26 Apr 2019 10:49:13 +0000 (10:49 +0000)
diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

index 6d351cc57ac9c38f548a7e90a41e1227dd359f83..e8bad0413b2f7b03ce9f6007eb86b4cbc10c28c0 100644 (file)
--- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -6882,6 +6882,8 @@ SDValue DAGCombiner::visitSHL(SDNode *N) {
    //                               (and (srl x, (sub c1, c2), MASK)
    // Only fold this if the inner shift has no other uses -- if it does, folding
    // this will increase the total number of instructions.
+  // TODO - drop hasOneUse requirement if c1 == c2?
+  // TODO - support non-uniform vector shift amounts.
    if (N1C && N0.getOpcode() == ISD::SRL && N0.hasOneUse() &&
        TLI.shouldFoldConstantShiftPairToMask(N, Level)) {
      if (ConstantSDNode *N0C1 = isConstOrConstSplat(N0.getOperand(1))) {
@@ -7188,6 +7190,7 @@ SDValue DAGCombiner::visitSRL(SDNode *N) {
    }
  
    // fold (srl (shl x, c), c) -> (and x, cst2)
+  // TODO - (srl (shl x, c1), c2).
    if (N0.getOpcode() == ISD::SHL && N0.getOperand(1) == N1 &&
        isConstantOrConstantVector(N1, /* NoOpaques */ true)) {
      SDLoc DL(N);
diff --git a/lib/Target/X86/X86.td b/lib/Target/X86/X86.td

index c054379acf785873f3469d8b0aa94d4016ca03aa..fe23a2900d503e4db371f659d8f830d5ad7cd78e 100644 (file)
--- a/lib/Target/X86/X86.td
+++ b/lib/Target/X86/X86.td
@@ -424,6 +424,11 @@ def FeatureFastHorizontalOps
          "Prefer horizontal vector math instructions (haddp, phsub, etc.) over "
          "normal vector instructions with shuffles", [FeatureSSE3]>;
  
+def FeatureFastVectorShiftMasks
+    : SubtargetFeature<
+        "fast-vector-shift-masks", "HasFastVectorShiftMasks", "true",
+        "Prefer a left/right vector logical shift pair over a shift+and pair">;
+
  // Merge branches using three-way conditional code.
  def FeatureMergeToThreeWayBranch : SubtargetFeature<"merge-to-threeway-branch",
                                          "ThreewayBranchProfitable", "true",
@@ -775,7 +780,8 @@ def ProcessorFeatures {
                                                        FeaturePOPCNT,
                                                        FeatureSlowSHLD,
                                                        FeatureLAHFSAHF,
-                                                      FeatureFast15ByteNOP];
+                                                      FeatureFast15ByteNOP,
+                                                      FeatureFastVectorShiftMasks];
    list<SubtargetFeature> BtVer1Features = BtVer1InheritableFeatures;
  
    // Jaguar
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp

index e91724c3f32a5276f1f4078d56eb952933f39c74..889a0111b87b56b856f14bd21d4440a1cde57b08 100644 (file)
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -5013,7 +5013,18 @@ bool X86TargetLowering::hasAndNot(SDValue Y) const {
  
  bool X86TargetLowering::shouldFoldConstantShiftPairToMask(
      const SDNode *N, CombineLevel Level) const {
-  // TODO - some targets prefer immediate vector shifts to shift+mask.
+  assert((N->getOpcode() == ISD::SHL &&
+          N->getOperand(0).getOpcode() == ISD::SRL) ||
+         (N->getOpcode() == ISD::SRL &&
+          N->getOperand(0).getOpcode() == ISD::SHL) &&
+             "Expected shift-shift mask");
+
+  if (Subtarget.hasFastVectorShiftMasks() && N->getValueType(0).isVector()) {
+    // Only fold if the shift values are equal - so it folds to AND.
+    // TODO - we should fold if either is non-uniform but we don't do the
+    // fold for non-splats yet.
+    return N->getOperand(1) == N->getOperand(0).getOperand(1);
+  }
    return TargetLoweringBase::shouldFoldConstantShiftPairToMask(N, Level);
  }
  
diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h

index fe04f0220705395a98cc1016ae3fbc5fcb305e83..0ff9d544d8264217e1dfce69c6c1e8b6af795084 100644 (file)
--- a/lib/Target/X86/X86Subtarget.h
+++ b/lib/Target/X86/X86Subtarget.h
@@ -393,6 +393,9 @@ protected:
    /// Try harder to combine to horizontal vector ops if they are fast.
    bool HasFastHorizontalOps = false;
  
+  /// Prefer a left/right vector logical shifts pair over a shift+and pair.
+  bool HasFastVectorShiftMasks = false;
+
    /// Use a retpoline thunk rather than indirect calls to block speculative
    /// execution.
    bool UseRetpolineIndirectCalls = false;
@@ -644,6 +647,7 @@ public:
    bool hasFastSHLDRotate() const { return HasFastSHLDRotate; }
    bool hasFastBEXTR() const { return HasFastBEXTR; }
    bool hasFastHorizontalOps() const { return HasFastHorizontalOps; }
+  bool hasFastVectorShiftMasks() const { return HasFastVectorShiftMasks; }
    bool hasMacroFusion() const { return HasMacroFusion; }
    bool hasBranchFusion() const { return HasBranchFusion; }
    bool hasERMSB() const { return HasERMSB; }
diff --git a/test/CodeGen/X86/sse2-vector-shifts.ll b/test/CodeGen/X86/sse2-vector-shifts.ll

index 82d4b7721d91fb3eeb298bc72f2a9fade1b7d23f..696f99220b3585e0659a955a398b30fb0cccdf69 100644 (file)
--- a/test/CodeGen/X86/sse2-vector-shifts.ll
+++ b/test/CodeGen/X86/sse2-vector-shifts.ll
@@ -1,5 +1,7 @@
  ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+sse2 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,MASK
+; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+sse2,+fast-vector-shift-masks | FileCheck %s --check-prefixes=CHECK,SHIFT
+; RUN: llc < %s -mtriple=x86_64-pc-linux -mcpu=btver1 | FileCheck %s --check-prefixes=CHECK,SHIFT
  
  ; SSE2 Logical Shift Left
  
@@ -300,11 +302,17 @@ define <4 x i32> @shl_sra_v4i32(<4 x i32> %x) nounwind {
  }
  
  define <4 x i32> @shl_srl_v4i32(<4 x i32> %x) nounwind {
-; CHECK-LABEL: shl_srl_v4i32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    pslld $3, %xmm0
-; CHECK-NEXT:    pand {{.*}}(%rip), %xmm0
-; CHECK-NEXT:    retq
+; MASK-LABEL: shl_srl_v4i32:
+; MASK:       # %bb.0:
+; MASK-NEXT:    pslld $3, %xmm0
+; MASK-NEXT:    pand {{.*}}(%rip), %xmm0
+; MASK-NEXT:    retq
+;
+; SHIFT-LABEL: shl_srl_v4i32:
+; SHIFT:       # %bb.0:
+; SHIFT-NEXT:    psrld $2, %xmm0
+; SHIFT-NEXT:    pslld $5, %xmm0
+; SHIFT-NEXT:    retq
    %shl0 = lshr <4 x i32> %x, <i32 2, i32 2, i32 2, i32 2>
    %shl1 = shl <4 x i32> %shl0, <i32 5, i32 5, i32 5, i32 5>
    ret <4 x i32> %shl1
author	Simon Pilgrim <llvm-dev@redking.me.uk>
	Fri, 26 Apr 2019 10:49:13 +0000 (10:49 +0000)
committer	Simon Pilgrim <llvm-dev@redking.me.uk>
	Fri, 26 Apr 2019 10:49:13 +0000 (10:49 +0000)
lib/CodeGen/SelectionDAG/DAGCombiner.cpp		patch \| blob \| history
lib/Target/X86/X86.td		patch \| blob \| history
lib/Target/X86/X86ISelLowering.cpp		patch \| blob \| history
lib/Target/X86/X86Subtarget.h		patch \| blob \| history
test/CodeGen/X86/sse2-vector-shifts.ll		patch \| blob \| history