[X86][AMDGPU][DAGCombiner] Move call to allowsMemoryAccess into isLoadBitCastBenefici...

author Craig Topper <craig.topper@intel.com>

Tue, 9 Jul 2019 19:55:28 +0000 (19:55 +0000)

committer Craig Topper <craig.topper@intel.com>

Tue, 9 Jul 2019 19:55:28 +0000 (19:55 +0000)
author Craig Topper <craig.topper@intel.com>
Tue, 9 Jul 2019 19:55:28 +0000 (19:55 +0000)
committer Craig Topper <craig.topper@intel.com>
Tue, 9 Jul 2019 19:55:28 +0000 (19:55 +0000)
diff --git a/include/llvm/CodeGen/TargetLowering.h b/include/llvm/CodeGen/TargetLowering.h

index 8b70075d3aea01201e9f73f81bf3e2d75e96dac0..d5cca60bb1b27843b5b38c734d3ca983aeda1883 100644 (file)
--- a/include/llvm/CodeGen/TargetLowering.h
+++ b/include/llvm/CodeGen/TargetLowering.h
@@ -401,8 +401,9 @@ public:
    /// efficiently, casting the load to a smaller vector of larger types and
    /// loading is more efficient, however, this can be undone by optimizations in
    /// dag combiner.
-  virtual bool isLoadBitCastBeneficial(EVT LoadVT,
-                                       EVT BitcastVT) const {
+  virtual bool isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT,
+                                       const SelectionDAG &DAG,
+                                       const MachineMemOperand &MMO) const {
      // Don't do if we could do an indexed load on the original type, but not on
      // the new one.
      if (!LoadVT.isSimple() || !BitcastVT.isSimple())
@@ -416,14 +417,18 @@ public:
          getTypeToPromoteTo(ISD::LOAD, LoadMVT) == BitcastVT.getSimpleVT())
        return false;
  
-    return true;
+    bool Fast = false;
+    return allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), BitcastVT,
+                              MMO, &Fast) && Fast;
    }
  
    /// Return true if the following transform is beneficial:
    /// (store (y (conv x)), y*)) -> (store x, (x*))
-  virtual bool isStoreBitCastBeneficial(EVT StoreVT, EVT BitcastVT) const {
+  virtual bool isStoreBitCastBeneficial(EVT StoreVT, EVT BitcastVT,
+                                        const SelectionDAG &DAG,
+                                        const MachineMemOperand &MMO) const {
      // Default to the same logic as loads.
-    return isLoadBitCastBeneficial(StoreVT, BitcastVT);
+    return isLoadBitCastBeneficial(StoreVT, BitcastVT, DAG, MMO);
    }
  
    /// Return true if it is expected to be cheaper to do a store of a non-zero
diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

index cd5de4c1400411e0c27c8dc8c2a5bbf9cfdeedb8..09e1195e1fd2b31bdb2ef2d211238fce7fce135d 100644 (file)
--- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -11040,14 +11040,11 @@ SDValue DAGCombiner::visitBITCAST(SDNode *N) {
        // as we assume software couldn't rely on the number of accesses of an
        // illegal type.
        ((!LegalOperations && !cast<LoadSDNode>(N0)->isVolatile()) ||
-       TLI.isOperationLegal(ISD::LOAD, VT)) &&
-      TLI.isLoadBitCastBeneficial(N0.getValueType(), VT)) {
+       TLI.isOperationLegal(ISD::LOAD, VT))) {
      LoadSDNode *LN0 = cast<LoadSDNode>(N0);
  
-    bool Fast = false;
-    if (TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
-                               *LN0->getMemOperand(), &Fast) &&
-        Fast) {
+    if (TLI.isLoadBitCastBeneficial(N0.getValueType(), VT, DAG,
+                                    *LN0->getMemOperand())) {
        SDValue Load =
            DAG.getLoad(VT, SDLoc(N), LN0->getChain(), LN0->getBasePtr(),
                        LN0->getPointerInfo(), LN0->getAlignment(),
@@ -16174,15 +16171,11 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) {
      // illegal type.
      if (((!LegalOperations && !ST->isVolatile()) ||
           TLI.isOperationLegal(ISD::STORE, SVT)) &&
-        TLI.isStoreBitCastBeneficial(Value.getValueType(), SVT)) {
-      bool Fast = false;
-      if (TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), SVT,
-                                 *ST->getMemOperand(), &Fast) &&
-          Fast) {
-        return DAG.getStore(Chain, SDLoc(N), Value.getOperand(0), Ptr,
-                            ST->getPointerInfo(), ST->getAlignment(),
-                            ST->getMemOperand()->getFlags(), ST->getAAInfo());
-      }
+        TLI.isStoreBitCastBeneficial(Value.getValueType(), SVT,
+                                     DAG, *ST->getMemOperand())) {
+      return DAG.getStore(Chain, SDLoc(N), Value.getOperand(0), Ptr,
+                          ST->getPointerInfo(), ST->getAlignment(),
+                          ST->getMemOperand()->getFlags(), ST->getAAInfo());
      }
    }
  
diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp

index 766294dee2357cc38b6bef13b440c1fa7c7606d8..0ccd58d44aaf0653b6cfe6198c6decccd2262f69 100644 (file)
--- a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -719,8 +719,9 @@ bool AMDGPUTargetLowering::shouldReduceLoadWidth(SDNode *N,
    return (OldSize < 32);
  }
  
-bool AMDGPUTargetLowering::isLoadBitCastBeneficial(EVT LoadTy,
-                                                   EVT CastTy) const {
+bool AMDGPUTargetLowering::isLoadBitCastBeneficial(EVT LoadTy, EVT CastTy,
+                                                   const SelectionDAG &DAG,
+                                                   const MachineMemOperand &MMO) const {
  
    assert(LoadTy.getSizeInBits() == CastTy.getSizeInBits());
  
@@ -730,8 +731,12 @@ bool AMDGPUTargetLowering::isLoadBitCastBeneficial(EVT LoadTy,
    unsigned LScalarSize = LoadTy.getScalarSizeInBits();
    unsigned CastScalarSize = CastTy.getScalarSizeInBits();
  
-  return (LScalarSize < CastScalarSize) ||
-         (CastScalarSize >= 32);
+  if ((LScalarSize >= CastScalarSize) && (CastScalarSize < 32))
+    return false;
+
+  bool Fast = false;
+  return allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), CastTy,
+                            MMO, &Fast) && Fast;
  }
  
  // SI+ has instructions for cttz / ctlz for 32-bit values. This is probably also
diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.h b/lib/Target/AMDGPU/AMDGPUISelLowering.h

index 9723fc3ec660decda353b1da4aa1ae3f7993c663..40ff24f07547436aafee4a5ef637b2afeee2616e 100644 (file)
--- a/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -182,7 +182,8 @@ public:
                               ISD::LoadExtType ExtType,
                               EVT ExtVT) const override;
  
-  bool isLoadBitCastBeneficial(EVT, EVT) const final;
+  bool isLoadBitCastBeneficial(EVT, EVT, const SelectionDAG &DAG,
+                               const MachineMemOperand &MMO) const final;
  
    bool storeOfVectorConstantIsCheap(EVT MemVT,
                                      unsigned NumElem,
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp

index 370ecefd273aa744ced2c978847d2caa0abd40c9..3cab44b0ac1394beda54197f1c837148e66e3322 100644 (file)
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -4941,8 +4941,9 @@ bool X86TargetLowering::isCheapToSpeculateCtlz() const {
    return Subtarget.hasLZCNT();
  }
  
-bool X86TargetLowering::isLoadBitCastBeneficial(EVT LoadVT,
-                                                EVT BitcastVT) const {
+bool X86TargetLowering::isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT,
+                                                const SelectionDAG &DAG,
+                                                const MachineMemOperand &MMO) const {
    if (!Subtarget.hasAVX512() && !LoadVT.isVector() && BitcastVT.isVector() &&
        BitcastVT.getVectorElementType() == MVT::i1)
      return false;
@@ -4950,7 +4951,12 @@ bool X86TargetLowering::isLoadBitCastBeneficial(EVT LoadVT,
    if (!Subtarget.hasDQI() && BitcastVT == MVT::v8i1 && LoadVT == MVT::i8)
      return false;
  
-  return TargetLowering::isLoadBitCastBeneficial(LoadVT, BitcastVT);
+  // If both types are legal vectors, it's always ok to convert them.
+  if (LoadVT.isVector() && BitcastVT.isVector() &&
+      isTypeLegal(LoadVT) && isTypeLegal(BitcastVT))
+    return true;
+
+  return TargetLowering::isLoadBitCastBeneficial(LoadVT, BitcastVT, DAG, MMO);
  }
  
  bool X86TargetLowering::canMergeStoresTo(unsigned AddressSpace, EVT MemVT,
diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h

index 0631fb7dfe82293c28a61ad78add3835eba0154c..e0be03bc3f9d558a0f8407e655cca8408e01cabf 100644 (file)
--- a/lib/Target/X86/X86ISelLowering.h
+++ b/lib/Target/X86/X86ISelLowering.h
@@ -1127,7 +1127,9 @@ namespace llvm {
        return NumElem > 2;
      }
  
-    bool isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT) const override;
+    bool isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT,
+                                 const SelectionDAG &DAG,
+                                 const MachineMemOperand &MMO) const override;
  
      /// Intel processors have a unified instruction and data cache
      const char * getClearCacheBuiltinName() const override {
diff --git a/test/CodeGen/X86/merge-consecutive-stores-nt.ll b/test/CodeGen/X86/merge-consecutive-stores-nt.ll

index b742cab5448928843a9f20993938e2bbeaa69d6e..8df92e76def1228f0844aa53bc1dd8df6c3e3a71 100644 (file)
--- a/test/CodeGen/X86/merge-consecutive-stores-nt.ll
+++ b/test/CodeGen/X86/merge-consecutive-stores-nt.ll
@@ -306,27 +306,25 @@ define void @merge_2_v4f32_align1_ntstore(<4 x float>* %a0, <4 x float>* %a1) no
  ; X86-SSE2-NEXT:    movdqu 16(%ecx), %xmm1
  ; X86-SSE2-NEXT:    movd %xmm0, %ecx
  ; X86-SSE2-NEXT:    movntil %ecx, (%eax)
-; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
-; X86-SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[3,1],xmm0[2,3]
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[3,1,2,3]
  ; X86-SSE2-NEXT:    movd %xmm2, %ecx
  ; X86-SSE2-NEXT:    movntil %ecx, 12(%eax)
  ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
  ; X86-SSE2-NEXT:    movd %xmm2, %ecx
  ; X86-SSE2-NEXT:    movntil %ecx, 8(%eax)
-; X86-SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
  ; X86-SSE2-NEXT:    movd %xmm0, %ecx
  ; X86-SSE2-NEXT:    movntil %ecx, 4(%eax)
  ; X86-SSE2-NEXT:    movd %xmm1, %ecx
  ; X86-SSE2-NEXT:    movntil %ecx, 16(%eax)
-; X86-SSE2-NEXT:    movdqa %xmm1, %xmm0
-; X86-SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[2,3]
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3]
  ; X86-SSE2-NEXT:    movd %xmm0, %ecx
  ; X86-SSE2-NEXT:    movntil %ecx, 28(%eax)
  ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
  ; X86-SSE2-NEXT:    movd %xmm0, %ecx
  ; X86-SSE2-NEXT:    movntil %ecx, 24(%eax)
-; X86-SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1,2,3]
-; X86-SSE2-NEXT:    movd %xmm1, %ecx
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE2-NEXT:    movd %xmm0, %ecx
  ; X86-SSE2-NEXT:    movntil %ecx, 20(%eax)
  ; X86-SSE2-NEXT:    retl
  ;
@@ -421,27 +419,25 @@ define void @merge_2_v4f32_align1(<4 x float>* %a0, <4 x float>* %a1) nounwind {
  ; X86-SSE2-NEXT:    movdqu 16(%ecx), %xmm1
  ; X86-SSE2-NEXT:    movd %xmm0, %ecx
  ; X86-SSE2-NEXT:    movntil %ecx, (%eax)
-; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
-; X86-SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[3,1],xmm0[2,3]
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[3,1,2,3]
  ; X86-SSE2-NEXT:    movd %xmm2, %ecx
  ; X86-SSE2-NEXT:    movntil %ecx, 12(%eax)
  ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
  ; X86-SSE2-NEXT:    movd %xmm2, %ecx
  ; X86-SSE2-NEXT:    movntil %ecx, 8(%eax)
-; X86-SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
  ; X86-SSE2-NEXT:    movd %xmm0, %ecx
  ; X86-SSE2-NEXT:    movntil %ecx, 4(%eax)
  ; X86-SSE2-NEXT:    movd %xmm1, %ecx
  ; X86-SSE2-NEXT:    movntil %ecx, 16(%eax)
-; X86-SSE2-NEXT:    movdqa %xmm1, %xmm0
-; X86-SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[2,3]
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3]
  ; X86-SSE2-NEXT:    movd %xmm0, %ecx
  ; X86-SSE2-NEXT:    movntil %ecx, 28(%eax)
  ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
  ; X86-SSE2-NEXT:    movd %xmm0, %ecx
  ; X86-SSE2-NEXT:    movntil %ecx, 24(%eax)
-; X86-SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1,2,3]
-; X86-SSE2-NEXT:    movd %xmm1, %ecx
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE2-NEXT:    movd %xmm0, %ecx
  ; X86-SSE2-NEXT:    movntil %ecx, 20(%eax)
  ; X86-SSE2-NEXT:    retl
  ;
diff --git a/test/CodeGen/X86/vector-shuffle-128-v4.ll b/test/CodeGen/X86/vector-shuffle-128-v4.ll

index f37cd88101bed9cfd6a6b1dbcfe5e0589d741fa9..f4a4bb5e6b4fee3053ad5fefda3efd4eca9b999c 100644 (file)
--- a/test/CodeGen/X86/vector-shuffle-128-v4.ll
+++ b/test/CodeGen/X86/vector-shuffle-128-v4.ll
@@ -1,12 +1,12 @@
  ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE3
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSSE3
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1OR2 --check-prefix=AVX1
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=ALL,AVX,AVX1OR2,AVX2OR512VL,AVX2,AVX2-SLOW
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=ALL,AVX,AVX1OR2,AVX2OR512VL,AVX2,AVX2-FAST
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512dq,+fast-variable-shuffle | FileCheck %s --check-prefixes=ALL,AVX,AVX2OR512VL,AVX512VL
+; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
+; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+sse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE3
+; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSSE3
+; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
+; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1OR2 --check-prefix=AVX1
+; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=ALL,AVX,AVX1OR2,AVX2OR512VL,AVX2,AVX2-SLOW
+; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=ALL,AVX,AVX1OR2,AVX2OR512VL,AVX2,AVX2-FAST
+; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512dq,+fast-variable-shuffle | FileCheck %s --check-prefixes=ALL,AVX,AVX2OR512VL,AVX512VL
  
  define <4 x i32> @shuffle_v4i32_0001(<4 x i32> %a, <4 x i32> %b) {
  ; SSE-LABEL: shuffle_v4i32_0001:
author	Craig Topper <craig.topper@intel.com>
	Tue, 9 Jul 2019 19:55:28 +0000 (19:55 +0000)
committer	Craig Topper <craig.topper@intel.com>
	Tue, 9 Jul 2019 19:55:28 +0000 (19:55 +0000)
include/llvm/CodeGen/TargetLowering.h		patch \| blob \| history
lib/CodeGen/SelectionDAG/DAGCombiner.cpp		patch \| blob \| history
lib/Target/AMDGPU/AMDGPUISelLowering.cpp		patch \| blob \| history
lib/Target/AMDGPU/AMDGPUISelLowering.h		patch \| blob \| history
lib/Target/X86/X86ISelLowering.cpp		patch \| blob \| history
lib/Target/X86/X86ISelLowering.h		patch \| blob \| history
test/CodeGen/X86/merge-consecutive-stores-nt.ll		patch \| blob \| history
test/CodeGen/X86/vector-shuffle-128-v4.ll		patch \| blob \| history