From e9359f427ecda293d8b0d751b4b852d96446d674 Mon Sep 17 00:00:00 2001 From: James Y Knight Date: Fri, 8 May 2015 13:47:01 +0000 Subject: [PATCH] Fix alignment checks in MergeConsecutiveStores. 1) check whether the alignment of the memory is sufficient for the *merged* store or load to be efficient. Not doing so can result in some ridiculously poor code generation, if merging creates a vector operation which must be aligned but isn't. 2) DON'T check that the alignment of each load/store is equal. If you're merging 2 4-byte stores, the first *might* have 8-byte alignment, but the second certainly will have 4-byte alignment. We do want to allow those to be merged. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@236850 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 88 +++++++++++-------- .../CodeGen/PowerPC/MergeConsecutiveStores.ll | 68 ++++++++++++++ test/CodeGen/X86/MergeConsecutiveStores.ll | 11 +-- 3 files changed, 123 insertions(+), 44 deletions(-) create mode 100644 test/CodeGen/PowerPC/MergeConsecutiveStores.ll diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index bf42aabeab2..bd61f0cbd53 100644 --- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -10653,6 +10653,17 @@ bool DAGCombiner::MergeStoresOfConstantsOrVecElts( return true; } +static bool allowableAlignment(const SelectionDAG &DAG, + const TargetLowering &TLI, EVT EVTTy, + unsigned AS, unsigned Align) { + if (TLI.allowsMisalignedMemoryAccesses(EVTTy, AS, Align)) + return true; + + Type *Ty = EVTTy.getTypeForEVT(*DAG.getContext()); + unsigned ABIAlignment = TLI.getDataLayout()->getPrefTypeAlignment(Ty); + return (Align >= ABIAlignment); +} + bool DAGCombiner::MergeConsecutiveStores(StoreSDNode* St) { if (OptLevel == CodeGenOpt::None) return false; @@ -10719,10 +10730,6 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode* St) { if (!Ptr.equalBaseIndex(BasePtr)) break; - // Check that the alignment is the same. - if (Index->getAlignment() != St->getAlignment()) - break; - // The memory operands must not be volatile. if (Index->isVolatile() || Index->isIndexed()) break; @@ -10736,11 +10743,6 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode* St) { if (Index->getMemoryVT() != MemVT) break; - // We do not allow unaligned stores because we want to prevent overriding - // stores. - if (Index->getAlignment()*8 != MemVT.getSizeInBits()) - break; - // We found a potential memory operand to merge. StoreNodes.push_back(MemOpLink(Index, Ptr.Offset, Seq++)); @@ -10814,6 +10816,8 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode* St) { // The node with the lowest store address. LSBaseSDNode *FirstInChain = StoreNodes[0].MemNode; + unsigned FirstStoreAS = FirstInChain->getAddressSpace(); + unsigned FirstStoreAlign = FirstInChain->getAlignment(); // Store the constants into memory as one consecutive store. if (IsConstantSrc) { @@ -10836,21 +10840,28 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode* St) { // Find a legal type for the constant store. unsigned StoreBW = (i+1) * ElementSizeBytes * 8; EVT StoreTy = EVT::getIntegerVT(*DAG.getContext(), StoreBW); - if (TLI.isTypeLegal(StoreTy)) + if (TLI.isTypeLegal(StoreTy) && + allowableAlignment(DAG, TLI, StoreTy, FirstStoreAS, + FirstStoreAlign)) { LastLegalType = i+1; // Or check whether a truncstore is legal. - else if (TLI.getTypeAction(*DAG.getContext(), StoreTy) == - TargetLowering::TypePromoteInteger) { + } else if (TLI.getTypeAction(*DAG.getContext(), StoreTy) == + TargetLowering::TypePromoteInteger) { EVT LegalizedStoredValueTy = TLI.getTypeToTransformTo(*DAG.getContext(), StoredVal.getValueType()); - if (TLI.isTruncStoreLegal(LegalizedStoredValueTy, StoreTy)) - LastLegalType = i+1; + if (TLI.isTruncStoreLegal(LegalizedStoredValueTy, StoreTy) && + allowableAlignment(DAG, TLI, LegalizedStoredValueTy, FirstStoreAS, + FirstStoreAlign)) { + LastLegalType = i + 1; + } } // Find a legal type for the vector store. EVT Ty = EVT::getVectorVT(*DAG.getContext(), MemVT, i+1); - if (TLI.isTypeLegal(Ty)) + if (TLI.isTypeLegal(Ty) && + allowableAlignment(DAG, TLI, Ty, FirstStoreAS, FirstStoreAlign)) { LastLegalVectorType = i + 1; + } } // We only use vectors if the constant is known to be zero and the @@ -10886,7 +10897,8 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode* St) { // Find a legal type for the vector store. EVT Ty = EVT::getVectorVT(*DAG.getContext(), MemVT, i+1); - if (TLI.isTypeLegal(Ty)) + if (TLI.isTypeLegal(Ty) && + allowableAlignment(DAG, TLI, Ty, FirstStoreAS, FirstStoreAlign)) NumElem = i + 1; } @@ -10913,10 +10925,6 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode* St) { if (!Ld->hasNUsesOfValue(1, 0)) break; - // Check that the alignment is the same as the stores. - if (Ld->getAlignment() != St->getAlignment()) - break; - // The memory operands must not be volatile. if (Ld->isVolatile() || Ld->isIndexed()) break; @@ -10954,6 +10962,10 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode* St) { St->getAlignment() >= RequiredAlignment) return false; + LoadSDNode *FirstLoad = cast(LoadNodes[0].MemNode); + unsigned FirstLoadAS = FirstLoad->getAddressSpace(); + unsigned FirstLoadAlign = FirstLoad->getAlignment(); + // Scan the memory operations on the chain and find the first non-consecutive // load memory address. These variables hold the index in the store node // array. @@ -10962,7 +10974,7 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode* St) { unsigned LastLegalVectorType = 0; unsigned LastLegalIntegerType = 0; StartAddress = LoadNodes[0].OffsetFromBase; - SDValue FirstChain = LoadNodes[0].MemNode->getChain(); + SDValue FirstChain = FirstLoad->getChain(); for (unsigned i = 1; i < LoadNodes.size(); ++i) { // All loads much share the same chain. if (LoadNodes[i].MemNode->getChain() != FirstChain) @@ -10975,13 +10987,18 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode* St) { // Find a legal type for the vector store. EVT StoreTy = EVT::getVectorVT(*DAG.getContext(), MemVT, i+1); - if (TLI.isTypeLegal(StoreTy)) + if (TLI.isTypeLegal(StoreTy) && + allowableAlignment(DAG, TLI, StoreTy, FirstStoreAS, FirstStoreAlign) && + allowableAlignment(DAG, TLI, StoreTy, FirstLoadAS, FirstLoadAlign)) { LastLegalVectorType = i + 1; + } // Find a legal type for the integer store. unsigned StoreBW = (i+1) * ElementSizeBytes * 8; StoreTy = EVT::getIntegerVT(*DAG.getContext(), StoreBW); - if (TLI.isTypeLegal(StoreTy)) + if (TLI.isTypeLegal(StoreTy) && + allowableAlignment(DAG, TLI, StoreTy, FirstStoreAS, FirstStoreAlign) && + allowableAlignment(DAG, TLI, StoreTy, FirstLoadAS, FirstLoadAlign)) LastLegalIntegerType = i + 1; // Or check whether a truncstore and extload is legal. else if (TLI.getTypeAction(*DAG.getContext(), StoreTy) == @@ -10991,7 +11008,11 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode* St) { if (TLI.isTruncStoreLegal(LegalizedStoredValueTy, StoreTy) && TLI.isLoadExtLegal(ISD::ZEXTLOAD, LegalizedStoredValueTy, StoreTy) && TLI.isLoadExtLegal(ISD::SEXTLOAD, LegalizedStoredValueTy, StoreTy) && - TLI.isLoadExtLegal(ISD::EXTLOAD, LegalizedStoredValueTy, StoreTy)) + TLI.isLoadExtLegal(ISD::EXTLOAD, LegalizedStoredValueTy, StoreTy) && + allowableAlignment(DAG, TLI, LegalizedStoredValueTy, FirstStoreAS, + FirstStoreAlign) && + allowableAlignment(DAG, TLI, LegalizedStoredValueTy, FirstLoadAS, + FirstLoadAlign)) LastLegalIntegerType = i+1; } } @@ -11035,18 +11056,13 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode* St) { SDLoc LoadDL(LoadNodes[0].MemNode); SDLoc StoreDL(StoreNodes[0].MemNode); - LoadSDNode *FirstLoad = cast(LoadNodes[0].MemNode); - SDValue NewLoad = DAG.getLoad(JointMemOpVT, LoadDL, - FirstLoad->getChain(), - FirstLoad->getBasePtr(), - FirstLoad->getPointerInfo(), - false, false, false, - FirstLoad->getAlignment()); - - SDValue NewStore = DAG.getStore(LatestOp->getChain(), StoreDL, NewLoad, - FirstInChain->getBasePtr(), - FirstInChain->getPointerInfo(), false, false, - FirstInChain->getAlignment()); + SDValue NewLoad = DAG.getLoad( + JointMemOpVT, LoadDL, FirstLoad->getChain(), FirstLoad->getBasePtr(), + FirstLoad->getPointerInfo(), false, false, false, FirstLoadAlign); + + SDValue NewStore = DAG.getStore( + LatestOp->getChain(), StoreDL, NewLoad, FirstInChain->getBasePtr(), + FirstInChain->getPointerInfo(), false, false, FirstStoreAlign); // Replace one of the loads with the new load. LoadSDNode *Ld = cast(LoadNodes[0].MemNode); diff --git a/test/CodeGen/PowerPC/MergeConsecutiveStores.ll b/test/CodeGen/PowerPC/MergeConsecutiveStores.ll new file mode 100644 index 00000000000..9a5d33f8967 --- /dev/null +++ b/test/CodeGen/PowerPC/MergeConsecutiveStores.ll @@ -0,0 +1,68 @@ +; RUN: llc -march=ppc32 -mattr=+altivec < %s | FileCheck %s + +;; This test ensures that MergeConsecutiveStores does not attempt to +;; merge stores or loads when doing so would result in unaligned +;; memory operations (unless the target supports those, e.g. X86). + +;; This issue happen in other situations for other targets, but PPC +;; with Altivec extensions was chosen for the test because it does not +;; support unaligned access with AltiVec instructions. If the 4 +;; load/stores get merged to an v4i32 vector type severely bad code +;; gets generated: it painstakingly copies the values to a temporary +;; location on the stack, with vector ops, in order to then use +;; integer ops to load from the temporary stack location and store to +;; the final location. Yuck! + +%struct.X = type { i32, i32, i32, i32 } + +@fx = common global %struct.X zeroinitializer, align 4 +@fy = common global %struct.X zeroinitializer, align 4 + +;; In this test case, lvx and stvx instructions should NOT be +;; generated, as the alignment is not sufficient for it to be +;; worthwhile. + +;; CHECK-LABEL: f: +;; CHECK: lwzu +;; CHECK-NEXT: lwz +;; CHECK-NEXT: lwz +;; CHECK-NEXT: lwz +;; CHECK-NEXT: stwu +;; CHECK-NEXT: stw +;; CHECK-NEXT: stw +;; CHECK-NEXT: stw +;; CHECK-NEXT: blr +define void @f() { +entry: + %0 = load i32, i32* getelementptr inbounds (%struct.X, %struct.X* @fx, i32 0, i32 0), align 4 + %1 = load i32, i32* getelementptr inbounds (%struct.X, %struct.X* @fx, i32 0, i32 1), align 4 + %2 = load i32, i32* getelementptr inbounds (%struct.X, %struct.X* @fx, i32 0, i32 2), align 4 + %3 = load i32, i32* getelementptr inbounds (%struct.X, %struct.X* @fx, i32 0, i32 3), align 4 + store i32 %0, i32* getelementptr inbounds (%struct.X, %struct.X* @fy, i32 0, i32 0), align 4 + store i32 %1, i32* getelementptr inbounds (%struct.X, %struct.X* @fy, i32 0, i32 1), align 4 + store i32 %2, i32* getelementptr inbounds (%struct.X, %struct.X* @fy, i32 0, i32 2), align 4 + store i32 %3, i32* getelementptr inbounds (%struct.X, %struct.X* @fy, i32 0, i32 3), align 4 + ret void +} + +@gx = common global %struct.X zeroinitializer, align 16 +@gy = common global %struct.X zeroinitializer, align 16 + +;; In this test, lvx and stvx instructions SHOULD be generated, as +;; the 16-byte alignment of the new load/store is acceptable. +;; CHECK-LABEL: g: +;; CHECK: lvx +;; CHECK: stvx +;; CHECK: blr +define void @g() { +entry: + %0 = load i32, i32* getelementptr inbounds (%struct.X, %struct.X* @fx, i32 0, i32 0), align 16 + %1 = load i32, i32* getelementptr inbounds (%struct.X, %struct.X* @fx, i32 0, i32 1), align 4 + %2 = load i32, i32* getelementptr inbounds (%struct.X, %struct.X* @fx, i32 0, i32 2), align 4 + %3 = load i32, i32* getelementptr inbounds (%struct.X, %struct.X* @fx, i32 0, i32 3), align 4 + store i32 %0, i32* getelementptr inbounds (%struct.X, %struct.X* @fy, i32 0, i32 0), align 16 + store i32 %1, i32* getelementptr inbounds (%struct.X, %struct.X* @fy, i32 0, i32 1), align 4 + store i32 %2, i32* getelementptr inbounds (%struct.X, %struct.X* @fy, i32 0, i32 2), align 4 + store i32 %3, i32* getelementptr inbounds (%struct.X, %struct.X* @fy, i32 0, i32 3), align 4 + ret void +} diff --git a/test/CodeGen/X86/MergeConsecutiveStores.ll b/test/CodeGen/X86/MergeConsecutiveStores.ll index aff6fbc254b..275d4213bd2 100644 --- a/test/CodeGen/X86/MergeConsecutiveStores.ll +++ b/test/CodeGen/X86/MergeConsecutiveStores.ll @@ -291,17 +291,12 @@ block4: ; preds = %4, %.lr.ph ret void } +;; On x86, even unaligned copies can be merged to vector ops. ; CHECK-LABEL: merge_loads_no_align: ; load: -; CHECK: movl -; CHECK: movl -; CHECK: movl -; CHECK: movl +; CHECK: vmovups ; store: -; CHECK: movl -; CHECK: movl -; CHECK: movl -; CHECK: movl +; CHECK: vmovups ; CHECK: ret define void @merge_loads_no_align(i32 %count, %struct.B* noalias nocapture %q, %struct.B* noalias nocapture %p) nounwind uwtable noinline ssp { %a1 = icmp sgt i32 %count, 0 -- 2.40.0