From 4e1eef18555afaa48cf1c1aecaeffe2aba3edbd5 Mon Sep 17 00:00:00 2001 From: Piotr Padlewski Date: Thu, 12 Jan 2017 11:33:58 +0000 Subject: [PATCH] [Devirtualization] MemDep returns non-local !invariant.group dependencies Summary: Memory Dependence Analysis was limited to return only local dependencies for invariant.group handling. Now it returns NonLocal when it finds it and then by asking getNonLocalPointerDependency we get found dep. Thanks to this we are able to devirtualize loops! void indirect(A &a, int n) { for (int i = 0 ; i < n; i++) a.foo(); } void test(int n) { A a; indirect(a); } After inlining a.foo() will be changed to direct call, even if foo and A::A() is external (but only if vtable definition is be available). Reviewers: nlewycky, dberlin, chandlerc, rsmith Subscribers: mehdi_amini, davide, llvm-commits Differential Revision: https://reviews.llvm.org/D28137 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@291762 91177308-0d34-0410-b5e6-96231b3b80d8 --- .../llvm/Analysis/MemoryDependenceAnalysis.h | 10 ++- lib/Analysis/MemoryDependenceAnalysis.cpp | 63 ++++++++++++++++--- test/Transforms/GVN/assume-equal.ll | 8 +-- test/Transforms/GVN/invariant.group.ll | 38 +++++++++++ test/Transforms/NewGVN/assume-equal.ll | 8 +-- test/Transforms/NewGVN/invariant.group.ll | 39 ++++++++++++ 6 files changed, 145 insertions(+), 21 deletions(-) diff --git a/include/llvm/Analysis/MemoryDependenceAnalysis.h b/include/llvm/Analysis/MemoryDependenceAnalysis.h index 33dbd22f7a2..a401887016c 100644 --- a/include/llvm/Analysis/MemoryDependenceAnalysis.h +++ b/include/llvm/Analysis/MemoryDependenceAnalysis.h @@ -302,6 +302,10 @@ private: NonLocalPointerInfo() : Size(MemoryLocation::UnknownSize) {} }; + /// Cache storing single nonlocal def for the instruction. + /// It is set when nonlocal def would be found in function returning only + /// local dependencies. + DenseMap NonLocalDefsCache; /// This map stores the cached results of doing a pointer lookup at the /// bottom of a block. /// @@ -441,9 +445,9 @@ public: /// This analysis looks for other loads and stores with invariant.group /// metadata and the same pointer operand. Returns Unknown if it does not /// find anything, and Def if it can be assumed that 2 instructions load or - /// store the same value. - /// FIXME: This analysis works only on single block because of restrictions - /// at the call site. + /// store the same value and NonLocal which indicate that non-local Def was + /// found, which can be retrieved by calling getNonLocalPointerDependency + /// with the same queried instruction. MemDepResult getInvariantGroupPointerDependency(LoadInst *LI, BasicBlock *BB); /// Looks at a memory location for a load (specified by MemLocBase, Offs, and diff --git a/lib/Analysis/MemoryDependenceAnalysis.cpp b/lib/Analysis/MemoryDependenceAnalysis.cpp index acdf14733b0..66a0d145dcd 100644 --- a/lib/Analysis/MemoryDependenceAnalysis.cpp +++ b/lib/Analysis/MemoryDependenceAnalysis.cpp @@ -323,17 +323,28 @@ MemDepResult MemoryDependenceResults::getPointerDependencyFrom( const MemoryLocation &MemLoc, bool isLoad, BasicBlock::iterator ScanIt, BasicBlock *BB, Instruction *QueryInst, unsigned *Limit) { + MemDepResult InvariantGroupDependency = MemDepResult::getUnknown(); if (QueryInst != nullptr) { if (auto *LI = dyn_cast(QueryInst)) { - MemDepResult InvariantGroupDependency = - getInvariantGroupPointerDependency(LI, BB); + InvariantGroupDependency = getInvariantGroupPointerDependency(LI, BB); if (InvariantGroupDependency.isDef()) return InvariantGroupDependency; } } - return getSimplePointerDependencyFrom(MemLoc, isLoad, ScanIt, BB, QueryInst, - Limit); + MemDepResult SimpleDep = getSimplePointerDependencyFrom( + MemLoc, isLoad, ScanIt, BB, QueryInst, Limit); + if (SimpleDep.isDef()) + return SimpleDep; + // Non-local invariant group dependency indicates there is non local Def + // (it only returns nonLocal if it finds nonLocal def), which is better than + // local clobber and everything else. + if (InvariantGroupDependency.isNonLocal()) + return InvariantGroupDependency; + + assert(InvariantGroupDependency.isUnknown() && + "InvariantGroupDependency should be only unknown at this point"); + return SimpleDep; } MemDepResult @@ -358,6 +369,20 @@ MemoryDependenceResults::getInvariantGroupPointerDependency(LoadInst *LI, // Queue to process all pointers that are equivalent to load operand. SmallVector LoadOperandsQueue; LoadOperandsQueue.push_back(LoadOperand); + + Instruction *ClosestDependency = nullptr; + // Order of instructions in uses list is unpredictible. In order to always + // get the same result, we will look for the closest dominance. + auto GetClosestDependency = [this](Instruction *Best, Instruction *Other) { + assert(Other && "Must call it with not null instruction"); + if (Best == nullptr || DT.dominates(Best, Other)) + return Other; + return Best; + }; + + + // FIXME: This loop is O(N^2) because dominates can be O(n) and in worst case + // we will see all the instructions. This should be fixed in MSSA. while (!LoadOperandsQueue.empty()) { const Value *Ptr = LoadOperandsQueue.pop_back_val(); assert(Ptr && !isa(Ptr) && @@ -388,12 +413,24 @@ MemoryDependenceResults::getInvariantGroupPointerDependency(LoadInst *LI, // If we hit load/store with the same invariant.group metadata (and the // same pointer operand) we can assume that value pointed by pointer // operand didn't change. - if ((isa(U) || isa(U)) && U->getParent() == BB && + if ((isa(U) || isa(U)) && U->getMetadata(LLVMContext::MD_invariant_group) == InvariantGroupMD) - return MemDepResult::getDef(U); + ClosestDependency = GetClosestDependency(ClosestDependency, U); } } - return MemDepResult::getUnknown(); + + if (!ClosestDependency) + return MemDepResult::getUnknown(); + if (ClosestDependency->getParent() == BB) + return MemDepResult::getDef(ClosestDependency); + // Def(U) can't be returned here because it is non-local. If local + // dependency won't be found then return nonLocal counting that the + // user will call getNonLocalPointerDependency, which will return cached + // result. + NonLocalDefsCache.try_emplace( + LI, NonLocalDepResult(ClosestDependency->getParent(), + MemDepResult::getDef(ClosestDependency), nullptr)); + return MemDepResult::getNonLocal(); } MemDepResult MemoryDependenceResults::getSimplePointerDependencyFrom( @@ -877,7 +914,17 @@ void MemoryDependenceResults::getNonLocalPointerDependency( assert(Loc.Ptr->getType()->isPointerTy() && "Can't get pointer deps of a non-pointer!"); Result.clear(); - + { + // Check if there is cached Def with invariant.group. FIXME: cache might be + // invalid if cached instruction would be removed between call to + // getPointerDependencyFrom and this function. + auto NonLocalDefIt = NonLocalDefsCache.find(QueryInst); + if (NonLocalDefIt != NonLocalDefsCache.end()) { + Result.push_back(std::move(NonLocalDefIt->second)); + NonLocalDefsCache.erase(NonLocalDefIt); + return; + } + } // This routine does not expect to deal with volatile instructions. // Doing so would require piping through the QueryInst all the way through. // TODO: volatiles can't be elided, but they can be reordered with other diff --git a/test/Transforms/GVN/assume-equal.ll b/test/Transforms/GVN/assume-equal.ll index d423c1685e1..941f14ce402 100644 --- a/test/Transforms/GVN/assume-equal.ll +++ b/test/Transforms/GVN/assume-equal.ll @@ -65,22 +65,20 @@ if.then: ; preds = %entry %vtable1 = load i8**, i8*** %1, align 8, !invariant.group !0 %vtable2.cast = bitcast i8** %vtable1 to i32 (%struct.A*)** %call1 = load i32 (%struct.A*)*, i32 (%struct.A*)** %vtable2.cast, align 8 -; FIXME: those loads could be also direct, but right now the invariant.group -; analysis works only on single block -; CHECK-NOT: call i32 @_ZN1A3fooEv( +; CHECK: call i32 @_ZN1A3fooEv( %callx = tail call i32 %call1(%struct.A* %0) #1 %vtable2 = load i8**, i8*** %1, align 8, !invariant.group !0 %vtable3.cast = bitcast i8** %vtable2 to i32 (%struct.A*)** %call4 = load i32 (%struct.A*)*, i32 (%struct.A*)** %vtable3.cast, align 8 -; CHECK-NOT: call i32 @_ZN1A3fooEv( +; CHECK: call i32 @_ZN1A3fooEv( %cally = tail call i32 %call4(%struct.A* %0) #1 %b = bitcast i8* %call to %struct.A** %vtable3 = load %struct.A*, %struct.A** %b, align 8, !invariant.group !0 %vtable4.cast = bitcast %struct.A* %vtable3 to i32 (%struct.A*)** %vfun = load i32 (%struct.A*)*, i32 (%struct.A*)** %vtable4.cast, align 8 -; CHECK-NOT: call i32 @_ZN1A3fooEv( +; CHECK: call i32 @_ZN1A3fooEv( %unknown = tail call i32 %vfun(%struct.A* %0) #1 br label %if.end diff --git a/test/Transforms/GVN/invariant.group.ll b/test/Transforms/GVN/invariant.group.ll index d0b32d7f3dd..6f1f357cad6 100644 --- a/test/Transforms/GVN/invariant.group.ll +++ b/test/Transforms/GVN/invariant.group.ll @@ -392,6 +392,44 @@ define void @testNotGlobal() { ret void } +; CHECK-LABEL: define void @handling_loops() +define void @handling_loops() { + %a = alloca %struct.A, align 8 + %1 = bitcast %struct.A* %a to i8* + %2 = getelementptr inbounds %struct.A, %struct.A* %a, i64 0, i32 0 + store i32 (...)** bitcast (i8** getelementptr inbounds ([3 x i8*], [3 x i8*]* @_ZTV1A, i64 0, i64 2) to i32 (...)**), i32 (...)*** %2, align 8, !invariant.group !0 + %3 = load i8, i8* @unknownPtr, align 4 + %4 = icmp sgt i8 %3, 0 + br i1 %4, label %.lr.ph.i, label %_Z2g2R1A.exit + +.lr.ph.i: ; preds = %0 + %5 = bitcast %struct.A* %a to void (%struct.A*)*** + %6 = load i8, i8* @unknownPtr, align 4 + %7 = icmp sgt i8 %6, 1 + br i1 %7, label %._crit_edge.preheader, label %_Z2g2R1A.exit + +._crit_edge.preheader: ; preds = %.lr.ph.i + br label %._crit_edge + +._crit_edge: ; preds = %._crit_edge.preheader, %._crit_edge + %8 = phi i8 [ %10, %._crit_edge ], [ 1, %._crit_edge.preheader ] + %.pre = load void (%struct.A*)**, void (%struct.A*)*** %5, align 8, !invariant.group !0 + %9 = load void (%struct.A*)*, void (%struct.A*)** %.pre, align 8 + ; CHECK: call void @_ZN1A3fooEv(%struct.A* nonnull %a) + call void %9(%struct.A* nonnull %a) #3 + ; CHECK-NOT: call void % + %10 = add nuw nsw i8 %8, 1 + %11 = load i8, i8* @unknownPtr, align 4 + %12 = icmp slt i8 %10, %11 + br i1 %12, label %._crit_edge, label %_Z2g2R1A.exit.loopexit + +_Z2g2R1A.exit.loopexit: ; preds = %._crit_edge + br label %_Z2g2R1A.exit + +_Z2g2R1A.exit: ; preds = %_Z2g2R1A.exit.loopexit, %.lr.ph.i, %0 + ret void +} + declare void @foo(i8*) declare void @foo2(i8*, i8) diff --git a/test/Transforms/NewGVN/assume-equal.ll b/test/Transforms/NewGVN/assume-equal.ll index b6c2a7afb29..7e009192064 100644 --- a/test/Transforms/NewGVN/assume-equal.ll +++ b/test/Transforms/NewGVN/assume-equal.ll @@ -66,22 +66,20 @@ if.then: ; preds = %entry %vtable1 = load i8**, i8*** %1, align 8, !invariant.group !0 %vtable2.cast = bitcast i8** %vtable1 to i32 (%struct.A*)** %call1 = load i32 (%struct.A*)*, i32 (%struct.A*)** %vtable2.cast, align 8 -; FIXME: those loads could be also direct, but right now the invariant.group -; analysis works only on single block -; CHECK-NOT: call i32 @_ZN1A3fooEv( +; CHECK: call i32 @_ZN1A3fooEv( %callx = tail call i32 %call1(%struct.A* %0) #1 %vtable2 = load i8**, i8*** %1, align 8, !invariant.group !0 %vtable3.cast = bitcast i8** %vtable2 to i32 (%struct.A*)** %call4 = load i32 (%struct.A*)*, i32 (%struct.A*)** %vtable3.cast, align 8 -; CHECK-NOT: call i32 @_ZN1A3fooEv( +; CHECK: call i32 @_ZN1A3fooEv( %cally = tail call i32 %call4(%struct.A* %0) #1 %b = bitcast i8* %call to %struct.A** %vtable3 = load %struct.A*, %struct.A** %b, align 8, !invariant.group !0 %vtable4.cast = bitcast %struct.A* %vtable3 to i32 (%struct.A*)** %vfun = load i32 (%struct.A*)*, i32 (%struct.A*)** %vtable4.cast, align 8 -; CHECK-NOT: call i32 @_ZN1A3fooEv( +; CHECK: call i32 @_ZN1A3fooEv( %unknown = tail call i32 %vfun(%struct.A* %0) #1 br label %if.end diff --git a/test/Transforms/NewGVN/invariant.group.ll b/test/Transforms/NewGVN/invariant.group.ll index 80c6e05a8e2..c421df6bd3b 100644 --- a/test/Transforms/NewGVN/invariant.group.ll +++ b/test/Transforms/NewGVN/invariant.group.ll @@ -393,6 +393,45 @@ define void @testNotGlobal() { ret void } +; CHECK-LABEL: define void @handling_loops() +define void @handling_loops() { + %a = alloca %struct.A, align 8 + %1 = bitcast %struct.A* %a to i8* + %2 = getelementptr inbounds %struct.A, %struct.A* %a, i64 0, i32 0 + store i32 (...)** bitcast (i8** getelementptr inbounds ([3 x i8*], [3 x i8*]* @_ZTV1A, i64 0, i64 2) to i32 (...)**), i32 (...)*** %2, align 8, !invariant.group !0 + %3 = load i8, i8* @unknownPtr, align 4 + %4 = icmp sgt i8 %3, 0 + br i1 %4, label %.lr.ph.i, label %_Z2g2R1A.exit + +.lr.ph.i: ; preds = %0 + %5 = bitcast %struct.A* %a to void (%struct.A*)*** + %6 = load i8, i8* @unknownPtr, align 4 + %7 = icmp sgt i8 %6, 1 + br i1 %7, label %._crit_edge.preheader, label %_Z2g2R1A.exit + +._crit_edge.preheader: ; preds = %.lr.ph.i + br label %._crit_edge + +._crit_edge: ; preds = %._crit_edge.preheader, %._crit_edge + %8 = phi i8 [ %10, %._crit_edge ], [ 1, %._crit_edge.preheader ] + %.pre = load void (%struct.A*)**, void (%struct.A*)*** %5, align 8, !invariant.group !0 + %9 = load void (%struct.A*)*, void (%struct.A*)** %.pre, align 8 +; CHECK: call void @_ZN1A3fooEv(%struct.A* nonnull %a) + call void %9(%struct.A* nonnull %a) #3 + +; CHECK-NOT: call void % + %10 = add nuw nsw i8 %8, 1 + %11 = load i8, i8* @unknownPtr, align 4 + %12 = icmp slt i8 %10, %11 + br i1 %12, label %._crit_edge, label %_Z2g2R1A.exit.loopexit + +_Z2g2R1A.exit.loopexit: ; preds = %._crit_edge + br label %_Z2g2R1A.exit + +_Z2g2R1A.exit: ; preds = %_Z2g2R1A.exit.loopexit, %.lr.ph.i, %0 + ret void +} + declare void @foo(i8*) declare void @foo2(i8*, i8) -- 2.40.0