From: Krzysztof Parzyszek Date: Mon, 1 Jul 2019 13:50:47 +0000 (+0000) Subject: [Hexagon] Rework VLCR algorithm X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=b618f1590304ccc49fd629dd0249817a4278a4f6;p=llvm [Hexagon] Rework VLCR algorithm Add code to catch pattern for commutative instructions for VLCR. Patch by Suyog Sarda. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@364770 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/Target/Hexagon/HexagonVectorLoopCarriedReuse.cpp b/lib/Target/Hexagon/HexagonVectorLoopCarriedReuse.cpp index 7236dd283b8..e5df1d456c1 100644 --- a/lib/Target/Hexagon/HexagonVectorLoopCarriedReuse.cpp +++ b/lib/Target/Hexagon/HexagonVectorLoopCarriedReuse.cpp @@ -238,10 +238,17 @@ namespace { // used over the backedge. This is teh value that gets reused from a // previous iteration. Instruction *BackedgeInst = nullptr; + std::map DepChains; + int Iterations = -1; ReuseValue() = default; - void reset() { Inst2Replace = nullptr; BackedgeInst = nullptr; } + void reset() { + Inst2Replace = nullptr; + BackedgeInst = nullptr; + DepChains.clear(); + Iterations = -1; + } bool isDefined() { return Inst2Replace != nullptr; } }; @@ -288,10 +295,10 @@ namespace { void findDepChainFromPHI(Instruction *I, DepChain &D); void reuseValue(); Value *findValueInBlock(Value *Op, BasicBlock *BB); - bool isDepChainBtwn(Instruction *I1, Instruction *I2, int Iters); - DepChain *getDepChainBtwn(Instruction *I1, Instruction *I2); + DepChain *getDepChainBtwn(Instruction *I1, Instruction *I2, int Iters); bool isEquivalentOperation(Instruction *I1, Instruction *I2); bool canReplace(Instruction *I); + bool isCallInstCommutative(CallInst *C); }; } // end anonymous namespace @@ -326,6 +333,70 @@ bool HexagonVectorLoopCarriedReuse::runOnLoop(Loop *L, LPPassManager &LPM) { return doVLCR(); } +bool HexagonVectorLoopCarriedReuse::isCallInstCommutative(CallInst *C) { + switch (C->getCalledFunction()->getIntrinsicID()) { + case Intrinsic::hexagon_V6_vaddb: + case Intrinsic::hexagon_V6_vaddb_128B: + case Intrinsic::hexagon_V6_vaddh: + case Intrinsic::hexagon_V6_vaddh_128B: + case Intrinsic::hexagon_V6_vaddw: + case Intrinsic::hexagon_V6_vaddw_128B: + case Intrinsic::hexagon_V6_vaddubh: + case Intrinsic::hexagon_V6_vaddubh_128B: + case Intrinsic::hexagon_V6_vadduhw: + case Intrinsic::hexagon_V6_vadduhw_128B: + case Intrinsic::hexagon_V6_vaddhw: + case Intrinsic::hexagon_V6_vaddhw_128B: + case Intrinsic::hexagon_V6_vmaxb: + case Intrinsic::hexagon_V6_vmaxb_128B: + case Intrinsic::hexagon_V6_vmaxh: + case Intrinsic::hexagon_V6_vmaxh_128B: + case Intrinsic::hexagon_V6_vmaxw: + case Intrinsic::hexagon_V6_vmaxw_128B: + case Intrinsic::hexagon_V6_vmaxub: + case Intrinsic::hexagon_V6_vmaxub_128B: + case Intrinsic::hexagon_V6_vmaxuh: + case Intrinsic::hexagon_V6_vmaxuh_128B: + case Intrinsic::hexagon_V6_vminub: + case Intrinsic::hexagon_V6_vminub_128B: + case Intrinsic::hexagon_V6_vminuh: + case Intrinsic::hexagon_V6_vminuh_128B: + case Intrinsic::hexagon_V6_vminb: + case Intrinsic::hexagon_V6_vminb_128B: + case Intrinsic::hexagon_V6_vminh: + case Intrinsic::hexagon_V6_vminh_128B: + case Intrinsic::hexagon_V6_vminw: + case Intrinsic::hexagon_V6_vminw_128B: + case Intrinsic::hexagon_V6_vmpyub: + case Intrinsic::hexagon_V6_vmpyub_128B: + case Intrinsic::hexagon_V6_vmpyuh: + case Intrinsic::hexagon_V6_vmpyuh_128B: + case Intrinsic::hexagon_V6_vavgub: + case Intrinsic::hexagon_V6_vavgub_128B: + case Intrinsic::hexagon_V6_vavgh: + case Intrinsic::hexagon_V6_vavgh_128B: + case Intrinsic::hexagon_V6_vavguh: + case Intrinsic::hexagon_V6_vavguh_128B: + case Intrinsic::hexagon_V6_vavgw: + case Intrinsic::hexagon_V6_vavgw_128B: + case Intrinsic::hexagon_V6_vavgb: + case Intrinsic::hexagon_V6_vavgb_128B: + case Intrinsic::hexagon_V6_vavguw: + case Intrinsic::hexagon_V6_vavguw_128B: + case Intrinsic::hexagon_V6_vabsdiffh: + case Intrinsic::hexagon_V6_vabsdiffh_128B: + case Intrinsic::hexagon_V6_vabsdiffub: + case Intrinsic::hexagon_V6_vabsdiffub_128B: + case Intrinsic::hexagon_V6_vabsdiffuh: + case Intrinsic::hexagon_V6_vabsdiffuh_128B: + case Intrinsic::hexagon_V6_vabsdiffw: + case Intrinsic::hexagon_V6_vabsdiffw_128B: + return true; + default: + return false; + } +} + bool HexagonVectorLoopCarriedReuse::isEquivalentOperation(Instruction *I1, Instruction *I2) { if (!I1->isSameOperationAs(I2)) @@ -360,13 +431,19 @@ bool HexagonVectorLoopCarriedReuse::isEquivalentOperation(Instruction *I1, bool HexagonVectorLoopCarriedReuse::canReplace(Instruction *I) { const IntrinsicInst *II = dyn_cast(I); - if (II && - (II->getIntrinsicID() == Intrinsic::hexagon_V6_hi || - II->getIntrinsicID() == Intrinsic::hexagon_V6_lo)) { + if (!II) + return true; + + switch (II->getIntrinsicID()) { + case Intrinsic::hexagon_V6_hi: + case Intrinsic::hexagon_V6_lo: + case Intrinsic::hexagon_V6_hi_128B: + case Intrinsic::hexagon_V6_lo_128B: LLVM_DEBUG(dbgs() << "Not considering for reuse: " << *II << "\n"); return false; + default: + return true; } - return true; } void HexagonVectorLoopCarriedReuse::findValueToReuse() { for (auto *D : Dependences) { @@ -427,34 +504,85 @@ void HexagonVectorLoopCarriedReuse::findValueToReuse() { int NumOperands = I->getNumOperands(); - for (int OpNo = 0; OpNo < NumOperands; ++OpNo) { - Value *Op = I->getOperand(OpNo); - Value *BEOp = BEUser->getOperand(OpNo); - - Instruction *OpInst = dyn_cast(Op); - if (!OpInst) { - if (Op == BEOp) - continue; - // Do not allow reuse to occur when the operands may be different - // values. - BEUser = nullptr; - break; + // Take operands of each PNUser one by one and try to find DepChain + // with every operand of the BEUser. If any of the operands of BEUser + // has DepChain with current operand of the PNUser, break the matcher + // loop. Keep doing this for Every PNUser operand. If PNUser operand + // does not have DepChain with any of the BEUser operand, break the + // outer matcher loop, mark the BEUser as null and reset the ReuseCandidate. + // This ensures that DepChain exist for all the PNUser operand with + // BEUser operand. This also ensures that DepChains are independent of + // the positions in PNUser and BEUser. + std::map DepChains; + CallInst *C1 = dyn_cast(I); + if ((I && I->isCommutative()) || (C1 && isCallInstCommutative(C1))) { + bool Found = false; + for (int OpNo = 0; OpNo < NumOperands; ++OpNo) { + Value *Op = I->getOperand(OpNo); + Instruction *OpInst = dyn_cast(Op); + Found = false; + for (int T = 0; T < NumOperands; ++T) { + Value *BEOp = BEUser->getOperand(T); + Instruction *BEOpInst = dyn_cast(BEOp); + if (!OpInst && !BEOpInst) { + if (Op == BEOp) { + Found = true; + break; + } + } + + if ((OpInst && !BEOpInst) || (!OpInst && BEOpInst)) + continue; + + DepChain *D = getDepChainBtwn(OpInst, BEOpInst, Iters); + + if (D) { + Found = true; + DepChains[OpInst] = D; + break; + } + } + if (!Found) { + BEUser = nullptr; + break; + } } - - Instruction *BEOpInst = dyn_cast(BEOp); - - if (!isDepChainBtwn(OpInst, BEOpInst, Iters)) { - BEUser = nullptr; - break; + } else { + + for (int OpNo = 0; OpNo < NumOperands; ++OpNo) { + Value *Op = I->getOperand(OpNo); + Value *BEOp = BEUser->getOperand(OpNo); + + Instruction *OpInst = dyn_cast(Op); + if (!OpInst) { + if (Op == BEOp) + continue; + // Do not allow reuse to occur when the operands may be different + // values. + BEUser = nullptr; + break; + } + + Instruction *BEOpInst = dyn_cast(BEOp); + DepChain *D = getDepChainBtwn(OpInst, BEOpInst, Iters); + + if (D) { + DepChains[OpInst] = D; + } else { + BEUser = nullptr; + break; + } } } if (BEUser) { LLVM_DEBUG(dbgs() << "Found Value for reuse.\n"); ReuseCandidate.Inst2Replace = I; ReuseCandidate.BackedgeInst = BEUser; + ReuseCandidate.DepChains = DepChains; + ReuseCandidate.Iterations = Iters; return; - } else - ReuseCandidate.reset(); + } + ReuseCandidate.reset(); } } } @@ -474,27 +602,10 @@ void HexagonVectorLoopCarriedReuse::reuseValue() { Instruction *Inst2Replace = ReuseCandidate.Inst2Replace; Instruction *BEInst = ReuseCandidate.BackedgeInst; int NumOperands = Inst2Replace->getNumOperands(); - std::map DepChains; - int Iterations = -1; + std::map &DepChains = ReuseCandidate.DepChains; + int Iterations = ReuseCandidate.Iterations; BasicBlock *LoopPH = CurLoop->getLoopPreheader(); - - for (int i = 0; i < NumOperands; ++i) { - Instruction *I = dyn_cast(Inst2Replace->getOperand(i)); - if(!I) - continue; - else { - Instruction *J = cast(BEInst->getOperand(i)); - DepChain *D = getDepChainBtwn(I, J); - - assert(D && - "No DepChain between corresponding operands in ReuseCandidate\n"); - if (Iterations == -1) - Iterations = D->iterations(); - assert(Iterations == D->iterations() && "Iterations mismatch"); - DepChains[I] = D; - } - } - + assert(!DepChains.empty() && "No DepChains"); LLVM_DEBUG(dbgs() << "reuseValue is making the following changes\n"); SmallVector InstsInPreheader; @@ -603,20 +714,11 @@ void HexagonVectorLoopCarriedReuse::findDepChainFromPHI(Instruction *I, } } -bool HexagonVectorLoopCarriedReuse::isDepChainBtwn(Instruction *I1, - Instruction *I2, - int Iters) { - for (auto *D : Dependences) { - if (D->front() == I1 && D->back() == I2 && D->iterations() == Iters) - return true; - } - return false; -} - DepChain *HexagonVectorLoopCarriedReuse::getDepChainBtwn(Instruction *I1, - Instruction *I2) { + Instruction *I2, + int Iters) { for (auto *D : Dependences) { - if (D->front() == I1 && D->back() == I2) + if (D->front() == I1 && D->back() == I2 && D->iterations() == Iters) return D; } return nullptr; diff --git a/test/CodeGen/Hexagon/hexagon_vector_loop_carried_reuse_commutative.ll b/test/CodeGen/Hexagon/hexagon_vector_loop_carried_reuse_commutative.ll new file mode 100644 index 00000000000..0ba9bb74f7b --- /dev/null +++ b/test/CodeGen/Hexagon/hexagon_vector_loop_carried_reuse_commutative.ll @@ -0,0 +1,82 @@ +; RUN: opt -march=hexagon < %s -hexagon-vlcr -adce -S | FileCheck %s + +; CHECK: %v32.hexagon.vlcr = tail call <32 x i32> @llvm.hexagon.V6.vmaxub.128B + +target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048" +target triple = "hexagon" + +@g0 = external local_unnamed_addr global i32, align 4 + +; Function Attrs: nounwind +define void @f0(i8* noalias nocapture readonly %a0, i8* noalias nocapture %a1, i32 %a2) local_unnamed_addr #0 { +b0: + %v0 = getelementptr inbounds i8, i8* %a0, i32 %a2 + %v1 = mul nsw i32 %a2, 2 + %v2 = getelementptr inbounds i8, i8* %a0, i32 %v1 + %v3 = load i32, i32* @g0, align 4, !tbaa !0 + %v4 = icmp sgt i32 %v3, 0 + br i1 %v4, label %b1, label %b4 + +b1: ; preds = %b0 + %v5 = bitcast i8* %v2 to <32 x i32>* + %v6 = load <32 x i32>, <32 x i32>* %v5, align 128, !tbaa !4 + %v7 = getelementptr inbounds i8, i8* %v2, i32 128 + %v8 = bitcast i8* %v7 to <32 x i32>* + %v9 = bitcast i8* %v0 to <32 x i32>* + %v10 = load <32 x i32>, <32 x i32>* %v9, align 128, !tbaa !4 + %v11 = getelementptr inbounds i8, i8* %v0, i32 128 + %v12 = bitcast i8* %v11 to <32 x i32>* + %v13 = bitcast i8* %a0 to <32 x i32>* + %v14 = load <32 x i32>, <32 x i32>* %v13, align 128, !tbaa !4 + %v15 = getelementptr inbounds i8, i8* %a0, i32 128 + %v16 = bitcast i8* %v15 to <32 x i32>* + %v17 = bitcast i8* %a1 to <32 x i32>* + br label %b2 + +b2: ; preds = %b2, %b1 + %v18 = phi <32 x i32>* [ %v17, %b1 ], [ %v37, %b2 ] + %v19 = phi <32 x i32>* [ %v8, %b1 ], [ %v30, %b2 ] + %v20 = phi <32 x i32>* [ %v12, %b1 ], [ %v28, %b2 ] + %v21 = phi <32 x i32>* [ %v16, %b1 ], [ %v26, %b2 ] + %v22 = phi i32 [ 0, %b1 ], [ %v38, %b2 ] + %v23 = phi <32 x i32> [ %v14, %b1 ], [ %v27, %b2 ] + %v24 = phi <32 x i32> [ %v10, %b1 ], [ %v29, %b2 ] + %v25 = phi <32 x i32> [ %v6, %b1 ], [ %v31, %b2 ] + %v26 = getelementptr inbounds <32 x i32>, <32 x i32>* %v21, i32 1 + %v27 = load <32 x i32>, <32 x i32>* %v21, align 128, !tbaa !4 + %v28 = getelementptr inbounds <32 x i32>, <32 x i32>* %v20, i32 1 + %v29 = load <32 x i32>, <32 x i32>* %v20, align 128, !tbaa !4 + %v30 = getelementptr inbounds <32 x i32>, <32 x i32>* %v19, i32 1 + %v31 = load <32 x i32>, <32 x i32>* %v19, align 128, !tbaa !4 + %v32 = tail call <32 x i32> @llvm.hexagon.V6.vmaxub.128B(<32 x i32> %v23, <32 x i32> %v24) + %v33 = tail call <32 x i32> @llvm.hexagon.V6.vmaxub.128B(<32 x i32> %v32, <32 x i32> %v25) + %v34 = tail call <32 x i32> @llvm.hexagon.V6.vmaxub.128B(<32 x i32> %v29, <32 x i32> %v27) + %v35 = tail call <32 x i32> @llvm.hexagon.V6.vmaxub.128B(<32 x i32> %v34, <32 x i32> %v31) + %v36 = tail call <32 x i32> @llvm.hexagon.V6.valignbi.128B(<32 x i32> %v35, <32 x i32> %v33, i32 1) + %v37 = getelementptr inbounds <32 x i32>, <32 x i32>* %v18, i32 1 + store <32 x i32> %v36, <32 x i32>* %v18, align 128, !tbaa !4 + %v38 = add nuw nsw i32 %v22, 128 + %v39 = icmp slt i32 %v38, %v3 + br i1 %v39, label %b2, label %b3 + +b3: ; preds = %b2 + br label %b4 + +b4: ; preds = %b3, %b0 + ret void +} + +; Function Attrs: nounwind readnone +declare <32 x i32> @llvm.hexagon.V6.vmaxub.128B(<32 x i32>, <32 x i32>) #1 + +; Function Attrs: nounwind readnone +declare <32 x i32> @llvm.hexagon.V6.valignbi.128B(<32 x i32>, <32 x i32>, i32) #1 + +attributes #0 = { nounwind "target-cpu"="hexagonv60" "target-features"="+hvx,+hvx-length128b,-long-calls" } +attributes #1 = { nounwind readnone } + +!0 = !{!1, !1, i64 0} +!1 = !{!"int", !2, i64 0} +!2 = !{!"omnipotent char", !3, i64 0} +!3 = !{!"Simple C/C++ TBAA"} +!4 = !{!2, !2, i64 0}