From: Jonas Paulsson Date: Tue, 31 Jul 2018 13:00:42 +0000 (+0000) Subject: [SystemZ] Improve decoding in case of instructions with four register operands. X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=6108c027cf87799474ef0e83c1706ddc32b10cf5;p=llvm [SystemZ] Improve decoding in case of instructions with four register operands. Since z13, the max group size will be 2 if any μop has more than 3 register sources. This has been ignored sofar in the SystemZHazardRecognizer, but is now handled by recognizing those instructions and adjusting the tracking of decoding and the cost heuristic for grouping. Review: Ulrich Weigand https://reviews.llvm.org/D49847 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@338368 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/Target/SystemZ/SystemZHazardRecognizer.cpp b/lib/Target/SystemZ/SystemZHazardRecognizer.cpp index d01dd9eaaaf..c7dd3581eae 100644 --- a/lib/Target/SystemZ/SystemZHazardRecognizer.cpp +++ b/lib/Target/SystemZ/SystemZHazardRecognizer.cpp @@ -81,6 +81,7 @@ getHazardType(SUnit *m, int Stalls) { void SystemZHazardRecognizer::Reset() { CurrGroupSize = 0; + CurrGroupHas4RegOps = false; clearProcResCounters(); GrpCount = 0; LastFPdOpCycleIdx = UINT_MAX; @@ -99,6 +100,12 @@ SystemZHazardRecognizer::fitsIntoCurrentGroup(SUnit *SU) const { if (SC->BeginGroup) return (CurrGroupSize == 0); + // An instruction with 4 register operands will not fit in last slot. + assert ((CurrGroupSize < 2 || !CurrGroupHas4RegOps) || + "Current decoder group is already full!"); + if (CurrGroupSize == 2 && has4RegOps(SU->getInstr())) + return false; + // Since a full group is handled immediately in EmitInstruction(), // SU should fit into current group. NumSlots should be 1 or 0, // since it is not a cracked or expanded instruction. @@ -108,6 +115,23 @@ SystemZHazardRecognizer::fitsIntoCurrentGroup(SUnit *SU) const { return true; } +bool SystemZHazardRecognizer::has4RegOps(const MachineInstr *MI) const { + const MachineFunction &MF = *MI->getParent()->getParent(); + const TargetRegisterInfo *TRI = &TII->getRegisterInfo(); + const MCInstrDesc &MID = MI->getDesc(); + unsigned Count = 0; + for (unsigned OpIdx = 0; OpIdx < MID.getNumOperands(); OpIdx++) { + const TargetRegisterClass *RC = TII->getRegClass(MID, OpIdx, TRI, MF); + if (RC == nullptr) + continue; + if (OpIdx >= MID.getNumDefs() && + MID.getOperandConstraint(OpIdx, MCOI::TIED_TO) != -1) + continue; + Count++; + } + return Count >= 4; +} + void SystemZHazardRecognizer::nextGroup() { if (CurrGroupSize == 0) return; @@ -119,6 +143,7 @@ void SystemZHazardRecognizer::nextGroup() { // Reset counter for next group. CurrGroupSize = 0; + CurrGroupHas4RegOps = false; // Decrease counters for execution units by one. for (unsigned i = 0; i < SchedModel->getNumProcResourceKinds(); ++i) @@ -172,6 +197,8 @@ void SystemZHazardRecognizer::dumpSU(SUnit *SU, raw_ostream &OS) const { OS << "/EndsGroup"; if (SU->isUnbuffered) OS << "/Unbuffered"; + if (has4RegOps(SU->getInstr())) + OS << "/4RegOps"; } void SystemZHazardRecognizer::dumpCurrGroup(std::string Msg) const { @@ -184,6 +211,7 @@ void SystemZHazardRecognizer::dumpCurrGroup(std::string Msg) const { dbgs() << "{ " << CurGroupDbg << " }"; dbgs() << " (" << CurrGroupSize << " decoder slot" << (CurrGroupSize > 1 ? "s":"") + << (CurrGroupHas4RegOps ? ", 4RegOps" : "") << ")\n"; } } @@ -294,11 +322,14 @@ EmitInstruction(SUnit *SU) { // Insert SU into current group by increasing number of slots used // in current group. CurrGroupSize += getNumDecoderSlots(SU); - assert (CurrGroupSize <= 3); + CurrGroupHas4RegOps |= has4RegOps(SU->getInstr()); + unsigned GroupLim = + ((CurrGroupHas4RegOps && getNumDecoderSlots(SU) < 3) ? 2 : 3); + assert (CurrGroupSize <= GroupLim && "SU does not fit into decoder group!"); // Check if current group is now full/ended. If so, move on to next // group to be ready to evaluate more candidates. - if (CurrGroupSize == 3 || SC->EndGroup) + if (CurrGroupSize == GroupLim || SC->EndGroup) nextGroup(); } @@ -325,6 +356,10 @@ int SystemZHazardRecognizer::groupingCost(SUnit *SU) const { return -1; } + // An instruction with 4 register operands will not fit in last slot. + if (CurrGroupSize == 2 && has4RegOps(SU->getInstr())) + return 1; + // Most instructions can be placed in any decoder slot. return 0; } diff --git a/lib/Target/SystemZ/SystemZHazardRecognizer.h b/lib/Target/SystemZ/SystemZHazardRecognizer.h index ad06be978ad..6292feefbfe 100644 --- a/lib/Target/SystemZ/SystemZHazardRecognizer.h +++ b/lib/Target/SystemZ/SystemZHazardRecognizer.h @@ -45,15 +45,17 @@ namespace llvm { /// SystemZHazardRecognizer maintains the state for one MBB during scheduling. class SystemZHazardRecognizer : public ScheduleHazardRecognizer { -#ifndef NDEBUG const SystemZInstrInfo *TII; -#endif const TargetSchedModel *SchedModel; /// Keep track of the number of decoder slots used in the current /// decoder group. unsigned CurrGroupSize; + /// True if an instruction with four reg operands have been scheduled into + /// the current decoder group. + bool CurrGroupHas4RegOps; + /// The tracking of resources here are quite similar to the common /// code use of a critical resource. However, z13 differs in the way /// that it has two processor sides which may be interesting to @@ -73,6 +75,9 @@ class SystemZHazardRecognizer : public ScheduleHazardRecognizer { /// Return true if MI fits into current decoder group. bool fitsIntoCurrentGroup(SUnit *SU) const; + /// Return true if this instruction has four register operands. + bool has4RegOps(const MachineInstr *MI) const; + /// Two decoder groups per cycle are formed (for z13), meaning 2x3 /// instructions. This function returns a number between 0 and 5, /// representing the current decoder slot of the current cycle. If an SU @@ -105,11 +110,7 @@ class SystemZHazardRecognizer : public ScheduleHazardRecognizer { public: SystemZHazardRecognizer(const SystemZInstrInfo *tii, const TargetSchedModel *SM) - : -#ifndef NDEBUG - TII(tii), -#endif - SchedModel(SM) { + : TII(tii), SchedModel(SM) { Reset(); } diff --git a/lib/Target/SystemZ/SystemZMachineScheduler.cpp b/lib/Target/SystemZ/SystemZMachineScheduler.cpp index 11e60158524..98e761ef87f 100644 --- a/lib/Target/SystemZ/SystemZMachineScheduler.cpp +++ b/lib/Target/SystemZ/SystemZMachineScheduler.cpp @@ -169,8 +169,7 @@ SUnit *SystemZPostRASchedStrategy::pickNode(bool &IsTopNode) { return *Available.begin(); } - // All nodes that are possible to schedule are stored by in the - // Available set. + // All nodes that are possible to schedule are stored in the Available set. LLVM_DEBUG(dbgs() << "** Available: "; Available.dump(*HazardRec);); Candidate Best; diff --git a/test/CodeGen/SystemZ/vec-cmp-cmp-logic-select.ll b/test/CodeGen/SystemZ/vec-cmp-cmp-logic-select.ll index 8714a5a5703..9ed7ca21797 100644 --- a/test/CodeGen/SystemZ/vec-cmp-cmp-logic-select.ll +++ b/test/CodeGen/SystemZ/vec-cmp-cmp-logic-select.ll @@ -688,8 +688,8 @@ define <8 x float> @fun30(<8 x float> %val1, <8 x float> %val2, <8 x double> %va ; CHECK-NEXT: vpkg %v6, %v6, %v7 ; CHECK-NEXT: vpkg %v4, %v4, %v5 ; CHECK-NEXT: vn %v5, %v16, %v6 -; CHECK-NEXT: vsel %v24, %v3, %v2, %v5 -; CHECK-NEXT: vldeb %v17, %v17 +; CHECK-DAG: vsel %v24, %v3, %v2, %v5 +; CHECK-DAG: vldeb %v17, %v17 ; CHECK-NEXT: vldeb %v18, %v18 ; CHECK-NEXT: vfchdb %v17, %v18, %v17 ; CHECK-NEXT: vmrhf %v18, %v30, %v30