--- /dev/null
+//=- MachineLoopUtils.h - Helper functions for manipulating loops -*- C++ -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_CODEGEN_MACHINELOOPUTILS_H
+#define LLVM_LIB_CODEGEN_MACHINELOOPUTILS_H
+
+namespace llvm {
+class MachineBasicBlock;
+class MachineRegisterInfo;
+class TargetInstrInfo;
+
+enum LoopPeelDirection {
+ LPD_Front, ///< Peel the first iteration of the loop.
+ LPD_Back ///< Peel the last iteration of the loop.
+};
+
+/// Peels a single block loop. Loop must have two successors, one of which
+/// must be itself. Similarly it must have two predecessors, one of which must
+/// be itself.
+///
+/// The loop block is copied and inserted into the CFG such that two copies of
+/// the loop follow on from each other. The copy is inserted either before or
+/// after the loop based on Direction.
+///
+/// Phis are updated and an unconditional branch inserted at the end of the
+/// clone so as to execute a single iteration.
+///
+/// The trip count of Loop is not updated.
+MachineBasicBlock *PeelSingleBlockLoop(LoopPeelDirection Direction,
+ MachineBasicBlock *Loop,
+ MachineRegisterInfo &MRI,
+ const TargetInstrInfo *TII);
+
+} // namespace llvm
+
+#endif // LLVM_LIB_CODEGEN_MACHINELOOPUTILS_H
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineLoopUtils.h"
#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include <deque>
#include <vector>
namespace llvm {
/// Return the rescheduled instructions in order.
ArrayRef<MachineInstr *> getInstructions() { return ScheduledInstrs; }
- void dump() {
- print(dbgs());
- }
+ void dump() { print(dbgs()); }
void print(raw_ostream &OS);
};
/// A reimplementation of ModuloScheduleExpander. It works by generating a
/// standalone kernel loop and peeling out the prologs and epilogs.
-///
-/// FIXME: This implementation cannot yet generate valid code. It can generate
-/// a correct kernel but cannot peel out prologs and epilogs.
class PeelingModuloScheduleExpander {
ModuloSchedule &Schedule;
MachineFunction &MF;
const TargetInstrInfo *TII;
LiveIntervals *LIS;
+ /// The original loop block that gets rewritten in-place.
MachineBasicBlock *BB;
+ /// The original loop preheader.
MachineBasicBlock *Preheader;
+ /// All prolog and epilog blocks.
+ SmallVector<MachineBasicBlock *, 4> Prologs, Epilogs;
+ /// For every block, the stages that are produced.
+ DenseMap<MachineBasicBlock *, BitVector> LiveStages;
+ /// For every block, the stages that are available. A stage can be available
+ /// but not produced (in the epilog) or produced but not available (in the
+ /// prolog).
+ DenseMap<MachineBasicBlock *, BitVector> AvailableStages;
+
+ /// CanonicalMIs and BlockMIs form a bidirectional map between any of the
+ /// loop kernel clones.
+ DenseMap<MachineInstr *, MachineInstr *> CanonicalMIs;
+ DenseMap<std::pair<MachineBasicBlock *, MachineInstr *>, MachineInstr *>
+ BlockMIs;
+
+ /// State passed from peelKernel to peelPrologAndEpilogs().
+ std::deque<MachineBasicBlock *> PeeledFront, PeeledBack;
+
public:
PeelingModuloScheduleExpander(MachineFunction &MF, ModuloSchedule &S,
LiveIntervals *LIS)
: Schedule(S), MF(MF), ST(MF.getSubtarget()), MRI(MF.getRegInfo()),
TII(ST.getInstrInfo()), LIS(LIS) {}
+ void expand();
+
/// Runs ModuloScheduleExpander and treats it as a golden input to validate
/// aspects of the code generated by PeelingModuloScheduleExpander.
void validateAgainstModuloScheduleExpander();
+
+protected:
+ /// Converts BB from the original loop body to the rewritten, pipelined
+ /// steady-state.
+ void rewriteKernel();
+
+private:
+ /// Peels one iteration of the rewritten kernel (BB) in the specified
+ /// direction.
+ MachineBasicBlock *peelKernel(LoopPeelDirection LPD);
+ /// Peel the kernel forwards and backwards to produce prologs and epilogs,
+ /// and stitch them together.
+ void peelPrologAndEpilogs();
+ /// All prolog and epilog blocks are clones of the kernel, so any produced
+ /// register in one block has an corollary in all other blocks.
+ Register getEquivalentRegisterIn(Register Reg, MachineBasicBlock *BB);
+ /// Change all users of MI, if MI is predicated out
+ /// (LiveStages[MI->getParent()] == false).
+ void rewriteUsesOf(MachineInstr *MI);
+ /// Insert branches between prologs, kernel and epilogs.
+ void fixupBranches();
+ /// Create a poor-man's LCSSA by cloning only the PHIs from the kernel block
+ /// to a block dominated by all prologs and epilogs. This allows us to treat
+ /// the loop exiting block as any other kernel clone.
+ MachineBasicBlock *CreateLCSSAExitingBlock();
+ /// Helper to get the stage of an instruction in the schedule.
+ unsigned getStage(MachineInstr *MI) {
+ if (CanonicalMIs.count(MI))
+ MI = CanonicalMIs[MI];
+ return Schedule.getStage(MI);
+ }
};
/// Expander that simply annotates each scheduled instruction with a post-instr
MachineInstr.cpp
MachineLICM.cpp
MachineLoopInfo.cpp
+ MachineLoopUtils.cpp
MachineModuleInfo.cpp
MachineModuleInfoImpls.cpp
MachineOperand.cpp
--- /dev/null
+//=- MachineLoopUtils.cpp - Functions for manipulating loops ----------------=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachineLoopUtils.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+using namespace llvm;
+
+namespace {
+// MI's parent and BB are clones of each other. Find the equivalent copy of MI
+// in BB.
+MachineInstr &findEquivalentInstruction(MachineInstr &MI,
+ MachineBasicBlock *BB) {
+ MachineBasicBlock *PB = MI.getParent();
+ unsigned Offset = std::distance(PB->instr_begin(), MachineBasicBlock::instr_iterator(MI));
+ return *std::next(BB->instr_begin(), Offset);
+}
+} // namespace
+
+MachineBasicBlock *llvm::PeelSingleBlockLoop(LoopPeelDirection Direction,
+ MachineBasicBlock *Loop,
+ MachineRegisterInfo &MRI,
+ const TargetInstrInfo *TII) {
+ MachineFunction &MF = *Loop->getParent();
+ MachineBasicBlock *Preheader = *Loop->pred_begin();
+ if (Preheader == Loop)
+ Preheader = *std::next(Loop->pred_begin());
+ MachineBasicBlock *Exit = *Loop->succ_begin();
+ if (Exit == Loop)
+ Exit = *std::next(Loop->succ_begin());
+
+ MachineBasicBlock *NewBB = MF.CreateMachineBasicBlock(Loop->getBasicBlock());
+ if (Direction == LPD_Front)
+ MF.insert(Loop->getIterator(), NewBB);
+ else
+ MF.insert(std::next(Loop->getIterator()), NewBB);
+
+ // FIXME: Add DenseMapInfo trait for Register so we can use it as a key.
+ DenseMap<unsigned, Register> Remaps;
+ auto InsertPt = NewBB->end();
+ for (MachineInstr &MI : *Loop) {
+ MachineInstr *NewMI = MF.CloneMachineInstr(&MI);
+ NewBB->insert(InsertPt, NewMI);
+ for (MachineOperand &MO : NewMI->defs()) {
+ Register OrigR = MO.getReg();
+ if (OrigR.isPhysical())
+ continue;
+ Register &R = Remaps[OrigR];
+ R = MRI.createVirtualRegister(MRI.getRegClass(OrigR));
+ MO.setReg(R);
+
+ if (Direction == LPD_Back) {
+ // Replace all uses outside the original loop with the new register.
+ // FIXME: is the use_iterator stable enough to mutate register uses
+ // while iterating?
+ SmallVector<MachineOperand *, 4> Uses;
+ for (auto &Use : MRI.use_operands(OrigR))
+ if (Use.getParent()->getParent() != Loop)
+ Uses.push_back(&Use);
+ for (auto *Use : Uses) {
+ MRI.constrainRegClass(R, MRI.getRegClass(Use->getReg()));
+ Use->setReg(R);
+ }
+ }
+ }
+ }
+
+ for (auto I = NewBB->getFirstNonPHI(); I != NewBB->end(); ++I)
+ for (MachineOperand &MO : I->uses())
+ if (MO.isReg() && Remaps.count(MO.getReg()))
+ MO.setReg(Remaps[MO.getReg()]);
+
+ for (auto I = NewBB->begin(); I->isPHI(); ++I) {
+ MachineInstr &MI = *I;
+ unsigned LoopRegIdx = 3, InitRegIdx = 1;
+ if (MI.getOperand(2).getMBB() != Preheader)
+ std::swap(LoopRegIdx, InitRegIdx);
+ MachineInstr &OrigPhi = findEquivalentInstruction(MI, Loop);
+ assert(OrigPhi.isPHI());
+ if (Direction == LPD_Front) {
+ // When peeling front, we are only left with the initial value from the
+ // preheader.
+ Register R = MI.getOperand(LoopRegIdx).getReg();
+ if (Remaps.count(R))
+ R = Remaps[R];
+ OrigPhi.getOperand(InitRegIdx).setReg(R);
+ MI.RemoveOperand(LoopRegIdx + 1);
+ MI.RemoveOperand(LoopRegIdx + 0);
+ } else {
+ // When peeling back, the initial value is the loop-carried value from
+ // the original loop.
+ Register LoopReg = OrigPhi.getOperand(LoopRegIdx).getReg();
+ MI.getOperand(LoopRegIdx).setReg(LoopReg);
+ MI.RemoveOperand(InitRegIdx + 1);
+ MI.RemoveOperand(InitRegIdx + 0);
+ }
+ }
+
+ DebugLoc DL;
+ if (Direction == LPD_Front) {
+ Preheader->replaceSuccessor(Loop, NewBB);
+ NewBB->addSuccessor(Loop);
+ Loop->replacePhiUsesWith(Preheader, NewBB);
+ if (TII->removeBranch(*Preheader) > 0)
+ TII->insertBranch(*Preheader, NewBB, nullptr, {}, DL);
+ TII->removeBranch(*NewBB);
+ TII->insertBranch(*NewBB, Loop, nullptr, {}, DL);
+ } else {
+ Loop->replaceSuccessor(Exit, NewBB);
+ Exit->replacePhiUsesWith(Loop, NewBB);
+ NewBB->addSuccessor(Exit);
+
+ MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
+ SmallVector<MachineOperand, 4> Cond;
+ bool CanAnalyzeBr = !TII->analyzeBranch(*Loop, TBB, FBB, Cond);
+ (void)CanAnalyzeBr;
+ assert(CanAnalyzeBr && "Must be able to analyze the loop branch!");
+ TII->removeBranch(*Loop);
+ TII->insertBranch(*Loop, TBB == Exit ? NewBB : TBB,
+ FBB == Exit ? NewBB : FBB, Cond, DL);
+ if (TII->removeBranch(*NewBB) > 0)
+ TII->insertBranch(*NewBB, Exit, nullptr, {}, DL);
+ }
+
+ return NewBB;
+}
// The experimental code generator can't work if there are InstChanges.
if (ExperimentalCodeGen && NewInstrChanges.empty()) {
PeelingModuloScheduleExpander MSE(MF, MS, &LIS);
- // Experimental code generation isn't complete yet, but it can partially
- // validate the code it generates against the original
- // ModuloScheduleExpander.
- MSE.validateAgainstModuloScheduleExpander();
+ MSE.expand();
} else {
ModuloScheduleExpander MSE(MF, MS, LIS, std::move(NewInstrChanges));
MSE.expand();
#include "llvm/ADT/StringExtras.h"
#include "llvm/CodeGen/LiveIntervals.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineLoopUtils.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/MC/MCContext.h"
};
} // namespace
+MachineBasicBlock *
+PeelingModuloScheduleExpander::peelKernel(LoopPeelDirection LPD) {
+ MachineBasicBlock *NewBB = PeelSingleBlockLoop(LPD, BB, MRI, TII);
+ if (LPD == LPD_Front)
+ PeeledFront.push_back(NewBB);
+ else
+ PeeledBack.push_front(NewBB);
+ for (auto I = BB->begin(), NI = NewBB->begin(); !I->isTerminator();
+ ++I, ++NI) {
+ CanonicalMIs[&*I] = &*I;
+ CanonicalMIs[&*NI] = &*I;
+ BlockMIs[{NewBB, &*I}] = &*NI;
+ BlockMIs[{BB, &*I}] = &*I;
+ }
+ return NewBB;
+}
+
+void PeelingModuloScheduleExpander::peelPrologAndEpilogs() {
+ BitVector LS(Schedule.getNumStages(), true);
+ BitVector AS(Schedule.getNumStages(), true);
+ LiveStages[BB] = LS;
+ AvailableStages[BB] = AS;
+
+ // Peel out the prologs.
+ LS.reset();
+ for (int I = 0; I < Schedule.getNumStages() - 1; ++I) {
+ LS[I] = 1;
+ Prologs.push_back(peelKernel(LPD_Front));
+ LiveStages[Prologs.back()] = LS;
+ AvailableStages[Prologs.back()] = LS;
+ }
+
+ // Create a block that will end up as the new loop exiting block (dominated by
+ // all prologs and epilogs). It will only contain PHIs, in the same order as
+ // BB's PHIs. This gives us a poor-man's LCSSA with the inductive property
+ // that the exiting block is a (sub) clone of BB. This in turn gives us the
+ // property that any value deffed in BB but used outside of BB is used by a
+ // PHI in the exiting block.
+ MachineBasicBlock *ExitingBB = CreateLCSSAExitingBlock();
+
+ // Push out the epilogs, again in reverse order.
+ // We can't assume anything about the minumum loop trip count at this point,
+ // so emit a fairly complex epilog:
+ // K[0, 1, 2] // Kernel runs stages 0, 1, 2
+ // E0[2] <- P1 // Epilog runs stage 2 only, so the state after is [0].
+ // E1[1, 2] <- P0 // Epilog 1 moves the last item from stage 0 to stage 2.
+ //
+ // This creates a single-successor single-predecessor sequence of blocks for
+ // each epilog, which are kept this way for simplicity at this stage and
+ // cleaned up by the optimizer later.
+ for (int I = 1; I <= Schedule.getNumStages() - 1; ++I) {
+ Epilogs.push_back(nullptr);
+ for (int J = Schedule.getNumStages() - 1; J >= I; --J) {
+ LS.reset();
+ LS[J] = 1;
+ Epilogs.back() = peelKernel(LPD_Back);
+ LiveStages[Epilogs.back()] = LS;
+ AvailableStages[Epilogs.back()] = AS;
+ }
+ }
+
+ // Now we've defined all the prolog and epilog blocks as a fallthrough
+ // sequence, add the edges that will be followed if the loop trip count is
+ // lower than the number of stages (connecting prologs directly with epilogs).
+ auto PI = Prologs.begin();
+ auto EI = Epilogs.begin();
+ assert(Prologs.size() == Epilogs.size());
+ for (; PI != Prologs.end(); ++PI, ++EI) {
+ MachineBasicBlock *Pred = *(*EI)->pred_begin();
+ (*PI)->addSuccessor(*EI);
+ for (MachineInstr &MI : (*EI)->phis()) {
+ Register Reg = MI.getOperand(1).getReg();
+ MachineInstr *Use = MRI.getUniqueVRegDef(Reg);
+ if (Use && Use->getParent() == Pred)
+ Reg = getEquivalentRegisterIn(Reg, *PI);
+ MI.addOperand(MachineOperand::CreateReg(Reg, /*isDef=*/false));
+ MI.addOperand(MachineOperand::CreateMBB(*PI));
+ }
+ }
+
+ // Create a list of all blocks in order.
+ SmallVector<MachineBasicBlock *, 8> Blocks;
+ llvm::copy(PeeledFront, std::back_inserter(Blocks));
+ Blocks.push_back(BB);
+ llvm::copy(PeeledBack, std::back_inserter(Blocks));
+
+ // Iterate in reverse order over all instructions, remapping as we go.
+ for (MachineBasicBlock *B : reverse(Blocks)) {
+ for (auto I = B->getFirstInstrTerminator()->getReverseIterator();
+ I != std::next(B->getFirstNonPHI()->getReverseIterator());) {
+ MachineInstr *MI = &*I++;
+ rewriteUsesOf(MI);
+ }
+ }
+ // Now all remapping has been done, we're free to optimize the generated code.
+ for (MachineBasicBlock *B : reverse(Blocks))
+ EliminateDeadPhis(B, MRI, LIS);
+ EliminateDeadPhis(ExitingBB, MRI, LIS);
+}
+
+MachineBasicBlock *PeelingModuloScheduleExpander::CreateLCSSAExitingBlock() {
+ MachineFunction &MF = *BB->getParent();
+ MachineBasicBlock *Exit = *BB->succ_begin();
+ if (Exit == BB)
+ Exit = *std::next(BB->succ_begin());
+
+ MachineBasicBlock *NewBB = MF.CreateMachineBasicBlock(BB->getBasicBlock());
+ MF.insert(std::next(BB->getIterator()), NewBB);
+
+ // Clone all phis in BB into NewBB and rewrite.
+ for (MachineInstr &MI : BB->phis()) {
+ auto RC = MRI.getRegClass(MI.getOperand(0).getReg());
+ Register OldR = MI.getOperand(3).getReg();
+ Register R = MRI.createVirtualRegister(RC);
+ SmallVector<MachineInstr *, 4> Uses;
+ for (MachineInstr &Use : MRI.use_instructions(OldR))
+ if (Use.getParent() != BB)
+ Uses.push_back(&Use);
+ for (MachineInstr *Use : Uses)
+ Use->substituteRegister(OldR, R, /*SubIdx=*/0,
+ *MRI.getTargetRegisterInfo());
+ MachineInstr *NI = BuildMI(NewBB, DebugLoc(), TII->get(TargetOpcode::PHI), R)
+ .addReg(OldR)
+ .addMBB(BB);
+ BlockMIs[{NewBB, &MI}] = NI;
+ CanonicalMIs[NI] = &MI;
+ }
+ BB->replaceSuccessor(Exit, NewBB);
+ Exit->replacePhiUsesWith(BB, NewBB);
+ NewBB->addSuccessor(Exit);
+
+ MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
+ SmallVector<MachineOperand, 4> Cond;
+ bool CanAnalyzeBr = !TII->analyzeBranch(*BB, TBB, FBB, Cond);
+ (void)CanAnalyzeBr;
+ assert(CanAnalyzeBr && "Must be able to analyze the loop branch!");
+ TII->removeBranch(*BB);
+ TII->insertBranch(*BB, TBB == Exit ? NewBB : TBB, FBB == Exit ? NewBB : FBB,
+ Cond, DebugLoc());
+ TII->insertUnconditionalBranch(*NewBB, Exit, DebugLoc());
+ return NewBB;
+}
+
+Register
+PeelingModuloScheduleExpander::getEquivalentRegisterIn(Register Reg,
+ MachineBasicBlock *BB) {
+ MachineInstr *MI = MRI.getUniqueVRegDef(Reg);
+ unsigned OpIdx = MI->findRegisterDefOperandIdx(Reg);
+ return BlockMIs[{BB, CanonicalMIs[MI]}]->getOperand(OpIdx).getReg();
+}
+
+void PeelingModuloScheduleExpander::rewriteUsesOf(MachineInstr *MI) {
+ if (MI->isPHI()) {
+ // This is an illegal PHI. The loop-carried (desired) value is operand 3,
+ // and it is produced by this block.
+ Register PhiR = MI->getOperand(0).getReg();
+ Register R = MI->getOperand(3).getReg();
+ int RMIStage = getStage(MRI.getUniqueVRegDef(R));
+ if (RMIStage != -1 && !AvailableStages[MI->getParent()].test(RMIStage))
+ R = MI->getOperand(1).getReg();
+ MRI.setRegClass(R, MRI.getRegClass(PhiR));
+ MRI.replaceRegWith(PhiR, R);
+ if (LIS)
+ LIS->RemoveMachineInstrFromMaps(*MI);
+ MI->eraseFromParent();
+ return;
+ }
+
+ int Stage = getStage(MI);
+ if (Stage == -1 || LiveStages.count(MI->getParent()) == 0 ||
+ LiveStages[MI->getParent()].test(Stage))
+ // Instruction is live, no rewriting to do.
+ return;
+
+ for (MachineOperand &DefMO : MI->defs()) {
+ SmallVector<std::pair<MachineInstr *, Register>, 4> Subs;
+ for (MachineInstr &UseMI : MRI.use_instructions(DefMO.getReg())) {
+ // Only PHIs can use values from this block by construction.
+ // Match with the equivalent PHI in B.
+ assert(UseMI.isPHI());
+ Register Reg = getEquivalentRegisterIn(UseMI.getOperand(0).getReg(),
+ MI->getParent());
+ Subs.emplace_back(&UseMI, Reg);
+ }
+ for (auto &Sub : Subs)
+ Sub.first->substituteRegister(DefMO.getReg(), Sub.second, /*SubIdx=*/0,
+ *MRI.getTargetRegisterInfo());
+ }
+ if (LIS)
+ LIS->RemoveMachineInstrFromMaps(*MI);
+ MI->eraseFromParent();
+}
+
+void PeelingModuloScheduleExpander::fixupBranches() {
+ std::unique_ptr<TargetInstrInfo::PipelinerLoopInfo> Info =
+ TII->analyzeLoopForPipelining(BB);
+ assert(Info);
+
+ // Work outwards from the kernel.
+ bool KernelDisposed = false;
+ int TC = Schedule.getNumStages() - 1;
+ for (auto PI = Prologs.rbegin(), EI = Epilogs.rbegin(); PI != Prologs.rend();
+ ++PI, ++EI, --TC) {
+ MachineBasicBlock *Prolog = *PI;
+ MachineBasicBlock *Fallthrough = *Prolog->succ_begin();
+ MachineBasicBlock *Epilog = *EI;
+ SmallVector<MachineOperand, 4> Cond;
+ Optional<bool> StaticallyGreater =
+ Info->createTripCountGreaterCondition(TC, *Prolog, Cond);
+ if (!StaticallyGreater.hasValue()) {
+ LLVM_DEBUG(dbgs() << "Dynamic: TC > " << TC << "\n");
+ // Dynamically branch based on Cond.
+ TII->removeBranch(*Prolog);
+ TII->insertBranch(*Prolog, Epilog, Fallthrough, Cond, DebugLoc());
+ } else if (*StaticallyGreater == false) {
+ LLVM_DEBUG(dbgs() << "Static-false: TC > " << TC << "\n");
+ // Prolog never falls through; branch to epilog and orphan interior
+ // blocks. Leave it to unreachable-block-elim to clean up.
+ Prolog->removeSuccessor(Fallthrough);
+ for (MachineInstr &P : Fallthrough->phis()) {
+ P.RemoveOperand(2);
+ P.RemoveOperand(1);
+ }
+ TII->removeBranch(*Prolog);
+ TII->insertUnconditionalBranch(*Prolog, Epilog, DebugLoc());
+ KernelDisposed = true;
+ } else {
+ LLVM_DEBUG(dbgs() << "Static-true: TC > " << TC << "\n");
+ // Prolog always falls through; remove incoming values in epilog.
+ Prolog->removeSuccessor(Epilog);
+ for (MachineInstr &P : Epilog->phis()) {
+ P.RemoveOperand(4);
+ P.RemoveOperand(3);
+ }
+ }
+ }
+
+ if (!KernelDisposed) {
+ Info->adjustTripCount(-(Schedule.getNumStages() - 1));
+ Info->setPreheader(Prologs.back());
+ } else {
+ Info->disposed();
+ }
+}
+
+void PeelingModuloScheduleExpander::rewriteKernel() {
+ KernelRewriter KR(*Schedule.getLoop(), Schedule);
+ KR.rewrite();
+}
+
+void PeelingModuloScheduleExpander::expand() {
+ BB = Schedule.getLoop()->getTopBlock();
+ Preheader = Schedule.getLoop()->getLoopPreheader();
+ LLVM_DEBUG(Schedule.dump());
+
+ rewriteKernel();
+ peelPrologAndEpilogs();
+ fixupBranches();
+}
+
void PeelingModuloScheduleExpander::validateAgainstModuloScheduleExpander() {
BB = Schedule.getLoop()->getTopBlock();
Preheader = Schedule.getLoop()->getLoopPreheader();
// Now run the new expansion algorithm.
KernelRewriter KR(*Schedule.getLoop(), Schedule);
KR.rewrite();
+ peelPrologAndEpilogs();
// Collect all illegal phis that the new algorithm created. We'll give these
// to KernelOperandInfo.
-# RUN: llc < %s -x mir -march=hexagon -run-pass=modulo-schedule-test | FileCheck %s
+# RUN: llc < %s -x mir -march=hexagon -run-pass=modulo-schedule-test -pipeliner-experimental-cg=true | FileCheck %s
# Simple check for this sanity test; ensure all instructions are in stage 0 in
# the prolog and stage 3 in the epilog.
; REQUIRES: asserts
; RUN: llc -march=hexagon -mcpu=hexagonv65 -O3 -debug-only=pipeliner \
-; RUN: < %s 2>&1 | FileCheck %s
+; RUN: < %s 2>&1 -pipeliner-experimental-cg=true | FileCheck %s
; Test that the artificial dependences are ignored while computing the
; circuits.
; REQUIRES: asserts
-; RUN: llc -march=hexagon -enable-pipeliner -enable-aa-sched-mi < %s | FileCheck %s
+; RUN: llc -march=hexagon -enable-pipeliner -enable-aa-sched-mi < %s -pipeliner-experimental-cg=true | FileCheck %s
; CHECK: loop0(
; CHECK: loop0(.LBB0_[[LOOP:.]],
-; RUN: llc -march=hexagon -rdf-opt=0 -disable-hexagon-misched -hexagon-initial-cfg-cleanup=0 -lsr-setupcost-depth-limit=1 < %s | FileCheck %s
+; RUN: llc -march=hexagon -rdf-opt=0 -disable-hexagon-misched -hexagon-initial-cfg-cleanup=0 -lsr-setupcost-depth-limit=1 < %s -pipeliner-experimental-cg=true | FileCheck %s
; Test that we generate the correct code when a loop carried value
; is scheduled one stage earlier than it's use. The code in
-# RUN: llc -mtriple=hexagon -run-pass pipeliner -debug-only=pipeliner %s -o /dev/null 2>&1 | FileCheck %s
+# RUN: llc -mtriple=hexagon -run-pass pipeliner -debug-only=pipeliner %s -o /dev/null 2>&1 -pipeliner-experimental-cg=true | FileCheck %s
# REQUIRES: asserts
# Test that the loop carried dependence check correctly identifies a recurrence.
-# RUN: llc -mtriple=hexagon -run-pass pipeliner -debug-only=pipeliner %s -o /dev/null 2>&1 | FileCheck %s
+# RUN: llc -mtriple=hexagon -run-pass pipeliner -debug-only=pipeliner %s -o /dev/null 2>&1 -pipeliner-experimental-cg=true | FileCheck %s
# REQUIRES: asserts
# Test that the loop carried dependence check correctly identifies a recurrence
; RUN: llc -march=hexagon -enable-pipeliner=true -stats -o /dev/null < %s \
-; RUN: 2>&1 | FileCheck %s --check-prefix=STATS
+; RUN: 2>&1 -pipeliner-experimental-cg=true | FileCheck %s --check-prefix=STATS
; REQUIRES: asserts
; Test that we do not schedule chained references too far apart,
-; RUN: llc -march=hexagon -enable-pipeliner -pipeliner-max-stages=1 < %s | FileCheck %s
+; RUN: llc -march=hexagon -enable-pipeliner -pipeliner-max-stages=1 < %s -pipeliner-experimental-cg=true | FileCheck %s
; Test that we update the offset correctly for loads that are
; moved past stores. In these cases, we change the dependences
-; RUN: llc -march=hexagon -hexagon-initial-cfg-cleanup=0 < %s | FileCheck %s
+; RUN: llc -march=hexagon -hexagon-initial-cfg-cleanup=0 < %s -pipeliner-experimental-cg=true | FileCheck %s
; Test that we generate the correct offsets for loads in the prolog
; after removing dependences on a post-increment instructions of the
-; RUN: llc -march=hexagon -mcpu=hexagonv5 -enable-pipeliner < %s | FileCheck %s
-; RUN: llc -march=hexagon -mcpu=hexagonv62 -enable-pipeliner < %s | FileCheck --check-prefix=CHECK-V62 %s
-; RUN: llc -march=hexagon -mcpu=hexagonv65 -enable-pipeliner < %s | FileCheck --check-prefix=CHECK-V65 %s
+; RUN: llc -march=hexagon -mcpu=hexagonv5 -enable-pipeliner < %s -pipeliner-experimental-cg=true | FileCheck %s
+; RUN: llc -march=hexagon -mcpu=hexagonv62 -enable-pipeliner < %s -pipeliner-experimental-cg=true | FileCheck --check-prefix=CHECK-V62 %s
+; RUN: llc -march=hexagon -mcpu=hexagonv65 -enable-pipeliner < %s -pipeliner-experimental-cg=true | FileCheck --check-prefix=CHECK-V65 %s
;
; Make sure we pipeline the loop and that we generate the correct
; RUN: llc -march=hexagon -enable-pipeliner -enable-pipeliner-opt-size \
; RUN: -verify-machineinstrs -hexagon-initial-cfg-cleanup=0 \
; RUN: -enable-aa-sched-mi=false -hexagon-expand-condsets=0 \
-; RUN: < %s | FileCheck %s
+; RUN: < %s -pipeliner-experimental-cg=true | FileCheck %s
; Disable expand-condsets because it will assert on undefined registers.
-; RUN: llc -march=hexagon -rdf-opt=0 < %s | FileCheck %s
+; RUN: llc -march=hexagon -rdf-opt=0 < %s -pipeliner-experimental-cg=true | FileCheck %s
; Test that we fixup a pipelined loop correctly when the number of
; stages is greater than the compile-time loop trip count. In this
-; RUN: llc -march=hexagon < %s | FileCheck %s
+; RUN: llc -march=hexagon < %s -pipeliner-experimental-cg=true | FileCheck %s
; Test that the pipeliner correctly fixes up the pipelined CFG when the loop
; has a constant trip count, and the trip count is less than the number of
-; RUN: llc -march=hexagon < %s | FileCheck %s
+; RUN: llc -march=hexagon < %s -pipeliner-experimental-cg=true | FileCheck %s
; XFAIL: *
; LSR changes required.
; REQUIRES: asserts
;
; RUN: llc -march=hexagon -enable-pipeliner=true -debug-only=pipeliner < %s \
-; RUN: 2>&1 | FileCheck %s
+; RUN: 2>&1 -pipeliner-experimental-cg=true | FileCheck %s
; Test that the artificial dependence is created as a result of
; CopyToPhi DAG mutation.
-; RUN: llc -march=hexagon -enable-pipeliner -hexagon-initial-cfg-cleanup=0 < %s | FileCheck %s
+; RUN: llc -march=hexagon -enable-pipeliner -hexagon-initial-cfg-cleanup=0 < %s -pipeliner-experimental-cg=true | FileCheck %s
; Test that the code that changes the dependences does not allow
; a load with a negative offset to be overlapped with the post
-; RUN: llc -march=hexagon < %s | FileCheck %s
+; RUN: llc -march=hexagon < %s -pipeliner-experimental-cg=true | FileCheck %s
; CHECK: loop0(.LBB0_{{[0-9]+}},#347)
target triple = "hexagon"
; XFAIL: *
; Needs some fixed in the pipeliner.
-; RUN: llc -march=hexagon < %s | FileCheck %s
+; RUN: llc -march=hexagon < %s -pipeliner-experimental-cg=true | FileCheck %s
; CHECK: endloop0
; CHECK: vmem
-; RUN: llc -march=hexagon -enable-pipeliner -pipeliner-max-stages=3 < %s | FileCheck %s
+; RUN: llc -march=hexagon -enable-pipeliner -pipeliner-max-stages=3 < %s -pipeliner-experimental-cg=true | FileCheck %s
%s.0 = type { i16, i8, i8, i16, i8, i8, i16, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i32, i16, i8, i8, %s.1, [2 x [16 x %s.2]], i32 (i8*, i8*, i8*, i8*, i8*)*, %s.3*, %s.3*, [120 x i8], i8, i8, %s.4*, [2 x [120 x [8 x i8]]], [56 x i8], [2 x [121 x %s.5]], [2 x %s.5], %s.5*, %s.5*, i32, i32, i16, i8, i8, %s.7, %s.9, %s.11, %s.8*, %s.8* }
%s.1 = type { i8, i8, i8, i8, i8, i8, i8, i8, i32, i8, [16 x i8], i8, [4 x i8], [32 x i16], [32 x i16], [2 x i8], [4 x i8], [2 x [4 x i8]], [2 x [4 x i8]], i32, i32, i16, i8 }
-; RUN: llc -march=hexagon -enable-pipeliner < %s | FileCheck %s
+; RUN: llc -march=hexagon -enable-pipeliner < %s -pipeliner-experimental-cg=true | FileCheck %s
; Test that we generate the correct value for a Phi in the epilog
; that is for a value defined two stages earlier. An extra copy in the
-; RUN: llc -march=hexagon < %s | FileCheck %s
+; RUN: llc -march=hexagon < %s -pipeliner-experimental-cg=true | FileCheck %s
; Test that we use the correct name in an epilog phi for a phi value
; that is defined for the last time in the kernel. Previously, we
-; RUN: llc -march=hexagon -mno-pairing -mno-compound -hexagon-initial-cfg-cleanup=0 < %s | FileCheck %s
+; RUN: llc -march=hexagon -mno-pairing -mno-compound -hexagon-initial-cfg-cleanup=0 < %s -pipeliner-experimental-cg=true | FileCheck %s
; XFAIL: *
; Test that we generate the correct phi names in the epilog when the pipeliner
-; RUN: llc -march=hexagon -enable-pipeliner-opt-size -hexagon-initial-cfg-cleanup=0 < %s | FileCheck %s
+; RUN: llc -march=hexagon -enable-pipeliner-opt-size -hexagon-initial-cfg-cleanup=0 < %s -pipeliner-experimental-cg=true | FileCheck %s
; Test that we generate the correct names for the phis in the kernel for the
; incoming values. In this case, the loop contains a phi and has another phi
; RUN: llc -march=hexagon -enable-pipeliner -stats \
; RUN: -pipeliner-prune-loop-carried=false -fp-contract=fast \
-; RUN: -o /dev/null < %s 2>&1 | FileCheck %s --check-prefix=STATS
+; RUN: -o /dev/null < %s 2>&1 -pipeliner-experimental-cg=true | FileCheck %s --check-prefix=STATS
; REQUIRES: asserts
; That that we do not pipeline this loop. The recurrence is too large. If
-; RUN: llc -march=hexagon -pipeliner-ignore-recmii -pipeliner-max-stages=2 -enable-pipeliner < %s | FileCheck %s
+; RUN: llc -march=hexagon -pipeliner-ignore-recmii -pipeliner-max-stages=2 -enable-pipeliner < %s -pipeliner-experimental-cg=true | FileCheck %s
; This is a loop we pipeline to three packets, though we could do bettter.
-; RUN: llc -march=hexagon -hexagon-initial-cfg-cleanup=0 < %s | FileCheck %s
+; RUN: llc -march=hexagon -hexagon-initial-cfg-cleanup=0 < %s -pipeliner-experimental-cg=true | FileCheck %s
; Test that the pipeliner schedules a store before the load in which there is a
; loop carried dependence. Previously, the loop carried dependence wasn't added
-; RUN: llc -march=hexagon -enable-pipeliner -stats -o /dev/null < %s 2>&1 | FileCheck %s --check-prefix=STATS
+; RUN: llc -march=hexagon -enable-pipeliner -stats -o /dev/null < %s 2>&1 -pipeliner-experimental-cg=true | FileCheck %s --check-prefix=STATS
; REQUIRES: asserts
; STATS: 1 pipeliner - Number of loops software pipelined
; RUN: llc -march=hexagon -mcpu=hexagonv5 -enable-pipeliner \
-; RUN: -pipeliner-max-stages=2 < %s | FileCheck %s
+; RUN: -pipeliner-max-stages=2 < %s -pipeliner-experimental-cg=true | FileCheck %s
@A = global [8 x i32] [i32 4, i32 -3, i32 5, i32 -2, i32 -1, i32 2, i32 6, i32 -2], align 8
-; RUN: llc -march=hexagon -O3 < %s | FileCheck %s
+; RUN: llc -march=hexagon -O3 < %s -pipeliner-experimental-cg=true | FileCheck %s
; Test that the MinStart computation, which is based upon the length
; of the chain edges, is computed correctly. A bug in the code allowed
-; RUN: llc -march=hexagon -O2 -fp-contract=fast < %s | FileCheck %s
+; RUN: llc -march=hexagon -O2 -fp-contract=fast < %s -pipeliner-experimental-cg=true | FileCheck %s
; Test that the memoperands for instructions in the epilog are updated
; correctly. Previously, the pipeliner updated the offset for the memoperands
-; RUN: llc -march=hexagon -mcpu=hexagonv5 -enable-pipeliner < %s | FileCheck %s
+; RUN: llc -march=hexagon -mcpu=hexagonv5 -enable-pipeliner < %s -pipeliner-experimental-cg=true | FileCheck %s
; Make sure we attempt to pipeline all inner most loops.
-; RUN: llc -march=hexagon -enable-pipeliner -pipeliner-max-stages=2 < %s | FileCheck %s
+; RUN: llc -march=hexagon -enable-pipeliner -pipeliner-max-stages=2 < %s -pipeliner-experimental-cg=true | FileCheck %s
; Test that the generatePhi code doesn't rename a a Phi instruction that's defined
; in the same block. The bug causes a Phi to incorrectly depend on another Phi.
-; RUN: llc -march=hexagon < %s | FileCheck %s
+; RUN: llc -march=hexagon < %s -pipeliner-experimental-cg=true | FileCheck %s
; Test that the instruction ordering code in the pipeliner fixes up dependences
; between post-increment register definitions and uses so that the register
-; RUN: llc -march=hexagon < %s | FileCheck %s
+; RUN: llc -march=hexagon < %s -pipeliner-experimental-cg=true | FileCheck %s
; Test that the pipeliner cause an assert and correctly pipelines the
; loop.
-; RUN: llc -march=hexagon < %s | FileCheck %s
+; RUN: llc -march=hexagon < %s -pipeliner-experimental-cg=true | FileCheck %s
; Test that when we order instructions in a packet we check for
; order dependences so that the source of an order dependence
-; RUN: llc -march=hexagon -enable-pipeliner -pipeliner-max-stages=2 < %s | FileCheck %s
+; RUN: llc -march=hexagon -enable-pipeliner -pipeliner-max-stages=2 < %s -pipeliner-experimental-cg=true | FileCheck %s
; Test that we generate the correct offsets after we removed unneeded
; chain dependences between Phis and generated a better pipeline.
-; RUN: llc -march=hexagon -debug-only=pipeliner < %s -o - 2>&1 | FileCheck %s
+; RUN: llc -march=hexagon -debug-only=pipeliner < %s -o - 2>&1 -pipeliner-experimental-cg=true | FileCheck %s
; REQUIRES: asserts
; Test that there is a chain edge between two dependent Phis.
-; RUN: llc -march=hexagon -hexagon-initial-cfg-cleanup=0 -enable-pipeliner -pipeliner-max-stages=2 < %s | FileCheck %s
+; RUN: llc -march=hexagon -hexagon-initial-cfg-cleanup=0 -enable-pipeliner -pipeliner-max-stages=2 < %s -pipeliner-experimental-cg=true | FileCheck %s
; Check that the pipelined code uses the proper address in the
; prolog and the kernel. The bug occurs when the address computation
-; RUN: llc -march=hexagon -enable-pipeliner -enable-bsb-sched=0 -join-liveintervals=false < %s | FileCheck %s
+; RUN: llc -march=hexagon -enable-pipeliner -enable-bsb-sched=0 -join-liveintervals=false < %s -pipeliner-experimental-cg=true | FileCheck %s
; XFAIL: *
; This test is failing after post-ra machine sinking.
; RUN: llc -disable-lsr -march=hexagon -enable-pipeliner \
-; RUN: -debug-only=pipeliner < %s 2>&1 > /dev/null | FileCheck %s
+; RUN: -debug-only=pipeliner < %s 2>&1 > /dev/null -pipeliner-experimental-cg=true | FileCheck %s
; REQUIRES: asserts
;
; Test that checks if pipeliner disabled by pragma
; RUN: llc -disable-lsr -march=hexagon -enable-pipeliner \
-; RUN: -debug-only=pipeliner < %s 2>&1 > /dev/null | FileCheck %s
+; RUN: -debug-only=pipeliner < %s 2>&1 > /dev/null -pipeliner-experimental-cg=true | FileCheck %s
; REQUIRES: asserts
;
; Test that checks if the II set by pragma was taken by pipeliner.
-; RUN: llc -march=hexagon -rdf-opt=0 < %s | FileCheck %s
+; RUN: llc -march=hexagon -rdf-opt=0 < %s -pipeliner-experimental-cg=true | FileCheck %s
; Test that we generate the correct name for a value in a prolog block. The
; pipeliner was using an incorrect value for an instruction in the 2nd prolog
-; RUN: llc -march=hexagon -enable-pipeliner < %s | FileCheck %s
+; RUN: llc -march=hexagon -enable-pipeliner < %s -pipeliner-experimental-cg=true | FileCheck %s
; A test that the Phi rewrite logic is correct.
-; RUN: llc -march=hexagon -enable-pipeliner -debug-only=pipeliner < %s -o - 2>&1 > /dev/null | FileCheck %s
+; RUN: llc -march=hexagon -enable-pipeliner -debug-only=pipeliner < %s -o - 2>&1 > /dev/null -pipeliner-experimental-cg=true | FileCheck %s
; REQUIRES: asserts
; Test that checks that we compute the correct ResMII for haar.
; RUN: llc -disable-lsr -march=hexagon -enable-pipeliner \
-; RUN: -debug-only=pipeliner < %s 2>&1 > /dev/null | FileCheck %s
+; RUN: -debug-only=pipeliner < %s 2>&1 > /dev/null -pipeliner-experimental-cg=true | FileCheck %s
; REQUIRES: asserts
;
; Test that checks if the ResMII is 1.
-; RUN: llc -march=hexagon < %s | FileCheck %s
+; RUN: llc -march=hexagon < %s -pipeliner-experimental-cg=true | FileCheck %s
; Test that the pipeliner generates correct code when attempting to reuse
; an existing phi. This test case contains a phi that references another
-; RUN: llc -march=hexagon -O2 < %s | FileCheck %s
+; RUN: llc -march=hexagon -O2 < %s -pipeliner-experimental-cg=true | FileCheck %s
; We do not pipeline sigma yet, but the non-pipelined version
; with good scheduling is pretty fast. The compiler generates
-; RUN: llc -march=hexagon -mcpu=hexagonv5 -enable-pipeliner -pipeliner-max-stages=2 -disable-block-placement=0 -hexagon-bit=0 < %s | FileCheck %s
+; RUN: llc -march=hexagon -mcpu=hexagonv5 -enable-pipeliner -pipeliner-max-stages=2 -disable-block-placement=0 -hexagon-bit=0 < %s -pipeliner-experimental-cg=true | FileCheck %s
; Test that we rename registers correctly for multiple stages when there is a
; Phi and depends upon another Phi.
; CHECK: = and
; CHECK: = and
-; CHECK: = and
+; CHECK: r[[REGA:[0-9]+]] = memub(r{{[0-9]+}}+#1)
; CHECK: r[[REG0:[0-9]+]] = and(r[[REG1:[0-9]+]],#255)
; CHECK-NOT: r[[REG0]] = and(r[[REG1]],#255)
; CHECK: loop0(.LBB0_[[LOOP:.]],
-; RUN: llc -march=hexagon -mcpu=hexagonv5 -enable-pipeliner -pipeliner-max-stages=2 -hexagon-bit=0 < %s | FileCheck %s
+; RUN: llc -march=hexagon -mcpu=hexagonv5 -enable-pipeliner -pipeliner-max-stages=2 -hexagon-bit=0 < %s -pipeliner-experimental-cg=true | FileCheck %s
; Very similar to swp-stages4.ll, but the pipelined schedule is a little
; different.
-; RUN: llc -march=hexagon -enable-pipeliner -stats -o /dev/null < %s 2>&1 | FileCheck %s --check-prefix=STATS
+; RUN: llc -march=hexagon -enable-pipeliner -stats -o /dev/null < %s 2>&1 -pipeliner-experimental-cg=true | FileCheck %s --check-prefix=STATS
; REQUIRES: asserts
; We're unable to pipeline a loop with a subreg as an operand of a Phi.
-; RUN: llc -march=hexagon -enable-pipeliner -stats -o /dev/null < %s 2>&1 | FileCheck %s --check-prefix=STATS
+; RUN: llc -march=hexagon -enable-pipeliner -stats -o /dev/null < %s 2>&1 -pipeliner-experimental-cg=true | FileCheck %s --check-prefix=STATS
; REQUIRES: asserts
; Test that we don't pipeline, incorrectly, the swap operation.
-; RUN: llc -march=hexagon -enable-pipeliner -hexagon-initial-cfg-cleanup=0 -stats -o /dev/null < %s 2>&1 | FileCheck %s --check-prefix=STATS
+; RUN: llc -march=hexagon -enable-pipeliner -hexagon-initial-cfg-cleanup=0 -stats -o /dev/null < %s 2>&1 -pipeliner-experimental-cg=true | FileCheck %s --check-prefix=STATS
; REQUIRES: asserts
; Check that we handle the case when a value is first defined in the loop.
-; RUN: llc -march=hexagon -mcpu=hexagonv5 -enable-pipeliner < %s | FileCheck %s
-; RUN: llc -march=hexagon -mcpu=hexagonv5 -O2 < %s | FileCheck %s
-; RUN: llc -march=hexagon -mcpu=hexagonv5 -O3 < %s | FileCheck %s
+; RUN: llc -march=hexagon -mcpu=hexagonv5 -enable-pipeliner < %s -pipeliner-experimental-cg=true | FileCheck %s
+; RUN: llc -march=hexagon -mcpu=hexagonv5 -O2 < %s -pipeliner-experimental-cg=true | FileCheck %s
+; RUN: llc -march=hexagon -mcpu=hexagonv5 -O3 < %s -pipeliner-experimental-cg=true | FileCheck %s
;
; Check that we pipeline a vectorized dot product in a single packet.
;
; REQUIRES: to-be-fixed
-; RUN: llc -march=hexagon -mcpu=hexagonv5 -enable-pipeliner < %s | FileCheck %s
+; RUN: llc -march=hexagon -mcpu=hexagonv5 -enable-pipeliner < %s -pipeliner-experimental-cg=true | FileCheck %s
; Multiply and accumulate
; CHECK: mpyi([[REG0:r([0-9]+)]],[[REG1:r([0-9]+)]])
-; RUN: llc -march=hexagon -mcpu=hexagonv5 -enable-pipeliner < %s | FileCheck %s
-; RUN: llc -march=hexagon -mcpu=hexagonv60 -enable-pipeliner < %s | FileCheck %s --check-prefix=CHECKV60
+; RUN: llc -march=hexagon -mcpu=hexagonv5 -enable-pipeliner < %s -pipeliner-experimental-cg=true | FileCheck %s
+; RUN: llc -march=hexagon -mcpu=hexagonv60 -enable-pipeliner < %s -pipeliner-experimental-cg=true | FileCheck %s --check-prefix=CHECKV60
; Simple vector total.
; CHECK: loop0(.LBB0_[[LOOP:.]],