From f294f578dcc8865d6fbf41165050b474c3d250f6 Mon Sep 17 00:00:00 2001 From: Quentin Colombet Date: Sat, 28 Jan 2017 01:05:27 +0000 Subject: [PATCH] [RegisterCoalescing] Recommit the patch "Remove partial redundent copy". In r292621, the recommit fixes a bug related with live interval update after the partial redundent copy is moved. This recommit solves an additional bug related to the lack of update of subranges. The original patch is to solve the performance problem described in PR27827. Register coalescing sometimes cannot remove a copy because of interference. But if we can find a reverse copy in one of the predecessor block of the copy, the copy is partially redundent and we may remove the copy partially by moving it to the predecessor block without the reverse copy. Differential Revision: https://reviews.llvm.org/D28585 Re-apply r292621 Revert "Revert rL292621. Caused some internal build bot failures in apple." This reverts commit r292984. Original patch: Wei Mi Subrange fix: Mostly Matthias Braun git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@293353 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/CodeGen/LiveIntervalAnalysis.h | 10 + lib/CodeGen/RegisterCoalescer.cpp | 188 +++++++++++++ lib/Transforms/Scalar/LoopStrengthReduce.cpp | 4 +- test/CodeGen/X86/pre-coalesce-2.ll | 281 +++++++++++++++++++ test/CodeGen/X86/pre-coalesce.ll | 51 ++++ test/CodeGen/X86/pre-coalesce.mir | 122 ++++++++ 6 files changed, 655 insertions(+), 1 deletion(-) create mode 100644 test/CodeGen/X86/pre-coalesce-2.ll create mode 100644 test/CodeGen/X86/pre-coalesce.ll create mode 100644 test/CodeGen/X86/pre-coalesce.mir diff --git a/include/llvm/CodeGen/LiveIntervalAnalysis.h b/include/llvm/CodeGen/LiveIntervalAnalysis.h index d08053a84d4..f5b1f87720a 100644 --- a/include/llvm/CodeGen/LiveIntervalAnalysis.h +++ b/include/llvm/CodeGen/LiveIntervalAnalysis.h @@ -189,6 +189,16 @@ extern cl::opt UseSegmentSetForPhysRegs; void pruneValue(LiveRange &LR, SlotIndex Kill, SmallVectorImpl *EndPoints); + /// This function should be used. Its intend is to tell you that + /// you are doing something wrong if you call pruveValue directly on a + /// LiveInterval. Indeed, you are supposed to call pruneValue on the main + /// LiveRange and all the LiveRange of the subranges if any. + LLVM_ATTRIBUTE_UNUSED void pruneValue(LiveInterval &, SlotIndex, + SmallVectorImpl *) { + llvm_unreachable( + "Use pruneValue on the main LiveRange and on each subrange"); + } + SlotIndexes *getSlotIndexes() const { return Indexes; } diff --git a/lib/CodeGen/RegisterCoalescer.cpp b/lib/CodeGen/RegisterCoalescer.cpp index 57d5b5013d8..778ea8eaca1 100644 --- a/lib/CodeGen/RegisterCoalescer.cpp +++ b/lib/CodeGen/RegisterCoalescer.cpp @@ -22,6 +22,7 @@ #include "llvm/CodeGen/LiveRangeEdit.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineLoopInfo.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/Passes.h" @@ -189,6 +190,9 @@ namespace { /// This returns true if an interval was modified. bool removeCopyByCommutingDef(const CoalescerPair &CP,MachineInstr *CopyMI); + /// We found a copy which can be moved to its less frequent predecessor. + bool removePartialRedundancy(const CoalescerPair &CP, MachineInstr &CopyMI); + /// If the source of a copy is defined by a /// trivial computation, replace the copy by rematerialize the definition. bool reMaterializeTrivialDef(const CoalescerPair &CP, MachineInstr *CopyMI, @@ -861,6 +865,184 @@ bool RegisterCoalescer::removeCopyByCommutingDef(const CoalescerPair &CP, return true; } +/// For copy B = A in BB2, if A is defined by A = B in BB0 which is a +/// predecessor of BB2, and if B is not redefined on the way from A = B +/// in BB2 to B = A in BB2, B = A in BB2 is partially redundant if the +/// execution goes through the path from BB0 to BB2. We may move B = A +/// to the predecessor without such reversed copy. +/// So we will transform the program from: +/// BB0: +/// A = B; BB1: +/// ... ... +/// / \ / +/// BB2: +/// ... +/// B = A; +/// +/// to: +/// +/// BB0: BB1: +/// A = B; ... +/// ... B = A; +/// / \ / +/// BB2: +/// ... +/// +/// A special case is when BB0 and BB2 are the same BB which is the only +/// BB in a loop: +/// BB1: +/// ... +/// BB0/BB2: ---- +/// B = A; | +/// ... | +/// A = B; | +/// |------- +/// | +/// We may hoist B = A from BB0/BB2 to BB1. +/// +/// The major preconditions for correctness to remove such partial +/// redundancy include: +/// 1. A in B = A in BB2 is defined by a PHI in BB2, and one operand of +/// the PHI is defined by the reversed copy A = B in BB0. +/// 2. No B is referenced from the start of BB2 to B = A. +/// 3. No B is defined from A = B to the end of BB0. +/// 4. BB1 has only one successor. +/// +/// 2 and 4 implicitly ensure B is not live at the end of BB1. +/// 4 guarantees BB2 is hotter than BB1, so we can only move a copy to a +/// colder place, which not only prevent endless loop, but also make sure +/// the movement of copy is beneficial. +bool RegisterCoalescer::removePartialRedundancy(const CoalescerPair &CP, + MachineInstr &CopyMI) { + assert(!CP.isPhys()); + if (!CopyMI.isFullCopy()) + return false; + + MachineBasicBlock &MBB = *CopyMI.getParent(); + if (MBB.isEHPad()) + return false; + + if (MBB.pred_size() != 2) + return false; + + LiveInterval &IntA = + LIS->getInterval(CP.isFlipped() ? CP.getDstReg() : CP.getSrcReg()); + LiveInterval &IntB = + LIS->getInterval(CP.isFlipped() ? CP.getSrcReg() : CP.getDstReg()); + + // A is defined by PHI at the entry of MBB. + SlotIndex CopyIdx = LIS->getInstructionIndex(CopyMI).getRegSlot(true); + VNInfo *AValNo = IntA.getVNInfoAt(CopyIdx); + assert(AValNo && !AValNo->isUnused() && "COPY source not live"); + if (!AValNo->isPHIDef()) + return false; + + // No B is referenced before CopyMI in MBB. + if (IntB.overlaps(LIS->getMBBStartIdx(&MBB), CopyIdx)) + return false; + + // MBB has two predecessors: one contains A = B so no copy will be inserted + // for it. The other one will have a copy moved from MBB. + bool FoundReverseCopy = false; + MachineBasicBlock *CopyLeftBB = nullptr; + for (MachineBasicBlock *Pred : MBB.predecessors()) { + VNInfo *PVal = IntA.getVNInfoBefore(LIS->getMBBEndIdx(Pred)); + MachineInstr *DefMI = LIS->getInstructionFromIndex(PVal->def); + if (!DefMI || !DefMI->isFullCopy()) { + CopyLeftBB = Pred; + continue; + } + // Check DefMI is a reverse copy and it is in BB Pred. + if (DefMI->getOperand(0).getReg() != IntA.reg || + DefMI->getOperand(1).getReg() != IntB.reg || + DefMI->getParent() != Pred) { + CopyLeftBB = Pred; + continue; + } + // If there is any other def of B after DefMI and before the end of Pred, + // we need to keep the copy of B = A at the end of Pred if we remove + // B = A from MBB. + bool ValB_Changed = false; + for (auto VNI : IntB.valnos) { + if (VNI->isUnused()) + continue; + if (PVal->def < VNI->def && VNI->def < LIS->getMBBEndIdx(Pred)) { + ValB_Changed = true; + break; + } + } + if (ValB_Changed) { + CopyLeftBB = Pred; + continue; + } + FoundReverseCopy = true; + } + + // If no reverse copy is found in predecessors, nothing to do. + if (!FoundReverseCopy) + return false; + + // If CopyLeftBB is nullptr, it means every predecessor of MBB contains + // reverse copy, CopyMI can be removed trivially if only IntA/IntB is updated. + // If CopyLeftBB is not nullptr, move CopyMI from MBB to CopyLeftBB and + // update IntA/IntB. + // + // If CopyLeftBB is not nullptr, ensure CopyLeftBB has a single succ so + // MBB is hotter than CopyLeftBB. + if (CopyLeftBB && CopyLeftBB->succ_size() > 1) + return false; + + // Now ok to move copy. + if (CopyLeftBB) { + DEBUG(dbgs() << "\tremovePartialRedundancy: Move the copy to BB#" + << CopyLeftBB->getNumber() << '\t' << CopyMI); + + // Insert new copy to CopyLeftBB. + auto InsPos = CopyLeftBB->getFirstTerminator(); + MachineInstr *NewCopyMI = BuildMI(*CopyLeftBB, InsPos, CopyMI.getDebugLoc(), + TII->get(TargetOpcode::COPY), IntB.reg) + .addReg(IntA.reg); + SlotIndex NewCopyIdx = + LIS->InsertMachineInstrInMaps(*NewCopyMI).getRegSlot(); + IntB.createDeadDef(NewCopyIdx, LIS->getVNInfoAllocator()); + for (LiveInterval::SubRange &SR : IntB.subranges()) + SR.createDeadDef(NewCopyIdx, LIS->getVNInfoAllocator()); + } else { + DEBUG(dbgs() << "\tremovePartialRedundancy: Remove the copy from BB#" + << MBB.getNumber() << '\t' << CopyMI); + } + + // Remove CopyMI. + // Note: This is fine to remove the copy before updating the live-ranges. + // While updating the live-ranges, we only look at slot indices and + // never go back to the instruction. + LIS->RemoveMachineInstrFromMaps(CopyMI); + CopyMI.eraseFromParent(); + + // Update the liveness. + SmallVector EndPoints; + VNInfo *BValNo = IntB.Query(CopyIdx).valueOutOrDead(); + LIS->pruneValue(*static_cast(&IntB), CopyIdx.getRegSlot(), + &EndPoints); + BValNo->markUnused(); + // Extend IntB to the EndPoints of its original live interval. + LIS->extendToIndices(IntB, EndPoints); + + // Now, do the same for its subranges. + for (LiveInterval::SubRange &SR : IntB.subranges()) { + EndPoints.clear(); + VNInfo *BValNo = SR.Query(CopyIdx).valueOutOrDead(); + assert(BValNo && "All sublanes should be live"); + LIS->pruneValue(SR, CopyIdx.getRegSlot(), &EndPoints); + BValNo->markUnused(); + LIS->extendToIndices(SR, EndPoints); + } + + // Finally, update the live-range of IntA. + shrinkToUses(&IntA); + return true; +} + /// Returns true if @p MI defines the full vreg @p Reg, as opposed to just /// defining a subregister. static bool definesFullReg(const MachineInstr &MI, unsigned Reg) { @@ -1486,6 +1668,12 @@ bool RegisterCoalescer::joinCopy(MachineInstr *CopyMI, bool &Again) { } } + // Try and see if we can partially eliminate the copy by moving the copy to + // its predecessor. + if (!CP.isPartial() && !CP.isPhys()) + if (removePartialRedundancy(CP, *CopyMI)) + return true; + // Otherwise, we are unable to join the intervals. DEBUG(dbgs() << "\tInterference!\n"); Again = true; // May be possible to coalesce later. diff --git a/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/lib/Transforms/Scalar/LoopStrengthReduce.cpp index 01728ae680d..0aa023a4d61 100644 --- a/lib/Transforms/Scalar/LoopStrengthReduce.cpp +++ b/lib/Transforms/Scalar/LoopStrengthReduce.cpp @@ -2986,8 +2986,10 @@ void LSRInstance::CollectFixupsAndInitialFormulae() { User::op_iterator UseI = find(UserInst->operands(), U.getOperandValToReplace()); assert(UseI != UserInst->op_end() && "cannot find IV operand"); - if (IVIncSet.count(UseI)) + if (IVIncSet.count(UseI)) { + DEBUG(dbgs() << "Use is in profitable chain: " << **UseI << '\n'); continue; + } LSRUse::KindType Kind = LSRUse::Basic; MemAccessTy AccessTy; diff --git a/test/CodeGen/X86/pre-coalesce-2.ll b/test/CodeGen/X86/pre-coalesce-2.ll new file mode 100644 index 00000000000..90fcd1875d4 --- /dev/null +++ b/test/CodeGen/X86/pre-coalesce-2.ll @@ -0,0 +1,281 @@ +; RUN: llc -regalloc=greedy -verify-coalescing -mtriple=x86_64-unknown-linux-gnu < %s +; Check the live range is updated properly after register coalescing. + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +@.str = internal unnamed_addr constant { [17 x i8], [47 x i8] } { [17 x i8] c"0123456789ABCDEF\00", [47 x i8] zeroinitializer }, align 32 +@b = common local_unnamed_addr global i32 0, align 4 +@a = common local_unnamed_addr global i32* null, align 8 +@__sancov_gen_cov = private global [9 x i32] zeroinitializer + +; Function Attrs: nounwind sanitize_address +define void @fn2(i8* %p1) local_unnamed_addr #0 { +entry: + %0 = load atomic i32, i32* inttoptr (i64 add (i64 ptrtoint ([9 x i32]* @__sancov_gen_cov to i64), i64 4) to i32*) monotonic, align 4 + %1 = icmp sge i32 0, %0 + br i1 %1, label %2, label %3 + +;