[Pipeliner] Improve serialization order for post-increments

author Krzysztof Parzyszek <kparzysz@codeaurora.org>

Wed, 11 Oct 2017 15:51:44 +0000 (15:51 +0000)

committer Krzysztof Parzyszek <kparzysz@codeaurora.org>

Wed, 11 Oct 2017 15:51:44 +0000 (15:51 +0000)
author Krzysztof Parzyszek <kparzysz@codeaurora.org>
Wed, 11 Oct 2017 15:51:44 +0000 (15:51 +0000)
committer Krzysztof Parzyszek <kparzysz@codeaurora.org>
Wed, 11 Oct 2017 15:51:44 +0000 (15:51 +0000)
diff --git a/lib/CodeGen/MachinePipeliner.cpp b/lib/CodeGen/MachinePipeliner.cpp

index d0616fd678bda3f53ede024e0128b3432b36f5b6..20141f7f8d60f905117c31e00260a4397dfee24e 100644 (file)
--- a/lib/CodeGen/MachinePipeliner.cpp
+++ b/lib/CodeGen/MachinePipeliner.cpp
@@ -369,8 +369,9 @@ public:
    /// Set the Minimum Initiation Interval for this schedule attempt.
    void setMII(unsigned mii) { MII = mii; }
  
-  MachineInstr *applyInstrChange(MachineInstr *MI, SMSchedule &Schedule,
-                                 bool UpdateDAG = false);
+  void applyInstrChange(MachineInstr *MI, SMSchedule &Schedule);
+
+  void fixupRegisterOverlaps(std::deque<SUnit *> &Instrs);
  
    /// Return the new base register that was stored away for the changed
    /// instruction.
@@ -3390,9 +3391,8 @@ bool SwingSchedulerDAG::canUseLastOffsetValue(MachineInstr *MI,
  
  /// Apply changes to the instruction if needed. The changes are need
  /// to improve the scheduling and depend up on the final schedule.
-MachineInstr *SwingSchedulerDAG::applyInstrChange(MachineInstr *MI,
-                                                  SMSchedule &Schedule,
-                                                  bool UpdateDAG) {
+void SwingSchedulerDAG::applyInstrChange(MachineInstr *MI,
+                                         SMSchedule &Schedule) {
    SUnit *SU = getSUnit(MI);
    DenseMap<SUnit *, std::pair<unsigned, int64_t>>::iterator It =
        InstrChanges.find(SU);
@@ -3400,7 +3400,7 @@ MachineInstr *SwingSchedulerDAG::applyInstrChange(MachineInstr *MI,
      std::pair<unsigned, int64_t> RegAndOffset = It->second;
      unsigned BasePos, OffsetPos;
      if (!TII->getBaseAndOffsetPosition(*MI, BasePos, OffsetPos))
-      return nullptr;
+      return;
      unsigned BaseReg = MI->getOperand(BasePos).getReg();
      MachineInstr *LoopDef = findDefInLoop(BaseReg);
      int DefStageNum = Schedule.stageScheduled(getSUnit(LoopDef));
@@ -3418,15 +3418,11 @@ MachineInstr *SwingSchedulerDAG::applyInstrChange(MachineInstr *MI,
        int64_t NewOffset =
            MI->getOperand(OffsetPos).getImm() + RegAndOffset.second * OffsetDiff;
        NewMI->getOperand(OffsetPos).setImm(NewOffset);
-      if (UpdateDAG) {
-        SU->setInstr(NewMI);
-        MISUnitMap[NewMI] = SU;
-      }
+      SU->setInstr(NewMI);
+      MISUnitMap[NewMI] = SU;
        NewMIs.insert(NewMI);
-      return NewMI;
      }
    }
-  return nullptr;
  }
  
  /// Return true for an order dependence that is loop carried potentially.
@@ -3872,6 +3868,53 @@ bool SMSchedule::isValidSchedule(SwingSchedulerDAG *SSD) {
    return true;
  }
  
+/// Attempt to fix the degenerate cases when the instruction serialization
+/// causes the register lifetimes to overlap. For example,
+///   p' = store_pi(p, b)
+///      = load p, offset
+/// In this case p and p' overlap, which means that two registers are needed.
+/// Instead, this function changes the load to use p' and updates the offset.
+void SwingSchedulerDAG::fixupRegisterOverlaps(std::deque<SUnit *> &Instrs) {
+  unsigned OverlapReg = 0;
+  unsigned NewBaseReg = 0;
+  for (SUnit *SU : Instrs) {
+    MachineInstr *MI = SU->getInstr();
+    for (unsigned i = 0, e = MI->getNumOperands(); i < e; ++i) {
+      const MachineOperand &MO = MI->getOperand(i);
+      // Look for an instruction that uses p. The instruction occurs in the
+      // same cycle but occurs later in the serialized order.
+      if (MO.isReg() && MO.isUse() && MO.getReg() == OverlapReg) {
+        // Check that the instruction appears in the InstrChanges structure,
+        // which contains instructions that can have the offset updated.
+        DenseMap<SUnit *, std::pair<unsigned, int64_t>>::iterator It =
+          InstrChanges.find(SU);
+        if (It != InstrChanges.end()) {
+          unsigned BasePos, OffsetPos;
+          // Update the base register and adjust the offset.
+          if (TII->getBaseAndOffsetPosition(*MI, BasePos, OffsetPos)) {
+            MI->getOperand(BasePos).setReg(NewBaseReg);
+            int64_t Offset = MI->getOperand(OffsetPos).getImm();
+            MI->getOperand(OffsetPos).setImm(Offset - It->second.second);
+          }
+        }
+        OverlapReg = 0;
+        NewBaseReg = 0;
+        break;
+      }
+      // Look for an instruction of the form p' = op(p), which uses and defines
+      // two virtual registers that get allocated to the same physical register.
+      unsigned TiedUseIdx = 0;
+      if (MI->isRegTiedToUseOperand(i, &TiedUseIdx)) {
+        // OverlapReg is p in the example above.
+        OverlapReg = MI->getOperand(TiedUseIdx).getReg();
+        // NewBaseReg is p' in the example above.
+        NewBaseReg = MI->getOperand(i).getReg();
+        break;
+      }
+    }
+  }
+}
+
  /// After the schedule has been formed, call this function to combine
  /// the instructions from the different stages/cycles.  That is, this
  /// function creates a schedule that represents a single iteration.
@@ -3932,7 +3975,7 @@ void SMSchedule::finalizeSchedule(SwingSchedulerDAG *SSD) {
    // map. We need to use the new registers to create the correct order.
    for (int i = 0, e = SSD->SUnits.size(); i != e; ++i) {
      SUnit *SU = &SSD->SUnits[i];
-    SSD->applyInstrChange(SU->getInstr(), *this, true);
+    SSD->applyInstrChange(SU->getInstr(), *this);
    }
  
    // Reorder the instructions in each cycle to fix and improve the
@@ -3956,6 +3999,7 @@ void SMSchedule::finalizeSchedule(SwingSchedulerDAG *SSD) {
      // Replace the old order with the new order.
      cycleInstrs.swap(newOrderZC);
      cycleInstrs.insert(cycleInstrs.end(), newOrderI.begin(), newOrderI.end());
+    SSD->fixupRegisterOverlaps(cycleInstrs);
    }
  
    DEBUG(dump(););
diff --git a/lib/Target/Hexagon/HexagonInstrInfo.cpp b/lib/Target/Hexagon/HexagonInstrInfo.cpp

index bcd1a5089c710db321e50ef55b62c2ba0a5efef1..c2125bec3a521ee898c57e363e23d974cf12433e 100644 (file)
--- a/lib/Target/Hexagon/HexagonInstrInfo.cpp
+++ b/lib/Target/Hexagon/HexagonInstrInfo.cpp
@@ -1651,8 +1651,13 @@ bool HexagonInstrInfo::areMemAccessesTriviallyDisjoint(
  bool HexagonInstrInfo::getIncrementValue(const MachineInstr &MI,
        int &Value) const {
    if (isPostIncrement(MI)) {
+    // For a post-increment, the offset is zero and the increment value is
+    // determined by the instruction's access size.
+    int Zero;
      unsigned AccessSize;
-    return getBaseAndOffset(MI, Value, AccessSize);
+    bool RetVal = getBaseAndOffset(MI, Zero, AccessSize);
+    Value = (int) AccessSize;
+    return RetVal;
    }
    if (MI.getOpcode() == Hexagon::A2_addi) {
      Value = MI.getOperand(2).getImm();
diff --git a/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp b/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp

index 0f407c2d8366be56dad1f56c4f8c08830afdbb28..2555b50f91cff2ca71afecfb113f806756ce7716 100644 (file)
--- a/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp
+++ b/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp
@@ -496,6 +496,48 @@ void HexagonPacketizerList::useCalleesSP(MachineInstr &MI) {
    Off.setImm(Off.getImm() + FrameSize + HEXAGON_LRFP_SIZE);
  }
  
+/// Return true if we can update the offset in MI so that MI and MJ
+/// can be packetized together.
+bool HexagonPacketizerList::updateOffset(SUnit *SUI, SUnit *SUJ) {
+  assert(SUI->getInstr() && SUJ->getInstr());
+  MachineInstr &MI = *SUI->getInstr();
+  MachineInstr &MJ = *SUJ->getInstr();
+
+  unsigned BPI, OPI;
+  if (!HII->getBaseAndOffsetPosition(MI, BPI, OPI))
+    return false;
+  unsigned BPJ, OPJ;
+  if (!HII->getBaseAndOffsetPosition(MJ, BPJ, OPJ))
+    return false;
+  unsigned Reg = MI.getOperand(BPI).getReg();
+  if (Reg != MJ.getOperand(BPJ).getReg())
+    return false;
+  // Make sure that the dependences do not restrict adding MI to the packet.
+  // That is, ignore anti dependences, and make sure the only data dependence
+  // involves the specific register.
+  for (const auto &PI : SUI->Preds)
+    if (PI.getKind() != SDep::Anti &&
+        (PI.getKind() != SDep::Data || PI.getReg() != Reg))
+      return false;
+  int Incr;
+  if (!HII->getIncrementValue(MJ, Incr))
+    return false;
+
+  int64_t Offset = MI.getOperand(OPI).getImm();
+  MI.getOperand(OPI).setImm(Offset + Incr);
+  ChangedOffset = Offset;
+  return true;
+}
+
+/// Undo the changed offset. This is needed if the instruction cannot be
+/// added to the current packet due to a different instruction.
+void HexagonPacketizerList::undoChangedOffset(MachineInstr &MI) {
+  unsigned BP, OP;
+  if (!HII->getBaseAndOffsetPosition(MI, BP, OP))
+    llvm_unreachable("Unable to find base and offset operands.");
+  MI.getOperand(OP).setImm(ChangedOffset);
+}
+
  enum PredicateKind {
    PK_False,
    PK_True,
@@ -980,6 +1022,7 @@ void HexagonPacketizerList::initPacketizerState() {
    GlueToNewValueJump = false;
    GlueAllocframeStore = false;
    FoundSequentialDependence = false;
+  ChangedOffset = INT64_MAX;
  }
  
  // Ignore bundling of pseudo instructions.
@@ -1567,6 +1610,15 @@ bool HexagonPacketizerList::isLegalToPruneDependencies(SUnit *SUI, SUnit *SUJ) {
      useCalleesSP(I);
      GlueAllocframeStore = false;
    }
+
+  if (ChangedOffset != INT64_MAX)
+    undoChangedOffset(I);
+  else if (updateOffset(SUI, SUJ)) {
+    FoundSequentialDependence = false;
+    Dependence = false;
+    return true;
+  }
+
    return false;
  }
  
diff --git a/lib/Target/Hexagon/HexagonVLIWPacketizer.h b/lib/Target/Hexagon/HexagonVLIWPacketizer.h

index 3a7bdf5101e6d82da4986bdd9af7b1aaaf929762..cbdd2367429d480f46eaeba3c5ad0340e7fac9bb 100644 (file)
--- a/lib/Target/Hexagon/HexagonVLIWPacketizer.h
+++ b/lib/Target/Hexagon/HexagonVLIWPacketizer.h
@@ -38,6 +38,9 @@ class HexagonPacketizerList : public VLIWPacketizerList {
    // Has the feeder instruction been glued to new value jump.
    bool GlueToNewValueJump;
  
+  // This holds the offset value, when pruning the dependences.
+  int64_t ChangedOffset;
+
    // Check if there is a dependence between some instruction already in this
    // packet and this instruction.
    bool Dependence;
@@ -117,6 +120,8 @@ protected:
    bool demoteToDotOld(MachineInstr &MI);
    bool useCallersSP(MachineInstr &MI);
    void useCalleesSP(MachineInstr &MI);
+  bool updateOffset(SUnit *SUI, SUnit *SUJ);
+  void undoChangedOffset(MachineInstr &MI);
    bool arePredicatesComplements(MachineInstr &MI1, MachineInstr &MI2);
    bool restrictingDepExistInPacket(MachineInstr&, unsigned);
    bool isNewifiable(const MachineInstr &MI, const TargetRegisterClass *NewRC);
diff --git a/test/CodeGen/Hexagon/swp-order-copies.ll b/test/CodeGen/Hexagon/swp-order-copies.ll

new file mode 100644 (file)

index 0000000..5de0717
--- /dev/null
+++ b/test/CodeGen/Hexagon/swp-order-copies.ll
@@ -0,0 +1,37 @@
+; RUN: llc -march=hexagon < %s | FileCheck %s
+
+; Test that the instruction ordering code in the pipeliner fixes up dependences
+; between post-increment register definitions and uses so that the register
+; allocator does not allocate an additional register. The following test case
+; should generate a single packet.
+
+; CHECK: loop0(.LBB0_[[LOOP:.]],
+; CHECK: .LBB0_[[LOOP]]:
+; CHECK: {
+; CHECK-NOT: {
+; CHECK: :endloop0
+
+define void @test(i64* nocapture %v1, i64 %v2, i32 %len) local_unnamed_addr #0 {
+entry:
+  %cmp7 = icmp sgt i32 %len, 0
+  br i1 %cmp7, label %for.body, label %for.end
+
+for.body:
+  %arrayidx.phi = phi i64* [ %arrayidx.inc, %for.body ], [ %v1, %entry ]
+  %i.08 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %0 = load i64, i64* %arrayidx.phi, align 8
+  %1 = tail call i64 @llvm.hexagon.M2.mmpyul.rs1(i64 %0, i64 %v2)
+  store i64 %1, i64* %arrayidx.phi, align 8
+  %inc = add nuw nsw i32 %i.08, 1
+  %exitcond = icmp eq i32 %inc, %len
+  %arrayidx.inc = getelementptr i64, i64* %arrayidx.phi, i32 1
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+declare i64 @llvm.hexagon.M2.mmpyul.rs1(i64, i64) #1
+
+attributes #0 = { nounwind "target-cpu"="hexagonv60" }
+attributes #1 = { nounwind readnone }
author	Krzysztof Parzyszek <kparzysz@codeaurora.org>
	Wed, 11 Oct 2017 15:51:44 +0000 (15:51 +0000)
committer	Krzysztof Parzyszek <kparzysz@codeaurora.org>
	Wed, 11 Oct 2017 15:51:44 +0000 (15:51 +0000)
lib/CodeGen/MachinePipeliner.cpp		patch \| blob \| history
lib/Target/Hexagon/HexagonInstrInfo.cpp		patch \| blob \| history
lib/Target/Hexagon/HexagonVLIWPacketizer.cpp		patch \| blob \| history
lib/Target/Hexagon/HexagonVLIWPacketizer.h		patch \| blob \| history
test/CodeGen/Hexagon/swp-order-copies.ll	[new file with mode: 0644]	patch \| blob