[AMDGPU] Prevent spills before exec mask is restored

author Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>

Fri, 20 Jan 2017 00:44:31 +0000 (00:44 +0000)

committer Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>

Fri, 20 Jan 2017 00:44:31 +0000 (00:44 +0000)
author Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>
Fri, 20 Jan 2017 00:44:31 +0000 (00:44 +0000)
committer Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>
Fri, 20 Jan 2017 00:44:31 +0000 (00:44 +0000)
diff --git a/include/llvm/Target/TargetInstrInfo.h b/include/llvm/Target/TargetInstrInfo.h

index 83515bc918413e045c805f26280158b5f31d7d66..7d50201c3b767aa6a159fad0ff298ff329b43bad 100644 (file)
--- a/include/llvm/Target/TargetInstrInfo.h
+++ b/include/llvm/Target/TargetInstrInfo.h
@@ -1510,6 +1510,13 @@ public:
      return false;
    }
  
+  /// True if the instruction is bound to the top of its basic block and no
+  /// other instructions shall be inserted before it. This can be implemented
+  /// to prevent register allocator to insert spills before such instructions.
+  virtual bool isBasicBlockPrologue(const MachineInstr &MI) const {
+    return false;
+  }
+
  private:
    unsigned CallFrameSetupOpcode, CallFrameDestroyOpcode;
    unsigned CatchRetOpcode;
diff --git a/lib/CodeGen/MachineBasicBlock.cpp b/lib/CodeGen/MachineBasicBlock.cpp

index 3869f976854d0119e43e6d63070f3a5355da98d4..a7d87799f6796f7796d1f799296b548315a2628d 100644 (file)
--- a/lib/CodeGen/MachineBasicBlock.cpp
+++ b/lib/CodeGen/MachineBasicBlock.cpp
@@ -148,8 +148,11 @@ MachineBasicBlock::iterator MachineBasicBlock::getFirstNonPHI() {
  
  MachineBasicBlock::iterator
  MachineBasicBlock::SkipPHIsAndLabels(MachineBasicBlock::iterator I) {
+  const TargetInstrInfo *TII = getParent()->getSubtarget().getInstrInfo();
+
    iterator E = end();
-  while (I != E && (I->isPHI() || I->isPosition()))
+  while (I != E && (I->isPHI() || I->isPosition() ||
+                    TII->isBasicBlockPrologue(*I)))
      ++I;
    // FIXME: This needs to change if we wish to bundle labels
    // inside the bundle.
@@ -160,8 +163,11 @@ MachineBasicBlock::SkipPHIsAndLabels(MachineBasicBlock::iterator I) {
  
  MachineBasicBlock::iterator
  MachineBasicBlock::SkipPHIsLabelsAndDebug(MachineBasicBlock::iterator I) {
+  const TargetInstrInfo *TII = getParent()->getSubtarget().getInstrInfo();
+
    iterator E = end();
-  while (I != E && (I->isPHI() || I->isPosition() || I->isDebugValue()))
+  while (I != E && (I->isPHI() || I->isPosition() || I->isDebugValue() ||
+                    TII->isBasicBlockPrologue(*I)))
      ++I;
    // FIXME: This needs to change if we wish to bundle labels / dbg_values
    // inside the bundle.
diff --git a/lib/Target/AMDGPU/SIInstrInfo.cpp b/lib/Target/AMDGPU/SIInstrInfo.cpp

index e9dc6f57ab35efbdfd778fc510b1679e25de3587..a647e29c82c9eb7b790c28d347ca3591eb270d39 100644 (file)
--- a/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -3635,3 +3635,8 @@ ScheduleHazardRecognizer *
  SIInstrInfo::CreateTargetPostRAHazardRecognizer(const MachineFunction &MF) const {
    return new GCNHazardRecognizer(MF);
  }
+
+bool SIInstrInfo::isBasicBlockPrologue(const MachineInstr &MI) const {
+  return !MI.isTerminator() && MI.getOpcode() != AMDGPU::COPY &&
+         MI.modifiesRegister(AMDGPU::EXEC, &RI);
+}
diff --git a/lib/Target/AMDGPU/SIInstrInfo.h b/lib/Target/AMDGPU/SIInstrInfo.h

index e68f6f92ba964f03a4021e7d4e75d4e97980eb11..18d7ff6f29e03088bcb0b42985032d4b864c72c1 100644 (file)
--- a/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/lib/Target/AMDGPU/SIInstrInfo.h
@@ -731,6 +731,8 @@ public:
  
    ScheduleHazardRecognizer *
    CreateTargetPostRAHazardRecognizer(const MachineFunction &MF) const override;
+
+  bool isBasicBlockPrologue(const MachineInstr &MI) const override;
  };
  
  namespace AMDGPU {
diff --git a/test/CodeGen/AMDGPU/spill-cfg-position.ll b/test/CodeGen/AMDGPU/spill-cfg-position.ll

new file mode 100644 (file)

index 0000000..686c831
--- /dev/null
+++ b/test/CodeGen/AMDGPU/spill-cfg-position.ll
@@ -0,0 +1,78 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs -stress-regalloc=6 < %s | FileCheck %s
+
+; Inline spiller can decide to move a spill as early as possible in the basic block.
+; It will skip phis and label, but we also need to make sure it skips instructions
+; in the basic block prologue which restore exec mask.
+; Make sure instruction to restore exec mask immediately follows label
+
+; CHECK-LABEL: {{^}}spill_cfg_position:
+; CHECK: s_cbranch_execz [[LABEL1:BB[0-9_]+]]
+; CHECK: {{^}}[[LABEL1]]:
+; CHECK: s_cbranch_execz [[LABEL2:BB[0-9_]+]]
+; CHECK: {{^}}[[LABEL2]]:
+; CHECK-NEXT: s_or_b64 exec
+; CHECK: buffer_
+
+define void @spill_cfg_position(i32 addrspace(1)* nocapture %arg) {
+bb:
+  %tmp1 = tail call i32 @llvm.amdgcn.workitem.id.x() #0
+  %tmp14 = load i32, i32 addrspace(1)* %arg, align 4
+  %tmp15 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 1
+  %tmp16 = load i32, i32 addrspace(1)* %tmp15, align 4
+  %tmp17 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 2
+  %tmp18 = load i32, i32 addrspace(1)* %tmp17, align 4
+  %tmp19 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 3
+  %tmp20 = load i32, i32 addrspace(1)* %tmp19, align 4
+  %tmp21 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 4
+  %tmp22 = load i32, i32 addrspace(1)* %tmp21, align 4
+  %tmp23 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 5
+  %tmp24 = load i32, i32 addrspace(1)* %tmp23, align 4
+  %tmp25 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 6
+  %tmp26 = load i32, i32 addrspace(1)* %tmp25, align 4
+  %tmp27 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 7
+  %tmp28 = load i32, i32 addrspace(1)* %tmp27, align 4
+  %tmp29 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 8
+  %tmp30 = load i32, i32 addrspace(1)* %tmp29, align 4
+  %tmp33 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i32 %tmp1
+  %tmp34 = load i32, i32 addrspace(1)* %tmp33, align 4
+  %tmp35 = icmp eq i32 %tmp34, 0
+  br i1 %tmp35, label %bb44, label %bb36
+
+bb36:                                             ; preds = %bb
+  %tmp37 = mul nsw i32 %tmp20, %tmp18
+  %tmp38 = add nsw i32 %tmp37, %tmp16
+  %tmp39 = mul nsw i32 %tmp24, %tmp22
+  %tmp40 = add nsw i32 %tmp38, %tmp39
+  %tmp41 = mul nsw i32 %tmp28, %tmp26
+  %tmp42 = add nsw i32 %tmp40, %tmp41
+  %tmp43 = add nsw i32 %tmp42, %tmp30
+  br label %bb52
+
+bb44:                                             ; preds = %bb
+  %tmp45 = mul nsw i32 %tmp18, %tmp16
+  %tmp46 = mul nsw i32 %tmp22, %tmp20
+  %tmp47 = add nsw i32 %tmp46, %tmp45
+  %tmp48 = mul nsw i32 %tmp26, %tmp24
+  %tmp49 = add nsw i32 %tmp47, %tmp48
+  %tmp50 = mul nsw i32 %tmp30, %tmp28
+  %tmp51 = add nsw i32 %tmp49, %tmp50
+  br label %bb52
+
+bb52:                                             ; preds = %bb44, %bb36
+  %tmp53 = phi i32 [ %tmp43, %bb36 ], [ %tmp51, %bb44 ]
+  %tmp54 = mul nsw i32 %tmp16, %tmp14
+  %tmp55 = mul nsw i32 %tmp22, %tmp18
+  %tmp56 = mul nsw i32 %tmp24, %tmp20
+  %tmp57 = mul nsw i32 %tmp30, %tmp26
+  %tmp58 = add i32 %tmp55, %tmp54
+  %tmp59 = add i32 %tmp58, %tmp56
+  %tmp60 = add i32 %tmp59, %tmp28
+  %tmp61 = add i32 %tmp60, %tmp57
+  %tmp62 = add i32 %tmp61, %tmp53
+  store i32 %tmp62, i32 addrspace(1)* %tmp33, align 4
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #0
+
+attributes #0 = { nounwind readnone }
author	Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>
	Fri, 20 Jan 2017 00:44:31 +0000 (00:44 +0000)
committer	Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>
	Fri, 20 Jan 2017 00:44:31 +0000 (00:44 +0000)
include/llvm/Target/TargetInstrInfo.h		patch \| blob \| history
lib/CodeGen/MachineBasicBlock.cpp		patch \| blob \| history
lib/Target/AMDGPU/SIInstrInfo.cpp		patch \| blob \| history
lib/Target/AMDGPU/SIInstrInfo.h		patch \| blob \| history
test/CodeGen/AMDGPU/spill-cfg-position.ll	[new file with mode: 0644]	patch \| blob