[AMDGPU] Allow hoisting of comparisons out of a loop and eliminate condition copies

author Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>

Mon, 28 Nov 2016 18:58:49 +0000 (18:58 +0000)

committer Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>

Mon, 28 Nov 2016 18:58:49 +0000 (18:58 +0000)
author Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>
Mon, 28 Nov 2016 18:58:49 +0000 (18:58 +0000)
committer Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>
Mon, 28 Nov 2016 18:58:49 +0000 (18:58 +0000)
diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp

index b4a7a65386de12c8a4b4db46b4185bdd61f5667a..93dcd728a0c66b066f082918a2a1a364339fd30c 100644 (file)
--- a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -440,6 +440,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
  
    setSchedulingPreference(Sched::RegPressure);
    setJumpIsExpensive(true);
+  setHasMultipleConditionRegisters(true);
  
    // SI at least has hardware support for floating point exceptions, but no way
    // of using or handling them is implemented. They are also optional in OpenCL
diff --git a/lib/Target/AMDGPU/SILowerControlFlow.cpp b/lib/Target/AMDGPU/SILowerControlFlow.cpp

index 9f1178c40abd340babcde71198a96be5c41df43c..7ed18f27e5912ad2ecdf83330723d3a96796aa23 100644 (file)
--- a/lib/Target/AMDGPU/SILowerControlFlow.cpp
+++ b/lib/Target/AMDGPU/SILowerControlFlow.cpp
@@ -80,6 +80,11 @@ private:
    void emitLoop(MachineInstr &MI);
    void emitEndCf(MachineInstr &MI);
  
+  void findMaskOperands(MachineInstr &MI, unsigned OpNo,
+                        SmallVectorImpl<MachineOperand> &Src) const;
+
+  void combineMasks(MachineInstr &MI);
+
  public:
    static char ID;
  
@@ -336,6 +341,62 @@ void SILowerControlFlow::emitEndCf(MachineInstr &MI) {
      LIS->handleMove(*NewMI);
  }
  
+// Returns replace operands for a logical operation, either single result
+// for exec or two operands if source was another equivalent operation.
+void SILowerControlFlow::findMaskOperands(MachineInstr &MI, unsigned OpNo,
+       SmallVectorImpl<MachineOperand> &Src) const {
+  MachineOperand &Op = MI.getOperand(OpNo);
+  if (!Op.isReg() || !TargetRegisterInfo::isVirtualRegister(Op.getReg())) {
+    Src.push_back(Op);
+    return;
+  }
+
+  MachineInstr *Def = MRI->getUniqueVRegDef(Op.getReg());
+  if (!Def || Def->getParent() != MI.getParent() ||
+      !(Def->isFullCopy() || (Def->getOpcode() == MI.getOpcode())))
+    return;
+
+  // Make sure we do not modify exec between def and use.
+  // A copy with implcitly defined exec inserted earlier is an exclusion, it
+  // does not really modify exec.
+  for (auto I = Def->getIterator(); I != MI.getIterator(); ++I)
+    if (I->modifiesRegister(AMDGPU::EXEC, TRI) &&
+        !(I->isCopy() && I->getOperand(0).getReg() != AMDGPU::EXEC))
+      return;
+
+  for (const auto &SrcOp : Def->explicit_operands())
+    if (SrcOp.isUse() && (!SrcOp.isReg() ||
+        TargetRegisterInfo::isVirtualRegister(SrcOp.getReg()) ||
+        SrcOp.getReg() == AMDGPU::EXEC))
+      Src.push_back(SrcOp);
+}
+
+// Search and combine pairs of equivalent instructions, like
+// S_AND_B64 x, (S_AND_B64 x, y) => S_AND_B64 x, y
+// S_OR_B64  x, (S_OR_B64  x, y) => S_OR_B64  x, y
+// One of the operands is exec mask.
+void SILowerControlFlow::combineMasks(MachineInstr &MI) {
+  assert(MI.getNumExplicitOperands() == 3);
+  SmallVector<MachineOperand, 4> Ops;
+  unsigned OpToReplace = 1;
+  findMaskOperands(MI, 1, Ops);
+  if (Ops.size() == 1) OpToReplace = 2; // First operand can be exec or its copy
+  findMaskOperands(MI, 2, Ops);
+  if (Ops.size() != 3) return;
+
+  unsigned UniqueOpndIdx;
+  if (Ops[0].isIdenticalTo(Ops[1])) UniqueOpndIdx = 2;
+  else if (Ops[0].isIdenticalTo(Ops[2])) UniqueOpndIdx = 1;
+  else if (Ops[1].isIdenticalTo(Ops[2])) UniqueOpndIdx = 1;
+  else return;
+
+  unsigned Reg = MI.getOperand(OpToReplace).getReg();
+  MI.RemoveOperand(OpToReplace);
+  MI.addOperand(Ops[UniqueOpndIdx]);
+  if (MRI->use_empty(Reg))
+    MRI->getUniqueVRegDef(Reg)->eraseFromParent();
+}
+
  bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
    const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
    TII = ST.getInstrInfo();
@@ -351,9 +412,9 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
      NextBB = std::next(BI);
      MachineBasicBlock &MBB = *BI;
  
-    MachineBasicBlock::iterator I, Next;
+    MachineBasicBlock::iterator I, Next, Last;
  
-    for (I = MBB.begin(); I != MBB.end(); I = Next) {
+    for (I = MBB.begin(), Last = MBB.end(); I != MBB.end(); I = Next) {
        Next = std::next(I);
        MachineInstr &MI = *I;
  
@@ -386,9 +447,20 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
          emitEndCf(MI);
          break;
  
+      case AMDGPU::S_AND_B64:
+      case AMDGPU::S_OR_B64:
+        // Cleanup bit manipulations on exec mask
+        combineMasks(MI);
+        Last = I;
+        continue;
+
        default:
-        break;
+        Last = I;
+        continue;
        }
+
+      // Replay newly inserted code to combine masks
+      Next = (Last == MBB.end()) ? MBB.begin() : Last;
      }
    }
  
diff --git a/lib/Target/AMDGPU/SILowerI1Copies.cpp b/lib/Target/AMDGPU/SILowerI1Copies.cpp

index 9e62980940be5f67f9629660fa478aa8c4ee6424..be2e14fd462370cc3296f4dc08cb241c88b7fc19 100644 (file)
--- a/lib/Target/AMDGPU/SILowerI1Copies.cpp
+++ b/lib/Target/AMDGPU/SILowerI1Copies.cpp
@@ -100,12 +100,12 @@ bool SILowerI1Copies::runOnMachineFunction(MachineFunction &MF) {
        const TargetRegisterClass *DstRC = MRI.getRegClass(Dst.getReg());
        const TargetRegisterClass *SrcRC = MRI.getRegClass(Src.getReg());
  
+      DebugLoc DL = MI.getDebugLoc();
+      MachineInstr *DefInst = MRI.getUniqueVRegDef(Src.getReg());
        if (DstRC == &AMDGPU::VReg_1RegClass &&
            TRI->getCommonSubClass(SrcRC, &AMDGPU::SGPR_64RegClass)) {
          I1Defs.push_back(Dst.getReg());
-        DebugLoc DL = MI.getDebugLoc();
  
-        MachineInstr *DefInst = MRI.getUniqueVRegDef(Src.getReg());
          if (DefInst->getOpcode() == AMDGPU::S_MOV_B64) {
            if (DefInst->getOperand(1).isImm()) {
              I1Defs.push_back(Dst.getReg());
@@ -129,10 +129,26 @@ bool SILowerI1Copies::runOnMachineFunction(MachineFunction &MF) {
          MI.eraseFromParent();
        } else if (TRI->getCommonSubClass(DstRC, &AMDGPU::SGPR_64RegClass) &&
                   SrcRC == &AMDGPU::VReg_1RegClass) {
-        BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(AMDGPU::V_CMP_NE_U32_e64))
-          .addOperand(Dst)
-          .addOperand(Src)
-          .addImm(0);
+        if (DefInst->getOpcode() == AMDGPU::V_CNDMASK_B32_e64 &&
+            DefInst->getOperand(1).isImm() && DefInst->getOperand(2).isImm() &&
+            DefInst->getOperand(1).getImm() == 0 &&
+            DefInst->getOperand(2).getImm() != 0 &&
+            DefInst->getOperand(3).isReg() &&
+            TargetRegisterInfo::isVirtualRegister(
+              DefInst->getOperand(3).getReg()) &&
+            TRI->getCommonSubClass(
+              MRI.getRegClass(DefInst->getOperand(3).getReg()),
+              &AMDGPU::SGPR_64RegClass)) {
+          BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_B64))
+            .addOperand(Dst)
+            .addReg(AMDGPU::EXEC)
+            .addOperand(DefInst->getOperand(3));
+        } else {
+          BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMP_NE_U32_e64))
+            .addOperand(Dst)
+            .addOperand(Src)
+            .addImm(0);
+        }
          MI.eraseFromParent();
        }
      }
diff --git a/test/CodeGen/AMDGPU/branch-relaxation.ll b/test/CodeGen/AMDGPU/branch-relaxation.ll

index e12a0b798ee39e16fa124f361c5f5a15ce0773f9..c8467bb72a60828441fee3da6a49a5179d1410c2 100644 (file)
--- a/test/CodeGen/AMDGPU/branch-relaxation.ll
+++ b/test/CodeGen/AMDGPU/branch-relaxation.ll
@@ -493,9 +493,9 @@ ret:
  ; GCN: s_setpc_b64
  
  ; GCN: [[LONG_BR_DEST0]]
-; GCN: s_cmp_eq_u32
+; GCN: v_cmp_ne_u32_e32
  ; GCN-NEXT: ; implicit-def
-; GCN-NEXT: s_cbranch_scc0
+; GCN-NEXT: s_cbranch_vccz
  ; GCN: s_setpc_b64
  
  ; GCN: s_endpgm
diff --git a/test/CodeGen/AMDGPU/hoist-cond.ll b/test/CodeGen/AMDGPU/hoist-cond.ll

new file mode 100644 (file)

index 0000000..6831f22
--- /dev/null
+++ b/test/CodeGen/AMDGPU/hoist-cond.ll
@@ -0,0 +1,46 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck %s
+
+; Check that invariant compare is hoisted out of the loop.
+; At the same time condition shall not be serialized into a VGPR and deserialized later
+; using another v_cmp + v_cndmask, but used directly in s_and_saveexec_b64.
+
+; CHECK: v_cmp_{{..}}_u32_e64 [[COND:s\[[0-9]+:[0-9]+\]]]
+; CHECK: BB0_1:
+; CHECK-NOT: v_cmp
+; CHECK_NOT: v_cndmask
+; CHECK: s_and_saveexec_b64 s[{{[[0-9]+:[0-9]+}}], [[COND]]
+; CHECK: BB0_2:
+
+define amdgpu_kernel void @hoist_cond(float addrspace(1)* nocapture %arg, float addrspace(1)* noalias nocapture readonly %arg1, i32 %arg3, i32 %arg4) {
+bb:
+  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #0
+  %tmp5 = icmp ult i32 %tmp, %arg3
+  br label %bb1
+
+bb1:                                              ; preds = %bb3, %bb
+  %tmp7 = phi i32 [ %arg4, %bb ], [ %tmp16, %bb3 ]
+  %tmp8 = phi float [ 0.000000e+00, %bb ], [ %tmp15, %bb3 ]
+  br i1 %tmp5, label %bb2, label %bb3
+
+bb2:                                              ; preds = %bb1
+  %tmp10 = zext i32 %tmp7 to i64
+  %tmp11 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 %tmp10
+  %tmp12 = load float, float addrspace(1)* %tmp11, align 4
+  br label %bb3
+
+bb3:                                             ; preds = %bb2, %bb1
+  %tmp14 = phi float [ %tmp12, %bb2 ], [ 0.000000e+00, %bb1 ]
+  %tmp15 = fadd float %tmp8, %tmp14
+  %tmp16 = add i32 %tmp7, -1
+  %tmp17 = icmp eq i32 %tmp16, 0
+  br i1 %tmp17, label %bb4, label %bb1
+
+bb4:                                             ; preds = %bb3
+  store float %tmp15, float addrspace(1)* %arg, align 4
+  ret void
+}
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.amdgcn.workitem.id.x() #0
+
+attributes #0 = { nounwind readnone }
author	Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>
	Mon, 28 Nov 2016 18:58:49 +0000 (18:58 +0000)
committer	Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>
	Mon, 28 Nov 2016 18:58:49 +0000 (18:58 +0000)
lib/Target/AMDGPU/AMDGPUISelLowering.cpp		patch \| blob \| history
lib/Target/AMDGPU/SILowerControlFlow.cpp		patch \| blob \| history
lib/Target/AMDGPU/SILowerI1Copies.cpp		patch \| blob \| history
test/CodeGen/AMDGPU/branch-relaxation.ll		patch \| blob \| history
test/CodeGen/AMDGPU/hoist-cond.ll	[new file with mode: 0644]	patch \| blob