AMDGPU : Fix common dominator of two incoming blocks terminates with uniform branch...

author Wei Ding <wei.ding2@amd.com>

Wed, 12 Apr 2017 23:51:47 +0000 (23:51 +0000)

committer Wei Ding <wei.ding2@amd.com>

Wed, 12 Apr 2017 23:51:47 +0000 (23:51 +0000)
author Wei Ding <wei.ding2@amd.com>
Wed, 12 Apr 2017 23:51:47 +0000 (23:51 +0000)
committer Wei Ding <wei.ding2@amd.com>
Wed, 12 Apr 2017 23:51:47 +0000 (23:51 +0000)
diff --git a/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/lib/Target/AMDGPU/SIFixSGPRCopies.cpp

index 34cd6f704a12f51e0fb00bb3c8899c6784ce2eff..f9d258f44a622a6de9ee367d10e3dd8e060e3460 100644 (file)
--- a/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
+++ b/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
@@ -65,6 +65,7 @@
  /// ultimately led to the creation of an illegal COPY.
  //===----------------------------------------------------------------------===//
  
+#include "llvm/ADT/DenseSet.h"
  #include "AMDGPU.h"
  #include "AMDGPUSubtarget.h"
  #include "SIInstrInfo.h"
@@ -331,6 +332,27 @@ static bool isSafeToFoldImmIntoCopy(const MachineInstr *Copy,
    return true;
  }
  
+static bool predsHasDivergentTerminator(MachineBasicBlock *MBB,
+                               const TargetRegisterInfo *TRI) {
+  DenseSet<MachineBasicBlock*> Visited;
+  SmallVector<MachineBasicBlock*, 4> Worklist(MBB->pred_begin(), 
+                                              MBB->pred_end());
+
+  while (!Worklist.empty()) {
+    MachineBasicBlock *mbb = Worklist.back();
+    Worklist.pop_back();
+
+    if (!Visited.insert(mbb).second)
+      continue;
+    if (hasTerminatorThatModifiesExec(*mbb, *TRI))
+      return true;
+
+    Worklist.insert(Worklist.end(), mbb->pred_begin(), mbb->pred_end());
+  }
+
+  return false;
+}
+
  bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
    const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
    MachineRegisterInfo &MRI = MF.getRegInfo();
@@ -387,8 +409,8 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
            MachineBasicBlock *MBB0 = MI.getOperand(2).getMBB();
            MachineBasicBlock *MBB1 = MI.getOperand(4).getMBB();
  
-          MachineBasicBlock *NCD = MDT->findNearestCommonDominator(MBB0, MBB1);
-          if (NCD && !hasTerminatorThatModifiesExec(*NCD, *TRI)) {
+          if (!predsHasDivergentTerminator(MBB0, TRI) &&
+              !predsHasDivergentTerminator(MBB1, TRI)) {
              DEBUG(dbgs() << "Not fixing PHI for uniform branch: " << MI << '\n');
              break;
            }
diff --git a/test/CodeGen/AMDGPU/loop_break.ll b/test/CodeGen/AMDGPU/loop_break.ll

index 492472155ee6bce776ab86718c782076e9abbb94..b9df2cb779ad0a793e906f570f9dde2888d08a61 100644 (file)
--- a/test/CodeGen/AMDGPU/loop_break.ll
+++ b/test/CodeGen/AMDGPU/loop_break.ll
@@ -27,8 +27,9 @@
  
  ; GCN: [[LOOP_ENTRY:BB[0-9]+_[0-9]+]]: ; %bb1
  ; GCN: s_or_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], exec, [[INITMASK]]
-; GCN: s_cmp_gt_i32 s{{[0-9]+}}, -1
-; GCN-NEXT: s_cbranch_scc1 [[FLOW:BB[0-9]+_[0-9]+]]
+; GCN: v_cmp_lt_i32_e32 vcc, -1
+; GCN: s_and_b64 vcc, exec, vcc
+; GCN-NEXT: s_cbranch_vccnz [[FLOW:BB[0-9]+_[0-9]+]]
  
  ; GCN: ; BB#2: ; %bb4
  ; GCN: buffer_load_dword
diff --git a/test/CodeGen/AMDGPU/sgprcopies.ll b/test/CodeGen/AMDGPU/sgprcopies.ll

new file mode 100644 (file)

index 0000000..68cd83b
--- /dev/null
+++ b/test/CodeGen/AMDGPU/sgprcopies.ll
@@ -0,0 +1,58 @@
+; RUN: llc < %s -march=amdgcn -verify-machineinstrs | FileCheck -check-prefix=GCN %s
+
+; GCN-LABEL: {{^}}checkTwoBlocksWithUniformBranch
+; GCN: BB0_2
+; GCN: v_add
+define amdgpu_kernel void @checkTwoBlocksWithUniformBranch(i32 addrspace(1)* nocapture %out, i32 %width, float %xPos, float %yPos, float %xStep, float %yStep, i32 %maxIter) {
+entry:
+  %conv = call i32 @llvm.amdgcn.workitem.id.x() #1
+  %rem = urem i32 %conv, %width
+  %div = udiv i32 %conv, %width
+  %conv1 = sitofp i32 %rem to float
+  %x = tail call float @llvm.fmuladd.f32(float %xStep, float %conv1, float %xPos)
+  %conv2 = sitofp i32 %div to float
+  %y = tail call float @llvm.fmuladd.f32(float %yStep, float %conv2, float %yPos)
+  %yy = fmul float %y, %y
+  %xy = tail call float @llvm.fmuladd.f32(float %x, float %x, float %yy)
+  %cmp01 = fcmp ole float %xy, 4.000000e+00
+  %cmp02 = icmp ne i32 %maxIter, 0
+  %cond01 = and i1 %cmp02, %cmp01
+  br i1 %cond01, label %for.body.preheader, label %for.end
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %x_val = phi float [ %call8, %for.body ], [ %x, %for.body.preheader ]
+  %iter_val = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+  %y_val = phi float [ %call9, %for.body ], [ %y, %for.body.preheader ]
+  %sub = fsub float -0.000000e+00, %y_val
+  %call7 = tail call float @llvm.fmuladd.f32(float %x_val, float %x_val, float %x) #1
+  %call8 = tail call float @llvm.fmuladd.f32(float %sub, float %y_val, float %call7) #1
+  %mul = fmul float %x_val, 2.000000e+00
+  %call9 = tail call float @llvm.fmuladd.f32(float %mul, float %y_val, float %y) #1
+  %inc = add nuw i32 %iter_val, 1
+  %mul3 = fmul float %call9, %call9
+  %0 = tail call float @llvm.fmuladd.f32(float %call8, float %call8, float %mul3)
+  %cmp = fcmp ole float %0, 4.000000e+00
+  %cmp5 = icmp ult i32 %inc, %maxIter
+  %or.cond = and i1 %cmp5, %cmp
+  br i1 %or.cond, label %for.body, label %for.end.loopexit
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  %iter.0.lcssa = phi i32 [ 0, %entry ], [ %inc, %for.end.loopexit ]
+  %idxprom = ashr exact i32 %conv, 32
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %idxprom
+  store i32 %iter.0.lcssa, i32 addrspace(1)* %arrayidx, align 4
+  ret void
+}
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.amdgcn.workitem.id.x() #0
+declare float @llvm.fmuladd.f32(float, float, float) #1
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { readnone }
diff --git a/test/CodeGen/AMDGPU/uniform-loop-inside-nonuniform.ll b/test/CodeGen/AMDGPU/uniform-loop-inside-nonuniform.ll

index e0067f9f45acc5988de1b067085b2f41335e388c..8a08f9d8bb0d78964c0c24f7f4927547ee9fba70 100644 (file)
--- a/test/CodeGen/AMDGPU/uniform-loop-inside-nonuniform.ll
+++ b/test/CodeGen/AMDGPU/uniform-loop-inside-nonuniform.ll
@@ -7,11 +7,11 @@
  ; CHECK: s_and_saveexec_b64
  ; CHECK-NEXT: s_xor_b64
  ; CHECK-NEXT: ; mask branch
-
+; CHECK-NEXT: s_cbranch_execz BB{{[0-9]+_[0-9]+}}
  ; CHECK-NEXT: BB{{[0-9]+_[0-9]+}}: ; %loop_body.preheader
  
  ; CHECK: [[LOOP_BODY_LABEL:BB[0-9]+_[0-9]+]]:
-; CHECK: s_cbranch_scc0 [[LOOP_BODY_LABEL]]
+; CHECK: s_cbranch_vccz [[LOOP_BODY_LABEL]]
  
  ; CHECK: s_endpgm
  define amdgpu_ps void @test1(<8 x i32> inreg %rsrc, <2 x i32> %addr.base, i32 %y, i32 %p) {
diff --git a/test/CodeGen/AMDGPU/valu-i1.ll b/test/CodeGen/AMDGPU/valu-i1.ll

index aad260c3e3690a264914d59274105737343a07d5..85a8929ebe5869cf40d4d6a83787ff8381d4d69c 100644 (file)
--- a/test/CodeGen/AMDGPU/valu-i1.ll
+++ b/test/CodeGen/AMDGPU/valu-i1.ll
@@ -172,8 +172,8 @@ exit:
  ; SI: [[LABEL_LOOP:BB[0-9]+_[0-9]+]]:
  ; SI: buffer_load_dword
  ; SI-DAG: buffer_store_dword
-; SI-DAG: s_cmpk_eq_i32 s{{[0-9]+}}, 0x100
-; SI: s_cbranch_scc0 [[LABEL_LOOP]]
+; SI-DAG: v_cmp_eq_u32_e32 vcc, 0x100
+; SI: s_cbranch_vccz [[LABEL_LOOP]]
  ; SI: [[LABEL_EXIT]]:
  ; SI: s_endpgm
author	Wei Ding <wei.ding2@amd.com>
	Wed, 12 Apr 2017 23:51:47 +0000 (23:51 +0000)
committer	Wei Ding <wei.ding2@amd.com>
	Wed, 12 Apr 2017 23:51:47 +0000 (23:51 +0000)
lib/Target/AMDGPU/SIFixSGPRCopies.cpp		patch \| blob \| history
test/CodeGen/AMDGPU/loop_break.ll		patch \| blob \| history
test/CodeGen/AMDGPU/sgprcopies.ll	[new file with mode: 0644]	patch \| blob
test/CodeGen/AMDGPU/uniform-loop-inside-nonuniform.ll		patch \| blob \| history
test/CodeGen/AMDGPU/valu-i1.ll		patch \| blob \| history