[AMDGPU] Implement llvm.amdgcn.set.inactive intrinsic

author Connor Abbott <cwabbott0@gmail.com>

Fri, 4 Aug 2017 18:36:54 +0000 (18:36 +0000)

committer Connor Abbott <cwabbott0@gmail.com>

Fri, 4 Aug 2017 18:36:54 +0000 (18:36 +0000)
author Connor Abbott <cwabbott0@gmail.com>
Fri, 4 Aug 2017 18:36:54 +0000 (18:36 +0000)
committer Connor Abbott <cwabbott0@gmail.com>
Fri, 4 Aug 2017 18:36:54 +0000 (18:36 +0000)
diff --git a/include/llvm/IR/IntrinsicsAMDGPU.td b/include/llvm/IR/IntrinsicsAMDGPU.td

index db73503a71d450a3dfe4ce21cd2f950b93d7015c..40ac52e5d0d3f12712ae6e73aa454b0aad628b07 100644 (file)
--- a/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -756,6 +756,16 @@ def int_amdgcn_wwm : Intrinsic<[llvm_any_ty],
    [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable]
  >;
  
+// Given a value, copies it while setting all the inactive lanes to a given
+// value. Note that OpenGL helper lanes are considered active, so if the
+// program ever uses WQM, then the instruction and the first source will be
+// computed in WQM.
+def int_amdgcn_set_inactive :
+  Intrinsic<[llvm_anyint_ty],
+            [LLVMMatchType<0>, // value to be copied
+             LLVMMatchType<0>], // value for the inactive lanes to take
+            [IntrNoMem, IntrConvergent]>;
+
  //===----------------------------------------------------------------------===//
  // CI+ Intrinsics
  //===----------------------------------------------------------------------===//
diff --git a/lib/Target/AMDGPU/SIInstrInfo.cpp b/lib/Target/AMDGPU/SIInstrInfo.cpp

index 0b0d038803193365b387f020f7494b396fc092ea..e9492c4cf9cd9c9209e942ec3aedb277b62102a1 100644 (file)
--- a/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -1099,6 +1099,28 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
      MI.eraseFromParent();
      break;
    }
+  case AMDGPU::V_SET_INACTIVE_B32: {
+    BuildMI(MBB, MI, DL, get(AMDGPU::S_NOT_B64), AMDGPU::EXEC)
+      .addReg(AMDGPU::EXEC);
+    BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), MI.getOperand(0).getReg())
+      .add(MI.getOperand(2));
+    BuildMI(MBB, MI, DL, get(AMDGPU::S_NOT_B64), AMDGPU::EXEC)
+      .addReg(AMDGPU::EXEC);
+    MI.eraseFromParent();
+    break;
+  }
+  case AMDGPU::V_SET_INACTIVE_B64: {
+    BuildMI(MBB, MI, DL, get(AMDGPU::S_NOT_B64), AMDGPU::EXEC)
+      .addReg(AMDGPU::EXEC);
+    MachineInstr *Copy = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO),
+                                 MI.getOperand(0).getReg())
+      .add(MI.getOperand(2));
+    expandPostRAPseudo(*Copy);
+    BuildMI(MBB, MI, DL, get(AMDGPU::S_NOT_B64), AMDGPU::EXEC)
+      .addReg(AMDGPU::EXEC);
+    MI.eraseFromParent();
+    break;
+  }
    case AMDGPU::V_MOVRELD_B32_V1:
    case AMDGPU::V_MOVRELD_B32_V2:
    case AMDGPU::V_MOVRELD_B32_V4:
diff --git a/lib/Target/AMDGPU/SIInstructions.td b/lib/Target/AMDGPU/SIInstructions.td

index a13c8f32fe6bee65b9bd5e0ce63cc1f16f985a61..70ad847fc5e565d85f8b02cf7aafa027216577b4 100644 (file)
--- a/lib/Target/AMDGPU/SIInstructions.td
+++ b/lib/Target/AMDGPU/SIInstructions.td
@@ -137,6 +137,20 @@ def EXIT_WWM : SPseudoInstSI <(outs SReg_64:$sdst), (ins SReg_64:$src0)> {
    let mayStore = 0;
  }
  
+// Invert the exec mask and overwrite the inactive lanes of dst with inactive,
+// restoring it after we're done.
+def V_SET_INACTIVE_B32 : VPseudoInstSI <(outs VGPR_32:$vdst),
+  (ins VGPR_32: $src, VSrc_b32:$inactive),
+  [(set i32:$vdst, (int_amdgcn_set_inactive i32:$src, i32:$inactive))]> {
+  let Constraints = "$src = $vdst";
+}
+
+def V_SET_INACTIVE_B64 : VPseudoInstSI <(outs VReg_64:$vdst),
+  (ins VReg_64: $src, VSrc_b64:$inactive),
+  [(set i64:$vdst, (int_amdgcn_set_inactive i64:$src, i64:$inactive))]> {
+  let Constraints = "$src = $vdst";
+}
+
  let usesCustomInserter = 1, SALU = 1 in {
  def GET_GROUPSTATICSIZE : PseudoInstSI <(outs SReg_32:$sdst), (ins),
    [(set SReg_32:$sdst, (int_amdgcn_groupstaticsize))]>;
diff --git a/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/lib/Target/AMDGPU/SIWholeQuadMode.cpp

index 1a0f0f9aca90f7fb34d3b601973fbf8306e286e8..8aa57ba729333bbd119604c06800d599a7bac2de 100644 (file)
--- a/lib/Target/AMDGPU/SIWholeQuadMode.cpp
+++ b/lib/Target/AMDGPU/SIWholeQuadMode.cpp
@@ -303,6 +303,7 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
                                         std::vector<WorkItem> &Worklist) {
    char GlobalFlags = 0;
    bool WQMOutputs = MF.getFunction()->hasFnAttribute("amdgpu-ps-wqm-outputs");
+  SmallVector<MachineInstr *, 4> SetInactiveInstrs;
  
    // We need to visit the basic blocks in reverse post-order so that we visit
    // defs before uses, in particular so that we don't accidentally mark an
@@ -341,6 +342,23 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
          GlobalFlags |= StateWWM;
          LowerToCopyInstrs.push_back(&MI);
          continue;
+      } else if (Opcode == AMDGPU::V_SET_INACTIVE_B32 ||
+                 Opcode == AMDGPU::V_SET_INACTIVE_B64) {
+        III.Disabled = StateWWM;
+        MachineOperand &Inactive = MI.getOperand(2);
+        if (Inactive.isReg()) {
+          if (Inactive.isUndef()) {
+            LowerToCopyInstrs.push_back(&MI);
+          } else {
+            unsigned Reg = Inactive.getReg();
+            if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+              for (MachineInstr &DefMI : MRI->def_instructions(Reg))
+                markInstruction(DefMI, StateWWM, Worklist);
+            }
+          }
+        }
+        SetInactiveInstrs.push_back(&MI);
+        continue;
        } else if (TII->isDisableWQM(MI)) {
          BBI.Needs |= StateExact;
          if (!(BBI.InNeeds & StateExact)) {
@@ -380,6 +398,14 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
      }
    }
  
+  // Mark sure that any SET_INACTIVE instructions are computed in WQM if WQM is
+  // ever used anywhere in the function. This implements the corresponding
+  // semantics of @llvm.amdgcn.set.inactive.
+  if (GlobalFlags & StateWQM) {
+    for (MachineInstr *MI : SetInactiveInstrs)
+      markInstruction(*MI, StateWQM, Worklist);
+  }
+
    return GlobalFlags;
  }
  
@@ -799,8 +825,11 @@ void SIWholeQuadMode::lowerLiveMaskQueries(unsigned LiveMaskReg) {
  }
  
  void SIWholeQuadMode::lowerCopyInstrs() {
-  for (MachineInstr *MI : LowerToCopyInstrs)
+  for (MachineInstr *MI : LowerToCopyInstrs) {
+    for (unsigned i = MI->getNumExplicitOperands() - 1; i > 1; i--)
+      MI->RemoveOperand(i);
      MI->setDesc(TII->get(AMDGPU::COPY));
+  }
  }
  
  bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll

new file mode 100644 (file)

index 0000000..648a90a
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll
@@ -0,0 +1,29 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s
+
+
+; GCN-LABEL: {{^}}set_inactive:
+; GCN: s_not_b64 exec, exec
+; GCN: v_mov_b32_e32 {{v[0-9]+}}, 42
+; GCN: s_not_b64 exec, exec
+define amdgpu_kernel void @set_inactive(i32 addrspace(1)* %out, i32 %in) {
+  %tmp = call i32 @llvm.amdgcn.set.inactive.i32(i32 %in, i32 42) #0
+  store i32 %tmp, i32 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}set_inactive_64:
+; GCN: s_not_b64 exec, exec
+; GCN: v_mov_b32_e32 {{v[0-9]+}}, 0
+; GCN: v_mov_b32_e32 {{v[0-9]+}}, 0
+; GCN: s_not_b64 exec, exec
+define amdgpu_kernel void @set_inactive_64(i64 addrspace(1)* %out, i64 %in) {
+  %tmp = call i64 @llvm.amdgcn.set.inactive.i64(i64 %in, i64 0) #0
+  store i64 %tmp, i64 addrspace(1)* %out
+  ret void
+}
+
+declare i32 @llvm.amdgcn.set.inactive.i32(i32, i32) #0
+declare i64 @llvm.amdgcn.set.inactive.i64(i64, i64) #0
+
+attributes #0 = { convergent readnone }
diff --git a/test/CodeGen/AMDGPU/wqm.ll b/test/CodeGen/AMDGPU/wqm.ll

index 44bbeeba9f041fb55b500c6eafb164e05d1f1e4a..12fb6cd4d6f3b9637bedb2ec35cacb38e8cb4bed 100644 (file)
--- a/test/CodeGen/AMDGPU/wqm.ll
+++ b/test/CodeGen/AMDGPU/wqm.ll
@@ -256,6 +256,47 @@ endif:
    ret float %out.1
  }
  
+; Check that @llvm.amdgcn.set.inactive disables WWM.
+;
+;CHECK-LABEL: {{^}}test_set_inactive1:
+;CHECK: buffer_load_dword
+;CHECK: s_not_b64 exec, exec
+;CHECK: v_mov_b32_e32
+;CHECK: s_not_b64 exec, exec
+;CHECK: s_or_saveexec_b64 s{{\[[0-9]+:[0-9]+\]}}, -1
+;CHECK: v_add_i32_e32
+define amdgpu_ps void @test_set_inactive1(i32 inreg %idx) {
+main_body:
+  %src = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx, i32 0, i1 0, i1 0)
+  %src.0 = bitcast float %src to i32
+  %src.1 = call i32 @llvm.amdgcn.set.inactive.i32(i32 %src.0, i32 0)
+  %out = add i32 %src.1, %src.1
+  %out.0 = call i32 @llvm.amdgcn.wwm.i32(i32 %out)
+  %out.1 = bitcast i32 %out.0 to float
+  call void @llvm.amdgcn.buffer.store.f32(float %out.1, <4 x i32> undef, i32 %idx, i32 0, i1 0, i1 0)
+  ret void
+}
+
+; Check that enabling WQM anywhere enables WQM for the set.inactive source.
+;
+;CHECK-LABEL: {{^}}test_set_inactive2:
+;CHECK: s_wqm_b64 exec, exec
+;CHECK: buffer_load_dword
+;CHECK: buffer_load_dword
+define amdgpu_ps void @test_set_inactive2(i32 inreg %idx0, i32 inreg %idx1) {
+main_body:
+  %src1 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i1 0, i1 0)
+  %src1.0 = bitcast float %src1 to i32
+  %src1.1 = call i32 @llvm.amdgcn.set.inactive.i32(i32 %src1.0, i32 undef)
+  %src0 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i1 0, i1 0)
+  %src0.0 = bitcast float %src0 to i32
+  %src0.1 = call i32 @llvm.amdgcn.wqm.i32(i32 %src0.0)
+  %out = add i32 %src0.1, %src1.1
+  %out.0 = bitcast i32 %out to float
+  call void @llvm.amdgcn.buffer.store.f32(float %out.0, <4 x i32> undef, i32 %idx1, i32 0, i1 0, i1 0)
+  ret void
+}
+
  ; Check a case of one branch of an if-else requiring WQM, the other requiring
  ; exact.
  ;
@@ -513,7 +554,7 @@ main_body:
  ; CHECK: s_wqm_b64 exec, exec
  ; CHECK: v_add_f32_e32 v0,
  ; CHECK: s_and_b64 exec, exec, [[ORIG]]
-define amdgpu_ps float @test_prolog_1(float %a, float %b) #4 {
+define amdgpu_ps float @test_prolog_1(float %a, float %b) #5 {
  main_body:
    %s = fadd float %a, %b
    ret float %s
@@ -680,10 +721,12 @@ declare float @llvm.amdgcn.wqm.f32(float) #3
  declare i32 @llvm.amdgcn.wqm.i32(i32) #3
  declare float @llvm.amdgcn.wwm.f32(float) #3
  declare i32 @llvm.amdgcn.wwm.i32(i32) #3
+declare i32 @llvm.amdgcn.set.inactive.i32(i32, i32) #4
  declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #3
  declare i32 @llvm.amdgcn.mbcnt.hi(i32, i32) #3
  
  attributes #1 = { nounwind }
  attributes #2 = { nounwind readonly }
  attributes #3 = { nounwind readnone }
-attributes #4 = { "amdgpu-ps-wqm-outputs" }
+attributes #4 = { nounwind readnone convergent }
+attributes #5 = { "amdgpu-ps-wqm-outputs" }
author	Connor Abbott <cwabbott0@gmail.com>
	Fri, 4 Aug 2017 18:36:54 +0000 (18:36 +0000)
committer	Connor Abbott <cwabbott0@gmail.com>
	Fri, 4 Aug 2017 18:36:54 +0000 (18:36 +0000)
include/llvm/IR/IntrinsicsAMDGPU.td		patch \| blob \| history
lib/Target/AMDGPU/SIInstrInfo.cpp		patch \| blob \| history
lib/Target/AMDGPU/SIInstructions.td		patch \| blob \| history
lib/Target/AMDGPU/SIWholeQuadMode.cpp		patch \| blob \| history
test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll	[new file with mode: 0644]	patch \| blob
test/CodeGen/AMDGPU/wqm.ll		patch \| blob \| history