AMDGPU: Move v_readlane lane select from VGPR to SGPR

author Nicolai Haehnle <nhaehnle@gmail.com>

Mon, 24 Apr 2017 17:17:36 +0000 (17:17 +0000)

committer Nicolai Haehnle <nhaehnle@gmail.com>

Mon, 24 Apr 2017 17:17:36 +0000 (17:17 +0000)
author Nicolai Haehnle <nhaehnle@gmail.com>
Mon, 24 Apr 2017 17:17:36 +0000 (17:17 +0000)
committer Nicolai Haehnle <nhaehnle@gmail.com>
Mon, 24 Apr 2017 17:17:36 +0000 (17:17 +0000)
diff --git a/include/llvm/IR/IntrinsicsAMDGPU.td b/include/llvm/IR/IntrinsicsAMDGPU.td

index 5415c6b0d1518f0ea9ff8baff9b7a3279a51832c..21d8a15e7e7a963a0d0e0fbc95eff28377a5ba4d 100644 (file)
--- a/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -629,6 +629,8 @@ def int_amdgcn_readfirstlane :
    GCCBuiltin<"__builtin_amdgcn_readfirstlane">,
    Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem, IntrConvergent]>;
  
+// The lane argument must be uniform across the currently active threads of the
+// current wave. Otherwise, the result is undefined.
  def int_amdgcn_readlane :
    GCCBuiltin<"__builtin_amdgcn_readlane">,
    Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem, IntrConvergent]>;
diff --git a/lib/Target/AMDGPU/SIInstrInfo.cpp b/lib/Target/AMDGPU/SIInstrInfo.cpp

index d51110bcbd609b2b6f7ffd791a139272e56c9c33..c5af8a1ad925ef44ec21e25fde2687bb9ad7e604 100644 (file)
--- a/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -2640,6 +2640,19 @@ void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI,
    if (isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src1))
      return;
  
+  // Special case: V_READLANE_B32 accepts only immediate or SGPR operands for
+  // lane select. Fix up using V_READFIRSTLANE, since we assume that the lane
+  // select is uniform.
+  if (Opc == AMDGPU::V_READLANE_B32 && Src1.isReg() &&
+      RI.isVGPR(MRI, Src1.getReg())) {
+    unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+    const DebugLoc &DL = MI.getDebugLoc();
+    BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
+        .add(Src1);
+    Src1.ChangeToRegister(Reg, false);
+    return;
+  }
+
    // We do not use commuteInstruction here because it is too aggressive and will
    // commute if it is possible. We only want to commute here if it improves
    // legality. This can be called a fairly large number of times so don't waste
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll

index 5e892fad3741bf10863e65890e7757ade16af6be..cbd8f0a9c23a3bcd41e5d5afd9d0f9a1b02960e0 100644 (file)
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll
@@ -19,6 +19,20 @@ define amdgpu_kernel void @test_readlane_imm_sreg(i32 addrspace(1)* %out, i32 %s
    ret void
  }
  
+; CHECK-LABEL: {{^}}test_readlane_vregs:
+; CHECK: v_readfirstlane_b32 [[LANE:s[0-9]+]], v{{[0-9]+}}
+; CHECK: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, [[LANE]]
+define amdgpu_kernel void @test_readlane_vregs(i32 addrspace(1)* %out, <2 x i32> addrspace(1)* %in) #1 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep.in = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 %tid
+  %args = load <2 x i32>, <2 x i32> addrspace(1)* %gep.in
+  %value = extractelement <2 x i32> %args, i32 0
+  %lane = extractelement <2 x i32> %args, i32 1
+  %readlane = call i32 @llvm.amdgcn.readlane(i32 %value, i32 %lane)
+  store i32 %readlane, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
  ; TODO: m0 should be folded.
  ; CHECK-LABEL: {{^}}test_readlane_m0_sreg:
  ; CHECK: s_mov_b32 m0, -1
@@ -40,5 +54,8 @@ define amdgpu_kernel void @test_readlane_imm(i32 addrspace(1)* %out, i32 %src0)
    ret void
  }
  
+declare i32 @llvm.amdgcn.workitem.id.x() #2
+
  attributes #0 = { nounwind readnone convergent }
  attributes #1 = { nounwind }
+attributes #2 = { nounwind readnone }
author	Nicolai Haehnle <nhaehnle@gmail.com>
	Mon, 24 Apr 2017 17:17:36 +0000 (17:17 +0000)
committer	Nicolai Haehnle <nhaehnle@gmail.com>
	Mon, 24 Apr 2017 17:17:36 +0000 (17:17 +0000)
include/llvm/IR/IntrinsicsAMDGPU.td		patch \| blob \| history
lib/Target/AMDGPU/SIInstrInfo.cpp		patch \| blob \| history
test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll		patch \| blob \| history