[AMDGPU] Packed thread ids in function call ABI

author Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>

Fri, 28 Jun 2019 01:52:13 +0000 (01:52 +0000)

committer Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>

Fri, 28 Jun 2019 01:52:13 +0000 (01:52 +0000)
author Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>
Fri, 28 Jun 2019 01:52:13 +0000 (01:52 +0000)
committer Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>
Fri, 28 Jun 2019 01:52:13 +0000 (01:52 +0000)
diff --git a/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp b/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp

index 81c3356ab9ef3ebb7d5e20accc3d01b691795e32..99a01ca3a2fda20b9cf88f8d66f4d6e8af527b84 100644 (file)
--- a/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp
+++ b/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp
@@ -9,6 +9,7 @@
  #include "AMDGPU.h"
  #include "AMDGPUArgumentUsageInfo.h"
  #include "SIRegisterInfo.h"
+#include "llvm/Support/NativeFormatting.h"
  #include "llvm/Support/raw_ostream.h"
  
  using namespace llvm;
@@ -26,9 +27,16 @@ void ArgDescriptor::print(raw_ostream &OS,
    }
  
    if (isRegister())
-    OS << "Reg " << printReg(getRegister(), TRI) << '\n';
+    OS << "Reg " << printReg(getRegister(), TRI);
    else
-    OS << "Stack offset " << getStackOffset() << '\n';
+    OS << "Stack offset " << getStackOffset();
+
+  if (isMasked()) {
+    OS << " & ";
+    llvm::write_hex(OS, Mask, llvm::HexPrintStyle::PrefixLower);
+  }
+
+  OS << '\n';
  }
  
  char AMDGPUArgumentUsageInfo::ID = 0;
diff --git a/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h b/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h

index 277f3616031f251ef763af414e0146333b968d31..ab0024b50be18c1937c0c726f3b73a4c90f829e2 100644 (file)
--- a/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h
+++ b/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h
@@ -32,18 +32,27 @@ private:
      unsigned StackOffset;
    };
  
+  // Bitmask to locate argument within the register.
+  unsigned Mask;
+
    bool IsStack : 1;
    bool IsSet : 1;
  
-  ArgDescriptor(unsigned Val = 0, bool IsStack = false, bool IsSet = false)
-    : Register(Val), IsStack(IsStack), IsSet(IsSet) {}
  public:
-  static ArgDescriptor createRegister(unsigned Reg) {
-    return ArgDescriptor(Reg, false, true);
+  ArgDescriptor(unsigned Val = 0, unsigned Mask = ~0u,
+                bool IsStack = false, bool IsSet = false)
+    : Register(Val), Mask(Mask), IsStack(IsStack), IsSet(IsSet) {}
+
+  static ArgDescriptor createRegister(unsigned Reg, unsigned Mask = ~0u) {
+    return ArgDescriptor(Reg, Mask, false, true);
+  }
+
+  static ArgDescriptor createStack(unsigned Reg, unsigned Mask = ~0u) {
+    return ArgDescriptor(Reg, Mask, true, true);
    }
  
-  static ArgDescriptor createStack(unsigned Reg) {
-    return ArgDescriptor(Reg, true, true);
+  static ArgDescriptor createArg(const ArgDescriptor &Arg, unsigned Mask) {
+    return ArgDescriptor(Arg.Register, Mask, Arg.IsStack, Arg.IsSet);
    }
  
    bool isSet() const {
@@ -68,6 +77,14 @@ public:
      return StackOffset;
    }
  
+  unsigned getMask() const {
+    return Mask;
+  }
+
+  bool isMasked() const {
+    return Mask != ~0u;
+  }
+
    void print(raw_ostream &OS, const TargetRegisterInfo *TRI = nullptr) const;
  };
  
diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp

index d0af336a00b6b54a231cc8fc72169f55ab838bcd..766294dee2357cc38b6bef13b440c1fa7c7606d8 100644 (file)
--- a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -4233,9 +4233,19 @@ SDValue AMDGPUTargetLowering::loadInputValue(SelectionDAG &DAG,
                                               const ArgDescriptor &Arg) const {
    assert(Arg && "Attempting to load missing argument");
  
-  if (Arg.isRegister())
-    return CreateLiveInRegister(DAG, RC, Arg.getRegister(), VT, SL);
-  return loadStackInputValue(DAG, VT, SL, Arg.getStackOffset());
+  SDValue V = Arg.isRegister() ?
+    CreateLiveInRegister(DAG, RC, Arg.getRegister(), VT, SL) :
+    loadStackInputValue(DAG, VT, SL, Arg.getStackOffset());
+
+  if (!Arg.isMasked())
+    return V;
+
+  unsigned Mask = Arg.getMask();
+  unsigned Shift = countTrailingZeros<unsigned>(Mask);
+  V = DAG.getNode(ISD::SRL, SL, VT, V,
+                  DAG.getShiftAmountConstant(Shift, VT, SL));
+  return DAG.getNode(ISD::AND, SL, VT, V,
+                     DAG.getConstant(Mask >> Shift, SL, VT));
  }
  
  uint32_t AMDGPUTargetLowering::getImplicitParameterOffset(
diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp

index 25000506f2ccbacdef2a1232081da57a011e2dfc..398f6887644a0e8a383f2ddbd462ccde77bfd01c 100644 (file)
--- a/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -1585,7 +1585,13 @@ static void allocateSpecialEntryInputVGPRs(CCState &CCInfo,
  
  // Try to allocate a VGPR at the end of the argument list, or if no argument
  // VGPRs are left allocating a stack slot.
-static ArgDescriptor allocateVGPR32Input(CCState &CCInfo) {
+// If \p Mask is is given it indicates bitfield position in the register.
+// If \p Arg is given use it with new ]p Mask instead of allocating new.
+static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask = ~0u,
+                                         ArgDescriptor Arg = ArgDescriptor()) {
+  if (Arg.isSet())
+    return ArgDescriptor::createArg(Arg, Mask);
+
    ArrayRef<MCPhysReg> ArgVGPRs
      = makeArrayRef(AMDGPU::VGPR_32RegClass.begin(), 32);
    unsigned RegIdx = CCInfo.getFirstUnallocated(ArgVGPRs);
@@ -1593,7 +1599,7 @@ static ArgDescriptor allocateVGPR32Input(CCState &CCInfo) {
      // Spill to stack required.
      int64_t Offset = CCInfo.AllocateStack(4, 4);
  
-    return ArgDescriptor::createStack(Offset);
+    return ArgDescriptor::createStack(Offset, Mask);
    }
  
    unsigned Reg = ArgVGPRs[RegIdx];
@@ -1602,7 +1608,7 @@ static ArgDescriptor allocateVGPR32Input(CCState &CCInfo) {
  
    MachineFunction &MF = CCInfo.getMachineFunction();
    MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
-  return ArgDescriptor::createRegister(Reg);
+  return ArgDescriptor::createRegister(Reg, Mask);
  }
  
  static ArgDescriptor allocateSGPR32InputImpl(CCState &CCInfo,
@@ -1634,14 +1640,21 @@ static void allocateSpecialInputVGPRs(CCState &CCInfo,
                                        MachineFunction &MF,
                                        const SIRegisterInfo &TRI,
                                        SIMachineFunctionInfo &Info) {
-  if (Info.hasWorkItemIDX())
-    Info.setWorkItemIDX(allocateVGPR32Input(CCInfo));
+  const unsigned Mask = 0x3ff;
+  ArgDescriptor Arg;
  
-  if (Info.hasWorkItemIDY())
-    Info.setWorkItemIDY(allocateVGPR32Input(CCInfo));
+  if (Info.hasWorkItemIDX()) {
+    Arg = allocateVGPR32Input(CCInfo, Mask);
+    Info.setWorkItemIDX(Arg);
+  }
+
+  if (Info.hasWorkItemIDY()) {
+    Arg = allocateVGPR32Input(CCInfo, Mask << 10, Arg);
+    Info.setWorkItemIDY(Arg);
+  }
  
    if (Info.hasWorkItemIDZ())
-    Info.setWorkItemIDZ(allocateVGPR32Input(CCInfo));
+    Info.setWorkItemIDZ(allocateVGPR32Input(CCInfo, Mask << 20, Arg));
  }
  
  static void allocateSpecialInputSGPRs(CCState &CCInfo,
@@ -2387,9 +2400,6 @@ void SITargetLowering::passSpecialInputs(
      AMDGPUFunctionArgInfo::WORKGROUP_ID_X,
      AMDGPUFunctionArgInfo::WORKGROUP_ID_Y,
      AMDGPUFunctionArgInfo::WORKGROUP_ID_Z,
-    AMDGPUFunctionArgInfo::WORKITEM_ID_X,
-    AMDGPUFunctionArgInfo::WORKITEM_ID_Y,
-    AMDGPUFunctionArgInfo::WORKITEM_ID_Z,
      AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR
    };
  
@@ -2429,6 +2439,71 @@ void SITargetLowering::passSpecialInputs(
        MemOpChains.push_back(ArgStore);
      }
    }
+
+  // Pack workitem IDs into a single register or pass it as is if already
+  // packed.
+  const ArgDescriptor *OutgoingArg;
+  const TargetRegisterClass *ArgRC;
+
+  std::tie(OutgoingArg, ArgRC) =
+    CalleeArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_X);
+  if (!OutgoingArg)
+    std::tie(OutgoingArg, ArgRC) =
+      CalleeArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
+  if (!OutgoingArg)
+    std::tie(OutgoingArg, ArgRC) =
+      CalleeArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
+  if (!OutgoingArg)
+    return;
+
+  const ArgDescriptor *IncomingArgX
+    = CallerArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_X).first;
+  const ArgDescriptor *IncomingArgY
+    = CallerArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Y).first;
+  const ArgDescriptor *IncomingArgZ
+    = CallerArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Z).first;
+
+  SDValue InputReg;
+  SDLoc SL;
+
+  // If incoming ids are not packed we need to pack them.
+  if (IncomingArgX && !IncomingArgX->isMasked() && CalleeArgInfo.WorkItemIDX)
+    InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgX);
+
+  if (IncomingArgY && !IncomingArgY->isMasked() && CalleeArgInfo.WorkItemIDY) {
+    SDValue Y = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgY);
+    Y = DAG.getNode(ISD::SHL, SL, MVT::i32, Y,
+                    DAG.getShiftAmountConstant(10, MVT::i32, SL));
+    InputReg = InputReg.getNode() ?
+                 DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Y) : Y;
+  }
+
+  if (IncomingArgZ && !IncomingArgZ->isMasked() && CalleeArgInfo.WorkItemIDZ) {
+    SDValue Z = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgZ);
+    Z = DAG.getNode(ISD::SHL, SL, MVT::i32, Z,
+                    DAG.getShiftAmountConstant(20, MVT::i32, SL));
+    InputReg = InputReg.getNode() ?
+                 DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Z) : Z;
+  }
+
+  if (!InputReg.getNode()) {
+    // Workitem ids are already packed, any of present incoming arguments
+    // will carry all required fields.
+    ArgDescriptor IncomingArg = ArgDescriptor::createArg(
+      IncomingArgX ? *IncomingArgX :
+      IncomingArgY ? *IncomingArgY :
+                     *IncomingArgZ, ~0u);
+    InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, IncomingArg);
+  }
+
+  if (OutgoingArg->isRegister()) {
+    RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
+  } else {
+    unsigned SpecialArgOffset = CCInfo.AllocateStack(4, 4);
+    SDValue ArgStore = storeStackInputValue(DAG, DL, Chain, InputReg,
+                                            SpecialArgOffset);
+    MemOpChains.push_back(ArgStore);
+  }
  }
  
  static bool canGuaranteeTCO(CallingConv::ID CC) {
diff --git a/test/CodeGen/AMDGPU/call-constexpr.ll b/test/CodeGen/AMDGPU/call-constexpr.ll

index 0ebf11dc72f5c18eb948b957dac19660c8b1a91e..3044ff806aa0309582d619819942538b56dac450 100644 (file)
--- a/test/CodeGen/AMDGPU/call-constexpr.ll
+++ b/test/CodeGen/AMDGPU/call-constexpr.ll
@@ -65,6 +65,7 @@ define amdgpu_kernel void @test_bitcast_argument_and_return_types() #0 {
  
  ; GCN-LABEL: {{^}}use_workitem_id_x:
  ; GCN: s_waitcnt
+; GCN-NEXT: v_and_b32_e32 v1, 0x3ff, v1
  ; GCN-NEXT: v_add_i32_e32 v0, vcc, v1, v0
  ; GCN-NEXT: s_setpc_b64
  define hidden i32 @use_workitem_id_x(i32 %arg0) #0 {
diff --git a/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll b/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll

index dbab269350a696a1fd753eb684f7e760e54ed34e..09bb3b40d4b825fa9a0581d8ce7c41f864a809d0 100644 (file)
--- a/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll
+++ b/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll
@@ -1,8 +1,9 @@
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -mattr=-code-object-v3 -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -mattr=-code-object-v3 -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN %s
  
  ; GCN-LABEL: {{^}}use_workitem_id_x:
  ; GCN: s_waitcnt
-; GCN-NEXT: flat_store_dword v{{\[[0-9]:[0-9]+\]}}, v0
+; GCN: v_and_b32_e32 [[ID:v[0-9]+]], 0x3ff, v0
+; GCN-NEXT: flat_store_dword v{{\[[0-9]:[0-9]+\]}}, [[ID]]
  ; GCN-NEXT: s_waitcnt
  ; GCN-NEXT: s_setpc_b64
  define void @use_workitem_id_x() #1 {
@@ -13,7 +14,8 @@ define void @use_workitem_id_x() #1 {
  
  ; GCN-LABEL: {{^}}use_workitem_id_y:
  ; GCN: s_waitcnt
-; GCN-NEXT: flat_store_dword v{{\[[0-9]:[0-9]+\]}}, v0
+; GCN: v_bfe_u32 [[ID:v[0-9]+]], v0, 10, 10
+; GCN-NEXT: flat_store_dword v{{\[[0-9]:[0-9]+\]}}, [[ID]]
  ; GCN-NEXT: s_waitcnt
  ; GCN-NEXT: s_setpc_b64
  define void @use_workitem_id_y() #1 {
@@ -24,7 +26,8 @@ define void @use_workitem_id_y() #1 {
  
  ; GCN-LABEL: {{^}}use_workitem_id_z:
  ; GCN: s_waitcnt
-; GCN-NEXT: flat_store_dword v{{\[[0-9]:[0-9]+\]}}, v0
+; GCN: v_bfe_u32 [[ID:v[0-9]+]], v0, 20, 10
+; GCN-NEXT: flat_store_dword v{{\[[0-9]:[0-9]+\]}}, [[ID]]
  ; GCN-NEXT: s_waitcnt
  ; GCN-NEXT: s_setpc_b64
  define void @use_workitem_id_z() #1 {
@@ -35,8 +38,10 @@ define void @use_workitem_id_z() #1 {
  
  ; GCN-LABEL: {{^}}use_workitem_id_xy:
  ; GCN: s_waitcnt
-; GCN-NEXT: flat_store_dword v{{\[[0-9]:[0-9]+\]}}, v0
-; GCN-NEXT: flat_store_dword v{{\[[0-9]:[0-9]+\]}}, v1
+; GCN-DAG: v_and_b32_e32 [[IDX:v[0-9]+]], 0x3ff, v0
+; GCN-DAG: v_bfe_u32 [[IDY:v[0-9]+]], v0, 10, 10
+; GCN-DAG: flat_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDX]]
+; GCN-DAG: flat_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDY]]
  ; GCN-NEXT: s_waitcnt
  ; GCN-NEXT: s_setpc_b64
  define void @use_workitem_id_xy() #1 {
@@ -49,9 +54,12 @@ define void @use_workitem_id_xy() #1 {
  
  ; GCN-LABEL: {{^}}use_workitem_id_xyz:
  ; GCN: s_waitcnt
-; GCN-NEXT: flat_store_dword v{{\[[0-9]:[0-9]+\]}}, v0
-; GCN-NEXT: flat_store_dword v{{\[[0-9]:[0-9]+\]}}, v1
-; GCN-NEXT: flat_store_dword v{{\[[0-9]:[0-9]+\]}}, v2
+; GCN-DAG: v_and_b32_e32 [[IDX:v[0-9]+]], 0x3ff, v0
+; GCN-DAG: v_bfe_u32 [[IDY:v[0-9]+]], v0, 10, 10
+; GCN-DAG: v_bfe_u32 [[IDZ:v[0-9]+]], v0, 20, 10
+; GCN-DAG: flat_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDX]]
+; GCN-DAG: flat_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDY]]
+; GCN-DAG: flat_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDZ]]
  ; GCN-NEXT: s_waitcnt
  ; GCN-NEXT: s_setpc_b64
  define void @use_workitem_id_xyz() #1 {
@@ -66,8 +74,10 @@ define void @use_workitem_id_xyz() #1 {
  
  ; GCN-LABEL: {{^}}use_workitem_id_xz:
  ; GCN: s_waitcnt
-; GCN-NEXT: flat_store_dword v{{\[[0-9]:[0-9]+\]}}, v0
-; GCN-NEXT: flat_store_dword v{{\[[0-9]:[0-9]+\]}}, v1
+; GCN-DAG: v_and_b32_e32 [[IDX:v[0-9]+]], 0x3ff, v0
+; GCN-DAG: v_bfe_u32 [[IDZ:v[0-9]+]], v0, 20, 10
+; GCN-DAG: flat_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDX]]
+; GCN-DAG: flat_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDZ]]
  ; GCN-NEXT: s_waitcnt
  ; GCN-NEXT: s_setpc_b64
  define void @use_workitem_id_xz() #1 {
@@ -80,8 +90,10 @@ define void @use_workitem_id_xz() #1 {
  
  ; GCN-LABEL: {{^}}use_workitem_id_yz:
  ; GCN: s_waitcnt
-; GCN-NEXT: flat_store_dword v{{\[[0-9]:[0-9]+\]}}, v0
-; GCN-NEXT: flat_store_dword v{{\[[0-9]:[0-9]+\]}}, v1
+; GCN-DAG: v_bfe_u32 [[IDY:v[0-9]+]], v0, 10, 10
+; GCN-DAG: v_bfe_u32 [[IDZ:v[0-9]+]], v0, 20, 10
+; GCN-DAG: flat_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDY]]
+; GCN-DAG: flat_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDZ]]
  ; GCN-NEXT: s_waitcnt
  ; GCN-NEXT: s_setpc_b64
  define void @use_workitem_id_yz() #1 {
@@ -108,7 +120,7 @@ define amdgpu_kernel void @kern_indirect_use_workitem_id_x() #1 {
  
  ; GCN-NOT: v0
  ; GCN-NOT: v1
-; GCN: v_mov_b32_e32 v0, v1
+; GCN: v_lshlrev_b32_e32 v0, 10, v1
  ; GCN-NOT: v0
  ; GCN-NOT: v1
  ; GCN: s_swappc_b64
@@ -122,15 +134,72 @@ define amdgpu_kernel void @kern_indirect_use_workitem_id_y() #1 {
  
  ; GCN-NOT: v0
  ; GCN-NOT: v2
-; GCN: v_mov_b32_e32 v0, v2
+; GCN: v_lshlrev_b32_e32 v0, 20, v2
  ; GCN-NOT: v0
-; GCN-NOT: v2
+; GCN-NOT: v1
  ; GCN: s_swappc_b64
  define amdgpu_kernel void @kern_indirect_use_workitem_id_z() #1 {
    call void @use_workitem_id_z()
    ret void
  }
  
+; GCN-LABEL: {{^}}kern_indirect_use_workitem_id_xy:
+; GCN-NOT: v0
+; GCN-NOT: v1
+; GCN: v_lshlrev_b32_e32 [[IDY:v[0-9]+]], 10, v1
+; GCN: v_or_b32_e32 v0, v0, [[IDY]]
+; GCN-NOT: v0
+; GCN-NOT: v1
+; GCN: s_swappc_b64
+define amdgpu_kernel void @kern_indirect_use_workitem_id_xy() #1 {
+  call void @use_workitem_id_xy()
+  ret void
+}
+
+; GCN-LABEL: {{^}}kern_indirect_use_workitem_id_xz:
+; GCN-NOT: v0
+; GCN-NOT: v2
+; GCN: v_lshlrev_b32_e32 [[IDZ:v[0-9]+]], 20, v2
+; GCN: v_or_b32_e32 v0, v0, [[IDZ]]
+; GCN-NOT: v0
+; GCN-NOT: v2
+; GCN: s_swappc_b64
+define amdgpu_kernel void @kern_indirect_use_workitem_id_xz() #1 {
+  call void @use_workitem_id_xz()
+  ret void
+}
+
+; GCN-LABEL: {{^}}kern_indirect_use_workitem_id_yz:
+; GCN-NOT: v1
+; GCN-NOT: v2
+; GCN-DAG: v_lshlrev_b32_e32 [[IDY:v[0-9]+]], 10, v1
+; GCN-DAG: v_lshlrev_b32_e32 [[IDZ:v[0-9]+]], 20, v2
+; GCN: v_or_b32_e32 v0, [[IDY]], [[IDZ]]
+; GCN-NOT: v1
+; GCN-NOT: v2
+; GCN: s_swappc_b64
+define amdgpu_kernel void @kern_indirect_use_workitem_id_yz() #1 {
+  call void @use_workitem_id_yz()
+  ret void
+}
+
+; GCN-LABEL: {{^}}kern_indirect_use_workitem_id_xyz:
+; GCN-NOT: v0
+; GCN-NOT: v1
+; GCN-NOT: v2
+; GCN-DAG: v_lshlrev_b32_e32 [[IDY:v[0-9]+]], 10, v1
+; GCN-DAG: v_lshlrev_b32_e32 [[IDZ:v[0-9]+]], 20, v2
+; GCN-DAG: v_or_b32_e32 v0, v0, [[IDY]]
+; GCN-DAG: v_or_b32_e32 v0, v0, [[IDZ]]
+; GCN-NOT: v0
+; GCN-NOT: v1
+; GCN-NOT: v2
+; GCN: s_swappc_b64
+define amdgpu_kernel void @kern_indirect_use_workitem_id_xyz() #1 {
+  call void @use_workitem_id_xyz()
+  ret void
+}
+
  ; GCN-LABEL: {{^}}func_indirect_use_workitem_id_x:
  ; GCN-NOT: v0
  ; GCN: s_swappc_b64
@@ -160,8 +229,9 @@ define void @func_indirect_use_workitem_id_z() #1 {
  
  ; GCN-LABEL: {{^}}other_arg_use_workitem_id_x:
  ; GCN: s_waitcnt
-; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
-; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1
+; GCN-DAG: v_and_b32_e32 [[ID:v[0-9]+]], 0x3ff, v1
+; GCN-DAG: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
+; GCN-DAG: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ID]]
  define void @other_arg_use_workitem_id_x(i32 %arg0) #1 {
    %val = call i32 @llvm.amdgcn.workitem.id.x()
    store volatile i32 %arg0, i32 addrspace(1)* undef
@@ -171,8 +241,9 @@ define void @other_arg_use_workitem_id_x(i32 %arg0) #1 {
  
  ; GCN-LABEL: {{^}}other_arg_use_workitem_id_y:
  ; GCN: s_waitcnt
-; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
-; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1
+; GCN-DAG: v_bfe_u32 [[ID:v[0-9]+]], v1, 10, 10
+; GCN-DAG: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
+; GCN-DAG: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ID]]
  define void @other_arg_use_workitem_id_y(i32 %arg0) #1 {
    %val = call i32 @llvm.amdgcn.workitem.id.y()
    store volatile i32 %arg0, i32 addrspace(1)* undef
@@ -182,8 +253,9 @@ define void @other_arg_use_workitem_id_y(i32 %arg0) #1 {
  
  ; GCN-LABEL: {{^}}other_arg_use_workitem_id_z:
  ; GCN: s_waitcnt
-; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
-; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1
+; GCN-DAG: v_bfe_u32 [[ID:v[0-9]+]], v1, 20, 10
+; GCN-DAG: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
+; GCN-DAG: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ID]]
  define void @other_arg_use_workitem_id_z(i32 %arg0) #1 {
    %val = call i32 @llvm.amdgcn.workitem.id.z()
    store volatile i32 %arg0, i32 addrspace(1)* undef
@@ -207,6 +279,7 @@ define amdgpu_kernel void @kern_indirect_other_arg_use_workitem_id_x() #1 {
  ; GCN-LABEL: {{^}}kern_indirect_other_arg_use_workitem_id_y:
  ; GCN: enable_vgpr_workitem_id = 1
  
+; GCN: v_lshlrev_b32_e32 v1, 10, v1
  ; GCN-NOT: v1
  ; GCN: v_mov_b32_e32 v0, 0x22b
  ; GCN-NOT: v1
@@ -221,7 +294,7 @@ define amdgpu_kernel void @kern_indirect_other_arg_use_workitem_id_y() #1 {
  ; GCN: enable_vgpr_workitem_id = 2
  
  ; GCN-DAG: v_mov_b32_e32 v0, 0x22b
-; GCN-DAG: v_mov_b32_e32 v1, v2
+; GCN-DAG: v_lshlrev_b32_e32 v1, 20, v2
  ; GCN: s_swappc_b64
  ; GCN-NOT: v0
  define amdgpu_kernel void @kern_indirect_other_arg_use_workitem_id_z() #1 {
@@ -232,6 +305,7 @@ define amdgpu_kernel void @kern_indirect_other_arg_use_workitem_id_z() #1 {
  ; GCN-LABEL: {{^}}too_many_args_use_workitem_id_x:
  ; GCN: buffer_store_dword v32, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
  ; GCN: buffer_load_dword v32, off, s[0:3], s32{{$}}
+; GCN: v_and_b32_e32 v32, 0x3ff, v32
  ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+]}}, v32
  
  ; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
@@ -357,6 +431,7 @@ define void @too_many_args_call_too_many_args_use_workitem_id_x(
  ; GCN: buffer_store_dword v32, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
  ; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:4
  ; GCN-NEXT: s_waitcnt
+; GCN-NEXT: v_and_b32_e32 v32, 0x3ff, v32
  ; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v32
  ; GCN: buffer_load_dword v0, off, s[0:3], s32{{$}}
  ; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
@@ -469,15 +544,18 @@ define void @func_call_too_many_args_use_workitem_id_x_byval() #1 {
  }
  
  ; GCN-LABEL: {{^}}too_many_args_use_workitem_id_xyz:
-; GCN: buffer_store_dword v32, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GCN: buffer_store_dword v32, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
  ; GCN: buffer_load_dword v32, off, s[0:3], s32{{$}}
+; GCN: v_and_b32_e32 v32, 0x3ff, v32
  ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+]}}, v32
-; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:4{{$}}
+; GCN: buffer_load_dword v32, off, s[0:3], s32{{$}}
+; GCN: v_bfe_u32 v32, v32, 10, 10
  ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+]}}, v32
-; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:8{{$}}
+; GCN: buffer_load_dword v32, off, s[0:3], s32{{$}}
+; GCN: v_bfe_u32 v32, v32, 20, 10
  ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+]}}, v32
  
-; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
  ; GCN-NEXT: s_waitcnt
  ; GCN-NEXT: s_setpc_b64
  define void @too_many_args_use_workitem_id_xyz(
@@ -531,19 +609,19 @@ define void @too_many_args_use_workitem_id_xyz(
    ret void
  }
  
-; frame[0] = ID X
-; frame[1] = ID Y
-; frame[2] = ID Z
+; frame[0] = ID { Z, Y, X }
  
  ; GCN-LABEL: {{^}}kern_call_too_many_args_use_workitem_id_xyz:
  ; GCN: enable_vgpr_workitem_id = 2
  
-; GCN: s_mov_b32 s33, s7
-; GCN: s_mov_b32 s32, s33
+; GCN-DAG: s_mov_b32 s33, s7
+; GCN-DAG: s_mov_b32 s32, s33
  
-; GCN-DAG: buffer_store_dword v0, off, s[0:3], s32{{$}}
-; GCN-DAG: buffer_store_dword v1, off, s[0:3], s32 offset:4
-; GCN-DAG: buffer_store_dword v2, off, s[0:3], s32 offset:8
+; GCN-DAG: v_lshlrev_b32_e32 v1, 10, v1
+; GCN-DAG: v_or_b32_e32 v0, v0, v1
+; GCN-DAG: v_lshlrev_b32_e32 v2, 20, v2
+; GCN-DAG: v_or_b32_e32 v0, v0, v2
+; GCN: buffer_store_dword v0, off, s[0:3], s32{{$}}
  ; GCN: s_swappc_b64
  define amdgpu_kernel void @kern_call_too_many_args_use_workitem_id_xyz() #1 {
    call void @too_many_args_use_workitem_id_xyz(
@@ -560,19 +638,19 @@ define amdgpu_kernel void @kern_call_too_many_args_use_workitem_id_xyz() #1 {
  
  ; workitem ID X in register, yz on stack
  ; v31 = workitem ID X
-; frame[0] = workitem Y
-; frame[1] = workitem Z
+; frame[0] = workitem { Z, Y, X }
  
  ; GCN-LABEL: {{^}}too_many_args_use_workitem_id_x_stack_yz:
-; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+]}}, v31
-; GCN: buffer_load_dword v31, off, s[0:3], s32{{$}}
-; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+]}}, v31
-; GCN: buffer_load_dword v31, off, s[0:3], s32 offset:4{{$}}
-; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+]}}, v31
+; GCN-DAG: v_and_b32_e32 [[IDX:v[0-9]+]], 0x3ff, v31
+; GCN-DAG: flat_store_dword v[0:1], [[IDX]]
+; GCN-DAG: v_bfe_u32 [[IDY:v[0-9]+]], v31, 10, 10
+; GCN-DAG: flat_store_dword v{{\[[0-9]+:[0-9]+]}}, [[IDY]]
+; GCN-DAG: v_bfe_u32 [[IDZ:v[0-9]+]], v31, 20, 10
+; GCN-DAG: flat_store_dword v{{\[[0-9]+:[0-9]+]}}, [[IDZ]]
  
  ; GCN: s_waitcnt
  ; GCN-NEXT: s_setpc_b64
-; GCN: ScratchSize: 12
+; GCN: ScratchSize: 8
  define void @too_many_args_use_workitem_id_x_stack_yz(
    i32 %arg0, i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %arg6, i32 %arg7,
    i32 %arg8, i32 %arg9, i32 %arg10, i32 %arg11, i32 %arg12, i32 %arg13, i32 %arg14, i32 %arg15,
@@ -623,18 +701,18 @@ define void @too_many_args_use_workitem_id_x_stack_yz(
    ret void
  }
  
-; frame[0] = ID Y
-; frame[1] = ID Z
-
  ; GCN-LABEL: {{^}}kern_call_too_many_args_use_workitem_id_x_stack_yz:
  ; GCN: enable_vgpr_workitem_id = 2
  
  ; GCN: s_mov_b32 s33, s7
-; GCN: s_mov_b32 s32, s33
  
-; GCN-DAG: v_mov_b32_e32 v31, v0
-; GCN-DAG: buffer_store_dword v1, off, s[0:3], s32{{$}}
-; GCN-DAG: buffer_store_dword v2, off, s[0:3], s32 offset:4
+; GCN-NOT: v0
+; GCN-DAG: v_lshlrev_b32_e32 v1, 10, v1
+; GCN-DAG: v_or_b32_e32 v0, v0, v1
+; GCN-DAG: v_lshlrev_b32_e32 v2, 20, v2
+; GCN-DAG: v_or_b32_e32 v31, v0, v2
+
+; GCN: s_mov_b32 s32, s33
  ; GCN: s_swappc_b64
  define amdgpu_kernel void @kern_call_too_many_args_use_workitem_id_x_stack_yz() #1 {
    call void @too_many_args_use_workitem_id_x_stack_yz(
diff --git a/test/CodeGen/AMDGPU/zext-lid.ll b/test/CodeGen/AMDGPU/zext-lid.ll

index e257980dc0e55f112060aa60b7e7e55e5b174892..4e2e4d526b308356b323d7933bd74f7d4fc7ccec 100644 (file)
--- a/test/CodeGen/AMDGPU/zext-lid.ll
+++ b/test/CodeGen/AMDGPU/zext-lid.ll
@@ -1,8 +1,9 @@
-; RUN: llc -march=amdgcn < %s | FileCheck %s
-; RUN: llc -O0 -march=amdgcn < %s | FileCheck %s
+; RUN: llc -march=amdgcn < %s | FileCheck -enable-var-scope -check-prefixes=GCN,O2 %s
+; RUN: llc -O0 -march=amdgcn < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
  ; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-intrinsics < %s | FileCheck -check-prefix=OPT %s
  
-; CHECK-NOT: and_b32
+; GCN-LABEL: {{^}}zext_grp_size_128:
+; GCN-NOT: and_b32
  
  ; OPT-LABEL: @zext_grp_size_128
  ; OPT: tail call i32 @llvm.amdgcn.workitem.id.x(), !range !0
@@ -24,6 +25,9 @@ bb:
    ret void
  }
  
+; GCN-LABEL: {{^}}zext_grp_size_32x4x1:
+; GCN-NOT: and_b32
+
  ; OPT-LABEL: @zext_grp_size_32x4x1
  ; OPT: tail call i32 @llvm.amdgcn.workitem.id.x(), !range !2
  ; OPT: tail call i32 @llvm.amdgcn.workitem.id.y(), !range !3
@@ -44,6 +48,9 @@ bb:
    ret void
  }
  
+; GCN-LABEL: {{^}}zext_grp_size_1x1x1:
+; GCN-NOT: and_b32
+
  ; When EarlyCSE is not run this call produces a range max with 0 active bits,
  ; which is a special case as an AssertZext from width 0 is invalid.
  ; OPT-LABEL: @zext_grp_size_1x1x1
@@ -55,6 +62,9 @@ define amdgpu_kernel void @zext_grp_size_1x1x1(i32 addrspace(1)* nocapture %arg)
    ret void
  }
  
+; GCN-LABEL: {{^}}zext_grp_size_512:
+; GCN-NOT: and_b32
+
  ; OPT-LABEL: @zext_grp_size_512
  ; OPT: tail call i32 @llvm.amdgcn.workitem.id.x(), !range !6
  ; OPT: tail call i32 @llvm.amdgcn.workitem.id.y(), !range !6
@@ -75,6 +85,11 @@ bb:
    ret void
  }
  
+; GCN-LABEL: {{^}}func_test_workitem_id_x_known_max_range:
+; O2-NOT: and_b32
+; O2: v_and_b32_e32 v{{[0-9]+}}, 0x3ff,
+; O2-NOT: and_b32
+
  ; OPT-LABEL: @func_test_workitem_id_x_known_max_range(
  ; OPT: tail call i32 @llvm.amdgcn.workitem.id.x(), !range !0
  define void @func_test_workitem_id_x_known_max_range(i32 addrspace(1)* nocapture %out) #0 {
@@ -85,6 +100,11 @@ entry:
    ret void
  }
  
+; GCN-LABEL: {{^}}func_test_workitem_id_x_default_range:
+; O2-NOT: and_b32
+; O2: v_and_b32_e32 v{{[0-9]+}}, 0x3ff,
+; O2-NOT: and_b32
+
  ; OPT-LABEL: @func_test_workitem_id_x_default_range(
  ; OPT: tail call i32 @llvm.amdgcn.workitem.id.x(), !range !7
  define void @func_test_workitem_id_x_default_range(i32 addrspace(1)* nocapture %out) #4 {
author	Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>
	Fri, 28 Jun 2019 01:52:13 +0000 (01:52 +0000)
committer	Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>
	Fri, 28 Jun 2019 01:52:13 +0000 (01:52 +0000)
lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp		patch \| blob \| history
lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h		patch \| blob \| history
lib/Target/AMDGPU/AMDGPUISelLowering.cpp		patch \| blob \| history
lib/Target/AMDGPU/SIISelLowering.cpp		patch \| blob \| history
test/CodeGen/AMDGPU/call-constexpr.ll		patch \| blob \| history
test/CodeGen/AMDGPU/callee-special-input-vgprs.ll		patch \| blob \| history
test/CodeGen/AMDGPU/zext-lid.ll		patch \| blob \| history