AMDGPU: Write LDS objects out as global symbols in code generation

author Nicolai Haehnle <nhaehnle@gmail.com>

Tue, 25 Jun 2019 11:52:30 +0000 (11:52 +0000)

committer Nicolai Haehnle <nhaehnle@gmail.com>

Tue, 25 Jun 2019 11:52:30 +0000 (11:52 +0000)
author Nicolai Haehnle <nhaehnle@gmail.com>
Tue, 25 Jun 2019 11:52:30 +0000 (11:52 +0000)
committer Nicolai Haehnle <nhaehnle@gmail.com>
Tue, 25 Jun 2019 11:52:30 +0000 (11:52 +0000)
diff --git a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp

index 5578251bcfc41d0b6d954ec97a3a7f39f48f6392..bad5670010aacd54cd5f8f43de1a334bccee4b30 100644 (file)
--- a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -298,10 +298,37 @@ void AMDGPUAsmPrinter::EmitBasicBlockStart(const MachineBasicBlock &MBB) const {
  }
  
  void AMDGPUAsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
+  if (GV->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
+    if (GV->hasInitializer() && !isa<UndefValue>(GV->getInitializer())) {
+      OutContext.reportError({},
+                             Twine(GV->getName()) +
+                                 ": unsupported initializer for address space");
+      return;
+    }
+
+    // LDS variables aren't emitted in HSA or PAL yet.
+    const Triple::OSType OS = TM.getTargetTriple().getOS();
+    if (OS == Triple::AMDHSA || OS == Triple::AMDPAL)
+      return;
+
+    MCSymbol *GVSym = getSymbol(GV);
  
-  // Group segment variables aren't emitted in HSA.
-  if (AMDGPU::isGroupSegment(GV))
+    GVSym->redefineIfPossible();
+    if (GVSym->isDefined() || GVSym->isVariable())
+      report_fatal_error("symbol '" + Twine(GVSym->getName()) +
+                         "' is already defined");
+
+    const DataLayout &DL = GV->getParent()->getDataLayout();
+    uint64_t Size = DL.getTypeAllocSize(GV->getValueType());
+    unsigned Align = GV->getAlignment();
+    if (!Align)
+      Align = 4;
+
+    EmitVisibility(GVSym, GV->getVisibility(), !GV->isDeclaration());
+    EmitLinkage(GV, GVSym);
+    getTargetStreamer()->emitAMDGPULDS(GVSym, Size, Align);
      return;
+  }
  
    AsmPrinter::EmitGlobalVariable(GV);
  }
diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp

index 1197893120a93d103fb23efb6528423ca5b54425..d0af336a00b6b54a231cc8fc72169f55ab838bcd 100644 (file)
--- a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -4357,6 +4357,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
    NODE_NAME_CASE(BUILD_VERTICAL_VECTOR)
    NODE_NAME_CASE(CONST_DATA_PTR)
    NODE_NAME_CASE(PC_ADD_REL_OFFSET)
+  NODE_NAME_CASE(LDS)
    NODE_NAME_CASE(KILL)
    NODE_NAME_CASE(DUMMY_CHAIN)
    case AMDGPUISD::FIRST_MEM_OPCODE_NUMBER: break;
@@ -4571,6 +4572,15 @@ void AMDGPUTargetLowering::computeKnownBitsForTargetNode(
      Known.Zero.setHighBits(16);
      break;
    }
+  case AMDGPUISD::LDS: {
+    auto GA = cast<GlobalAddressSDNode>(Op.getOperand(0).getNode());
+    unsigned Align = GA->getGlobal()->getAlignment();
+
+    Known.Zero.setHighBits(16);
+    if (Align)
+      Known.Zero.setLowBits(Log2_32(Align));
+    break;
+  }
    case ISD::INTRINSIC_WO_CHAIN: {
      unsigned IID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
      switch (IID) {
diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.h b/lib/Target/AMDGPU/AMDGPUISelLowering.h

index a17f5dae576f953245287445fe3b25d4df89c03e..9723fc3ec660decda353b1da4aa1ae3f7993c663 100644 (file)
--- a/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -485,6 +485,7 @@ enum NodeType : unsigned {
    INTERP_P1LV_F16,
    INTERP_P2_F16,
    PC_ADD_REL_OFFSET,
+  LDS,
    KILL,
    DUMMY_CHAIN,
    FIRST_MEM_OPCODE_NUMBER = ISD::FIRST_TARGET_MEMORY_OPCODE,
diff --git a/lib/Target/AMDGPU/SIFoldOperands.cpp b/lib/Target/AMDGPU/SIFoldOperands.cpp

index 78e6e39b05a178f4e27c20b292ef3c31d56a49d0..74ed6f1fed1ded6df7995196c202966782f78fc5 100644 (file)
--- a/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -50,7 +50,7 @@ struct FoldCandidate {
      } else if (FoldOp->isFI()) {
        FrameIndexToFold = FoldOp->getIndex();
      } else {
-      assert(FoldOp->isReg());
+      assert(FoldOp->isReg() || FoldOp->isGlobal());
        OpToFold = FoldOp;
      }
    }
@@ -67,6 +67,8 @@ struct FoldCandidate {
      return Kind == MachineOperand::MO_Register;
    }
  
+  bool isGlobal() const { return Kind == MachineOperand::MO_GlobalAddress; }
+
    bool isCommuted() const {
      return Commuted;
    }
@@ -230,7 +232,7 @@ static bool updateOperand(FoldCandidate &Fold,
      }
    }
  
-  if ((Fold.isImm() || Fold.isFI()) && Fold.needsShrink()) {
+  if ((Fold.isImm() || Fold.isFI() || Fold.isGlobal()) && Fold.needsShrink()) {
      MachineBasicBlock *MBB = MI->getParent();
      auto Liveness = MBB->computeRegisterLiveness(&TRI, AMDGPU::VCC, MI);
      if (Liveness != MachineBasicBlock::LQR_Dead)
@@ -277,6 +279,12 @@ static bool updateOperand(FoldCandidate &Fold,
      return true;
    }
  
+  if (Fold.isGlobal()) {
+    Old.ChangeToGA(Fold.OpToFold->getGlobal(), Fold.OpToFold->getOffset(),
+                   Fold.OpToFold->getTargetFlags());
+    return true;
+  }
+
    if (Fold.isFI()) {
      Old.ChangeToFrameIndex(Fold.FrameIndexToFold);
      return true;
@@ -368,7 +376,7 @@ static bool tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList,
        if ((Opc == AMDGPU::V_ADD_I32_e64 ||
             Opc == AMDGPU::V_SUB_I32_e64 ||
             Opc == AMDGPU::V_SUBREV_I32_e64) && // FIXME
-          (OpToFold->isImm() || OpToFold->isFI())) {
+          (OpToFold->isImm() || OpToFold->isFI() || OpToFold->isGlobal())) {
          MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
  
          // Verify the other operand is a VGPR, otherwise we would violate the
@@ -483,7 +491,8 @@ void SIFoldOperands::foldOperand(
      return;
    }
  
-  bool FoldingImmLike = OpToFold.isImm() || OpToFold.isFI();
+  bool FoldingImmLike =
+      OpToFold.isImm() || OpToFold.isFI() || OpToFold.isGlobal();
  
    if (FoldingImmLike && UseMI->isCopy()) {
      unsigned DestReg = UseMI->getOperand(0).getReg();
@@ -884,7 +893,7 @@ void SIFoldOperands::foldInstOperand(MachineInstr &MI,
    SmallVector<FoldCandidate, 4> FoldList;
    MachineOperand &Dst = MI.getOperand(0);
  
-  bool FoldingImm = OpToFold.isImm() || OpToFold.isFI();
+  bool FoldingImm = OpToFold.isImm() || OpToFold.isFI() || OpToFold.isGlobal();
    if (FoldingImm) {
      unsigned NumLiteralUses = 0;
      MachineOperand *NonInlineUse = nullptr;
@@ -1232,7 +1241,8 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
        }
  
        MachineOperand &OpToFold = MI.getOperand(1);
-      bool FoldingImm = OpToFold.isImm() || OpToFold.isFI();
+      bool FoldingImm =
+          OpToFold.isImm() || OpToFold.isFI() || OpToFold.isGlobal();
  
        // FIXME: We could also be folding things like TargetIndexes.
        if (!FoldingImm && !OpToFold.isReg())
diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp

index 64018fd92ac987b8dfa78ff117e545934c88bad4..25000506f2ccbacdef2a1232081da57a011e2dfc 100644 (file)
--- a/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -3588,6 +3588,8 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
    }
  
    case AMDGPU::GET_GROUPSTATICSIZE: {
+    assert(getTargetMachine().getTargetTriple().getOS() == Triple::AMDHSA ||
+           getTargetMachine().getTargetTriple().getOS() == Triple::AMDPAL);
      DebugLoc DL = MI.getDebugLoc();
      BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_MOV_B32))
          .add(MI.getOperand(0))
@@ -4776,7 +4778,10 @@ SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
                                               SelectionDAG &DAG) const {
    GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op);
    const GlobalValue *GV = GSD->getGlobal();
-  if (GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
+  if ((GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS &&
+       (!GV->hasExternalLinkage() ||
+        getTargetMachine().getTargetTriple().getOS() == Triple::AMDHSA ||
+        getTargetMachine().getTargetTriple().getOS() == Triple::AMDPAL)) ||
        GSD->getAddressSpace() == AMDGPUAS::REGION_ADDRESS ||
        GSD->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS)
      return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG);
@@ -4784,7 +4789,12 @@ SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
    SDLoc DL(GSD);
    EVT PtrVT = Op.getValueType();
  
-  // FIXME: Should not make address space based decisions here.
+  if (GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
+    SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, GSD->getOffset(),
+                                            SIInstrInfo::MO_ABS32_LO);
+    return DAG.getNode(AMDGPUISD::LDS, DL, MVT::i32, GA);
+  }
+
    if (shouldEmitFixup(GV))
      return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT);
    else if (shouldEmitPCReloc(GV))
@@ -5773,6 +5783,18 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
      return SDValue(DAG.getMachineNode(AMDGPU::SI_IF_BREAK, DL, VT,
                                        Op->getOperand(1), Op->getOperand(2)), 0);
  
+  case Intrinsic::amdgcn_groupstaticsize: {
+    Triple::OSType OS = getTargetMachine().getTargetTriple().getOS();
+    if (OS == Triple::AMDHSA || OS == Triple::AMDPAL)
+      return Op;
+
+    const Module *M = MF.getFunction().getParent();
+    const GlobalValue *GV =
+        M->getNamedValue(Intrinsic::getName(Intrinsic::amdgcn_groupstaticsize));
+    SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, 0,
+                                            SIInstrInfo::MO_ABS32_LO);
+    return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0};
+  }
    default:
      if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
              AMDGPU::getImageDimIntrinsicInfo(IntrinsicID))
diff --git a/lib/Target/AMDGPU/SIInstrInfo.cpp b/lib/Target/AMDGPU/SIInstrInfo.cpp

index f977928735a78d1aa5f584eb3e64105dcafa37eb..1923ac24f85f319b986a694e718ce71c864ea7af 100644 (file)
--- a/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -2703,7 +2703,7 @@ bool SIInstrInfo::isImmOperandLegal(const MachineInstr &MI, unsigned OpNo,
    const MCInstrDesc &InstDesc = MI.getDesc();
    const MCOperandInfo &OpInfo = InstDesc.OpInfo[OpNo];
  
-  assert(MO.isImm() || MO.isTargetIndex() || MO.isFI());
+  assert(MO.isImm() || MO.isTargetIndex() || MO.isFI() || MO.isGlobal());
  
    if (OpInfo.OperandType == MCOI::OPERAND_IMMEDIATE)
      return true;
@@ -3012,7 +3012,7 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
  
      switch (Desc.OpInfo[i].OperandType) {
      case MCOI::OPERAND_REGISTER:
-      if (MI.getOperand(i).isImm()) {
+      if (MI.getOperand(i).isImm() || MI.getOperand(i).isGlobal()) {
          ErrInfo = "Illegal immediate value for operand.";
          return false;
        }
@@ -3682,7 +3682,7 @@ bool SIInstrInfo::isLegalVSrcOperand(const MachineRegisterInfo &MRI,
      return isLegalRegOperand(MRI, OpInfo, MO);
  
    // Handle non-register types that are treated like immediates.
-  assert(MO.isImm() || MO.isTargetIndex() || MO.isFI());
+  assert(MO.isImm() || MO.isTargetIndex() || MO.isFI() || MO.isGlobal());
    return true;
  }
  
@@ -3739,7 +3739,7 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx,
    }
  
    // Handle non-register types that are treated like immediates.
-  assert(MO->isImm() || MO->isTargetIndex() || MO->isFI());
+  assert(MO->isImm() || MO->isTargetIndex() || MO->isFI() || MO->isGlobal());
  
    if (!DefinedRC) {
      // This operand expects an immediate.
diff --git a/lib/Target/AMDGPU/SIInstrInfo.td b/lib/Target/AMDGPU/SIInstrInfo.td

index 1a3e16afce39e0f40ea77a745e19654141281503..cd1c7fdae929cc22e9b23fe2a55258e9d2b31f0d 100644 (file)
--- a/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/lib/Target/AMDGPU/SIInstrInfo.td
@@ -205,6 +205,10 @@ def SIpc_add_rel_offset : SDNode<"AMDGPUISD::PC_ADD_REL_OFFSET",
    SDTypeProfile<1, 2, [SDTCisVT<0, iPTR>, SDTCisSameAs<0,1>, SDTCisSameAs<0,2>]>
  >;
  
+def SIlds : SDNode<"AMDGPUISD::LDS",
+  SDTypeProfile<1, 1, [SDTCisVT<0, iPTR>, SDTCisSameAs<0,1>]>
+>;
+
  def SIload_d16_lo : SDNode<"AMDGPUISD::LOAD_D16_LO",
    SIload_d16,
    [SDNPMayLoad, SDNPMemOperand, SDNPHasChain]
diff --git a/lib/Target/AMDGPU/SIInstructions.td b/lib/Target/AMDGPU/SIInstructions.td

index 630aeeb8777549c42a487f5b1635acda91c62949..2cb7e9fa357bd37958b3ab2a5651984ef55777be 100644 (file)
--- a/lib/Target/AMDGPU/SIInstructions.td
+++ b/lib/Target/AMDGPU/SIInstructions.td
@@ -1142,6 +1142,16 @@ def : GCNPat <
    (S_MOV_B32 imm:$imm)
  >;
  
+def : GCNPat <
+  (VGPRImm<(SIlds tglobaladdr:$ga)>),
+  (V_MOV_B32_e32 $ga)
+>;
+
+def : GCNPat <
+  (SIlds tglobaladdr:$ga),
+  (S_MOV_B32 $ga)
+>;
+
  // FIXME: Workaround for ordering issue with peephole optimizer where
  // a register class copy interferes with immediate folding.  Should
  // use s_mov_b32, which can be shrunk to s_movk_i32
diff --git a/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/lib/Target/AMDGPU/SIShrinkInstructions.cpp

index c180afdcd45b42966212013552fcc96f7f25ea84..7ee178149c7a5e1ed393f45ec8892bced57495df 100644 (file)
--- a/lib/Target/AMDGPU/SIShrinkInstructions.cpp
+++ b/lib/Target/AMDGPU/SIShrinkInstructions.cpp
@@ -95,6 +95,10 @@ static bool foldImmediates(MachineInstr &MI, const SIInstrInfo *TII,
            Src0.setSubReg(0);
            Src0.ChangeToFrameIndex(MovSrc.getIndex());
            ConstantFolded = true;
+        } else if (MovSrc.isGlobal()) {
+          Src0.ChangeToGA(MovSrc.getGlobal(), MovSrc.getOffset(),
+                          MovSrc.getTargetFlags());
+          ConstantFolded = true;
          }
  
          if (ConstantFolded) {
diff --git a/test/CodeGen/AMDGPU/32-bit-local-address-space.ll b/test/CodeGen/AMDGPU/32-bit-local-address-space.ll

index ca661cf9a712caf8098bd89623188d75fb2187c6..72e62f8dbbfdf95f3074f6cdadd98008a43d5f84 100644 (file)
--- a/test/CodeGen/AMDGPU/32-bit-local-address-space.ll
+++ b/test/CodeGen/AMDGPU/32-bit-local-address-space.ll
@@ -81,8 +81,8 @@ define amdgpu_kernel void @mul_32bit_ptr(float addrspace(1)* %out, [3 x float] a
  @g_lds = addrspace(3) global float undef, align 4
  
  ; FUNC-LABEL: {{^}}infer_ptr_alignment_global_offset:
-; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0
-; SI: ds_read_b32 v{{[0-9]+}}, [[REG]]
+; SI: v_mov_b32_e32 [[PTR:v[0-9]+]], g_lds@abs32@lo
+; SI: ds_read_b32 v{{[0-9]+}}, [[PTR]]
  define amdgpu_kernel void @infer_ptr_alignment_global_offset(float addrspace(1)* %out, i32 %tid) {
    %val = load float, float addrspace(3)* @g_lds
    store float %val, float addrspace(1)* %out
diff --git a/test/CodeGen/AMDGPU/ds-sub-offset.ll b/test/CodeGen/AMDGPU/ds-sub-offset.ll

index 1314311c9cd7cd6d7a19342774f40e32b954f601..7487cd98e0aa27e8df6e65373712ade3f5d0c060 100644 (file)
--- a/test/CodeGen/AMDGPU/ds-sub-offset.ll
+++ b/test/CodeGen/AMDGPU/ds-sub-offset.ll
@@ -7,8 +7,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() #0
  
  ; GCN-LABEL: {{^}}write_ds_sub0_offset0_global:
  ; GCN: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 2, v0
-; CI: v_sub_i32_e32 [[BASEPTR:v[0-9]+]], vcc, 0, [[SHL]]
-; GFX9: v_sub_u32_e32 [[BASEPTR:v[0-9]+]], 0, [[SHL]]
+; GCN: v_sub_{{[iu]}}32_e32 [[BASEPTR:v[0-9]+]], {{(vcc, )?}}lds.obj@abs32@lo, [[SHL]]
  ; GCN: v_mov_b32_e32 [[VAL:v[0-9]+]], 0x7b
  ; GCN: ds_write_b32 [[BASEPTR]], [[VAL]] offset:12
  define amdgpu_kernel void @write_ds_sub0_offset0_global() #0 {
diff --git a/test/CodeGen/AMDGPU/ds_read2.ll b/test/CodeGen/AMDGPU/ds_read2.ll

index f933dc05701d7b2292dfb863f2de0903781449bf..9991eb3fcbe11cda00010f4f3ac668827b6238ff 100644 (file)
--- a/test/CodeGen/AMDGPU/ds_read2.ll
+++ b/test/CodeGen/AMDGPU/ds_read2.ll
@@ -355,7 +355,8 @@ define amdgpu_kernel void @misaligned_2_simple_read2_f32(float addrspace(1)* %ou
  ; CI-DAG: s_mov_b32 m0
  ; GFX9-NOT: m0
  
-; GCN-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 3, {{v[0-9]+}}
+; GCN-DAG: v_lshlrev_b32_e32 [[VOFS:v[0-9]+]], 3, {{v[0-9]+}}
+; GCN-DAG: v_add_{{[iu]}}32_e32 [[VPTR:v[0-9]+]], {{(vcc, )?}}lds.f64@abs32@lo, [[VOFS]]
  ; GCN: ds_read2_b64 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, [[VPTR]] offset1:8
  ; GCN: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO_VREG]]:{{[0-9]+\]}}, v{{\[[0-9]+}}:[[HI_VREG]]{{\]}}
  
@@ -441,8 +442,8 @@ define amdgpu_kernel void @misaligned_read2_f64(double addrspace(1)* %out, doubl
  ; CI-DAG: s_mov_b32 m0
  ; GFX9-NOT: m0
  
-; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
-; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] offset1:1
+; GCN-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], foo@abs32@lo{{$}}
+; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[PTR]] offset1:1
  define amdgpu_kernel void @load_constant_adjacent_offsets(i32 addrspace(1)* %out) {
    %val0 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4
    %val1 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 1), align 4
@@ -455,8 +456,8 @@ define amdgpu_kernel void @load_constant_adjacent_offsets(i32 addrspace(1)* %out
  ; CI-DAG: s_mov_b32 m0
  ; GFX9-NOT: m0
  
-; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
-; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] offset1:2
+; GCN-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], foo@abs32@lo{{$}}
+; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[PTR]] offset1:2
  define amdgpu_kernel void @load_constant_disjoint_offsets(i32 addrspace(1)* %out) {
    %val0 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4
    %val1 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 2), align 4
@@ -471,9 +472,9 @@ define amdgpu_kernel void @load_constant_disjoint_offsets(i32 addrspace(1)* %out
  ; CI-DAG: s_mov_b32 m0
  ; GFX9-NOT: m0
  
-; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
-; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] offset1:1
-; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] offset0:2 offset1:3
+; GCN-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], bar@abs32@lo{{$}}
+; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[PTR]] offset1:1
+; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[PTR]] offset0:2 offset1:3
  define amdgpu_kernel void @load_misaligned64_constant_offsets(i64 addrspace(1)* %out) {
    %val0 = load i64, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 0), align 4
    %val1 = load i64, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 1), align 4
@@ -488,10 +489,13 @@ define amdgpu_kernel void @load_misaligned64_constant_offsets(i64 addrspace(1)*
  ; CI-DAG: s_mov_b32 m0
  ; GFX9-NOT: m0
  
-; GCN-DAG: v_mov_b32_e32 [[BASE0:v[0-9]+]], 0x7ff8{{$}}
-; GCN-DAG: v_mov_b32_e32 [[BASE1:v[0-9]+]], 0x4000
-; GCN-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASE0]] offset1:1
-; GCN-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASE1]] offset1:1
+; GCN-DAG: s_mov_b32 [[SBASE0:s[0-9]+]], bar.large@abs32@lo
+; GCN-DAG: s_add_i32 [[SBASE1:s[0-9]+]], [[SBASE0]], 0x4000{{$}}
+; GCN-DAG: s_addk_i32 [[SBASE0]], 0x7ff8{{$}}
+; GCN-DAG: v_mov_b32_e32 [[VBASE0:v[0-9]+]], [[SBASE0]]
+; GCN-DAG: v_mov_b32_e32 [[VBASE1:v[0-9]+]], [[SBASE1]]
+; GCN-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[VBASE0]] offset1:1
+; GCN-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[VBASE1]] offset1:1
  ; GCN: s_endpgm
  define amdgpu_kernel void @load_misaligned64_constant_large_offsets(i64 addrspace(1)* %out) {
    %val0 = load i64, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 2048), align 4
diff --git a/test/CodeGen/AMDGPU/ds_write2.ll b/test/CodeGen/AMDGPU/ds_write2.ll

index 03436f6b3a3c2ce048451362950b3f385076313f..a1610d44e2c9372b98d814235c6ea7fdd28c242a 100644 (file)
--- a/test/CodeGen/AMDGPU/ds_write2.ll
+++ b/test/CodeGen/AMDGPU/ds_write2.ll
@@ -103,10 +103,17 @@ define amdgpu_kernel void @simple_write2_two_val_f32_volatile_1(float addrspace(
  ; CI: buffer_load_dwordx2 v{{\[[0-9]+}}:[[VAL1:[0-9]+]]{{\]}}
  ; CI-DAG: s_mov_b32 m0
  
-; GCN-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
-
-; GFX9: global_load_dwordx2 v{{\[}}[[VAL0:[0-9]+]]:{{[0-9]+\]}}
-; GFX9: global_load_dwordx2 v{{\[[0-9]+}}:[[VAL1:[0-9]+]]{{\]}}
+; CI-DAG: v_lshlrev_b32_e32 [[VOFS:v[0-9]+]], 2, v{{[0-9]+}}
+; CI-DAG: v_add_i32_e32 [[VPTR:v[0-9]+]], vcc, lds@abs32@lo, [[VOFS]]
+;
+; TODO: This should be an s_mov_b32. The v_mov_b32 gets introduced by an
+;       early legalization of the constant bus constraint on the v_lshl_add_u32,
+;       and then SIFoldOperands folds in an unlucky order.
+; GFX9-DAG: v_mov_b32_e32 [[VBASE:v[0-9]+]], lds@abs32@lo
+; GFX9-DAG: v_lshl_add_u32 [[VPTR:v[0-9]+]], {{v[0-9]+}}, 2, [[VBASE]]
+
+; GFX9-DAG: global_load_dwordx2 v{{\[}}[[VAL0:[0-9]+]]:{{[0-9]+\]}}
+; GFX9-DAG: global_load_dwordx2 v{{\[[0-9]+}}:[[VAL1:[0-9]+]]{{\]}}
  
  ; GCN: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:8
  ; GCN: s_endpgm
@@ -131,7 +138,12 @@ define amdgpu_kernel void @simple_write2_two_val_subreg2_mixed_f32(float addrspa
  ; GFX9-NOT: m0
  
  ; GCN-DAG: {{buffer|global}}_load_dwordx2 v{{\[}}[[VAL0:[0-9]+]]:[[VAL1:[0-9]+]]{{\]}}
-; GCN-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
+
+; CI-DAG: v_lshlrev_b32_e32 [[VOFS:v[0-9]+]], 2, v{{[0-9]+}}
+; CI-DAG: v_add_i32_e32 [[VPTR:v[0-9]+]], vcc, lds@abs32@lo, [[VOFS]]
+; GFX9-DAG: v_mov_b32_e32 [[VBASE:v[0-9]+]], lds@abs32@lo
+; GFX9-DAG: v_lshl_add_u32 [[VPTR:v[0-9]+]], v{{[0-9]+}}, 2, [[VBASE]]
+
  ; GCN: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:8
  ; GCN: s_endpgm
  define amdgpu_kernel void @simple_write2_two_val_subreg2_f32(float addrspace(1)* %C, <2 x float> addrspace(1)* %in) #0 {
@@ -153,7 +165,12 @@ define amdgpu_kernel void @simple_write2_two_val_subreg2_f32(float addrspace(1)*
  ; GFX9-NOT: m0
  
  ; GCN-DAG: {{buffer|global}}_load_dwordx4 v{{\[}}[[VAL0:[0-9]+]]:[[VAL1:[0-9]+]]{{\]}}
-; GCN-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
+
+; CI-DAG: v_lshlrev_b32_e32 [[VOFS:v[0-9]+]], 2, v{{[0-9]+}}
+; CI-DAG: v_add_i32_e32 [[VPTR:v[0-9]+]], vcc, lds@abs32@lo, [[VOFS]]
+; GFX9-DAG: v_mov_b32_e32 [[VBASE:v[0-9]+]], lds@abs32@lo
+; GFX9-DAG: v_lshl_add_u32 [[VPTR:v[0-9]+]], v{{[0-9]+}}, 2, [[VBASE]]
+
  ; GCN: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:8
  ; GCN: s_endpgm
  define amdgpu_kernel void @simple_write2_two_val_subreg4_f32(float addrspace(1)* %C, <4 x float> addrspace(1)* %in) #0 {
@@ -389,8 +406,8 @@ define amdgpu_kernel void @simple_write2_two_val_f64(double addrspace(1)* %C, do
  ; CI-DAG: s_mov_b32 m0
  ; GFX9-NOT: m0
  
-; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
-; GCN: ds_write2_b32 [[ZERO]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1
+; GCN-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], foo@abs32@lo{{$}}
+; GCN: ds_write2_b32 [[PTR]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1
  define amdgpu_kernel void @store_constant_adjacent_offsets() {
    store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4
    store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 1), align 4
@@ -402,8 +419,8 @@ define amdgpu_kernel void @store_constant_adjacent_offsets() {
  ; GFX9-NOT: m0
  
  ; GCN-DAG: v_mov_b32_e32 [[VAL:v[0-9]+]], 0x7b{{$}}
-; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
-; GCN: ds_write2_b32 [[ZERO]], [[VAL]], [[VAL]] offset1:2
+; GCN-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], foo@abs32@lo{{$}}
+; GCN: ds_write2_b32 [[PTR]], [[VAL]], [[VAL]] offset1:2
  define amdgpu_kernel void @store_constant_disjoint_offsets() {
    store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4
    store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 2), align 4
@@ -416,9 +433,9 @@ define amdgpu_kernel void @store_constant_disjoint_offsets() {
  ; CI-DAG: s_mov_b32 m0
  ; GFX9-NOT: m0
  
-; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
-; GCN-DAG: ds_write2_b32 [[ZERO]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1
-; GCN-DAG: ds_write2_b32 [[ZERO]], v{{[0-9]+}}, v{{[0-9]+}} offset0:2 offset1:3
+; GCN-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], bar@abs32@lo{{$}}
+; GCN-DAG: ds_write2_b32 [[PTR]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1
+; GCN-DAG: ds_write2_b32 [[PTR]], v{{[0-9]+}}, v{{[0-9]+}} offset0:2 offset1:3
  ; GCN: s_endpgm
  define amdgpu_kernel void @store_misaligned64_constant_offsets() {
    store i64 123, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 0), align 4
@@ -432,10 +449,13 @@ define amdgpu_kernel void @store_misaligned64_constant_offsets() {
  ; CI-DAG: s_mov_b32 m0
  ; GFX9-NOT: m0
  
-; GCN-DAG: v_mov_b32_e32 [[BASE0:v[0-9]+]], 0x7ff8{{$}}
-; GCN-DAG: v_mov_b32_e32 [[BASE1:v[0-9]+]], 0x4000{{$}}
-; GCN-DAG: ds_write2_b32 [[BASE0]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1
-; GCN-DAG: ds_write2_b32 [[BASE1]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1
+; GCN-DAG: s_mov_b32 [[SBASE0:s[0-9]+]], bar.large@abs32@lo
+; GCN-DAG: s_add_i32 [[SBASE1:s[0-9]+]], [[SBASE0]], 0x4000{{$}}
+; GCN-DAG: s_addk_i32 [[SBASE0]], 0x7ff8{{$}}
+; GCN-DAG: v_mov_b32_e32 [[VBASE0:v[0-9]+]], [[SBASE0]]{{$}}
+; GCN-DAG: v_mov_b32_e32 [[VBASE1:v[0-9]+]], [[SBASE1]]{{$}}
+; GCN-DAG: ds_write2_b32 [[VBASE0]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1
+; GCN-DAG: ds_write2_b32 [[VBASE1]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1
  ; GCN: s_endpgm
  define amdgpu_kernel void @store_misaligned64_constant_large_offsets() {
    store i64 123, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 2048), align 4
diff --git a/test/CodeGen/AMDGPU/lds-initializer.ll b/test/CodeGen/AMDGPU/lds-initializer.ll

index 5ced7100596e1749e54d02d90e7f122685644fa1..7cfe013b8453dc2eb6e6ec5cab249307128e49e5 100644 (file)
--- a/test/CodeGen/AMDGPU/lds-initializer.ll
+++ b/test/CodeGen/AMDGPU/lds-initializer.ll
@@ -1,7 +1,7 @@
  ; RUN: not llc -march=amdgcn -mcpu=tahiti < %s 2>&1 | FileCheck %s
  ; RUN: not llc -march=amdgcn -mcpu=tonga < %s 2>&1 | FileCheck %s
  
-; CHECK: in function load_init_lds_global{{.*}}: unsupported initializer for address space
+; CHECK: lds: unsupported initializer for address space
  
  @lds = addrspace(3) global [8 x i32] [i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8]
  
diff --git a/test/CodeGen/AMDGPU/lds-relocs.ll b/test/CodeGen/AMDGPU/lds-relocs.ll

new file mode 100644 (file)

index 0000000..63e3dd8
--- /dev/null
+++ b/test/CodeGen/AMDGPU/lds-relocs.ll
@@ -0,0 +1,63 @@
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefixes=GCN %s
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -filetype=obj < %s | llvm-readobj -r -t | FileCheck -check-prefixes=ELF %s
+
+@lds.external = external unnamed_addr addrspace(3) global [0 x i32]
+@lds.defined = unnamed_addr addrspace(3) global [8 x i32] undef, align 8
+
+; ELF:      Relocations [
+; ELF-NEXT:   Section (3) .rel.text {
+; ELF-NEXT:     0x{{[0-9a-f]*}} R_AMDGPU_ABS32 lds.external 0x0
+; ELF-NEXT:     0x{{[0-9a-f]*}} R_AMDGPU_ABS32 lds.defined 0x0
+; ELF-NEXT:   }
+; ELF-NEXT: ]
+
+; ELF:      Symbol {
+; ELF:        Name: lds.defined
+; ELF-NEXT:   Value: 0x8
+; ELF-NEXT:   Size: 32
+; ELF-NEXT:   Binding: Global (0x1)
+; ELF-NEXT:   Type: Object (0x1)
+; ELF-NEXT:   Other: 0
+; ELF-NEXT:   Section: Processor Specific (0xFF00)
+; ELF-NEXT: }
+
+; ELF:      Symbol {
+; ELF:        Name: lds.external
+; ELF-NEXT:   Value: 0x4
+; ELF-NEXT:   Size: 0
+; ELF-NEXT:   Binding: Global (0x1)
+; ELF-NEXT:   Type: Object (0x1)
+; ELF-NEXT:   Other: 0
+; ELF-NEXT:   Section: Processor Specific (0xFF00)
+; ELF-NEXT: }
+
+; GCN-LABEL: {{^}}test_basic:
+; GCN: v_mov_b32_e32 v1, lds.external@abs32@lo ; encoding: [0xff,0x02,0x02,0x7e,A,A,A,A]
+; GCN-NEXT:              ; fixup A - offset: 4, value: lds.external@abs32@lo, kind: FK_Data_4{{$}}
+;
+; GCN: s_add_i32 s0, lds.defined@abs32@lo, s0 ; encoding: [0xff,0x00,0x00,0x81,A,A,A,A]
+; GCN-NEXT:          ; fixup A - offset: 4, value: lds.defined@abs32@lo, kind: FK_Data_4{{$}}
+;
+; GCN: .globl lds.external
+; GCN: .amdgpu_lds lds.external, 0, 4
+; GCN: .globl lds.defined
+; GCN: .amdgpu_lds lds.defined, 32, 8
+define amdgpu_gs float @test_basic(i32 inreg %wave, i32 %arg1) #0 {
+main_body:
+  %gep0 = getelementptr [0 x i32], [0 x i32] addrspace(3)* @lds.external, i32 0, i32 %arg1
+  %tmp = load i32, i32 addrspace(3)* %gep0
+
+  %mask = call i64 @llvm.amdgcn.icmp.i64.i32(i32 %tmp, i32 0, i32 0)
+  %mask.32 = trunc i64 %mask to i32
+  %gep1 = getelementptr [8 x i32], [8 x i32] addrspace(3)* @lds.defined, i32 0, i32 %wave
+  store i32 %mask.32, i32 addrspace(3)* %gep1
+
+  %r = bitcast i32 %tmp to float
+  ret float %r
+}
+
+; Function Attrs: convergent nounwind readnone
+declare i64 @llvm.amdgcn.icmp.i64.i32(i32, i32, i32) #4
+
+attributes #0 = { "no-signed-zeros-fp-math"="true" }
+attributes #4 = { convergent nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/lds-size.ll b/test/CodeGen/AMDGPU/lds-size.ll

index ff78c3bcb18cf964dda224b38f5467ab11169c1e..a7f3d7787557613ba4c4cb345e7c1f0e1d851343 100644 (file)
--- a/test/CodeGen/AMDGPU/lds-size.ll
+++ b/test/CodeGen/AMDGPU/lds-size.ll
@@ -1,4 +1,3 @@
-; RUN: llc -march=amdgcn < %s | FileCheck -check-prefix=ALL -check-prefix=GCN %s
  ; RUN: llc -mtriple=amdgcn-amd-amdhsa < %s | FileCheck -check-prefix=ALL -check-prefix=HSA %s
  ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=ALL -check-prefix=EG %s
  
diff --git a/test/CodeGen/AMDGPU/lds-zero-initializer.ll b/test/CodeGen/AMDGPU/lds-zero-initializer.ll

index a3b3d7b341facc35da38b49f9f4b9bf91eb862f0..367dd173f781f25c834fe793107c7d0414dd295f 100644 (file)
--- a/test/CodeGen/AMDGPU/lds-zero-initializer.ll
+++ b/test/CodeGen/AMDGPU/lds-zero-initializer.ll
@@ -1,7 +1,7 @@
  ; RUN: not llc -march=amdgcn -mcpu=tahiti < %s 2>&1 | FileCheck %s
  ; RUN: not llc -march=amdgcn -mcpu=tonga < %s 2>&1 | FileCheck %s
  
-; CHECK: in function load_zeroinit_lds_global{{.*}}: unsupported initializer for address space
+; CHECK: lds: unsupported initializer for address space
  
  @lds = addrspace(3) global [256 x i32] zeroinitializer
  
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll

index c4e631590d53e826ce30344e9f9bb2c1f2fa7145..4d49d87c67ad19e4218969d8cf320991d1a07355 100644 (file)
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll
@@ -268,7 +268,11 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_addr64(i64* %ptr) #0
  ; CIVI-DAG: s_mov_b32 m0
  ; GFX9-NOT: m0
  
-; GCN-DAG: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
+; CIVI-DAG: v_lshlrev_b32_e32 [[OFS:v[0-9]+]], 2, {{v[0-9]+}}
+; CIVI-DAG: v_add_{{[ui]}}32_e32 [[PTR:v[0-9]+]], vcc, lds0@abs32@lo, [[OFS]]
+; GFX9-DAG: s_mov_b32 [[BASE:s[0-9]+]], lds0@abs32@lo
+; GFX9-DAG: v_lshl_add_u32 [[PTR:v[0-9]+]], {{v[0-9]+}}, 2, [[BASE]]
+
  ; GCN: ds_dec_rtn_u32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
  define amdgpu_kernel void @atomic_dec_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
    %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
@@ -412,7 +416,11 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_addr64(i64 addrspa
  ; CIVI-DAG: s_mov_b32 m0
  ; GFX9-NOT: m0
  
-; GCN-DAG: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 3, {{v[0-9]+}}
+; CIVI-DAG: v_lshlrev_b32_e32 [[OFS:v[0-9]+]], 3, {{v[0-9]+}}
+; CIVI-DAG: v_add_{{[ui]}}32_e32 [[PTR:v[0-9]+]], vcc, lds1@abs32@lo, [[OFS]]
+; GFX9-DAG: v_mov_b32_e32 [[BASE:v[0-9]+]], lds1@abs32@lo
+; GFX9-DAG: v_lshl_add_u32 [[PTR:v[0-9]+]], {{v[0-9]+}}, 3, [[BASE]]
+
  ; GCN: ds_dec_rtn_u64 v{{\[[0-9]+:[0-9]+\]}}, [[PTR]], v{{\[[0-9]+:[0-9]+\]}} offset:16
  define amdgpu_kernel void @atomic_dec_shl_base_lds_0_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
    %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll

index 689e2b6300fad3b8fe9aed801fcc8284ca55aac0..111f131177ba3d3ee6a9fe5e8ce419d46edfb8bf 100644 (file)
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll
@@ -131,7 +131,10 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_addr64(i32 addrspa
  @lds0 = addrspace(3) global [512 x i32] undef, align 4
  
  ; GCN-LABEL: {{^}}atomic_inc_shl_base_lds_0_i32:
-; GCN: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
+; CIVI-DAG: v_lshlrev_b32_e32 [[OFS:v[0-9]+]], 2, {{v[0-9]+}}
+; CIVI-DAG: v_add_{{[ui]}}32_e32 [[PTR:v[0-9]+]], vcc, lds0@abs32@lo, [[OFS]]
+; GFX9-DAG: s_mov_b32 [[BASE:s[0-9]+]], lds0@abs32@lo
+; GFX9-DAG: v_lshl_add_u32 [[PTR:v[0-9]+]], {{v[0-9]+}}, 2, [[BASE]]
  ; GCN: ds_inc_rtn_u32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
  define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
    %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
@@ -325,7 +328,10 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(i32* %ptr) #0
  @lds1 = addrspace(3) global [512 x i64] undef, align 8
  
  ; GCN-LABEL: {{^}}atomic_inc_shl_base_lds_0_i64:
-; GCN: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 3, {{v[0-9]+}}
+; CIVI-DAG: v_lshlrev_b32_e32 [[OFS:v[0-9]+]], 3, {{v[0-9]+}}
+; CIVI-DAG: v_add_{{[ui]}}32_e32 [[PTR:v[0-9]+]], vcc, lds1@abs32@lo, [[OFS]]
+; GFX9-DAG: v_mov_b32_e32 [[BASE:v[0-9]+]], lds1@abs32@lo
+; GFX9-DAG: v_lshl_add_u32 [[PTR:v[0-9]+]], {{v[0-9]+}}, 3, [[BASE]]
  ; GCN: ds_inc_rtn_u64 v{{\[[0-9]+:[0-9]+\]}}, [[PTR]], v{{\[[0-9]+:[0-9]+\]}} offset:16
  define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
    %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.groupstaticsize.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.groupstaticsize.ll

index d26fab4cebe1843d221a199909a1a6935cd1c90a..3224d8a3594ad8ddec6ef56904262c88b73b8bcf 100644 (file)
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.groupstaticsize.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.groupstaticsize.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,NOHSA %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,HSA %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,HSA %s
  
  @lds0 = addrspace(3) global [512 x float] undef, align 4
  @lds1 = addrspace(3) global [256 x float] undef, align 4
@@ -8,7 +8,8 @@
  @large = addrspace(3) global [4096 x i32] undef, align 4
  
  ; CHECK-LABEL: {{^}}groupstaticsize_test0:
-; CHECK: v_mov_b32_e32 v{{[0-9]+}}, 0x800{{$}}
+; NOHSA: v_mov_b32_e32 v{{[0-9]+}}, llvm.amdgcn.groupstaticsize@abs32@lo
+; HSA: v_mov_b32_e32 v{{[0-9]+}}, 0x800{{$}}
  define amdgpu_kernel void @groupstaticsize_test0(float addrspace(1)* %out, i32 addrspace(1)* %lds_size) #0 {
    %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
    %idx.0 = add nsw i32 %tid.x, 64
@@ -22,7 +23,8 @@ define amdgpu_kernel void @groupstaticsize_test0(float addrspace(1)* %out, i32 a
  }
  
  ; CHECK-LABEL: {{^}}groupstaticsize_test1:
-; CHECK: v_mov_b32_e32 v{{[0-9]+}}, 0xc00{{$}}
+; NOHSA: v_mov_b32_e32 v{{[0-9]+}}, llvm.amdgcn.groupstaticsize@abs32@lo
+; HSA: v_mov_b32_e32 v{{[0-9]+}}, 0xc00{{$}}
  define amdgpu_kernel void @groupstaticsize_test1(float addrspace(1)* %out, i32 %cond, i32 addrspace(1)* %lds_size) {
  entry:
    %static_lds_size = call i32 @llvm.amdgcn.groupstaticsize() #1
@@ -50,7 +52,8 @@ endif:                                            ; preds = %else, %if
  
  ; Exceeds 16-bit simm limit of s_movk_i32
  ; CHECK-LABEL: {{^}}large_groupstaticsize:
-; CHECK: v_mov_b32_e32 [[REG:v[0-9]+]], 0x4000{{$}}
+; NOHSA: v_mov_b32_e32 v{{[0-9]+}}, llvm.amdgcn.groupstaticsize@abs32@lo
+; HSA: v_mov_b32_e32 [[REG:v[0-9]+]], 0x4000{{$}}
  define amdgpu_kernel void @large_groupstaticsize(i32 addrspace(1)* %size, i32 %idx) #0 {
    %gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(3)* @large, i32 0, i32 %idx
    store volatile i32 0, i32 addrspace(3)* %gep
diff --git a/test/CodeGen/AMDGPU/local-memory.amdgcn.ll b/test/CodeGen/AMDGPU/local-memory.amdgcn.ll

index 47b6558241b9f8e5c6007c524d1b3e17f8324f61..a070488a4bc05f4f073112d9cc60dc16426e916d 100644 (file)
--- a/test/CodeGen/AMDGPU/local-memory.amdgcn.ll
+++ b/test/CodeGen/AMDGPU/local-memory.amdgcn.ll
@@ -3,12 +3,6 @@
  
  @local_memory.local_mem = internal unnamed_addr addrspace(3) global [128 x i32] undef, align 4
  
-; Check that the LDS size emitted correctly
-; SI: .long 47180
-; SI-NEXT: .long 65668
-; CI: .long 47180
-; CI-NEXT: .long 32900
-
  ; GCN-LABEL: {{^}}local_memory:
  
  ; GCN-NOT: s_wqm_b64
@@ -57,6 +51,7 @@ entry:
  
  ; CI: v_sub_i32_e32 [[SUB:v[0-9]+]], vcc, 0, [[ADDRW]]
  ; CI: ds_read2_b32 {{v\[[0-9]+:[0-9]+\]}}, [[SUB]] offset0:3 offset1:7
+
  define amdgpu_kernel void @local_memory_two_objects(i32 addrspace(1)* %out) #0 {
  entry:
    %x.i = call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/test/CodeGen/AMDGPU/local-memory.ll b/test/CodeGen/AMDGPU/local-memory.ll

index 6124237d76383524adf75e6946e20533378d89fb..5e820792aee2d28e03535d767f683ae21ed0bc4f 100644 (file)
--- a/test/CodeGen/AMDGPU/local-memory.ll
+++ b/test/CodeGen/AMDGPU/local-memory.ll
@@ -10,8 +10,8 @@
  ; not an immediate.
  
  ; FUNC-LABEL: {{^}}load_i32_local_const_ptr:
-; GCN: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0
-; GCN: ds_read_b32 v{{[0-9]+}}, v[[ZERO]] offset:4
+; GCN: v_mov_b32_e32 v[[PTR:[0-9]+]], lds@abs32@lo
+; GCN: ds_read_b32 v{{[0-9]+}}, v[[PTR]] offset:4
  
  ; R600: LDS_READ_RET
  define amdgpu_kernel void @load_i32_local_const_ptr(i32 addrspace(1)* %out, i32 addrspace(3)* %in) #0 {
diff --git a/test/CodeGen/AMDGPU/merge-store-crash.ll b/test/CodeGen/AMDGPU/merge-store-crash.ll

index 1252a5c0c02ad6aeab40d187529b769723c8b12a..c384eb5b425b94e09e6926060d0936b7692a49fe 100644 (file)
--- a/test/CodeGen/AMDGPU/merge-store-crash.ll
+++ b/test/CodeGen/AMDGPU/merge-store-crash.ll
@@ -7,7 +7,8 @@
  @tess_lds = external addrspace(3) global [8192 x i32]
  
  ; CHECK-LABEL: {{^}}main:
-; CHECK: ds_write2_b32
+; CHECK: ds_write_b32
+; CHECK: ds_write_b32
  ; CHECK: v_mov_b32_e32 v1, v0
  ; CHECK: tbuffer_store_format_xyzw v[0:3],
  define amdgpu_vs void @main(i32 inreg %arg) {
diff --git a/test/CodeGen/AMDGPU/over-max-lds-size.ll b/test/CodeGen/AMDGPU/over-max-lds-size.ll

deleted file mode 100644 (file)

index 57777e7..0000000
--- a/test/CodeGen/AMDGPU/over-max-lds-size.ll
+++ /dev/null
@@ -1,14 +0,0 @@
-; RUN: not llc -march=amdgcn -mcpu=tahiti < %s 2>&1 | FileCheck -check-prefix=ERROR %s
-; RUN: not llc -march=amdgcn -mcpu=hawaii < %s 2>&1 | FileCheck -check-prefix=ERROR %s
-; RUN: not llc -march=amdgcn -mcpu=fiji < %s 2>&1 | FileCheck -check-prefix=ERROR %s
-
-; ERROR: error: local memory limit exceeded (400000) in use_huge_lds
-
-@huge = internal unnamed_addr addrspace(3) global [100000 x i32] undef, align 4
-
-define amdgpu_kernel void @use_huge_lds() {
-entry:
-  %v0 = getelementptr inbounds [100000 x i32], [100000 x i32] addrspace(3)* @huge, i32 0, i32 0
-  store i32 0, i32 addrspace(3)* %v0
-  ret void
-}
diff --git a/test/CodeGen/AMDGPU/promote-alloca-globals.ll b/test/CodeGen/AMDGPU/promote-alloca-globals.ll

index 4403d1f23066b4b531e010aefa3e7d101a2be10e..4fd9c01a57fb8c1c0ad0d47f49e3935e86596af0 100644 (file)
--- a/test/CodeGen/AMDGPU/promote-alloca-globals.ll
+++ b/test/CodeGen/AMDGPU/promote-alloca-globals.ll
@@ -8,7 +8,8 @@
  ; IR-LABEL: define amdgpu_kernel void @promote_alloca_size_256(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) {
  ; IR: alloca [10 x i32]
  ; ASM-LABEL: {{^}}promote_alloca_size_256:
-; ASM: ; LDSByteSize: 60000 bytes/workgroup (compile time only)
+; ASM: .amdgpu_lds global_array0, 30000, 4
+; ASM: .amdgpu_lds global_array1, 30000, 4
  
  define amdgpu_kernel void @promote_alloca_size_256(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) {
  entry:
diff --git a/test/CodeGen/AMDGPU/shl_add_ptr.ll b/test/CodeGen/AMDGPU/shl_add_ptr.ll

index 379dcd623026027fe2ee6dfb1759c73de6d3beb0..6c8891d28d6d92f50e8c96e314e02b0159adfeda 100644 (file)
--- a/test/CodeGen/AMDGPU/shl_add_ptr.ll
+++ b/test/CodeGen/AMDGPU/shl_add_ptr.ll
@@ -33,7 +33,11 @@ define amdgpu_kernel void @load_shl_base_lds_0(float addrspace(1)* %out, i32 add
  ; remaining add use goes through the normal shl + add constant fold.
  
  ; GCN-LABEL: {{^}}load_shl_base_lds_1:
-; GCN: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
+; GCN: v_lshlrev_b32_e32 [[OFS:v[0-9]+]], 2, {{v[0-9]+}}
+
+; TODO: integrate into the ds_read_b32 offset using a 16-bit relocation
+; GCN: v_add_{{[iu]}}32_e32 [[PTR:v[0-9]+]], vcc, lds0@abs32@lo, [[OFS]]
+
  ; GCN: ds_read_b32 [[RESULT:v[0-9]+]], [[PTR]] offset:8
  ; GCN: v_add_{{[iu]}}32_e32 [[ADDUSE:v[0-9]+]], vcc, 8, v{{[0-9]+}}
  ; GCN-DAG: buffer_store_dword [[RESULT]]
@@ -68,10 +72,18 @@ define amdgpu_kernel void @load_shl_base_lds_max_offset(i8 addrspace(1)* %out, i
  ; The two globals are placed adjacent in memory, so the same base
  ; pointer can be used with an offset into the second one.
  
+; TODO: Recover the optimization of using ds_read2st64_b32 using alignment hints
+
  ; GCN-LABEL: {{^}}load_shl_base_lds_2:
-; GCN: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
+; GCN: v_lshlrev_b32_e32 [[OFS:v[0-9]+]], 2, {{v[0-9]+}}
+; GCN-DAG: v_add_{{[iu]}}32_e32 [[PTR0:v[0-9]+]], vcc, lds0@abs32@lo, [[OFS]]
+; GCN-DAG: v_add_{{[iu]}}32_e32 [[PTR1:v[0-9]+]], vcc, lds1@abs32@lo, [[OFS]]
  ; GCN: s_mov_b32 m0, -1
-; GCN-NEXT: ds_read2st64_b32 {{v\[[0-9]+:[0-9]+\]}}, [[PTR]] offset0:1 offset1:9
+
+; GCN-DAG: ds_read_b32 {{v[0-9]+}}, [[PTR0]] offset:256
+; GCN-DAG: ds_read_b32 {{v[0-9]+}}, [[PTR1]] offset:256
+; TODO: ds_read2st64_b32 {{v\[[0-9]+:[0-9]+\]}}, [[PTR]] offset0:1 offset1:9
+
  ; GCN: s_endpgm
  define amdgpu_kernel void @load_shl_base_lds_2(float addrspace(1)* %out) #0 {
    %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
diff --git a/test/CodeGen/AMDGPU/si-sgpr-spill.ll b/test/CodeGen/AMDGPU/si-sgpr-spill.ll

index 9a731c6bbbc2ae8b7ea0a2c3e087abc6dc18e504..54b4b4d258659c50c124b1b2a50fc63c709cd03c 100644 (file)
--- a/test/CodeGen/AMDGPU/si-sgpr-spill.ll
+++ b/test/CodeGen/AMDGPU/si-sgpr-spill.ll
@@ -4,16 +4,11 @@
  ; These tests check that the compiler won't crash when it needs to spill
  ; SGPRs.
  
-@ddxy_lds = external addrspace(3) global [64 x i32]
-
  ; GCN-LABEL: {{^}}main:
  ; GCN: s_wqm
  
  ; Make sure not emitting unused scratch resource descriptor setup
  ; GCN-NOT: s_mov_b32
-; GCN-NOT: s_mov_b32
-; GCN-NOT: s_mov_b32
-; GCN-NOT: s_mov_b32
  
  ; GCN: s_mov_b32 m0
  
@@ -26,6 +21,7 @@
  ; TOVGPR: ScratchSize: 0{{$}}
  define amdgpu_ps void @main([17 x <4 x i32>] addrspace(4)* byval %arg, [32 x <4 x i32>] addrspace(4)* byval %arg1, [16 x <8 x i32>] addrspace(4)* byval %arg2, float inreg %arg3, i32 inreg %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <3 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, float %arg20) {
  main_body:
+  %lds = inttoptr i32 0 to [64 x i32] addrspace(3)*
    %tmp = getelementptr [17 x <4 x i32>], [17 x <4 x i32>] addrspace(4)* %arg, i64 0, i32 0
    %tmp21 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp, !tbaa !0
    %tmp22 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 96, i32 0)
@@ -203,18 +199,18 @@ main_body:
    %p2.i6 = call float @llvm.amdgcn.interp.p2(float %p1.i5, float %j.f.i4, i32 2, i32 5, i32 %arg4) #0
    %mbcnt.lo.0 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
    %tmp109 = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %mbcnt.lo.0)
-  %tmp110 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %tmp109
+  %tmp110 = getelementptr [64 x i32], [64 x i32] addrspace(3)* %lds, i32 0, i32 %tmp109
    %tmp111 = bitcast float %p2.i to i32
    store i32 %tmp111, i32 addrspace(3)* %tmp110
    %tmp112 = bitcast float %p2.i96 to i32
    store i32 %tmp112, i32 addrspace(3)* %tmp110
    %mbcnt.lo.1 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
    %tmp113 = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %mbcnt.lo.1)
-  %tmp114 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %tmp113
+  %tmp114 = getelementptr [64 x i32], [64 x i32] addrspace(3)* %lds, i32 0, i32 %tmp113
    %tmp115 = and i32 %tmp113, -4
-  %tmp116 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %tmp115
+  %tmp116 = getelementptr [64 x i32], [64 x i32] addrspace(3)* %lds, i32 0, i32 %tmp115
    %tmp117 = add i32 %tmp115, 1
-  %tmp118 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %tmp117
+  %tmp118 = getelementptr [64 x i32], [64 x i32] addrspace(3)* %lds, i32 0, i32 %tmp117
    %tmp119 = bitcast float %p2.i to i32
    store i32 %tmp119, i32 addrspace(3)* %tmp114
    %tmp120 = load i32, i32 addrspace(3)* %tmp116
@@ -241,7 +237,7 @@ main_body:
    %tmp140 = fmul float %tmp59, %p2.i96
    %mbcnt.lo.2 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
    %tmp141 = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %mbcnt.lo.2)
-  %tmp142 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %tmp141
+  %tmp142 = getelementptr [64 x i32], [64 x i32] addrspace(3)* %lds, i32 0, i32 %tmp141
    %tmp143 = bitcast float %tmp137 to i32
    store i32 %tmp143, i32 addrspace(3)* %tmp142
    %tmp144 = bitcast float %tmp138 to i32
@@ -252,11 +248,11 @@ main_body:
    store i32 %tmp146, i32 addrspace(3)* %tmp142
    %mbcnt.lo.3 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
    %tmp147 = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %mbcnt.lo.3)
-  %tmp148 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %tmp147
+  %tmp148 = getelementptr [64 x i32], [64 x i32] addrspace(3)* %lds, i32 0, i32 %tmp147
    %tmp149 = and i32 %tmp147, -4
-  %tmp150 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %tmp149
+  %tmp150 = getelementptr [64 x i32], [64 x i32] addrspace(3)* %lds, i32 0, i32 %tmp149
    %tmp151 = add i32 %tmp149, 2
-  %tmp152 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %tmp151
+  %tmp152 = getelementptr [64 x i32], [64 x i32] addrspace(3)* %lds, i32 0, i32 %tmp151
    %tmp153 = bitcast float %tmp137 to i32
    store i32 %tmp153, i32 addrspace(3)* %tmp148
    %tmp154 = load i32, i32 addrspace(3)* %tmp150
diff --git a/test/CodeGen/AMDGPU/target-cpu.ll b/test/CodeGen/AMDGPU/target-cpu.ll

index cc4c98bb6789129ce7d27f2d037ef37f005e09ea..4750cf2020d9f55d284fa07b79e9ccae1b1b7a1d 100644 (file)
--- a/test/CodeGen/AMDGPU/target-cpu.ll
+++ b/test/CodeGen/AMDGPU/target-cpu.ll
@@ -78,7 +78,6 @@ define amdgpu_kernel void @target_fiji() #4 {
  
  ; CHECK-LABEL: {{^}}promote_alloca_enabled:
  ; CHECK: ds_read_b32
-; CHECK: ; LDSByteSize: 5120
  define amdgpu_kernel void @promote_alloca_enabled(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #5 {
  entry:
    %stack = alloca [5 x i32], align 4, addrspace(5)
diff --git a/test/CodeGen/MIR/AMDGPU/machine-function-info.ll b/test/CodeGen/MIR/AMDGPU/machine-function-info.ll

index df004562d495efe94d644713f88eff956f655b7d..4e233495f5f3f1f1b001075988344a16c3d2b4c0 100644 (file)
--- a/test/CodeGen/MIR/AMDGPU/machine-function-info.ll
+++ b/test/CodeGen/MIR/AMDGPU/machine-function-info.ll
@@ -10,7 +10,7 @@
  ; CHECK: machineFunctionInfo:
  ; CHECK-NEXT: explicitKernArgSize: 128
  ; CHECK-NEXT: maxKernArgAlign: 64
-; CHECK-NEXT: ldsSize: 2048
+; CHECK-NEXT: ldsSize: 0
  ; CHECK-NEXT: isEntryFunction: true
  ; CHECK-NEXT: noSignedZerosFPMath: false
  ; CHECK-NEXT: memoryBound: false
author	Nicolai Haehnle <nhaehnle@gmail.com>
	Tue, 25 Jun 2019 11:52:30 +0000 (11:52 +0000)
committer	Nicolai Haehnle <nhaehnle@gmail.com>
	Tue, 25 Jun 2019 11:52:30 +0000 (11:52 +0000)
lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp		patch \| blob \| history
lib/Target/AMDGPU/AMDGPUISelLowering.cpp		patch \| blob \| history
lib/Target/AMDGPU/AMDGPUISelLowering.h		patch \| blob \| history
lib/Target/AMDGPU/SIFoldOperands.cpp		patch \| blob \| history
lib/Target/AMDGPU/SIISelLowering.cpp		patch \| blob \| history
lib/Target/AMDGPU/SIInstrInfo.cpp		patch \| blob \| history
lib/Target/AMDGPU/SIInstrInfo.td		patch \| blob \| history
lib/Target/AMDGPU/SIInstructions.td		patch \| blob \| history
lib/Target/AMDGPU/SIShrinkInstructions.cpp		patch \| blob \| history
test/CodeGen/AMDGPU/32-bit-local-address-space.ll		patch \| blob \| history
test/CodeGen/AMDGPU/ds-sub-offset.ll		patch \| blob \| history
test/CodeGen/AMDGPU/ds_read2.ll		patch \| blob \| history
test/CodeGen/AMDGPU/ds_write2.ll		patch \| blob \| history
test/CodeGen/AMDGPU/lds-initializer.ll		patch \| blob \| history
test/CodeGen/AMDGPU/lds-relocs.ll	[new file with mode: 0644]	patch \| blob
test/CodeGen/AMDGPU/lds-size.ll		patch \| blob \| history
test/CodeGen/AMDGPU/lds-zero-initializer.ll		patch \| blob \| history
test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll		patch \| blob \| history
test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll		patch \| blob \| history
test/CodeGen/AMDGPU/llvm.amdgcn.groupstaticsize.ll		patch \| blob \| history
test/CodeGen/AMDGPU/local-memory.amdgcn.ll		patch \| blob \| history
test/CodeGen/AMDGPU/local-memory.ll		patch \| blob \| history
test/CodeGen/AMDGPU/merge-store-crash.ll		patch \| blob \| history
test/CodeGen/AMDGPU/over-max-lds-size.ll	[deleted file]	patch \| blob \| history
test/CodeGen/AMDGPU/promote-alloca-globals.ll		patch \| blob \| history
test/CodeGen/AMDGPU/shl_add_ptr.ll		patch \| blob \| history
test/CodeGen/AMDGPU/si-sgpr-spill.ll		patch \| blob \| history
test/CodeGen/AMDGPU/target-cpu.ll		patch \| blob \| history
test/CodeGen/MIR/AMDGPU/machine-function-info.ll		patch \| blob \| history