From 90f5eff5ac4c8818136a7f66a22b9971f5bb6260 Mon Sep 17 00:00:00 2001 From: Nicolai Haehnle Date: Tue, 25 Jun 2019 11:52:30 +0000 Subject: [PATCH] AMDGPU: Write LDS objects out as global symbols in code generation Summary: The symbols use the processor-specific SHN_AMDGPU_LDS section index introduced with a previous change. The linker is then expected to resolve relocations, which are also emitted. Initially disabled for HSA and PAL environments until they have caught up in terms of linker and runtime loader. Some notes: - The llvm.amdgcn.groupstaticsize intrinsics can no longer be lowered to a constant at compile times, which means some tests can no longer be applied. The current "solution" is a terrible hack, but the intrinsic isn't used by Mesa, so we can keep it for now. - We no longer know the full LDS size per kernel at compile time, which means that we can no longer generate a relevant error message at compile time. It would be possible to add a check for the size of individual variables, but ultimately the linker will have to perform the final check. Change-Id: If66dbf33fccfbf3609aefefa2558ac0850d42275 Reviewers: arsenm, rampitec, t-tye, b-sumner, jsjodin Subscribers: qcolombet, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D61494 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@364297 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp | 31 ++++++++- lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 10 +++ lib/Target/AMDGPU/AMDGPUISelLowering.h | 1 + lib/Target/AMDGPU/SIFoldOperands.cpp | 22 +++++-- lib/Target/AMDGPU/SIISelLowering.cpp | 26 +++++++- lib/Target/AMDGPU/SIInstrInfo.cpp | 8 +-- lib/Target/AMDGPU/SIInstrInfo.td | 4 ++ lib/Target/AMDGPU/SIInstructions.td | 10 +++ lib/Target/AMDGPU/SIShrinkInstructions.cpp | 4 ++ .../AMDGPU/32-bit-local-address-space.ll | 4 +- test/CodeGen/AMDGPU/ds-sub-offset.ll | 3 +- test/CodeGen/AMDGPU/ds_read2.ll | 28 +++++---- test/CodeGen/AMDGPU/ds_write2.ll | 54 +++++++++++----- test/CodeGen/AMDGPU/lds-initializer.ll | 2 +- test/CodeGen/AMDGPU/lds-relocs.ll | 63 +++++++++++++++++++ test/CodeGen/AMDGPU/lds-size.ll | 1 - test/CodeGen/AMDGPU/lds-zero-initializer.ll | 2 +- test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll | 12 +++- test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll | 10 ++- .../AMDGPU/llvm.amdgcn.groupstaticsize.ll | 15 +++-- test/CodeGen/AMDGPU/local-memory.amdgcn.ll | 7 +-- test/CodeGen/AMDGPU/local-memory.ll | 4 +- test/CodeGen/AMDGPU/merge-store-crash.ll | 3 +- test/CodeGen/AMDGPU/over-max-lds-size.ll | 14 ----- test/CodeGen/AMDGPU/promote-alloca-globals.ll | 3 +- test/CodeGen/AMDGPU/shl_add_ptr.ll | 18 +++++- test/CodeGen/AMDGPU/si-sgpr-spill.ll | 22 +++---- test/CodeGen/AMDGPU/target-cpu.ll | 1 - .../MIR/AMDGPU/machine-function-info.ll | 2 +- 29 files changed, 282 insertions(+), 102 deletions(-) create mode 100644 test/CodeGen/AMDGPU/lds-relocs.ll delete mode 100644 test/CodeGen/AMDGPU/over-max-lds-size.ll diff --git a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp index 5578251bcfc..bad5670010a 100644 --- a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -298,10 +298,37 @@ void AMDGPUAsmPrinter::EmitBasicBlockStart(const MachineBasicBlock &MBB) const { } void AMDGPUAsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) { + if (GV->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) { + if (GV->hasInitializer() && !isa(GV->getInitializer())) { + OutContext.reportError({}, + Twine(GV->getName()) + + ": unsupported initializer for address space"); + return; + } + + // LDS variables aren't emitted in HSA or PAL yet. + const Triple::OSType OS = TM.getTargetTriple().getOS(); + if (OS == Triple::AMDHSA || OS == Triple::AMDPAL) + return; + + MCSymbol *GVSym = getSymbol(GV); - // Group segment variables aren't emitted in HSA. - if (AMDGPU::isGroupSegment(GV)) + GVSym->redefineIfPossible(); + if (GVSym->isDefined() || GVSym->isVariable()) + report_fatal_error("symbol '" + Twine(GVSym->getName()) + + "' is already defined"); + + const DataLayout &DL = GV->getParent()->getDataLayout(); + uint64_t Size = DL.getTypeAllocSize(GV->getValueType()); + unsigned Align = GV->getAlignment(); + if (!Align) + Align = 4; + + EmitVisibility(GVSym, GV->getVisibility(), !GV->isDeclaration()); + EmitLinkage(GV, GVSym); + getTargetStreamer()->emitAMDGPULDS(GVSym, Size, Align); return; + } AsmPrinter::EmitGlobalVariable(GV); } diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 1197893120a..d0af336a00b 100644 --- a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -4357,6 +4357,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(BUILD_VERTICAL_VECTOR) NODE_NAME_CASE(CONST_DATA_PTR) NODE_NAME_CASE(PC_ADD_REL_OFFSET) + NODE_NAME_CASE(LDS) NODE_NAME_CASE(KILL) NODE_NAME_CASE(DUMMY_CHAIN) case AMDGPUISD::FIRST_MEM_OPCODE_NUMBER: break; @@ -4571,6 +4572,15 @@ void AMDGPUTargetLowering::computeKnownBitsForTargetNode( Known.Zero.setHighBits(16); break; } + case AMDGPUISD::LDS: { + auto GA = cast(Op.getOperand(0).getNode()); + unsigned Align = GA->getGlobal()->getAlignment(); + + Known.Zero.setHighBits(16); + if (Align) + Known.Zero.setLowBits(Log2_32(Align)); + break; + } case ISD::INTRINSIC_WO_CHAIN: { unsigned IID = cast(Op.getOperand(0))->getZExtValue(); switch (IID) { diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.h b/lib/Target/AMDGPU/AMDGPUISelLowering.h index a17f5dae576..9723fc3ec66 100644 --- a/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ b/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -485,6 +485,7 @@ enum NodeType : unsigned { INTERP_P1LV_F16, INTERP_P2_F16, PC_ADD_REL_OFFSET, + LDS, KILL, DUMMY_CHAIN, FIRST_MEM_OPCODE_NUMBER = ISD::FIRST_TARGET_MEMORY_OPCODE, diff --git a/lib/Target/AMDGPU/SIFoldOperands.cpp b/lib/Target/AMDGPU/SIFoldOperands.cpp index 78e6e39b05a..74ed6f1fed1 100644 --- a/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -50,7 +50,7 @@ struct FoldCandidate { } else if (FoldOp->isFI()) { FrameIndexToFold = FoldOp->getIndex(); } else { - assert(FoldOp->isReg()); + assert(FoldOp->isReg() || FoldOp->isGlobal()); OpToFold = FoldOp; } } @@ -67,6 +67,8 @@ struct FoldCandidate { return Kind == MachineOperand::MO_Register; } + bool isGlobal() const { return Kind == MachineOperand::MO_GlobalAddress; } + bool isCommuted() const { return Commuted; } @@ -230,7 +232,7 @@ static bool updateOperand(FoldCandidate &Fold, } } - if ((Fold.isImm() || Fold.isFI()) && Fold.needsShrink()) { + if ((Fold.isImm() || Fold.isFI() || Fold.isGlobal()) && Fold.needsShrink()) { MachineBasicBlock *MBB = MI->getParent(); auto Liveness = MBB->computeRegisterLiveness(&TRI, AMDGPU::VCC, MI); if (Liveness != MachineBasicBlock::LQR_Dead) @@ -277,6 +279,12 @@ static bool updateOperand(FoldCandidate &Fold, return true; } + if (Fold.isGlobal()) { + Old.ChangeToGA(Fold.OpToFold->getGlobal(), Fold.OpToFold->getOffset(), + Fold.OpToFold->getTargetFlags()); + return true; + } + if (Fold.isFI()) { Old.ChangeToFrameIndex(Fold.FrameIndexToFold); return true; @@ -368,7 +376,7 @@ static bool tryAddToFoldList(SmallVectorImpl &FoldList, if ((Opc == AMDGPU::V_ADD_I32_e64 || Opc == AMDGPU::V_SUB_I32_e64 || Opc == AMDGPU::V_SUBREV_I32_e64) && // FIXME - (OpToFold->isImm() || OpToFold->isFI())) { + (OpToFold->isImm() || OpToFold->isFI() || OpToFold->isGlobal())) { MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); // Verify the other operand is a VGPR, otherwise we would violate the @@ -483,7 +491,8 @@ void SIFoldOperands::foldOperand( return; } - bool FoldingImmLike = OpToFold.isImm() || OpToFold.isFI(); + bool FoldingImmLike = + OpToFold.isImm() || OpToFold.isFI() || OpToFold.isGlobal(); if (FoldingImmLike && UseMI->isCopy()) { unsigned DestReg = UseMI->getOperand(0).getReg(); @@ -884,7 +893,7 @@ void SIFoldOperands::foldInstOperand(MachineInstr &MI, SmallVector FoldList; MachineOperand &Dst = MI.getOperand(0); - bool FoldingImm = OpToFold.isImm() || OpToFold.isFI(); + bool FoldingImm = OpToFold.isImm() || OpToFold.isFI() || OpToFold.isGlobal(); if (FoldingImm) { unsigned NumLiteralUses = 0; MachineOperand *NonInlineUse = nullptr; @@ -1232,7 +1241,8 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) { } MachineOperand &OpToFold = MI.getOperand(1); - bool FoldingImm = OpToFold.isImm() || OpToFold.isFI(); + bool FoldingImm = + OpToFold.isImm() || OpToFold.isFI() || OpToFold.isGlobal(); // FIXME: We could also be folding things like TargetIndexes. if (!FoldingImm && !OpToFold.isReg()) diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp index 64018fd92ac..25000506f2c 100644 --- a/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/lib/Target/AMDGPU/SIISelLowering.cpp @@ -3588,6 +3588,8 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter( } case AMDGPU::GET_GROUPSTATICSIZE: { + assert(getTargetMachine().getTargetTriple().getOS() == Triple::AMDHSA || + getTargetMachine().getTargetTriple().getOS() == Triple::AMDPAL); DebugLoc DL = MI.getDebugLoc(); BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_MOV_B32)) .add(MI.getOperand(0)) @@ -4776,7 +4778,10 @@ SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI, SelectionDAG &DAG) const { GlobalAddressSDNode *GSD = cast(Op); const GlobalValue *GV = GSD->getGlobal(); - if (GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS || + if ((GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && + (!GV->hasExternalLinkage() || + getTargetMachine().getTargetTriple().getOS() == Triple::AMDHSA || + getTargetMachine().getTargetTriple().getOS() == Triple::AMDPAL)) || GSD->getAddressSpace() == AMDGPUAS::REGION_ADDRESS || GSD->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG); @@ -4784,7 +4789,12 @@ SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDLoc DL(GSD); EVT PtrVT = Op.getValueType(); - // FIXME: Should not make address space based decisions here. + if (GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) { + SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, GSD->getOffset(), + SIInstrInfo::MO_ABS32_LO); + return DAG.getNode(AMDGPUISD::LDS, DL, MVT::i32, GA); + } + if (shouldEmitFixup(GV)) return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT); else if (shouldEmitPCReloc(GV)) @@ -5773,6 +5783,18 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, return SDValue(DAG.getMachineNode(AMDGPU::SI_IF_BREAK, DL, VT, Op->getOperand(1), Op->getOperand(2)), 0); + case Intrinsic::amdgcn_groupstaticsize: { + Triple::OSType OS = getTargetMachine().getTargetTriple().getOS(); + if (OS == Triple::AMDHSA || OS == Triple::AMDPAL) + return Op; + + const Module *M = MF.getFunction().getParent(); + const GlobalValue *GV = + M->getNamedValue(Intrinsic::getName(Intrinsic::amdgcn_groupstaticsize)); + SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, 0, + SIInstrInfo::MO_ABS32_LO); + return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0}; + } default: if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = AMDGPU::getImageDimIntrinsicInfo(IntrinsicID)) diff --git a/lib/Target/AMDGPU/SIInstrInfo.cpp b/lib/Target/AMDGPU/SIInstrInfo.cpp index f977928735a..1923ac24f85 100644 --- a/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -2703,7 +2703,7 @@ bool SIInstrInfo::isImmOperandLegal(const MachineInstr &MI, unsigned OpNo, const MCInstrDesc &InstDesc = MI.getDesc(); const MCOperandInfo &OpInfo = InstDesc.OpInfo[OpNo]; - assert(MO.isImm() || MO.isTargetIndex() || MO.isFI()); + assert(MO.isImm() || MO.isTargetIndex() || MO.isFI() || MO.isGlobal()); if (OpInfo.OperandType == MCOI::OPERAND_IMMEDIATE) return true; @@ -3012,7 +3012,7 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, switch (Desc.OpInfo[i].OperandType) { case MCOI::OPERAND_REGISTER: - if (MI.getOperand(i).isImm()) { + if (MI.getOperand(i).isImm() || MI.getOperand(i).isGlobal()) { ErrInfo = "Illegal immediate value for operand."; return false; } @@ -3682,7 +3682,7 @@ bool SIInstrInfo::isLegalVSrcOperand(const MachineRegisterInfo &MRI, return isLegalRegOperand(MRI, OpInfo, MO); // Handle non-register types that are treated like immediates. - assert(MO.isImm() || MO.isTargetIndex() || MO.isFI()); + assert(MO.isImm() || MO.isTargetIndex() || MO.isFI() || MO.isGlobal()); return true; } @@ -3739,7 +3739,7 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx, } // Handle non-register types that are treated like immediates. - assert(MO->isImm() || MO->isTargetIndex() || MO->isFI()); + assert(MO->isImm() || MO->isTargetIndex() || MO->isFI() || MO->isGlobal()); if (!DefinedRC) { // This operand expects an immediate. diff --git a/lib/Target/AMDGPU/SIInstrInfo.td b/lib/Target/AMDGPU/SIInstrInfo.td index 1a3e16afce3..cd1c7fdae92 100644 --- a/lib/Target/AMDGPU/SIInstrInfo.td +++ b/lib/Target/AMDGPU/SIInstrInfo.td @@ -205,6 +205,10 @@ def SIpc_add_rel_offset : SDNode<"AMDGPUISD::PC_ADD_REL_OFFSET", SDTypeProfile<1, 2, [SDTCisVT<0, iPTR>, SDTCisSameAs<0,1>, SDTCisSameAs<0,2>]> >; +def SIlds : SDNode<"AMDGPUISD::LDS", + SDTypeProfile<1, 1, [SDTCisVT<0, iPTR>, SDTCisSameAs<0,1>]> +>; + def SIload_d16_lo : SDNode<"AMDGPUISD::LOAD_D16_LO", SIload_d16, [SDNPMayLoad, SDNPMemOperand, SDNPHasChain] diff --git a/lib/Target/AMDGPU/SIInstructions.td b/lib/Target/AMDGPU/SIInstructions.td index 630aeeb8777..2cb7e9fa357 100644 --- a/lib/Target/AMDGPU/SIInstructions.td +++ b/lib/Target/AMDGPU/SIInstructions.td @@ -1142,6 +1142,16 @@ def : GCNPat < (S_MOV_B32 imm:$imm) >; +def : GCNPat < + (VGPRImm<(SIlds tglobaladdr:$ga)>), + (V_MOV_B32_e32 $ga) +>; + +def : GCNPat < + (SIlds tglobaladdr:$ga), + (S_MOV_B32 $ga) +>; + // FIXME: Workaround for ordering issue with peephole optimizer where // a register class copy interferes with immediate folding. Should // use s_mov_b32, which can be shrunk to s_movk_i32 diff --git a/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/lib/Target/AMDGPU/SIShrinkInstructions.cpp index c180afdcd45..7ee178149c7 100644 --- a/lib/Target/AMDGPU/SIShrinkInstructions.cpp +++ b/lib/Target/AMDGPU/SIShrinkInstructions.cpp @@ -95,6 +95,10 @@ static bool foldImmediates(MachineInstr &MI, const SIInstrInfo *TII, Src0.setSubReg(0); Src0.ChangeToFrameIndex(MovSrc.getIndex()); ConstantFolded = true; + } else if (MovSrc.isGlobal()) { + Src0.ChangeToGA(MovSrc.getGlobal(), MovSrc.getOffset(), + MovSrc.getTargetFlags()); + ConstantFolded = true; } if (ConstantFolded) { diff --git a/test/CodeGen/AMDGPU/32-bit-local-address-space.ll b/test/CodeGen/AMDGPU/32-bit-local-address-space.ll index ca661cf9a71..72e62f8dbbf 100644 --- a/test/CodeGen/AMDGPU/32-bit-local-address-space.ll +++ b/test/CodeGen/AMDGPU/32-bit-local-address-space.ll @@ -81,8 +81,8 @@ define amdgpu_kernel void @mul_32bit_ptr(float addrspace(1)* %out, [3 x float] a @g_lds = addrspace(3) global float undef, align 4 ; FUNC-LABEL: {{^}}infer_ptr_alignment_global_offset: -; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0 -; SI: ds_read_b32 v{{[0-9]+}}, [[REG]] +; SI: v_mov_b32_e32 [[PTR:v[0-9]+]], g_lds@abs32@lo +; SI: ds_read_b32 v{{[0-9]+}}, [[PTR]] define amdgpu_kernel void @infer_ptr_alignment_global_offset(float addrspace(1)* %out, i32 %tid) { %val = load float, float addrspace(3)* @g_lds store float %val, float addrspace(1)* %out diff --git a/test/CodeGen/AMDGPU/ds-sub-offset.ll b/test/CodeGen/AMDGPU/ds-sub-offset.ll index 1314311c9cd..7487cd98e0a 100644 --- a/test/CodeGen/AMDGPU/ds-sub-offset.ll +++ b/test/CodeGen/AMDGPU/ds-sub-offset.ll @@ -7,8 +7,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() #0 ; GCN-LABEL: {{^}}write_ds_sub0_offset0_global: ; GCN: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 2, v0 -; CI: v_sub_i32_e32 [[BASEPTR:v[0-9]+]], vcc, 0, [[SHL]] -; GFX9: v_sub_u32_e32 [[BASEPTR:v[0-9]+]], 0, [[SHL]] +; GCN: v_sub_{{[iu]}}32_e32 [[BASEPTR:v[0-9]+]], {{(vcc, )?}}lds.obj@abs32@lo, [[SHL]] ; GCN: v_mov_b32_e32 [[VAL:v[0-9]+]], 0x7b ; GCN: ds_write_b32 [[BASEPTR]], [[VAL]] offset:12 define amdgpu_kernel void @write_ds_sub0_offset0_global() #0 { diff --git a/test/CodeGen/AMDGPU/ds_read2.ll b/test/CodeGen/AMDGPU/ds_read2.ll index f933dc05701..9991eb3fcbe 100644 --- a/test/CodeGen/AMDGPU/ds_read2.ll +++ b/test/CodeGen/AMDGPU/ds_read2.ll @@ -355,7 +355,8 @@ define amdgpu_kernel void @misaligned_2_simple_read2_f32(float addrspace(1)* %ou ; CI-DAG: s_mov_b32 m0 ; GFX9-NOT: m0 -; GCN-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 3, {{v[0-9]+}} +; GCN-DAG: v_lshlrev_b32_e32 [[VOFS:v[0-9]+]], 3, {{v[0-9]+}} +; GCN-DAG: v_add_{{[iu]}}32_e32 [[VPTR:v[0-9]+]], {{(vcc, )?}}lds.f64@abs32@lo, [[VOFS]] ; GCN: ds_read2_b64 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, [[VPTR]] offset1:8 ; GCN: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO_VREG]]:{{[0-9]+\]}}, v{{\[[0-9]+}}:[[HI_VREG]]{{\]}} @@ -441,8 +442,8 @@ define amdgpu_kernel void @misaligned_read2_f64(double addrspace(1)* %out, doubl ; CI-DAG: s_mov_b32 m0 ; GFX9-NOT: m0 -; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} -; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] offset1:1 +; GCN-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], foo@abs32@lo{{$}} +; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[PTR]] offset1:1 define amdgpu_kernel void @load_constant_adjacent_offsets(i32 addrspace(1)* %out) { %val0 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4 %val1 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 1), align 4 @@ -455,8 +456,8 @@ define amdgpu_kernel void @load_constant_adjacent_offsets(i32 addrspace(1)* %out ; CI-DAG: s_mov_b32 m0 ; GFX9-NOT: m0 -; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} -; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] offset1:2 +; GCN-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], foo@abs32@lo{{$}} +; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[PTR]] offset1:2 define amdgpu_kernel void @load_constant_disjoint_offsets(i32 addrspace(1)* %out) { %val0 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4 %val1 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 2), align 4 @@ -471,9 +472,9 @@ define amdgpu_kernel void @load_constant_disjoint_offsets(i32 addrspace(1)* %out ; CI-DAG: s_mov_b32 m0 ; GFX9-NOT: m0 -; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} -; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] offset1:1 -; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] offset0:2 offset1:3 +; GCN-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], bar@abs32@lo{{$}} +; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[PTR]] offset1:1 +; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[PTR]] offset0:2 offset1:3 define amdgpu_kernel void @load_misaligned64_constant_offsets(i64 addrspace(1)* %out) { %val0 = load i64, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 0), align 4 %val1 = load i64, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 1), align 4 @@ -488,10 +489,13 @@ define amdgpu_kernel void @load_misaligned64_constant_offsets(i64 addrspace(1)* ; CI-DAG: s_mov_b32 m0 ; GFX9-NOT: m0 -; GCN-DAG: v_mov_b32_e32 [[BASE0:v[0-9]+]], 0x7ff8{{$}} -; GCN-DAG: v_mov_b32_e32 [[BASE1:v[0-9]+]], 0x4000 -; GCN-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASE0]] offset1:1 -; GCN-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASE1]] offset1:1 +; GCN-DAG: s_mov_b32 [[SBASE0:s[0-9]+]], bar.large@abs32@lo +; GCN-DAG: s_add_i32 [[SBASE1:s[0-9]+]], [[SBASE0]], 0x4000{{$}} +; GCN-DAG: s_addk_i32 [[SBASE0]], 0x7ff8{{$}} +; GCN-DAG: v_mov_b32_e32 [[VBASE0:v[0-9]+]], [[SBASE0]] +; GCN-DAG: v_mov_b32_e32 [[VBASE1:v[0-9]+]], [[SBASE1]] +; GCN-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[VBASE0]] offset1:1 +; GCN-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[VBASE1]] offset1:1 ; GCN: s_endpgm define amdgpu_kernel void @load_misaligned64_constant_large_offsets(i64 addrspace(1)* %out) { %val0 = load i64, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 2048), align 4 diff --git a/test/CodeGen/AMDGPU/ds_write2.ll b/test/CodeGen/AMDGPU/ds_write2.ll index 03436f6b3a3..a1610d44e2c 100644 --- a/test/CodeGen/AMDGPU/ds_write2.ll +++ b/test/CodeGen/AMDGPU/ds_write2.ll @@ -103,10 +103,17 @@ define amdgpu_kernel void @simple_write2_two_val_f32_volatile_1(float addrspace( ; CI: buffer_load_dwordx2 v{{\[[0-9]+}}:[[VAL1:[0-9]+]]{{\]}} ; CI-DAG: s_mov_b32 m0 -; GCN-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}} - -; GFX9: global_load_dwordx2 v{{\[}}[[VAL0:[0-9]+]]:{{[0-9]+\]}} -; GFX9: global_load_dwordx2 v{{\[[0-9]+}}:[[VAL1:[0-9]+]]{{\]}} +; CI-DAG: v_lshlrev_b32_e32 [[VOFS:v[0-9]+]], 2, v{{[0-9]+}} +; CI-DAG: v_add_i32_e32 [[VPTR:v[0-9]+]], vcc, lds@abs32@lo, [[VOFS]] +; +; TODO: This should be an s_mov_b32. The v_mov_b32 gets introduced by an +; early legalization of the constant bus constraint on the v_lshl_add_u32, +; and then SIFoldOperands folds in an unlucky order. +; GFX9-DAG: v_mov_b32_e32 [[VBASE:v[0-9]+]], lds@abs32@lo +; GFX9-DAG: v_lshl_add_u32 [[VPTR:v[0-9]+]], {{v[0-9]+}}, 2, [[VBASE]] + +; GFX9-DAG: global_load_dwordx2 v{{\[}}[[VAL0:[0-9]+]]:{{[0-9]+\]}} +; GFX9-DAG: global_load_dwordx2 v{{\[[0-9]+}}:[[VAL1:[0-9]+]]{{\]}} ; GCN: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:8 ; GCN: s_endpgm @@ -131,7 +138,12 @@ define amdgpu_kernel void @simple_write2_two_val_subreg2_mixed_f32(float addrspa ; GFX9-NOT: m0 ; GCN-DAG: {{buffer|global}}_load_dwordx2 v{{\[}}[[VAL0:[0-9]+]]:[[VAL1:[0-9]+]]{{\]}} -; GCN-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}} + +; CI-DAG: v_lshlrev_b32_e32 [[VOFS:v[0-9]+]], 2, v{{[0-9]+}} +; CI-DAG: v_add_i32_e32 [[VPTR:v[0-9]+]], vcc, lds@abs32@lo, [[VOFS]] +; GFX9-DAG: v_mov_b32_e32 [[VBASE:v[0-9]+]], lds@abs32@lo +; GFX9-DAG: v_lshl_add_u32 [[VPTR:v[0-9]+]], v{{[0-9]+}}, 2, [[VBASE]] + ; GCN: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:8 ; GCN: s_endpgm define amdgpu_kernel void @simple_write2_two_val_subreg2_f32(float addrspace(1)* %C, <2 x float> addrspace(1)* %in) #0 { @@ -153,7 +165,12 @@ define amdgpu_kernel void @simple_write2_two_val_subreg2_f32(float addrspace(1)* ; GFX9-NOT: m0 ; GCN-DAG: {{buffer|global}}_load_dwordx4 v{{\[}}[[VAL0:[0-9]+]]:[[VAL1:[0-9]+]]{{\]}} -; GCN-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}} + +; CI-DAG: v_lshlrev_b32_e32 [[VOFS:v[0-9]+]], 2, v{{[0-9]+}} +; CI-DAG: v_add_i32_e32 [[VPTR:v[0-9]+]], vcc, lds@abs32@lo, [[VOFS]] +; GFX9-DAG: v_mov_b32_e32 [[VBASE:v[0-9]+]], lds@abs32@lo +; GFX9-DAG: v_lshl_add_u32 [[VPTR:v[0-9]+]], v{{[0-9]+}}, 2, [[VBASE]] + ; GCN: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:8 ; GCN: s_endpgm define amdgpu_kernel void @simple_write2_two_val_subreg4_f32(float addrspace(1)* %C, <4 x float> addrspace(1)* %in) #0 { @@ -389,8 +406,8 @@ define amdgpu_kernel void @simple_write2_two_val_f64(double addrspace(1)* %C, do ; CI-DAG: s_mov_b32 m0 ; GFX9-NOT: m0 -; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} -; GCN: ds_write2_b32 [[ZERO]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1 +; GCN-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], foo@abs32@lo{{$}} +; GCN: ds_write2_b32 [[PTR]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1 define amdgpu_kernel void @store_constant_adjacent_offsets() { store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4 store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 1), align 4 @@ -402,8 +419,8 @@ define amdgpu_kernel void @store_constant_adjacent_offsets() { ; GFX9-NOT: m0 ; GCN-DAG: v_mov_b32_e32 [[VAL:v[0-9]+]], 0x7b{{$}} -; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} -; GCN: ds_write2_b32 [[ZERO]], [[VAL]], [[VAL]] offset1:2 +; GCN-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], foo@abs32@lo{{$}} +; GCN: ds_write2_b32 [[PTR]], [[VAL]], [[VAL]] offset1:2 define amdgpu_kernel void @store_constant_disjoint_offsets() { store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4 store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 2), align 4 @@ -416,9 +433,9 @@ define amdgpu_kernel void @store_constant_disjoint_offsets() { ; CI-DAG: s_mov_b32 m0 ; GFX9-NOT: m0 -; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} -; GCN-DAG: ds_write2_b32 [[ZERO]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1 -; GCN-DAG: ds_write2_b32 [[ZERO]], v{{[0-9]+}}, v{{[0-9]+}} offset0:2 offset1:3 +; GCN-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], bar@abs32@lo{{$}} +; GCN-DAG: ds_write2_b32 [[PTR]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1 +; GCN-DAG: ds_write2_b32 [[PTR]], v{{[0-9]+}}, v{{[0-9]+}} offset0:2 offset1:3 ; GCN: s_endpgm define amdgpu_kernel void @store_misaligned64_constant_offsets() { store i64 123, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 0), align 4 @@ -432,10 +449,13 @@ define amdgpu_kernel void @store_misaligned64_constant_offsets() { ; CI-DAG: s_mov_b32 m0 ; GFX9-NOT: m0 -; GCN-DAG: v_mov_b32_e32 [[BASE0:v[0-9]+]], 0x7ff8{{$}} -; GCN-DAG: v_mov_b32_e32 [[BASE1:v[0-9]+]], 0x4000{{$}} -; GCN-DAG: ds_write2_b32 [[BASE0]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1 -; GCN-DAG: ds_write2_b32 [[BASE1]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1 +; GCN-DAG: s_mov_b32 [[SBASE0:s[0-9]+]], bar.large@abs32@lo +; GCN-DAG: s_add_i32 [[SBASE1:s[0-9]+]], [[SBASE0]], 0x4000{{$}} +; GCN-DAG: s_addk_i32 [[SBASE0]], 0x7ff8{{$}} +; GCN-DAG: v_mov_b32_e32 [[VBASE0:v[0-9]+]], [[SBASE0]]{{$}} +; GCN-DAG: v_mov_b32_e32 [[VBASE1:v[0-9]+]], [[SBASE1]]{{$}} +; GCN-DAG: ds_write2_b32 [[VBASE0]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1 +; GCN-DAG: ds_write2_b32 [[VBASE1]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1 ; GCN: s_endpgm define amdgpu_kernel void @store_misaligned64_constant_large_offsets() { store i64 123, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 2048), align 4 diff --git a/test/CodeGen/AMDGPU/lds-initializer.ll b/test/CodeGen/AMDGPU/lds-initializer.ll index 5ced7100596..7cfe013b845 100644 --- a/test/CodeGen/AMDGPU/lds-initializer.ll +++ b/test/CodeGen/AMDGPU/lds-initializer.ll @@ -1,7 +1,7 @@ ; RUN: not llc -march=amdgcn -mcpu=tahiti < %s 2>&1 | FileCheck %s ; RUN: not llc -march=amdgcn -mcpu=tonga < %s 2>&1 | FileCheck %s -; CHECK: in function load_init_lds_global{{.*}}: unsupported initializer for address space +; CHECK: lds: unsupported initializer for address space @lds = addrspace(3) global [8 x i32] [i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8] diff --git a/test/CodeGen/AMDGPU/lds-relocs.ll b/test/CodeGen/AMDGPU/lds-relocs.ll new file mode 100644 index 00000000000..63e3dd880ba --- /dev/null +++ b/test/CodeGen/AMDGPU/lds-relocs.ll @@ -0,0 +1,63 @@ +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefixes=GCN %s +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -filetype=obj < %s | llvm-readobj -r -t | FileCheck -check-prefixes=ELF %s + +@lds.external = external unnamed_addr addrspace(3) global [0 x i32] +@lds.defined = unnamed_addr addrspace(3) global [8 x i32] undef, align 8 + +; ELF: Relocations [ +; ELF-NEXT: Section (3) .rel.text { +; ELF-NEXT: 0x{{[0-9a-f]*}} R_AMDGPU_ABS32 lds.external 0x0 +; ELF-NEXT: 0x{{[0-9a-f]*}} R_AMDGPU_ABS32 lds.defined 0x0 +; ELF-NEXT: } +; ELF-NEXT: ] + +; ELF: Symbol { +; ELF: Name: lds.defined +; ELF-NEXT: Value: 0x8 +; ELF-NEXT: Size: 32 +; ELF-NEXT: Binding: Global (0x1) +; ELF-NEXT: Type: Object (0x1) +; ELF-NEXT: Other: 0 +; ELF-NEXT: Section: Processor Specific (0xFF00) +; ELF-NEXT: } + +; ELF: Symbol { +; ELF: Name: lds.external +; ELF-NEXT: Value: 0x4 +; ELF-NEXT: Size: 0 +; ELF-NEXT: Binding: Global (0x1) +; ELF-NEXT: Type: Object (0x1) +; ELF-NEXT: Other: 0 +; ELF-NEXT: Section: Processor Specific (0xFF00) +; ELF-NEXT: } + +; GCN-LABEL: {{^}}test_basic: +; GCN: v_mov_b32_e32 v1, lds.external@abs32@lo ; encoding: [0xff,0x02,0x02,0x7e,A,A,A,A] +; GCN-NEXT: ; fixup A - offset: 4, value: lds.external@abs32@lo, kind: FK_Data_4{{$}} +; +; GCN: s_add_i32 s0, lds.defined@abs32@lo, s0 ; encoding: [0xff,0x00,0x00,0x81,A,A,A,A] +; GCN-NEXT: ; fixup A - offset: 4, value: lds.defined@abs32@lo, kind: FK_Data_4{{$}} +; +; GCN: .globl lds.external +; GCN: .amdgpu_lds lds.external, 0, 4 +; GCN: .globl lds.defined +; GCN: .amdgpu_lds lds.defined, 32, 8 +define amdgpu_gs float @test_basic(i32 inreg %wave, i32 %arg1) #0 { +main_body: + %gep0 = getelementptr [0 x i32], [0 x i32] addrspace(3)* @lds.external, i32 0, i32 %arg1 + %tmp = load i32, i32 addrspace(3)* %gep0 + + %mask = call i64 @llvm.amdgcn.icmp.i64.i32(i32 %tmp, i32 0, i32 0) + %mask.32 = trunc i64 %mask to i32 + %gep1 = getelementptr [8 x i32], [8 x i32] addrspace(3)* @lds.defined, i32 0, i32 %wave + store i32 %mask.32, i32 addrspace(3)* %gep1 + + %r = bitcast i32 %tmp to float + ret float %r +} + +; Function Attrs: convergent nounwind readnone +declare i64 @llvm.amdgcn.icmp.i64.i32(i32, i32, i32) #4 + +attributes #0 = { "no-signed-zeros-fp-math"="true" } +attributes #4 = { convergent nounwind readnone } diff --git a/test/CodeGen/AMDGPU/lds-size.ll b/test/CodeGen/AMDGPU/lds-size.ll index ff78c3bcb18..a7f3d778755 100644 --- a/test/CodeGen/AMDGPU/lds-size.ll +++ b/test/CodeGen/AMDGPU/lds-size.ll @@ -1,4 +1,3 @@ -; RUN: llc -march=amdgcn < %s | FileCheck -check-prefix=ALL -check-prefix=GCN %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa < %s | FileCheck -check-prefix=ALL -check-prefix=HSA %s ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=ALL -check-prefix=EG %s diff --git a/test/CodeGen/AMDGPU/lds-zero-initializer.ll b/test/CodeGen/AMDGPU/lds-zero-initializer.ll index a3b3d7b341f..367dd173f78 100644 --- a/test/CodeGen/AMDGPU/lds-zero-initializer.ll +++ b/test/CodeGen/AMDGPU/lds-zero-initializer.ll @@ -1,7 +1,7 @@ ; RUN: not llc -march=amdgcn -mcpu=tahiti < %s 2>&1 | FileCheck %s ; RUN: not llc -march=amdgcn -mcpu=tonga < %s 2>&1 | FileCheck %s -; CHECK: in function load_zeroinit_lds_global{{.*}}: unsupported initializer for address space +; CHECK: lds: unsupported initializer for address space @lds = addrspace(3) global [256 x i32] zeroinitializer diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll index c4e631590d5..4d49d87c67a 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll @@ -268,7 +268,11 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_addr64(i64* %ptr) #0 ; CIVI-DAG: s_mov_b32 m0 ; GFX9-NOT: m0 -; GCN-DAG: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}} +; CIVI-DAG: v_lshlrev_b32_e32 [[OFS:v[0-9]+]], 2, {{v[0-9]+}} +; CIVI-DAG: v_add_{{[ui]}}32_e32 [[PTR:v[0-9]+]], vcc, lds0@abs32@lo, [[OFS]] +; GFX9-DAG: s_mov_b32 [[BASE:s[0-9]+]], lds0@abs32@lo +; GFX9-DAG: v_lshl_add_u32 [[PTR:v[0-9]+]], {{v[0-9]+}}, 2, [[BASE]] + ; GCN: ds_dec_rtn_u32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8 define amdgpu_kernel void @atomic_dec_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 @@ -412,7 +416,11 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_addr64(i64 addrspa ; CIVI-DAG: s_mov_b32 m0 ; GFX9-NOT: m0 -; GCN-DAG: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 3, {{v[0-9]+}} +; CIVI-DAG: v_lshlrev_b32_e32 [[OFS:v[0-9]+]], 3, {{v[0-9]+}} +; CIVI-DAG: v_add_{{[ui]}}32_e32 [[PTR:v[0-9]+]], vcc, lds1@abs32@lo, [[OFS]] +; GFX9-DAG: v_mov_b32_e32 [[BASE:v[0-9]+]], lds1@abs32@lo +; GFX9-DAG: v_lshl_add_u32 [[PTR:v[0-9]+]], {{v[0-9]+}}, 3, [[BASE]] + ; GCN: ds_dec_rtn_u64 v{{\[[0-9]+:[0-9]+\]}}, [[PTR]], v{{\[[0-9]+:[0-9]+\]}} offset:16 define amdgpu_kernel void @atomic_dec_shl_base_lds_0_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll index 689e2b6300f..111f131177b 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll @@ -131,7 +131,10 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_addr64(i32 addrspa @lds0 = addrspace(3) global [512 x i32] undef, align 4 ; GCN-LABEL: {{^}}atomic_inc_shl_base_lds_0_i32: -; GCN: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}} +; CIVI-DAG: v_lshlrev_b32_e32 [[OFS:v[0-9]+]], 2, {{v[0-9]+}} +; CIVI-DAG: v_add_{{[ui]}}32_e32 [[PTR:v[0-9]+]], vcc, lds0@abs32@lo, [[OFS]] +; GFX9-DAG: s_mov_b32 [[BASE:s[0-9]+]], lds0@abs32@lo +; GFX9-DAG: v_lshl_add_u32 [[PTR:v[0-9]+]], {{v[0-9]+}}, 2, [[BASE]] ; GCN: ds_inc_rtn_u32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8 define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 @@ -325,7 +328,10 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(i32* %ptr) #0 @lds1 = addrspace(3) global [512 x i64] undef, align 8 ; GCN-LABEL: {{^}}atomic_inc_shl_base_lds_0_i64: -; GCN: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 3, {{v[0-9]+}} +; CIVI-DAG: v_lshlrev_b32_e32 [[OFS:v[0-9]+]], 3, {{v[0-9]+}} +; CIVI-DAG: v_add_{{[ui]}}32_e32 [[PTR:v[0-9]+]], vcc, lds1@abs32@lo, [[OFS]] +; GFX9-DAG: v_mov_b32_e32 [[BASE:v[0-9]+]], lds1@abs32@lo +; GFX9-DAG: v_lshl_add_u32 [[PTR:v[0-9]+]], {{v[0-9]+}}, 3, [[BASE]] ; GCN: ds_inc_rtn_u64 v{{\[[0-9]+:[0-9]+\]}}, [[PTR]], v{{\[[0-9]+:[0-9]+\]}} offset:16 define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.groupstaticsize.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.groupstaticsize.ll index d26fab4cebe..3224d8a3594 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.groupstaticsize.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.groupstaticsize.ll @@ -1,6 +1,6 @@ -; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,NOHSA %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,HSA %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,HSA %s @lds0 = addrspace(3) global [512 x float] undef, align 4 @lds1 = addrspace(3) global [256 x float] undef, align 4 @@ -8,7 +8,8 @@ @large = addrspace(3) global [4096 x i32] undef, align 4 ; CHECK-LABEL: {{^}}groupstaticsize_test0: -; CHECK: v_mov_b32_e32 v{{[0-9]+}}, 0x800{{$}} +; NOHSA: v_mov_b32_e32 v{{[0-9]+}}, llvm.amdgcn.groupstaticsize@abs32@lo +; HSA: v_mov_b32_e32 v{{[0-9]+}}, 0x800{{$}} define amdgpu_kernel void @groupstaticsize_test0(float addrspace(1)* %out, i32 addrspace(1)* %lds_size) #0 { %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %idx.0 = add nsw i32 %tid.x, 64 @@ -22,7 +23,8 @@ define amdgpu_kernel void @groupstaticsize_test0(float addrspace(1)* %out, i32 a } ; CHECK-LABEL: {{^}}groupstaticsize_test1: -; CHECK: v_mov_b32_e32 v{{[0-9]+}}, 0xc00{{$}} +; NOHSA: v_mov_b32_e32 v{{[0-9]+}}, llvm.amdgcn.groupstaticsize@abs32@lo +; HSA: v_mov_b32_e32 v{{[0-9]+}}, 0xc00{{$}} define amdgpu_kernel void @groupstaticsize_test1(float addrspace(1)* %out, i32 %cond, i32 addrspace(1)* %lds_size) { entry: %static_lds_size = call i32 @llvm.amdgcn.groupstaticsize() #1 @@ -50,7 +52,8 @@ endif: ; preds = %else, %if ; Exceeds 16-bit simm limit of s_movk_i32 ; CHECK-LABEL: {{^}}large_groupstaticsize: -; CHECK: v_mov_b32_e32 [[REG:v[0-9]+]], 0x4000{{$}} +; NOHSA: v_mov_b32_e32 v{{[0-9]+}}, llvm.amdgcn.groupstaticsize@abs32@lo +; HSA: v_mov_b32_e32 [[REG:v[0-9]+]], 0x4000{{$}} define amdgpu_kernel void @large_groupstaticsize(i32 addrspace(1)* %size, i32 %idx) #0 { %gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(3)* @large, i32 0, i32 %idx store volatile i32 0, i32 addrspace(3)* %gep diff --git a/test/CodeGen/AMDGPU/local-memory.amdgcn.ll b/test/CodeGen/AMDGPU/local-memory.amdgcn.ll index 47b6558241b..a070488a4bc 100644 --- a/test/CodeGen/AMDGPU/local-memory.amdgcn.ll +++ b/test/CodeGen/AMDGPU/local-memory.amdgcn.ll @@ -3,12 +3,6 @@ @local_memory.local_mem = internal unnamed_addr addrspace(3) global [128 x i32] undef, align 4 -; Check that the LDS size emitted correctly -; SI: .long 47180 -; SI-NEXT: .long 65668 -; CI: .long 47180 -; CI-NEXT: .long 32900 - ; GCN-LABEL: {{^}}local_memory: ; GCN-NOT: s_wqm_b64 @@ -57,6 +51,7 @@ entry: ; CI: v_sub_i32_e32 [[SUB:v[0-9]+]], vcc, 0, [[ADDRW]] ; CI: ds_read2_b32 {{v\[[0-9]+:[0-9]+\]}}, [[SUB]] offset0:3 offset1:7 + define amdgpu_kernel void @local_memory_two_objects(i32 addrspace(1)* %out) #0 { entry: %x.i = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/test/CodeGen/AMDGPU/local-memory.ll b/test/CodeGen/AMDGPU/local-memory.ll index 6124237d763..5e820792aee 100644 --- a/test/CodeGen/AMDGPU/local-memory.ll +++ b/test/CodeGen/AMDGPU/local-memory.ll @@ -10,8 +10,8 @@ ; not an immediate. ; FUNC-LABEL: {{^}}load_i32_local_const_ptr: -; GCN: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0 -; GCN: ds_read_b32 v{{[0-9]+}}, v[[ZERO]] offset:4 +; GCN: v_mov_b32_e32 v[[PTR:[0-9]+]], lds@abs32@lo +; GCN: ds_read_b32 v{{[0-9]+}}, v[[PTR]] offset:4 ; R600: LDS_READ_RET define amdgpu_kernel void @load_i32_local_const_ptr(i32 addrspace(1)* %out, i32 addrspace(3)* %in) #0 { diff --git a/test/CodeGen/AMDGPU/merge-store-crash.ll b/test/CodeGen/AMDGPU/merge-store-crash.ll index 1252a5c0c02..c384eb5b425 100644 --- a/test/CodeGen/AMDGPU/merge-store-crash.ll +++ b/test/CodeGen/AMDGPU/merge-store-crash.ll @@ -7,7 +7,8 @@ @tess_lds = external addrspace(3) global [8192 x i32] ; CHECK-LABEL: {{^}}main: -; CHECK: ds_write2_b32 +; CHECK: ds_write_b32 +; CHECK: ds_write_b32 ; CHECK: v_mov_b32_e32 v1, v0 ; CHECK: tbuffer_store_format_xyzw v[0:3], define amdgpu_vs void @main(i32 inreg %arg) { diff --git a/test/CodeGen/AMDGPU/over-max-lds-size.ll b/test/CodeGen/AMDGPU/over-max-lds-size.ll deleted file mode 100644 index 57777e783c5..00000000000 --- a/test/CodeGen/AMDGPU/over-max-lds-size.ll +++ /dev/null @@ -1,14 +0,0 @@ -; RUN: not llc -march=amdgcn -mcpu=tahiti < %s 2>&1 | FileCheck -check-prefix=ERROR %s -; RUN: not llc -march=amdgcn -mcpu=hawaii < %s 2>&1 | FileCheck -check-prefix=ERROR %s -; RUN: not llc -march=amdgcn -mcpu=fiji < %s 2>&1 | FileCheck -check-prefix=ERROR %s - -; ERROR: error: local memory limit exceeded (400000) in use_huge_lds - -@huge = internal unnamed_addr addrspace(3) global [100000 x i32] undef, align 4 - -define amdgpu_kernel void @use_huge_lds() { -entry: - %v0 = getelementptr inbounds [100000 x i32], [100000 x i32] addrspace(3)* @huge, i32 0, i32 0 - store i32 0, i32 addrspace(3)* %v0 - ret void -} diff --git a/test/CodeGen/AMDGPU/promote-alloca-globals.ll b/test/CodeGen/AMDGPU/promote-alloca-globals.ll index 4403d1f2306..4fd9c01a57f 100644 --- a/test/CodeGen/AMDGPU/promote-alloca-globals.ll +++ b/test/CodeGen/AMDGPU/promote-alloca-globals.ll @@ -8,7 +8,8 @@ ; IR-LABEL: define amdgpu_kernel void @promote_alloca_size_256(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) { ; IR: alloca [10 x i32] ; ASM-LABEL: {{^}}promote_alloca_size_256: -; ASM: ; LDSByteSize: 60000 bytes/workgroup (compile time only) +; ASM: .amdgpu_lds global_array0, 30000, 4 +; ASM: .amdgpu_lds global_array1, 30000, 4 define amdgpu_kernel void @promote_alloca_size_256(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) { entry: diff --git a/test/CodeGen/AMDGPU/shl_add_ptr.ll b/test/CodeGen/AMDGPU/shl_add_ptr.ll index 379dcd62302..6c8891d28d6 100644 --- a/test/CodeGen/AMDGPU/shl_add_ptr.ll +++ b/test/CodeGen/AMDGPU/shl_add_ptr.ll @@ -33,7 +33,11 @@ define amdgpu_kernel void @load_shl_base_lds_0(float addrspace(1)* %out, i32 add ; remaining add use goes through the normal shl + add constant fold. ; GCN-LABEL: {{^}}load_shl_base_lds_1: -; GCN: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}} +; GCN: v_lshlrev_b32_e32 [[OFS:v[0-9]+]], 2, {{v[0-9]+}} + +; TODO: integrate into the ds_read_b32 offset using a 16-bit relocation +; GCN: v_add_{{[iu]}}32_e32 [[PTR:v[0-9]+]], vcc, lds0@abs32@lo, [[OFS]] + ; GCN: ds_read_b32 [[RESULT:v[0-9]+]], [[PTR]] offset:8 ; GCN: v_add_{{[iu]}}32_e32 [[ADDUSE:v[0-9]+]], vcc, 8, v{{[0-9]+}} ; GCN-DAG: buffer_store_dword [[RESULT]] @@ -68,10 +72,18 @@ define amdgpu_kernel void @load_shl_base_lds_max_offset(i8 addrspace(1)* %out, i ; The two globals are placed adjacent in memory, so the same base ; pointer can be used with an offset into the second one. +; TODO: Recover the optimization of using ds_read2st64_b32 using alignment hints + ; GCN-LABEL: {{^}}load_shl_base_lds_2: -; GCN: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}} +; GCN: v_lshlrev_b32_e32 [[OFS:v[0-9]+]], 2, {{v[0-9]+}} +; GCN-DAG: v_add_{{[iu]}}32_e32 [[PTR0:v[0-9]+]], vcc, lds0@abs32@lo, [[OFS]] +; GCN-DAG: v_add_{{[iu]}}32_e32 [[PTR1:v[0-9]+]], vcc, lds1@abs32@lo, [[OFS]] ; GCN: s_mov_b32 m0, -1 -; GCN-NEXT: ds_read2st64_b32 {{v\[[0-9]+:[0-9]+\]}}, [[PTR]] offset0:1 offset1:9 + +; GCN-DAG: ds_read_b32 {{v[0-9]+}}, [[PTR0]] offset:256 +; GCN-DAG: ds_read_b32 {{v[0-9]+}}, [[PTR1]] offset:256 +; TODO: ds_read2st64_b32 {{v\[[0-9]+:[0-9]+\]}}, [[PTR]] offset0:1 offset1:9 + ; GCN: s_endpgm define amdgpu_kernel void @load_shl_base_lds_2(float addrspace(1)* %out) #0 { %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 diff --git a/test/CodeGen/AMDGPU/si-sgpr-spill.ll b/test/CodeGen/AMDGPU/si-sgpr-spill.ll index 9a731c6bbbc..54b4b4d2586 100644 --- a/test/CodeGen/AMDGPU/si-sgpr-spill.ll +++ b/test/CodeGen/AMDGPU/si-sgpr-spill.ll @@ -4,16 +4,11 @@ ; These tests check that the compiler won't crash when it needs to spill ; SGPRs. -@ddxy_lds = external addrspace(3) global [64 x i32] - ; GCN-LABEL: {{^}}main: ; GCN: s_wqm ; Make sure not emitting unused scratch resource descriptor setup ; GCN-NOT: s_mov_b32 -; GCN-NOT: s_mov_b32 -; GCN-NOT: s_mov_b32 -; GCN-NOT: s_mov_b32 ; GCN: s_mov_b32 m0 @@ -26,6 +21,7 @@ ; TOVGPR: ScratchSize: 0{{$}} define amdgpu_ps void @main([17 x <4 x i32>] addrspace(4)* byval %arg, [32 x <4 x i32>] addrspace(4)* byval %arg1, [16 x <8 x i32>] addrspace(4)* byval %arg2, float inreg %arg3, i32 inreg %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <3 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, float %arg20) { main_body: + %lds = inttoptr i32 0 to [64 x i32] addrspace(3)* %tmp = getelementptr [17 x <4 x i32>], [17 x <4 x i32>] addrspace(4)* %arg, i64 0, i32 0 %tmp21 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp, !tbaa !0 %tmp22 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 96, i32 0) @@ -203,18 +199,18 @@ main_body: %p2.i6 = call float @llvm.amdgcn.interp.p2(float %p1.i5, float %j.f.i4, i32 2, i32 5, i32 %arg4) #0 %mbcnt.lo.0 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) %tmp109 = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %mbcnt.lo.0) - %tmp110 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %tmp109 + %tmp110 = getelementptr [64 x i32], [64 x i32] addrspace(3)* %lds, i32 0, i32 %tmp109 %tmp111 = bitcast float %p2.i to i32 store i32 %tmp111, i32 addrspace(3)* %tmp110 %tmp112 = bitcast float %p2.i96 to i32 store i32 %tmp112, i32 addrspace(3)* %tmp110 %mbcnt.lo.1 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) %tmp113 = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %mbcnt.lo.1) - %tmp114 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %tmp113 + %tmp114 = getelementptr [64 x i32], [64 x i32] addrspace(3)* %lds, i32 0, i32 %tmp113 %tmp115 = and i32 %tmp113, -4 - %tmp116 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %tmp115 + %tmp116 = getelementptr [64 x i32], [64 x i32] addrspace(3)* %lds, i32 0, i32 %tmp115 %tmp117 = add i32 %tmp115, 1 - %tmp118 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %tmp117 + %tmp118 = getelementptr [64 x i32], [64 x i32] addrspace(3)* %lds, i32 0, i32 %tmp117 %tmp119 = bitcast float %p2.i to i32 store i32 %tmp119, i32 addrspace(3)* %tmp114 %tmp120 = load i32, i32 addrspace(3)* %tmp116 @@ -241,7 +237,7 @@ main_body: %tmp140 = fmul float %tmp59, %p2.i96 %mbcnt.lo.2 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) %tmp141 = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %mbcnt.lo.2) - %tmp142 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %tmp141 + %tmp142 = getelementptr [64 x i32], [64 x i32] addrspace(3)* %lds, i32 0, i32 %tmp141 %tmp143 = bitcast float %tmp137 to i32 store i32 %tmp143, i32 addrspace(3)* %tmp142 %tmp144 = bitcast float %tmp138 to i32 @@ -252,11 +248,11 @@ main_body: store i32 %tmp146, i32 addrspace(3)* %tmp142 %mbcnt.lo.3 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) %tmp147 = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %mbcnt.lo.3) - %tmp148 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %tmp147 + %tmp148 = getelementptr [64 x i32], [64 x i32] addrspace(3)* %lds, i32 0, i32 %tmp147 %tmp149 = and i32 %tmp147, -4 - %tmp150 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %tmp149 + %tmp150 = getelementptr [64 x i32], [64 x i32] addrspace(3)* %lds, i32 0, i32 %tmp149 %tmp151 = add i32 %tmp149, 2 - %tmp152 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %tmp151 + %tmp152 = getelementptr [64 x i32], [64 x i32] addrspace(3)* %lds, i32 0, i32 %tmp151 %tmp153 = bitcast float %tmp137 to i32 store i32 %tmp153, i32 addrspace(3)* %tmp148 %tmp154 = load i32, i32 addrspace(3)* %tmp150 diff --git a/test/CodeGen/AMDGPU/target-cpu.ll b/test/CodeGen/AMDGPU/target-cpu.ll index cc4c98bb678..4750cf2020d 100644 --- a/test/CodeGen/AMDGPU/target-cpu.ll +++ b/test/CodeGen/AMDGPU/target-cpu.ll @@ -78,7 +78,6 @@ define amdgpu_kernel void @target_fiji() #4 { ; CHECK-LABEL: {{^}}promote_alloca_enabled: ; CHECK: ds_read_b32 -; CHECK: ; LDSByteSize: 5120 define amdgpu_kernel void @promote_alloca_enabled(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #5 { entry: %stack = alloca [5 x i32], align 4, addrspace(5) diff --git a/test/CodeGen/MIR/AMDGPU/machine-function-info.ll b/test/CodeGen/MIR/AMDGPU/machine-function-info.ll index df004562d49..4e233495f5f 100644 --- a/test/CodeGen/MIR/AMDGPU/machine-function-info.ll +++ b/test/CodeGen/MIR/AMDGPU/machine-function-info.ll @@ -10,7 +10,7 @@ ; CHECK: machineFunctionInfo: ; CHECK-NEXT: explicitKernArgSize: 128 ; CHECK-NEXT: maxKernArgAlign: 64 -; CHECK-NEXT: ldsSize: 2048 +; CHECK-NEXT: ldsSize: 0 ; CHECK-NEXT: isEntryFunction: true ; CHECK-NEXT: noSignedZerosFPMath: false ; CHECK-NEXT: memoryBound: false -- 2.40.0