bool runOnMachineFunction(MachineFunction &MF) override;
void Select(SDNode *N) override;
const char *getPassName() const override;
- void PreprocessISelDAG() override;
void PostprocessISelDAG() override;
private:
+ SDValue foldFrameIndex(SDValue N) const;
bool isInlineImmediate(const SDNode *N) const;
bool FoldOperand(SDValue &Src, SDValue &Sel, SDValue &Neg, SDValue &Abs,
const R600InstrInfo *TII);
return SelectMUBUFAddr64(Addr, SRsrc, VAddr, SOffset, Offset, GLC, SLC, TFE);
}
+SDValue AMDGPUDAGToDAGISel::foldFrameIndex(SDValue N) const {
+ if (auto FI = dyn_cast<FrameIndexSDNode>(N))
+ return CurDAG->getTargetFrameIndex(FI->getIndex(), FI->getValueType(0));
+ return N;
+}
+
bool AMDGPUDAGToDAGISel::SelectMUBUFScratch(SDValue Addr, SDValue &Rsrc,
SDValue &VAddr, SDValue &SOffset,
SDValue &ImmOffset) const {
// Offsets in vaddr must be positive.
ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
if (isLegalMUBUFImmOffset(C1)) {
- VAddr = N0;
+ VAddr = foldFrameIndex(N0);
ImmOffset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16);
return true;
}
}
// (node)
- VAddr = Addr;
+ VAddr = foldFrameIndex(Addr);
ImmOffset = CurDAG->getTargetConstant(0, DL, MVT::i16);
return true;
}
return SelectVOP3Mods(In, Src, SrcMods);
}
-void AMDGPUDAGToDAGISel::PreprocessISelDAG() {
- MachineFrameInfo &MFI = CurDAG->getMachineFunction().getFrameInfo();
-
- // Handle the perverse case where a frame index is being stored. We don't
- // want to see multiple frame index operands on the same instruction since
- // it complicates things and violates some assumptions about frame index
- // lowering.
- for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd();
- I != E; ++I) {
- SDValue FI = CurDAG->getTargetFrameIndex(I, MVT::i32);
-
- // It's possible that we have a frame index defined in the function that
- // isn't used in this block.
- if (FI.use_empty())
- continue;
-
- // Skip over the AssertZext inserted during lowering.
- SDValue EffectiveFI = FI;
- auto It = FI->use_begin();
- if (It->getOpcode() == ISD::AssertZext && FI->hasOneUse()) {
- EffectiveFI = SDValue(*It, 0);
- It = EffectiveFI->use_begin();
- }
-
- for (auto It = EffectiveFI->use_begin(); !It.atEnd(); ) {
- SDUse &Use = It.getUse();
- SDNode *User = Use.getUser();
- unsigned OpIdx = It.getOperandNo();
- ++It;
-
- if (MemSDNode *M = dyn_cast<MemSDNode>(User)) {
- unsigned PtrIdx = M->getOpcode() == ISD::STORE ? 2 : 1;
- if (OpIdx == PtrIdx)
- continue;
-
- unsigned OpN = M->getNumOperands();
- SDValue NewOps[8];
-
- assert(OpN < array_lengthof(NewOps));
- for (unsigned Op = 0; Op != OpN; ++Op) {
- if (Op != OpIdx) {
- NewOps[Op] = M->getOperand(Op);
- continue;
- }
-
- MachineSDNode *Mov = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32,
- SDLoc(M), MVT::i32, FI);
- NewOps[Op] = SDValue(Mov, 0);
- }
-
- CurDAG->UpdateNodeOperands(M, makeArrayRef(NewOps, OpN));
- }
- }
- }
-}
-
void AMDGPUDAGToDAGISel::PostprocessISelDAG() {
const AMDGPUTargetLowering& Lowering =
*static_cast<const AMDGPUTargetLowering*>(getTargetLowering());
setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
- setOperationAction(ISD::FrameIndex, MVT::i32, Custom);
setOperationAction(ISD::ConstantPool, MVT::v2i64, Expand);
setOperationAction(ISD::SELECT, MVT::i1, Promote);
SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
switch (Op.getOpcode()) {
default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
- case ISD::FrameIndex: return LowerFrameIndex(Op, DAG);
case ISD::BRCOND: return LowerBRCOND(Op, DAG);
case ISD::LOAD: {
SDValue Result = LowerLOAD(Op, DAG);
return nullptr;
}
-SDValue SITargetLowering::LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const {
-
- SDLoc SL(Op);
- FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Op);
- unsigned FrameIndex = FINode->getIndex();
-
- // A FrameIndex node represents a 32-bit offset into scratch memory. If the
- // high bit of a frame index offset were to be set, this would mean that it
- // represented an offset of ~2GB * 64 = ~128GB from the start of the scratch
- // buffer, with 64 being the number of threads per wave.
- //
- // The maximum private allocation for the entire GPU is 4G, and we are
- // concerned with the largest the index could ever be for an individual
- // workitem. This will occur with the minmum dispatch size. If a program
- // requires more, the dispatch size will be reduced.
- //
- // With this limit, we can mark the high bit of the FrameIndex node as known
- // zero, which is important, because it means in most situations we can prove
- // that values derived from FrameIndex nodes are non-negative. This enables us
- // to take advantage of more addressing modes when accessing scratch buffers,
- // since for scratch reads/writes, the register offset must always be
- // positive.
-
- uint64_t MaxGPUAlloc = UINT64_C(4) * 1024 * 1024 * 1024;
-
- // XXX - It is unclear if partial dispatch works. Assume it works at half wave
- // granularity. It is probably a full wave.
- uint64_t MinGranularity = 32;
-
- unsigned KnownBits = Log2_64(MaxGPUAlloc / MinGranularity);
- EVT ExtVT = EVT::getIntegerVT(*DAG.getContext(), KnownBits);
-
- SDValue TFI = DAG.getTargetFrameIndex(FrameIndex, MVT::i32);
- return DAG.getNode(ISD::AssertZext, SL, MVT::i32, TFI,
- DAG.getValueType(ExtVT));
-}
-
bool SITargetLowering::isCFIntrinsic(const SDNode *Intr) const {
if (Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN) {
switch (cast<ConstantSDNode>(Intr->getOperand(1))->getZExtValue()) {
SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerINTRINSIC_VOID(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerFastUnsafeFDIV(SDValue Op, SelectionDAG &DAG) const;
N->getValueAPF().bitcastToAPInt().getZExtValue(), SDLoc(N), MVT::i32);
}]>;
+def frameindex_to_targetframeindex : SDNodeXForm<frameindex, [{
+ auto FI = cast<FrameIndexSDNode>(N);
+ return CurDAG->getTargetFrameIndex(FI->getIndex(), MVT::i32);
+}]>;
+
// Copied from the AArch64 backend:
def bitcast_fpimm_to_i64 : SDNodeXForm<fpimm, [{
return CurDAG->getTargetConstant(
(V_MOV_B32_e32 (f32 (bitcast_fpimm_to_i32 $imm)))
>;
+def : Pat <
+ (i32 frameindex:$fi),
+ (V_MOV_B32_e32 (i32 (frameindex_to_targetframeindex $fi)))
+>;
+
def : Pat <
(i64 InlineImm<i64>:$imm),
(S_MOV_B64 InlineImm<i64>:$imm)
; RUN: llc -march=amdgcn -mattr=-promote-alloca -amdgpu-sroa=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; GCN-LABEL: {{^}}store_fi_lifetime:
+; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 0{{$}}
+; GCN: buffer_store_dword [[FI]]
+define void @store_fi_lifetime(i32 addrspace(1)* %out, i32 %in) #0 {
+entry:
+ %b = alloca i8
+ call void @llvm.lifetime.start(i64 1, i8* %b)
+ store volatile i8* %b, i8* addrspace(1)* undef
+ call void @llvm.lifetime.end(i64 1, i8* %b)
+ ret void
+}
+
; GCN-LABEL: {{^}}stored_fi_to_lds:
; GCN: s_load_dword [[LDSPTR:s[0-9]+]]
; GCN: v_mov_b32_e32 [[ZERO1:v[0-9]+]], 0{{$}}
}
; GCN-LABEL: {{^}}stored_fi_to_global_huge_frame_offset:
-; GCN: s_add_i32 [[BASE_1_OFF_0:s[0-9]+]], 0, 0x3ffc
+; GCN: v_mov_b32_e32 [[VAL_0:v[0-9]+]], 0{{$}}
; GCN: v_mov_b32_e32 [[BASE_0:v[0-9]+]], 0{{$}}
-; GCN: buffer_store_dword [[BASE_0]], v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen
+; GCN: buffer_store_dword [[VAL_0]], [[BASE_0]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen
+
+; GCN: v_mov_b32_e32 [[BASE_0_1:v[0-9]+]], 0{{$}}
+; GCN: v_add_i32_e32 [[BASE_1_OFF_0:v[0-9]+]], vcc, 0x3ffc, [[BASE_0_1]]
-; GCN: v_mov_b32_e32 [[V_BASE_1_OFF_0:v[0-9]+]], [[BASE_1_OFF_0]]
; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7{{$}}
-; GCN: s_add_i32 [[BASE_1_OFF_1:s[0-9]+]], 0, 56
-; GCN: buffer_store_dword [[K]], [[V_BASE_1_OFF_0]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen{{$}}
+; GCN: v_add_i32_e32 [[BASE_1_OFF_1:v[0-9]+]], vcc, 56, [[BASE_0_1]]
+; GCN: buffer_store_dword [[K]], [[BASE_1_OFF_0]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen{{$}}
-; GCN: v_mov_b32_e32 [[V_BASE_1_OFF_1:v[0-9]+]], [[BASE_1_OFF_1]]
-; GCN: buffer_store_dword [[V_BASE_1_OFF_1]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
+; GCN: buffer_store_dword [[BASE_1_OFF_1]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
define void @stored_fi_to_global_huge_frame_offset(i32* addrspace(1)* %ptr) #0 {
%tmp0 = alloca [4096 x i32]
%tmp1 = alloca [4096 x i32]
ret void
}
+@g1 = external addrspace(1) global i32*
+
+; This was leaving a dead node around resulting in failing to select
+; on the leftover AssertZext's ValueType operand.
+
+; GCN-LABEL: {{^}}cannot_select_assertzext_valuetype:
+; GCN: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, g1@GOTPCREL+4
+; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 0{{$}}
+; GCN: buffer_store_dword [[FI]]
+define void @cannot_select_assertzext_valuetype(i32 addrspace(1)* %out, i32 %idx) #0 {
+entry:
+ %b = alloca i32, align 4
+ %tmp1 = load volatile i32*, i32* addrspace(1)* @g1, align 4
+ %arrayidx = getelementptr inbounds i32, i32* %tmp1, i32 %idx
+ %tmp2 = load i32, i32* %arrayidx, align 4
+ store volatile i32* %b, i32* addrspace(1)* undef
+ ret void
+}
+
+declare void @llvm.lifetime.start(i64, i8* nocapture) #1
+declare void @llvm.lifetime.end(i64, i8* nocapture) #1
+
attributes #0 = { nounwind }
+attributes #1 = { argmemonly nounwind }
;
; CHECK-LABEL: {{^}}main:
; CHECK: v_lshlrev_b32_e32 [[BYTES:v[0-9]+]], 2, v0
-; CHECK: v_add_i32_e32 [[HI_OFF:v[0-9]+]], vcc, 0x200, [[BYTES]]
-; CHECK: v_add_i32_e32 [[LO_OFF:v[0-9]+]], vcc, 0, [[BYTES]]
+; CHECK-DAG: v_mov_b32_e32 [[ZERO_BASE_FI:v[0-9]+]], 0{{$}}
+; CHECK-DAG: v_add_i32_e32 [[HI_OFF:v[0-9]+]], vcc, 0x200, [[BYTES]]
+; CHECK-DAG: v_add_i32_e32 [[LO_OFF:v[0-9]+]], vcc, 0, [[BYTES]]
+
; CHECK: buffer_load_dword {{v[0-9]+}}, [[LO_OFF]], {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen
; CHECK: buffer_load_dword {{v[0-9]+}}, [[HI_OFF]], {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen
define amdgpu_ps float @main(i32 %idx) {