const SIInstrInfo *TII;
const SIRegisterInfo *TRI;
const GCNSubtarget *ST;
+ const SIMachineFunctionInfo *MFI;
void foldOperand(MachineOperand &OpToFold,
MachineInstr *UseMI,
- unsigned UseOpIdx,
+ int UseOpIdx,
SmallVectorImpl<FoldCandidate> &FoldList,
SmallVectorImpl<MachineInstr *> &CopiesToReplace) const;
}
}
+// TODO: Add heuristic that the frame index might not fit in the addressing mode
+// immediate offset to avoid materializing in loops.
+static bool frameIndexMayFold(const SIInstrInfo *TII,
+ const MachineInstr &UseMI,
+ int OpNo,
+ const MachineOperand &OpToFold) {
+ return OpToFold.isFI() &&
+ (TII->isMUBUF(UseMI) || TII->isFLATScratch(UseMI)) &&
+ OpNo == AMDGPU::getNamedOperandIdx(UseMI.getOpcode(), AMDGPU::OpName::vaddr);
+}
+
FunctionPass *llvm::createSIFoldOperandsPass() {
return new SIFoldOperands();
}
MachineOperand *OpToFold,
const SIInstrInfo *TII) {
if (!TII->isOperandLegal(*MI, OpNo, OpToFold)) {
-
// Special case for v_mac_{f16, f32}_e64 if we are trying to fold into src2
unsigned Opc = MI->getOpcode();
if ((Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
void SIFoldOperands::foldOperand(
MachineOperand &OpToFold,
MachineInstr *UseMI,
- unsigned UseOpIdx,
+ int UseOpIdx,
SmallVectorImpl<FoldCandidate> &FoldList,
SmallVectorImpl<MachineInstr *> &CopiesToReplace) const {
const MachineOperand &UseOp = UseMI->getOperand(UseOpIdx);
return;
}
+ if (frameIndexMayFold(TII, *UseMI, UseOpIdx, OpToFold)) {
+ // Sanity check that this is a stack access.
+ // FIXME: Should probably use stack pseudos before frame lowering.
+ MachineOperand *SOff = TII->getNamedOperand(*UseMI, AMDGPU::OpName::soffset);
+ if (!SOff->isReg() || (SOff->getReg() != MFI->getScratchWaveOffsetReg() &&
+ SOff->getReg() != MFI->getStackPtrOffsetReg()))
+ return;
+
+ if (TII->getNamedOperand(*UseMI, AMDGPU::OpName::srsrc)->getReg() !=
+ MFI->getScratchRSrcReg())
+ return;
- bool FoldingImm = OpToFold.isImm();
+ // A frame index will resolve to a positive constant, so it should always be
+ // safe to fold the addressing mode, even pre-GFX9.
+ UseMI->getOperand(UseOpIdx).ChangeToFrameIndex(OpToFold.getIndex());
+ SOff->setReg(MFI->getStackPtrOffsetReg());
+ return;
+ }
- if (FoldingImm && UseMI->isCopy()) {
+ bool FoldingImmLike = OpToFold.isImm() || OpToFold.isFI();
+
+ if (FoldingImmLike && UseMI->isCopy()) {
unsigned DestReg = UseMI->getOperand(0).getReg();
const TargetRegisterClass *DestRC
= TargetRegisterInfo::isVirtualRegister(DestReg) ?
// %sgpr = V_READFIRSTLANE_B32 %vgpr
// =>
// %sgpr = S_MOV_B32 imm
- if (FoldingImm) {
+ if (FoldingImmLike) {
if (execMayBeModifiedBeforeUse(*MRI,
UseMI->getOperand(UseOpIdx).getReg(),
*OpToFold.getParent(),
// FIXME: ChangeToImmediate should clear subreg
UseMI->getOperand(1).setSubReg(0);
- UseMI->getOperand(1).ChangeToImmediate(OpToFold.getImm());
+ if (OpToFold.isImm())
+ UseMI->getOperand(1).ChangeToImmediate(OpToFold.getImm());
+ else
+ UseMI->getOperand(1).ChangeToFrameIndex(OpToFold.getIndex());
UseMI->RemoveOperand(2); // Remove exec read (or src1 for readlane)
return;
}
return;
}
- if (!FoldingImm) {
+ if (!FoldingImmLike) {
tryAddToFoldList(FoldList, UseMI, UseOpIdx, &OpToFold, TII);
// FIXME: We could try to change the instruction from 64-bit to 32-bit
// in some cases. A better heuristic is needed.
if (isInlineConstantIfFolded(TII, *UseMI, OpNo, OpToFold)) {
foldOperand(OpToFold, UseMI, OpNo, FoldList, CopiesToReplace);
+ } else if (frameIndexMayFold(TII, *UseMI, OpNo, OpToFold)) {
+ foldOperand(OpToFold, UseMI, OpNo, FoldList,
+ CopiesToReplace);
} else {
if (++NumLiteralUses == 1) {
NonInlineUse = &*Use;
ST = &MF.getSubtarget<GCNSubtarget>();
TII = ST->getInstrInfo();
TRI = &TII->getRegisterInfo();
-
- const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+ MFI = MF.getInfo<SIMachineFunctionInfo>();
// omod is ignored by hardware if IEEE bit is enabled. omod also does not
// correctly handle signed zeros.
return (Flags & SIInstrFlags::FLAT) && !(Flags & SIInstrFlags::LGKM_CNT);
}
+ // FIXME: Make this more precise
+ static bool isFLATScratch(const MachineInstr &MI) {
+ return isSegmentSpecificFLAT(MI);
+ }
+
// Any FLAT encoded instruction, including global_* and scratch_*.
bool isFLAT(uint16_t Opcode) const {
return get(Opcode).TSFlags & SIInstrFlags::FLAT;
ret void
}
+; Make sure the offset is folded and function's frame register is used
+; rather than the global scratch wave offset.
+; GCN-LABEL: {{^}}void_func_byval_struct_use_outside_entry_block:
+; GCN-NOT: v_lshrrev_b32
+; GCN-NOT: s_sub_u32
+
+; GCN: s_and_saveexec_b64
+; GCN: s_cbranch_execz [[BB1:BB[0-9]+_[0-9]+]]
+
+; GCN: buffer_load_dword [[LOAD0:v[0-9]+]], off, s[0:3], s32{{$}}
+; GCN-NOT: s32
+; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], s32{{$}}
+; GCN-NOT: s32
+
+; GCN: buffer_load_dword [[LOAD1:v[0-9]+]], off, s[0:3], s32 offset:16{{$}}
+; GCN-NOT: s32
+; GCN: buffer_store_dword [[LOAD1]], off, s[0:3], s32 offset:16{{$}}
+; GCN-NOT: s32
+
+; GCN: [[BB1]]
+; GCN: s_or_b64 exec, exec
+define hidden void @void_func_byval_struct_use_outside_entry_block(%struct.ByValStruct addrspace(5)* byval noalias nocapture align 4 %arg0, %struct.ByValStruct addrspace(5)* byval noalias nocapture align 4 %arg1, i1 %cond) #1 {
+entry:
+ br i1 %cond, label %bb0, label %bb1
+
+bb0:
+ %arrayidx = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct addrspace(5)* %arg0, i32 0, i32 0, i32 0
+ %tmp = load volatile i32, i32 addrspace(5)* %arrayidx, align 4
+ %add = add nsw i32 %tmp, 1
+ store volatile i32 %add, i32 addrspace(5)* %arrayidx, align 4
+ %arrayidx2 = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct addrspace(5)* %arg1, i32 0, i32 0, i32 0
+ %tmp1 = load volatile i32, i32 addrspace(5)* %arrayidx2, align 4
+ %add3 = add nsw i32 %tmp1, 2
+ store volatile i32 %add3, i32 addrspace(5)* %arrayidx2, align 4
+ store volatile i32 9, i32 addrspace(1)* null, align 4
+ br label %bb1
+
+bb1:
+ ret void
+}
+
; GCN-LABEL: {{^}}void_func_byval_struct_non_leaf:
; GCN: s_mov_b32 s5, s32
; GCN: s_add_u32 s32, s32, 0xc00{{$}}
--- /dev/null
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -run-pass si-fold-operands,dead-mi-elimination %s -o - | FileCheck -check-prefix=GCN %s
+
+---
+name: no_fold_fi_non_stack_rsrc_soffset
+tracksRegLiveness: true
+frameInfo:
+ maxAlignment: 4
+ localFrameSize: 4
+stack:
+ - { id: 0, size: 4, alignment: 4, local-offset: 0 }
+machineFunctionInfo:
+ isEntryFunction: true
+ scratchRSrcReg: '$sgpr96_sgpr97_sgpr98_sgpr99'
+ scratchWaveOffsetReg: '$sgpr6'
+ frameOffsetReg: '$sgpr6'
+ stackPtrOffsetReg: '$sgpr6'
+body: |
+ bb.0:
+ liveins: $sgpr12_sgpr13_sgpr14_sgpr15
+
+ ; GCN-LABEL: name: no_fold_fi_non_stack_rsrc_soffset
+ ; GCN: liveins: $sgpr12_sgpr13_sgpr14_sgpr15
+ ; GCN: [[COPY:%[0-9]+]]:sgpr_128 = COPY $sgpr12_sgpr13_sgpr14_sgpr15
+ ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
+ ; GCN: [[BUFFER_LOAD_DWORD_IDXEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN [[V_MOV_B32_e32_]], [[COPY]], 0, 0, 0, 0, 0, 0, implicit $exec
+ ; GCN: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_IDXEN]]
+ ; GCN: SI_RETURN_TO_EPILOG $vgpr0
+ %0:sgpr_128 = COPY $sgpr12_sgpr13_sgpr14_sgpr15
+ %1:sreg_32_xm0 = S_MOV_B32 0
+ %2:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
+ %3:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN %2, %0, %1, 0, 0, 0, 0, 0, implicit $exec
+ $vgpr0 = COPY %3
+ SI_RETURN_TO_EPILOG $vgpr0
+
+...
+
+---
+name: no_fold_fi_non_stack_rsrc
+tracksRegLiveness: true
+frameInfo:
+ maxAlignment: 4
+ localFrameSize: 4
+stack:
+ - { id: 0, size: 4, alignment: 4, local-offset: 0 }
+machineFunctionInfo:
+ isEntryFunction: true
+ scratchRSrcReg: '$sgpr96_sgpr97_sgpr98_sgpr99'
+ scratchWaveOffsetReg: '$sgpr6'
+ frameOffsetReg: '$sgpr6'
+ stackPtrOffsetReg: '$sgpr32'
+body: |
+ bb.0:
+ liveins: $sgpr12_sgpr13_sgpr14_sgpr15
+
+ ; GCN-LABEL: name: no_fold_fi_non_stack_rsrc
+ ; GCN: liveins: $sgpr12_sgpr13_sgpr14_sgpr15
+ ; GCN: [[COPY:%[0-9]+]]:sgpr_128 = COPY $sgpr12_sgpr13_sgpr14_sgpr15
+ ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
+ ; GCN: [[BUFFER_LOAD_DWORD_IDXEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN [[V_MOV_B32_e32_]], [[COPY]], $sgpr32, 0, 0, 0, 0, 0, implicit $exec
+ ; GCN: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_IDXEN]]
+ ; GCN: SI_RETURN_TO_EPILOG $vgpr0
+ %0:sgpr_128 = COPY $sgpr12_sgpr13_sgpr14_sgpr15
+ %2:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
+ %3:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN %2, %0, $sgpr32, 0, 0, 0, 0, 0, implicit $exec
+ $vgpr0 = COPY %3
+ SI_RETURN_TO_EPILOG $vgpr0
+
+...
+
+# Offset is from global scratch wave offset.
+---
+name: fold_fi_mubuf_scratch_scratch_wave_offset
+tracksRegLiveness: true
+frameInfo:
+ maxAlignment: 4
+ localFrameSize: 4
+stack:
+ - { id: 0, size: 4, alignment: 4, local-offset: 0 }
+machineFunctionInfo:
+ isEntryFunction: true
+ scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3'
+ scratchWaveOffsetReg: '$sgpr33'
+ stackPtrOffsetReg: '$sgpr32'
+body: |
+ bb.0:
+
+ ; GCN-LABEL: name: fold_fi_mubuf_scratch_scratch_wave_offset
+ ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 7, implicit $exec
+ ; GCN: BUFFER_STORE_DWORD_OFFEN [[V_MOV_B32_e32_]], %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, implicit $exec
+ ; GCN: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, implicit $exec
+ ; GCN: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
+ ; GCN: S_ENDPGM 0, implicit $vgpr0
+ %0:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
+ %1:vgpr_32 = V_MOV_B32_e32 7, implicit $exec
+
+ BUFFER_STORE_DWORD_OFFEN %1:vgpr_32, %0:vgpr_32, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr33, 0, 0, 0, 0, 0, implicit $exec
+ %2:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %0:vgpr_32, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr33, 0, 0, 0, 0, 0, implicit $exec
+ $vgpr0 = COPY %2
+ S_ENDPGM 0, implicit $vgpr0
+
+...
+
+---
+name: no_fold_fi_mubuf_scratch_sp_offset
+tracksRegLiveness: true
+frameInfo:
+ maxAlignment: 4
+ localFrameSize: 4
+stack:
+ - { id: 0, size: 4, alignment: 4, local-offset: 0 }
+machineFunctionInfo:
+ isEntryFunction: true
+ scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3'
+ scratchWaveOffsetReg: '$sgpr33'
+ stackPtrOffsetReg: '$sgpr32'
+body: |
+ bb.0:
+
+ ; GCN-LABEL: name: no_fold_fi_mubuf_scratch_sp_offset
+ ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 7, implicit $exec
+ ; GCN: BUFFER_STORE_DWORD_OFFEN [[V_MOV_B32_e32_]], %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, implicit $exec
+ ; GCN: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, implicit $exec
+ ; GCN: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
+ ; GCN: S_ENDPGM 0, implicit $vgpr0
+ %0:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
+ %1:vgpr_32 = V_MOV_B32_e32 7, implicit $exec
+
+ BUFFER_STORE_DWORD_OFFEN %1:vgpr_32, %0:vgpr_32, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, implicit $exec
+ %2:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %0:vgpr_32, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, implicit $exec
+ $vgpr0 = COPY %2
+ S_ENDPGM 0, implicit $vgpr0
+
+...
ret void
}
-; FIXME: Should be able to see that this can use vaddr, but the
-; FrameIndex is hidden behind a CopyFromReg in the second block.
-
; GCN-LABEL: {{^}}void_func_byval_struct_i8_i32_ptr_nonentry_block:
; GCN: s_sub_u32 [[SUB_OFFSET:s[0-9]+]], s32, s33
; GCN: s_and_saveexec_b64
-; CI: v_add_i32_e32 v0, vcc, 4, [[SHIFT]]
-; CI: buffer_load_dword v1, v1, s[0:3], s33 offen offset:4{{$}}
+; CI: v_add_i32_e32 [[GEP:v[0-9]+]], vcc, 4, [[SHIFT]]
+; CI: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s32 offset:4{{$}}
-; GFX9: v_add_u32_e32 v0, 4, [[SHIFT]]
-; GFX9: buffer_load_dword v1, v{{[0-9]+}}, s[0:3], s33 offen offset:4{{$}}
+; GFX9: v_add_u32_e32 [[GEP:v[0-9]+]], 4, [[SHIFT]]
+; GFX9: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s32 offset:4{{$}}
-; GCN: ds_write_b32
+; GCN: ds_write_b32 v{{[0-9]+}}, [[GEP]]
define void @void_func_byval_struct_i8_i32_ptr_nonentry_block({ i8, i32 } addrspace(5)* byval %arg0, i32 %arg2) #0 {
%cmp = icmp eq i32 %arg2, 0
br i1 %cmp, label %bb, label %ret
ret float %val
}
+; Make sure a frame index folding doessn't crash on a MUBUF not used
+; for stack access.
+
+; CHECK-LABEL: {{^}}no_fold_fi_imm_soffset:
+; CHECK: v_mov_b32_e32 [[FI:v[0-9]+]], 4{{$}}
+; CHECK-NEXT: buffer_load_dword v0, [[FI]], s{{\[[0-9]+:[0-9]+\]}}, 0 idxen
+define amdgpu_ps float @no_fold_fi_imm_soffset(<4 x i32> inreg %rsrc) {
+ %alloca = alloca i32, addrspace(5)
+ %alloca.cast = ptrtoint i32 addrspace(5)* %alloca to i32
+
+ %ret.val = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 %alloca.cast, i32 0, i1 false, i1 false)
+ ret float %ret.val
+}
+
+; CHECK-LABEL: {{^}}no_fold_fi_reg_soffset:
+; CHECK-DAG: v_mov_b32_e32 v[[FI:[0-9]+]], 4{{$}}
+; CHECK-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], s
+; CHECK: buffer_load_dword v0, v{{\[}}[[FI]]:[[HI]]
+define amdgpu_ps float @no_fold_fi_reg_soffset(<4 x i32> inreg %rsrc, i32 inreg %soffset) {
+ %alloca = alloca i32, addrspace(5)
+ %alloca.cast = ptrtoint i32 addrspace(5)* %alloca to i32
+
+ %ret.val = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 %alloca.cast, i32 %soffset, i1 false, i1 false)
+ ret float %ret.val
+}
+
declare float @llvm.amdgcn.buffer.load.f32(<4 x i32>, i32, i32, i1, i1) #0
declare <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32>, i32, i32, i1, i1) #0
declare <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32>, i32, i32, i1, i1) #0
; Make sure this doesn't crash.
; CHECK-LABEL: {{^}}test_readfirstlane_fi:
-; CHECK: v_mov_b32_e32 [[FIVAL:v[0-9]]], 4
-; CHECK: v_readfirstlane_b32 s{{[0-9]+}}, [[FIVAL]]
+; CHECK: s_mov_b32 [[FIVAL:s[0-9]]], 4
define amdgpu_kernel void @test_readfirstlane_fi(i32 addrspace(1)* %out) #1 {
%alloca = alloca i32, addrspace(5)
%int = ptrtoint i32 addrspace(5)* %alloca to i32