void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI,
unsigned LoadStoreOp,
- const MachineOperand *SrcDst,
+ int Index,
+ unsigned ValueReg,
+ bool IsKill,
unsigned ScratchRsrcReg,
- unsigned ScratchOffset,
- int64_t Offset,
+ unsigned ScratchOffsetReg,
+ int64_t InstOffset,
+ MachineMemOperand *MMO,
RegScavenger *RS) const {
- unsigned Value = SrcDst->getReg();
- bool IsKill = SrcDst->isKill();
MachineBasicBlock *MBB = MI->getParent();
MachineFunction *MF = MI->getParent()->getParent();
const SISubtarget &ST = MF->getSubtarget<SISubtarget>();
const SIInstrInfo *TII = ST.getInstrInfo();
+ const MachineFrameInfo &MFI = MF->getFrameInfo();
- DebugLoc DL = MI->getDebugLoc();
- bool IsStore = MI->mayStore();
+ const MCInstrDesc &Desc = TII->get(LoadStoreOp);
+ const DebugLoc &DL = MI->getDebugLoc();
+ bool IsStore = Desc.mayStore();
bool RanOutOfSGPRs = false;
bool Scavenged = false;
- unsigned SOffset = ScratchOffset;
- unsigned OriginalImmOffset = Offset;
+ unsigned SOffset = ScratchOffsetReg;
- unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode());
+ const TargetRegisterClass *RC = getRegClassForReg(MF->getRegInfo(), ValueReg);
+ unsigned NumSubRegs = AMDGPU::getRegBitWidth(RC->getID()) / 32;
unsigned Size = NumSubRegs * 4;
+ int64_t Offset = InstOffset + MFI.getObjectOffset(Index);
+ const int64_t OriginalImmOffset = Offset;
+
+ unsigned Align = MFI.getObjectAlignment(Index);
+ const MachinePointerInfo &BasePtrInfo = MMO->getPointerInfo();
if (!isUInt<12>(Offset + Size)) {
SOffset = AMDGPU::NoRegister;
// subtract the offset after the spill to return ScratchOffset to it's
// original value.
RanOutOfSGPRs = true;
- SOffset = ScratchOffset;
+ SOffset = ScratchOffsetReg;
} else {
Scavenged = true;
}
+
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), SOffset)
- .addReg(ScratchOffset)
- .addImm(Offset);
+ .addReg(ScratchOffsetReg)
+ .addImm(Offset);
+
Offset = 0;
}
- for (unsigned i = 0, e = NumSubRegs; i != e; ++i, Offset += 4) {
+ const unsigned EltSize = 4;
+
+ for (unsigned i = 0, e = NumSubRegs; i != e; ++i, Offset += EltSize) {
unsigned SubReg = NumSubRegs == 1 ?
- Value : getSubReg(Value, getSubRegFromChannel(i));
+ ValueReg : getSubReg(ValueReg, getSubRegFromChannel(i));
unsigned SOffsetRegState = 0;
unsigned SrcDstRegState = getDefRegState(!IsStore);
SrcDstRegState |= getKillRegState(IsKill);
}
- BuildMI(*MBB, MI, DL, TII->get(LoadStoreOp))
+ MachinePointerInfo PInfo = BasePtrInfo.getWithOffset(EltSize * i);
+ MachineMemOperand *NewMMO
+ = MF->getMachineMemOperand(PInfo, MMO->getFlags(),
+ EltSize, MinAlign(Align, EltSize * i));
+
+ BuildMI(*MBB, MI, DL, Desc)
.addReg(SubReg, getDefRegState(!IsStore))
.addReg(ScratchRsrcReg)
.addReg(SOffset, SOffsetRegState)
.addImm(0) // glc
.addImm(0) // slc
.addImm(0) // tfe
- .addReg(Value, RegState::Implicit | SrcDstRegState)
- .setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
+ .addMemOperand(NewMMO)
+ .addReg(ValueReg, RegState::Implicit | SrcDstRegState);
}
+
if (RanOutOfSGPRs) {
// Subtract the offset we added to the ScratchOffset register.
- BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_SUB_U32), ScratchOffset)
- .addReg(ScratchOffset)
- .addImm(OriginalImmOffset);
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_SUB_U32), ScratchOffsetReg)
+ .addReg(ScratchOffsetReg)
+ .addImm(OriginalImmOffset);
}
}
bool SpillToSMEM = ST.hasScalarStores() && EnableSpillSGPRToSMEM;
+ const unsigned EltSize = 4;
+
// SubReg carries the "Kill" flag when SubReg == SuperReg.
unsigned SubKillState = getKillRegState((NumSubRegs == 1) && IsKill);
for (unsigned i = 0, e = NumSubRegs; i < e; ++i) {
}
int64_t FrOffset = FrameInfo.getObjectOffset(Index);
- unsigned Size = FrameInfo.getObjectSize(Index);
unsigned Align = FrameInfo.getObjectAlignment(Index);
MachinePointerInfo PtrInfo
- = MachinePointerInfo::getFixedStack(*MF, Index);
+ = MachinePointerInfo::getFixedStack(*MF, Index, EltSize * i);
MachineMemOperand *MMO
= MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore,
- Size, Align);
+ EltSize, MinAlign(Align, EltSize * i));
unsigned OffsetReg = AMDGPU::M0;
// Add i * 4 wave offset.
Mov.addReg(SuperReg, RegState::Implicit | SuperKillState);
}
- unsigned Size = FrameInfo.getObjectSize(Index);
unsigned Align = FrameInfo.getObjectAlignment(Index);
MachinePointerInfo PtrInfo
- = MachinePointerInfo::getFixedStack(*MF, Index);
+ = MachinePointerInfo::getFixedStack(*MF, Index, EltSize * i);
MachineMemOperand *MMO
= MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore,
- Size, Align);
+ EltSize, MinAlign(Align, EltSize * i));
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_V32_SAVE))
.addReg(TmpReg, RegState::Kill) // src
.addFrameIndex(Index) // vaddr
int64_t FrOffset = FrameInfo.getObjectOffset(Index);
+ const unsigned EltSize = 4;
+
for (unsigned i = 0, e = NumSubRegs; i < e; ++i) {
unsigned SubReg = NumSubRegs == 1 ?
SuperReg : getSubReg(SuperReg, getSubRegFromChannel(i));
if (SpillToSMEM) {
- unsigned Size = FrameInfo.getObjectSize(Index);
unsigned Align = FrameInfo.getObjectAlignment(Index);
MachinePointerInfo PtrInfo
- = MachinePointerInfo::getFixedStack(*MF, Index);
+ = MachinePointerInfo::getFixedStack(*MF, Index, EltSize * i);
MachineMemOperand *MMO
= MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOLoad,
- Size, Align);
+ EltSize, MinAlign(Align, EltSize * i));
unsigned OffsetReg = AMDGPU::M0;
} else {
// Restore SGPR from a stack slot.
// FIXME: We should use S_LOAD_DWORD here for VI.
-
unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
unsigned Align = FrameInfo.getObjectAlignment(Index);
- unsigned Size = FrameInfo.getObjectSize(Index);
MachinePointerInfo PtrInfo
- = MachinePointerInfo::getFixedStack(*MF, Index);
+ = MachinePointerInfo::getFixedStack(*MF, Index, EltSize * i);
- MachineMemOperand *MMO = MF->getMachineMemOperand(
- PtrInfo, MachineMemOperand::MOLoad, Size, Align);
+ MachineMemOperand *MMO = MF->getMachineMemOperand(PtrInfo,
+ MachineMemOperand::MOLoad, EltSize,
+ MinAlign(Align, EltSize * i));
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_V32_RESTORE), TmpReg)
.addFrameIndex(Index) // vaddr
.addReg(MFI->getScratchWaveOffsetReg()) // soffset
.addImm(i * 4) // offset
.addMemOperand(MMO);
- BuildMI(*MBB, MI, DL,
- TII->get(AMDGPU::V_READFIRSTLANE_B32), SubReg)
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), SubReg)
.addReg(TmpReg, RegState::Kill)
.addReg(MI->getOperand(0).getReg(), RegState::ImplicitDefine);
}
case AMDGPU::SI_SPILL_V128_SAVE:
case AMDGPU::SI_SPILL_V96_SAVE:
case AMDGPU::SI_SPILL_V64_SAVE:
- case AMDGPU::SI_SPILL_V32_SAVE:
+ case AMDGPU::SI_SPILL_V32_SAVE: {
+ const MachineOperand *VData = TII->getNamedOperand(*MI,
+ AMDGPU::OpName::vdata);
buildSpillLoadStore(MI, AMDGPU::BUFFER_STORE_DWORD_OFFSET,
- TII->getNamedOperand(*MI, AMDGPU::OpName::vdata),
+ Index,
+ VData->getReg(), VData->isKill(),
TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc)->getReg(),
TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg(),
- FrameInfo.getObjectOffset(Index) +
- TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(), RS);
+ TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(),
+ *MI->memoperands_begin(),
+ RS);
MFI->addToSpilledVGPRs(getNumSubRegsForSpillOp(MI->getOpcode()));
MI->eraseFromParent();
break;
+ }
case AMDGPU::SI_SPILL_V32_RESTORE:
case AMDGPU::SI_SPILL_V64_RESTORE:
case AMDGPU::SI_SPILL_V96_RESTORE:
case AMDGPU::SI_SPILL_V128_RESTORE:
case AMDGPU::SI_SPILL_V256_RESTORE:
case AMDGPU::SI_SPILL_V512_RESTORE: {
+ const MachineOperand *VData = TII->getNamedOperand(*MI,
+ AMDGPU::OpName::vdata);
+
buildSpillLoadStore(MI, AMDGPU::BUFFER_LOAD_DWORD_OFFSET,
- TII->getNamedOperand(*MI, AMDGPU::OpName::vdata),
+ Index,
+ VData->getReg(), VData->isKill(),
TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc)->getReg(),
TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg(),
- FrameInfo.getObjectOffset(Index) +
- TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(), RS);
+ TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(),
+ *MI->memoperands_begin(),
+ RS);
MI->eraseFromParent();
break;
}
; VMEM: v_mov_b32_e32 v[[V_SAVEEXEC_LO:[0-9]+]], s[[SAVEEXEC_LO]]
-; VMEM: buffer_store_dword v[[V_SAVEEXEC_LO]], off, s[0:3], s7 ; 8-byte Folded Spill
+; VMEM: buffer_store_dword v[[V_SAVEEXEC_LO]], off, s[0:3], s7 ; 4-byte Folded Spill
; VMEM: v_mov_b32_e32 v[[V_SAVEEXEC_HI:[0-9]+]], s[[SAVEEXEC_HI]]
-; VMEM: buffer_store_dword v[[V_SAVEEXEC_HI]], off, s[0:3], s7 offset:4 ; 8-byte Folded Spill
+; VMEM: buffer_store_dword v[[V_SAVEEXEC_HI]], off, s[0:3], s7 offset:4 ; 4-byte Folded Spill
; Spill load
; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], s7 offset:[[LOAD0_OFFSET:[0-9]+]] ; 4-byte Folded Spill
-; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_LO:[0-9]+]], off, s[0:3], s7 ; 8-byte Folded Reload
+; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_LO:[0-9]+]], off, s[0:3], s7 ; 4-byte Folded Reload
; VMEM: s_waitcnt vmcnt(0)
; VMEM: v_readfirstlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], v[[V_RELOAD_SAVEEXEC_LO]]
-; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_HI:[0-9]+]], off, s[0:3], s7 offset:4 ; 8-byte Folded Reload
+; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_HI:[0-9]+]], off, s[0:3], s7 offset:4 ; 4-byte Folded Reload
; VMEM: s_waitcnt vmcnt(0)
; VMEM: v_readfirstlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], v[[V_RELOAD_SAVEEXEC_HI]]
; VMEM: v_mov_b32_e32 v[[V_SAVEEXEC_LO:[0-9]+]], s[[SAVEEXEC_LO]]
-; VMEM: buffer_store_dword v[[V_SAVEEXEC_LO]], off, s[0:3], s7 offset:16 ; 8-byte Folded Spill
+; VMEM: buffer_store_dword v[[V_SAVEEXEC_LO]], off, s[0:3], s7 offset:16 ; 4-byte Folded Spill
; VMEM: v_mov_b32_e32 v[[V_SAVEEXEC_HI:[0-9]+]], s[[SAVEEXEC_HI]]
-; VMEM: buffer_store_dword v[[V_SAVEEXEC_HI]], off, s[0:3], s7 offset:20 ; 8-byte Folded Spill
+; VMEM: buffer_store_dword v[[V_SAVEEXEC_HI]], off, s[0:3], s7 offset:20 ; 4-byte Folded Spill
; GCN: s_mov_b64 exec, s{{\[}}[[ANDEXEC_LO]]:[[ANDEXEC_HI]]{{\]}}
; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_LO_LANE]]
; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_HI_LANE]]
-; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_LO:[0-9]+]], off, s[0:3], s7 offset:16 ; 8-byte Folded Reload
+; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_LO:[0-9]+]], off, s[0:3], s7 offset:16 ; 4-byte Folded Reload
; VMEM: s_waitcnt vmcnt(0)
; VMEM: v_readfirstlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], v[[V_RELOAD_SAVEEXEC_LO]]
-; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_HI:[0-9]+]], off, s[0:3], s7 offset:20 ; 8-byte Folded Reload
+; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_HI:[0-9]+]], off, s[0:3], s7 offset:20 ; 4-byte Folded Reload
; VMEM: s_waitcnt vmcnt(0)
; VMEM: v_readfirstlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], v[[V_RELOAD_SAVEEXEC_HI]]
; VGPR: v_writelane_b32 [[SPILL_VGPR]], s[[SAVEEXEC_HI]], [[SAVEEXEC_HI_LANE:[0-9]+]]
; VMEM: v_mov_b32_e32 v[[V_SAVEEXEC_LO:[0-9]+]], s[[SAVEEXEC_LO]]
-; VMEM: buffer_store_dword v[[V_SAVEEXEC_LO]], off, s[0:3], s7 offset:[[SAVEEXEC_LO_OFFSET:[0-9]+]] ; 8-byte Folded Spill
+; VMEM: buffer_store_dword v[[V_SAVEEXEC_LO]], off, s[0:3], s7 offset:[[SAVEEXEC_LO_OFFSET:[0-9]+]] ; 4-byte Folded Spill
; VMEM: v_mov_b32_e32 v[[V_SAVEEXEC_HI:[0-9]+]], s[[SAVEEXEC_HI]]
-; VMEM: buffer_store_dword v[[V_SAVEEXEC_HI]], off, s[0:3], s7 offset:[[SAVEEXEC_HI_OFFSET:[0-9]+]] ; 8-byte Folded Spill
+; VMEM: buffer_store_dword v[[V_SAVEEXEC_HI]], off, s[0:3], s7 offset:[[SAVEEXEC_HI_OFFSET:[0-9]+]] ; 4-byte Folded Spill
; GCN: s_mov_b64 exec, [[CMP0]]
; GCN: s_waitcnt vmcnt(0) expcnt(0)
; VMEM: s_waitcnt vmcnt(0)
; VMEM: v_readfirstlane_b32 s[[FLOW_S_RELOAD_SAVEEXEC_LO:[0-9]+]], v[[FLOW_V_RELOAD_SAVEEXEC_LO]]
-; VMEM: buffer_load_dword v[[FLOW_V_RELOAD_SAVEEXEC_HI:[0-9]+]], off, s[0:3], s7 offset:[[SAVEEXEC_HI_OFFSET]] ; 8-byte Folded Reload
+; VMEM: buffer_load_dword v[[FLOW_V_RELOAD_SAVEEXEC_HI:[0-9]+]], off, s[0:3], s7 offset:[[SAVEEXEC_HI_OFFSET]] ; 4-byte Folded Reload
; VMEM: s_waitcnt vmcnt(0)
; VMEM: v_readfirstlane_b32 s[[FLOW_S_RELOAD_SAVEEXEC_HI:[0-9]+]], v[[FLOW_V_RELOAD_SAVEEXEC_HI]]
; VMEM: v_mov_b32_e32 v[[FLOW_V_SAVEEXEC_LO:[0-9]+]], s[[FLOW_S_RELOAD_SAVEEXEC_LO]]
-; VMEM: buffer_store_dword v[[FLOW_V_SAVEEXEC_LO]], off, s[0:3], s7 offset:[[FLOW_SAVEEXEC_LO_OFFSET:[0-9]+]] ; 8-byte Folded Spill
+; VMEM: buffer_store_dword v[[FLOW_V_SAVEEXEC_LO]], off, s[0:3], s7 offset:[[FLOW_SAVEEXEC_LO_OFFSET:[0-9]+]] ; 4-byte Folded Spill
; VMEM: v_mov_b32_e32 v[[FLOW_V_SAVEEXEC_HI:[0-9]+]], s[[FLOW_S_RELOAD_SAVEEXEC_HI]]
-; VMEM: buffer_store_dword v[[FLOW_V_SAVEEXEC_HI]], off, s[0:3], s7 offset:[[FLOW_SAVEEXEC_HI_OFFSET:[0-9]+]] ; 8-byte Folded Spill
+; VMEM: buffer_store_dword v[[FLOW_V_SAVEEXEC_HI]], off, s[0:3], s7 offset:[[FLOW_SAVEEXEC_HI_OFFSET:[0-9]+]] ; 4-byte Folded Spill
; GCN: buffer_store_dword [[FLOW_VAL]], off, s[0:3], s7 offset:[[RESULT_OFFSET:[0-9]+]] ; 4-byte Folded Spill
; GCN: s_xor_b64 exec, exec, s{{\[}}[[FLOW_S_RELOAD_SAVEEXEC_LO]]:[[FLOW_S_RELOAD_SAVEEXEC_HI]]{{\]}}
; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], [[SPILL_VGPR]], [[FLOW_SAVEEXEC_HI_LANE]]
-; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_LO:[0-9]+]], off, s[0:3], s7 offset:[[FLOW_SAVEEXEC_LO_OFFSET]] ; 8-byte Folded Reload
+; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_LO:[0-9]+]], off, s[0:3], s7 offset:[[FLOW_SAVEEXEC_LO_OFFSET]] ; 4-byte Folded Reload
; VMEM: s_waitcnt vmcnt(0)
; VMEM: v_readfirstlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], v[[V_RELOAD_SAVEEXEC_LO]]
-; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_HI:[0-9]+]], off, s[0:3], s7 offset:[[FLOW_SAVEEXEC_HI_OFFSET]] ; 8-byte Folded Reload
+; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_HI:[0-9]+]], off, s[0:3], s7 offset:[[FLOW_SAVEEXEC_HI_OFFSET]] ; 4-byte Folded Reload
; VMEM: s_waitcnt vmcnt(0)
; VMEM: v_readfirstlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], v[[V_RELOAD_SAVEEXEC_HI]]
; Make sure scratch wave offset register is correctly incremented and
; then restored.
; SMEM: s_mov_b32 m0, s91{{$}}
-; SMEM: s_buffer_store_dword s{{[0-9]+}}, s[92:95], m0 ; 16-byte Folded Spill
+; SMEM: s_buffer_store_dword s{{[0-9]+}}, s[92:95], m0 ; 4-byte Folded Spill
; SMEM: s_add_u32 m0, s91, 0x100{{$}}
-; SMEM: s_buffer_store_dword s{{[0-9]+}}, s[92:95], m0 ; 16-byte Folded Spill
+; SMEM: s_buffer_store_dword s{{[0-9]+}}, s[92:95], m0 ; 4-byte Folded Spill
; SMEM: s_add_u32 m0, s91, 0x200{{$}}
-; SMEM: s_buffer_store_dword s{{[0-9]+}}, s[92:95], m0 ; 16-byte Folded Spill
+; SMEM: s_buffer_store_dword s{{[0-9]+}}, s[92:95], m0 ; 4-byte Folded Spill
; SMEM: s_add_u32 m0, s91, 0x300{{$}}
-; SMEM: s_buffer_store_dword s{{[0-9]+}}, s[92:95], m0 ; 16-byte Folded Spill
+; SMEM: s_buffer_store_dword s{{[0-9]+}}, s[92:95], m0 ; 4-byte Folded Spill
; SMEM: s_mov_b32 m0, s91{{$}}
-; SMEM: s_buffer_load_dword s{{[0-9]+}}, s[92:95], m0 ; 16-byte Folded Reload
+; SMEM: s_buffer_load_dword s{{[0-9]+}}, s[92:95], m0 ; 4-byte Folded Reload
; SMEM: s_add_u32 m0, s91, 0x100{{$}}
; SMEM: s_waitcnt lgkmcnt(0)
-; SMEM: s_buffer_load_dword s{{[0-9]+}}, s[92:95], m0 ; 16-byte Folded Reload
+; SMEM: s_buffer_load_dword s{{[0-9]+}}, s[92:95], m0 ; 4-byte Folded Reload
; SMEM: s_add_u32 m0, s91, 0x200{{$}}
; SMEM: s_waitcnt lgkmcnt(0)
-; SMEM: s_buffer_load_dword s{{[0-9]+}}, s[92:95], m0 ; 16-byte Folded Reload
+; SMEM: s_buffer_load_dword s{{[0-9]+}}, s[92:95], m0 ; 4-byte Folded Reload
; SMEM: s_add_u32 m0, s91, 0x300{{$}}
; SMEM: s_waitcnt lgkmcnt(0)
-; SMEM: s_buffer_load_dword s{{[0-9]+}}, s[92:95], m0 ; 16-byte Folded Reload
+; SMEM: s_buffer_load_dword s{{[0-9]+}}, s[92:95], m0 ; 4-byte Folded Reload
; ALL: s_endpgm
define void @test(i32 addrspace(1)* %out, i32 %in) {