int Offset);
bool loadM0(MachineInstr &MI, MachineInstr *MovRel, int Offset = 0);
- void computeIndirectRegAndOffset(unsigned VecReg, unsigned &Reg, int &Offset);
+ std::pair<unsigned, int> computeIndirectRegAndOffset(unsigned VecReg,
+ int Offset) const;
bool indirectSrc(MachineInstr &MI);
bool indirectDst(MachineInstr &MI);
// indirect Index. e.g. v0 = v[VecReg + Offset]
// As an output, this is a constant value that needs
// to be added to the value stored in M0.
-void SILowerControlFlow::computeIndirectRegAndOffset(unsigned VecReg,
- unsigned &Reg,
- int &Offset) {
+std::pair<unsigned, int>
+SILowerControlFlow::computeIndirectRegAndOffset(unsigned VecReg,
+ int Offset) const {
unsigned SubReg = TRI->getSubReg(VecReg, AMDGPU::sub0);
if (!SubReg)
SubReg = VecReg;
+ const TargetRegisterClass *SuperRC = TRI->getPhysRegClass(VecReg);
const TargetRegisterClass *RC = TRI->getPhysRegClass(SubReg);
- int RegIdx = TRI->getHWRegIndex(SubReg) + Offset;
+ int NumElts = SuperRC->getSize() / RC->getSize();
+ int BaseRegIdx = TRI->getHWRegIndex(SubReg);
+
+ // Skip out of bounds offsets, or else we would end up using an undefined
+ // register.
+ if (Offset >= NumElts)
+ return std::make_pair(RC->getRegister(BaseRegIdx), Offset);
+
+ int RegIdx = BaseRegIdx + Offset;
if (RegIdx < 0) {
Offset = RegIdx;
RegIdx = 0;
Offset = 0;
}
- Reg = RC->getRegister(RegIdx);
+ unsigned Reg = RC->getRegister(RegIdx);
+ return std::make_pair(Reg, Offset);
}
// Return true if a new block was inserted.
int Off = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
unsigned Reg;
- computeIndirectRegAndOffset(SrcVec->getReg(), Reg, Off);
+ std::tie(Reg, Off) = computeIndirectRegAndOffset(SrcVec->getReg(), Off);
MachineInstr *MovRel =
BuildMI(*MBB.getParent(), DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val);
unsigned Reg;
- computeIndirectRegAndOffset(Dst, Reg, Off);
+ std::tie(Reg, Off) = computeIndirectRegAndOffset(Dst, Off);
MachineInstr *MovRel =
BuildMI(*MBB.getParent(), DL, TII->get(AMDGPU::V_MOVRELD_B32_e32))
ret void
}
+; offset puts outside of superegister bounaries, so clamp to 1st element.
+; CHECK-LABEL: {{^}}extract_largest_inbounds_offset:
+; CHECK: buffer_load_dwordx4 v{{\[}}[[LO_ELT:[0-9]+]]:[[HI_ELT:[0-9]+]]{{\]}}
+; CHECK: s_load_dword [[IDX:s[0-9]+]]
+; CHECK: s_mov_b32 m0, [[IDX]]
+; CHECK-NEXT: v_movrels_b32_e32 [[EXTRACT:v[0-9]+]], v[[HI_ELT]]
+; CHECK: buffer_store_dword [[EXTRACT]]
+define void @extract_largest_inbounds_offset(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %idx) {
+entry:
+ %ld = load volatile <4 x i32>, <4 x i32> addrspace(1)* %in
+ %offset = add i32 %idx, 3
+ %value = extractelement <4 x i32> %ld, i32 %offset
+ store i32 %value, i32 addrspace(1)* %out
+ ret void
+}
+
+; CHECK-LABL: {{^}}extract_out_of_bounds_offset:
+; CHECK: buffer_load_dwordx4 v{{\[}}[[LO_ELT:[0-9]+]]:[[HI_ELT:[0-9]+]]{{\]}}
+; CHECK: s_load_dword [[IDX:s[0-9]+]]
+; CHECK: s_add_i32 m0, [[IDX]], 4
+; CHECK-NEXT: v_movrels_b32_e32 [[EXTRACT:v[0-9]+]], v[[LO_ELT]]
+; CHECK: buffer_store_dword [[EXTRACT]]
+define void @extract_out_of_bounds_offset(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %idx) {
+entry:
+ %ld = load volatile <4 x i32>, <4 x i32> addrspace(1)* %in
+ %offset = add i32 %idx, 4
+ %value = extractelement <4 x i32> %ld, i32 %offset
+ store i32 %value, i32 addrspace(1)* %out
+ ret void
+}
+
declare i32 @llvm.amdgcn.workitem.id.x() #1
attributes #0 = { nounwind }