bool IsModified = false;
do {
IsModified = false;
+
// Go over all selected nodes and try to fold them a bit more
- for (SDNode &Node : CurDAG->allnodes()) {
- MachineSDNode *MachineNode = dyn_cast<MachineSDNode>(&Node);
+ SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_begin();
+ while (Position != CurDAG->allnodes_end()) {
+ SDNode *Node = &*Position++;
+ MachineSDNode *MachineNode = dyn_cast<MachineSDNode>(Node);
if (!MachineNode)
continue;
SDNode *ResNode = Lowering.PostISelFolding(MachineNode, *CurDAG);
- if (ResNode != &Node) {
- ReplaceUses(&Node, ResNode);
+ if (ResNode != Node) {
+ if (ResNode)
+ ReplaceUses(Node, ResNode);
IsModified = true;
}
}
return (NumLoads <= 16 && (Offset1 - Offset0) < 64);
}
-int AMDGPUInstrInfo::getMaskedMIMGOp(uint16_t Opcode, unsigned Channels) const {
- switch (Channels) {
- default: return Opcode;
- case 1: return AMDGPU::getMaskedMIMGOp(Opcode, AMDGPU::Channels_1);
- case 2: return AMDGPU::getMaskedMIMGOp(Opcode, AMDGPU::Channels_2);
- case 3: return AMDGPU::getMaskedMIMGOp(Opcode, AMDGPU::Channels_3);
+static AMDGPU::Channels indexToChannel(unsigned Channel) {
+ switch (Channel) {
+ case 1:
+ return AMDGPU::Channels_1;
+ case 2:
+ return AMDGPU::Channels_2;
+ case 3:
+ return AMDGPU::Channels_3;
+ case 4:
+ return AMDGPU::Channels_4;
+ default:
+ llvm_unreachable("invalid MIMG channel");
}
}
+// FIXME: Need to handle d16 images correctly.
+static unsigned rcToChannels(unsigned RCID) {
+ switch (RCID) {
+ case AMDGPU::VGPR_32RegClassID:
+ return 1;
+ case AMDGPU::VReg_64RegClassID:
+ return 2;
+ case AMDGPU::VReg_96RegClassID:
+ return 3;
+ case AMDGPU::VReg_128RegClassID:
+ return 4;
+ default:
+ llvm_unreachable("invalid MIMG register class");
+ }
+}
+
+int AMDGPUInstrInfo::getMaskedMIMGOp(unsigned Opc,
+ unsigned NewChannels) const {
+ AMDGPU::Channels Channel = indexToChannel(NewChannels);
+ unsigned OrigChannels = rcToChannels(get(Opc).OpInfo[0].RegClass);
+ if (NewChannels == OrigChannels)
+ return Opc;
+
+ switch (OrigChannels) {
+ case 1:
+ return AMDGPU::getMaskedMIMGOp1(Opc, Channel);
+ case 2:
+ return AMDGPU::getMaskedMIMGOp2(Opc, Channel);
+ case 3:
+ return AMDGPU::getMaskedMIMGOp3(Opc, Channel);
+ case 4:
+ return AMDGPU::getMaskedMIMGOp4(Opc, Channel);
+ default:
+ llvm_unreachable("invalid MIMG channel");
+ }
+}
+
+
// This must be kept in sync with the SIEncodingFamily class in SIInstrInfo.td
enum SIEncodingFamily {
SI = 0,
/// not exist. If Opcode is not a pseudo instruction, this is identity.
int pseudoToMCOpcode(int Opcode) const;
- /// \brief Given a MIMG \p Opcode that writes all 4 channels, return the
- /// equivalent opcode that writes \p Channels Channels.
- int getMaskedMIMGOp(uint16_t Opcode, unsigned Channels) const;
+ /// \brief Given a MIMG \p MI that writes any number of channels, return the
+ /// equivalent opcode that writes \p NewChannels Channels.
+ int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels) const;
};
} // End llvm namespace
}
/// \brief Adjust the writemask of MIMG instructions
-void SITargetLowering::adjustWritemask(MachineSDNode *&Node,
- SelectionDAG &DAG) const {
- SDNode *Users[4] = { };
+SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node,
+ SelectionDAG &DAG) const {
+ SDNode *Users[4] = { nullptr };
unsigned Lane = 0;
unsigned DmaskIdx = (Node->getNumOperands() - Node->getNumValues() == 9) ? 2 : 3;
unsigned OldDmask = Node->getConstantOperandVal(DmaskIdx);
// Abort if we can't understand the usage
if (!I->isMachineOpcode() ||
I->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
- return;
+ return Node;
// Lane means which subreg of %vgpra_vgprb_vgprc_vgprd is used.
// Note that subregs are packed, i.e. Lane==0 is the first bit set
// Abort if we have more than one user per component
if (Users[Lane])
- return;
+ return Node;
Users[Lane] = *I;
NewDmask |= 1 << Comp;
// Abort if there's no change
if (NewDmask == OldDmask)
- return;
+ return Node;
+
+ unsigned BitsSet = countPopulation(NewDmask);
+
+ const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
+ int NewOpcode = TII->getMaskedMIMGOp(Node->getMachineOpcode(), BitsSet);
+ assert(NewOpcode != -1 &&
+ NewOpcode != static_cast<int>(Node->getMachineOpcode()) &&
+ "failed to find equivalent MIMG op");
// Adjust the writemask in the node
- std::vector<SDValue> Ops;
+ SmallVector<SDValue, 12> Ops;
Ops.insert(Ops.end(), Node->op_begin(), Node->op_begin() + DmaskIdx);
Ops.push_back(DAG.getTargetConstant(NewDmask, SDLoc(Node), MVT::i32));
Ops.insert(Ops.end(), Node->op_begin() + DmaskIdx + 1, Node->op_end());
- Node = (MachineSDNode*)DAG.UpdateNodeOperands(Node, Ops);
-
- // If we only got one lane, replace it with a copy
- // (if NewDmask has only one bit set...)
- if (NewDmask && (NewDmask & (NewDmask-1)) == 0) {
- SDValue RC = DAG.getTargetConstant(AMDGPU::VGPR_32RegClassID, SDLoc(),
- MVT::i32);
- SDNode *Copy = DAG.getMachineNode(TargetOpcode::COPY_TO_REGCLASS,
- SDLoc(), Users[Lane]->getValueType(0),
- SDValue(Node, 0), RC);
+
+ MVT SVT = Node->getValueType(0).getVectorElementType().getSimpleVT();
+
+ auto NewVTList =
+ DAG.getVTList(BitsSet == 1 ?
+ SVT : MVT::getVectorVT(SVT, BitsSet == 3 ? 4 : BitsSet),
+ MVT::Other);
+
+ MachineSDNode *NewNode = DAG.getMachineNode(NewOpcode, SDLoc(Node),
+ NewVTList, Ops);
+ // Update chain.
+ DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 1), SDValue(NewNode, 1));
+
+ if (BitsSet == 1) {
+ assert(Node->hasNUsesOfValue(1, 0));
+ SDNode *Copy = DAG.getMachineNode(TargetOpcode::COPY,
+ SDLoc(Node), Users[Lane]->getValueType(0),
+ SDValue(NewNode, 0));
DAG.ReplaceAllUsesWith(Users[Lane], Copy);
- return;
+ return nullptr;
}
// Update the users of the node with the new indices
continue;
SDValue Op = DAG.getTargetConstant(Idx, SDLoc(User), MVT::i32);
- DAG.UpdateNodeOperands(User, User->getOperand(0), Op);
+ DAG.UpdateNodeOperands(User, SDValue(NewNode, 0), Op);
switch (Idx) {
default: break;
case AMDGPU::sub2: Idx = AMDGPU::sub3; break;
}
}
+
+ DAG.RemoveDeadNode(Node);
+ return nullptr;
}
static bool isFrameIndexOp(SDValue Op) {
}
/// \brief Fold the instructions after selecting them.
+/// Returns null if users were already updated.
SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node,
SelectionDAG &DAG) const {
const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
unsigned Opcode = Node->getMachineOpcode();
if (TII->isMIMG(Opcode) && !TII->get(Opcode).mayStore() &&
- !TII->isGather4(Opcode))
- adjustWritemask(Node, DAG);
+ !TII->isGather4(Opcode)) {
+ return adjustWritemask(Node, DAG);
+ }
if (Opcode == AMDGPU::INSERT_SUBREG ||
Opcode == AMDGPU::REG_SEQUENCE) {
return;
}
- if (TII->isMIMG(MI)) {
- unsigned VReg = MI.getOperand(0).getReg();
- const TargetRegisterClass *RC = MRI.getRegClass(VReg);
- // TODO: Need mapping tables to handle other cases (register classes).
- if (RC != &AMDGPU::VReg_128RegClass)
- return;
-
- unsigned DmaskIdx = MI.getNumOperands() == 12 ? 3 : 4;
- unsigned Writemask = MI.getOperand(DmaskIdx).getImm();
- unsigned BitsSet = 0;
- for (unsigned i = 0; i < 4; ++i)
- BitsSet += Writemask & (1 << i) ? 1 : 0;
- switch (BitsSet) {
- default: return;
- case 1: RC = &AMDGPU::VGPR_32RegClass; break;
- case 2: RC = &AMDGPU::VReg_64RegClass; break;
- case 3: RC = &AMDGPU::VReg_96RegClass; break;
- }
-
- unsigned NewOpcode = TII->getMaskedMIMGOp(MI.getOpcode(), BitsSet);
- MI.setDesc(TII->get(NewOpcode));
- MRI.setRegClass(VReg, RC);
- return;
- }
-
// Replace unused atomics with the no return version.
int NoRetAtomicOp = AMDGPU::getAtomicNoRetOp(MI.getOpcode());
if (NoRetAtomicOp != -1) {
SDValue lowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerTRAP(SDValue Op, SelectionDAG &DAG) const;
- void adjustWritemask(MachineSDNode *&N, SelectionDAG &DAG) const;
+ SDNode *adjustWritemask(MachineSDNode *&N, SelectionDAG &DAG) const;
SDValue performUCharToFloatCombine(SDNode *N,
DAGCombinerInfo &DCI) const;
let ValueCols = [["Default"]];
}
-def getMaskedMIMGOp : InstrMapping {
+def getMaskedMIMGOp1 : InstrMapping {
+ let FilterClass = "MIMG_Mask";
+ let RowFields = ["Op"];
+ let ColFields = ["Channels"];
+ let KeyCol = ["1"];
+ let ValueCols = [["2"], ["3"], ["4"] ];
+}
+
+def getMaskedMIMGOp2 : InstrMapping {
+ let FilterClass = "MIMG_Mask";
+ let RowFields = ["Op"];
+ let ColFields = ["Channels"];
+ let KeyCol = ["2"];
+ let ValueCols = [["1"], ["3"], ["4"] ];
+}
+
+def getMaskedMIMGOp3 : InstrMapping {
+ let FilterClass = "MIMG_Mask";
+ let RowFields = ["Op"];
+ let ColFields = ["Channels"];
+ let KeyCol = ["3"];
+ let ValueCols = [["1"], ["2"], ["4"] ];
+}
+
+def getMaskedMIMGOp4 : InstrMapping {
let FilterClass = "MIMG_Mask";
let RowFields = ["Op"];
let ColFields = ["Channels"];
--- /dev/null
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+; GCN-LABEL: {{^}}adjust_writemask_crash_0:
+; GCN: image_get_lod v0, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} dmask:0x2
+; GCN-NOT: v1
+; GCN-NOT: v0
+; GCN: buffer_store_dword v0
+define amdgpu_ps void @adjust_writemask_crash_0() #0 {
+main_body:
+ %tmp = call <2 x float> @llvm.amdgcn.image.getlod.v2f32.v2f32.v8i32(<2 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 3, i1 false, i1 false, i1 false, i1 false, i1 false)
+ %tmp1 = bitcast <2 x float> %tmp to <2 x i32>
+ %tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+ %tmp3 = bitcast <4 x i32> %tmp2 to <4 x float>
+ %tmp4 = extractelement <4 x float> %tmp3, i32 0
+ store volatile float %tmp4, float addrspace(1)* undef
+ ret void
+}
+
+; GCN-LABEL: {{^}}adjust_writemask_crash_1:
+; GCN: image_get_lod v0, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} dmask:0x1
+; GCN-NOT: v1
+; GCN-NOT: v0
+; GCN: buffer_store_dword v0
+define amdgpu_ps void @adjust_writemask_crash_1() #0 {
+main_body:
+ %tmp = call <2 x float> @llvm.amdgcn.image.getlod.v2f32.v2f32.v8i32(<2 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 3, i1 false, i1 false, i1 false, i1 false, i1 false)
+ %tmp1 = bitcast <2 x float> %tmp to <2 x i32>
+ %tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <4 x i32> <i32 1, i32 0, i32 undef, i32 undef>
+ %tmp3 = bitcast <4 x i32> %tmp2 to <4 x float>
+ %tmp4 = extractelement <4 x float> %tmp3, i32 1
+ store volatile float %tmp4, float addrspace(1)* undef
+ ret void
+}
+
+define amdgpu_ps void @adjust_writemask_crash_0_v4() #0 {
+main_body:
+ %tmp = call <4 x float> @llvm.amdgcn.image.getlod.v4f32.v2f32.v8i32(<2 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 5, i1 false, i1 false, i1 false, i1 false, i1 false)
+ %tmp1 = bitcast <4 x float> %tmp to <4 x i32>
+ %tmp2 = shufflevector <4 x i32> %tmp1, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+ %tmp3 = bitcast <4 x i32> %tmp2 to <4 x float>
+ %tmp4 = extractelement <4 x float> %tmp3, i32 0
+ store volatile float %tmp4, float addrspace(1)* undef
+ ret void
+}
+
+
+declare <2 x float> @llvm.amdgcn.image.getlod.v2f32.v2f32.v8i32(<2 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #1
+declare <4 x float> @llvm.amdgcn.image.getlod.v4f32.v2f32.v8i32(<2 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readonly }