From bff29f6333be55184e218a541dfee50c68115fec Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Wed, 19 Jun 2019 20:44:15 +0000 Subject: [PATCH] AMDGPU: Fix folding immediate into readfirstlane through reg_sequence The def instruction for the vreg may not match, because it may be folding through a reg_sequence. The assert was overly conservative and not necessary. It's not actually important if DefMI really defined the register, because the fold that will be done cares about the def of the value that will be folded. For some reason copies aren't making it through the reg_sequence, although they should. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@363876 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/AMDGPU/SIFoldOperands.cpp | 7 +- lib/Target/AMDGPU/SIInstrInfo.cpp | 1 - .../AMDGPU/constant-address-space-32bit.ll | 12 ++ test/CodeGen/AMDGPU/fold-readlane.mir | 123 ++++++++++++++++++ 4 files changed, 141 insertions(+), 2 deletions(-) diff --git a/lib/Target/AMDGPU/SIFoldOperands.cpp b/lib/Target/AMDGPU/SIFoldOperands.cpp index 4a1fc1332c3..3f566884f6b 100644 --- a/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -436,9 +436,11 @@ void SIFoldOperands::foldOperand( unsigned RegSeqDstReg = UseMI->getOperand(0).getReg(); unsigned RegSeqDstSubReg = UseMI->getOperand(UseOpIdx + 1).getImm(); + MachineRegisterInfo::use_iterator Next; for (MachineRegisterInfo::use_iterator RSUse = MRI->use_begin(RegSeqDstReg), RSE = MRI->use_end(); - RSUse != RSE; ++RSUse) { + RSUse != RSE; RSUse = Next) { + Next = std::next(RSUse); MachineInstr *RSUseMI = RSUse->getParent(); if (RSUse->getSubReg() != RegSeqDstSubReg) @@ -523,6 +525,9 @@ void SIFoldOperands::foldOperand( return; UseMI->setDesc(TII->get(AMDGPU::S_MOV_B32)); + + // FIXME: ChangeToImmediate should clear subreg + UseMI->getOperand(1).setSubReg(0); UseMI->getOperand(1).ChangeToImmediate(OpToFold.getImm()); UseMI->RemoveOperand(2); // Remove exec read (or src1 for readlane) return; diff --git a/lib/Target/AMDGPU/SIInstrInfo.cpp b/lib/Target/AMDGPU/SIInstrInfo.cpp index 56935b35734..5831abb8071 100644 --- a/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -6079,7 +6079,6 @@ bool llvm::execMayBeModifiedBeforeUse(const MachineRegisterInfo &MRI, const MachineInstr &DefMI, const MachineInstr *UseMI) { assert(MRI.isSSA() && "Must be run on SSA"); - assert(DefMI.definesRegister(VReg) && "wrong def instruction"); auto *TRI = MRI.getTargetRegisterInfo(); auto *DefBB = DefMI.getParent(); diff --git a/test/CodeGen/AMDGPU/constant-address-space-32bit.ll b/test/CodeGen/AMDGPU/constant-address-space-32bit.ll index 040bcbc0182..e90c85545b0 100644 --- a/test/CodeGen/AMDGPU/constant-address-space-32bit.ll +++ b/test/CodeGen/AMDGPU/constant-address-space-32bit.ll @@ -279,12 +279,24 @@ define amdgpu_vs float @load_addr_no_fold(i32 addrspace(6)* inreg noalias %p0) # ret float %r2 } +; CHECK-LABEL: {{^}}vgpr_arg_src: +; CHECK: v_readfirstlane_b32 s[[READLANE:[0-9]+]], v0 +; CHECK: s_mov_b32 s[[ZERO:[0-9]+]] +; CHECK: s_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[READLANE]]:[[ZERO]]{{\]}} +define amdgpu_vs float @vgpr_arg_src(<4 x i32> addrspace(6)* %arg) { +main_body: + %tmp9 = load <4 x i32>, <4 x i32> addrspace(6)* %arg + %tmp10 = call nsz float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32> %tmp9, i32 undef, i32 0, i32 0, i32 0) #1 + ret float %tmp10 +} + ; Function Attrs: nounwind readnone speculatable declare float @llvm.amdgcn.interp.mov(i32, i32, i32, i32) #6 ; Function Attrs: nounwind readonly declare <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #7 +declare float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32>, i32, i32, i32, i32) #7 !0 = !{} diff --git a/test/CodeGen/AMDGPU/fold-readlane.mir b/test/CodeGen/AMDGPU/fold-readlane.mir index 55b7a612d77..3c68686aa4a 100644 --- a/test/CodeGen/AMDGPU/fold-readlane.mir +++ b/test/CodeGen/AMDGPU/fold-readlane.mir @@ -248,3 +248,126 @@ body: | %1:sreg_32_xm0 = S_MOV_B32 12 %2:sreg_32_xm0 = V_READLANE_B32 %0, %1, implicit $exec ... + +# Constant for subreg0 +# GCN-LABEL: name: fold-imm-readfirstlane-regsequence0{{$}} + +# GCN: %0:vgpr_32 = COPY $vgpr0 +# GCN-NEXT: %1:vgpr_32 = V_MOV_B32_e32 0, implicit $exec +# GCN-NEXT: %2:vreg_64 = REG_SEQUENCE %0, %subreg.sub0, killed %1, %subreg.sub1 +# GCN-NEXT: %3:sgpr_32 = V_READFIRSTLANE_B32 %2.sub0, implicit $exec +# GCN-NEXT: %4:sgpr_32 = S_MOV_B32 0 +--- +name: fold-imm-readfirstlane-regsequence0 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + %0:vgpr_32 = COPY $vgpr0 + %1:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %2:vreg_64 = REG_SEQUENCE %0:vgpr_32, %subreg.sub0, killed %1:vgpr_32, %subreg.sub1 + %3:sgpr_32 = V_READFIRSTLANE_B32 %2.sub0:vreg_64, implicit $exec + %4:sgpr_32 = V_READFIRSTLANE_B32 %2.sub1:vreg_64, implicit $exec +... + +# Constant for subreg1 +# GCN-LABEL: name: fold-imm-readfirstlane-regsequence1{{$}} +# GCN: %0:vgpr_32 = COPY $vgpr0 +# GCN-NEXT: %1:vgpr_32 = V_MOV_B32_e32 0, implicit $exec +# GCN-NEXT: %2:vreg_64 = REG_SEQUENCE %1, %subreg.sub0, killed %0, %subreg.sub1 +# GCN-NEXT: %3:sgpr_32 = S_MOV_B32 0 +# GCN-NEXT: %4:sgpr_32 = V_READFIRSTLANE_B32 %2.sub1, implicit $exec + +--- +name: fold-imm-readfirstlane-regsequence1 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + %0:vgpr_32 = COPY $vgpr0 + %1:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %2:vreg_64 = REG_SEQUENCE %1:vgpr_32, %subreg.sub0, killed %0:vgpr_32, %subreg.sub1 + %3:sgpr_32 = V_READFIRSTLANE_B32 %2.sub0:vreg_64, implicit $exec + %4:sgpr_32 = V_READFIRSTLANE_B32 %2.sub1:vreg_64, implicit $exec +... + +# Different constant regs for each subreg +# GCN-LABEL: name: fold-imm-readfirstlane-regsequence2{{$}} +# GCN: %0:vgpr_32 = V_MOV_B32_e32 0, implicit $exec +# GCN-NEXT: %1:vgpr_32 = V_MOV_B32_e32 1, implicit $exec +# GCN-NEXT: %2:vreg_64 = REG_SEQUENCE %0, %subreg.sub0, killed %1, %subreg.sub1 +# GCN-NEXT: %3:sgpr_32 = S_MOV_B32 0 +# GCN-NEXT: %4:sgpr_32 = S_MOV_B32 1 +--- +name: fold-imm-readfirstlane-regsequence2 +tracksRegLiveness: true +body: | + bb.0: + %0:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1:vgpr_32 = V_MOV_B32_e32 1, implicit $exec + %2:vreg_64 = REG_SEQUENCE %0:vgpr_32, %subreg.sub0, killed %1:vgpr_32, %subreg.sub1 + %3:sgpr_32 = V_READFIRSTLANE_B32 %2.sub0:vreg_64, implicit $exec + %4:sgpr_32 = V_READFIRSTLANE_B32 %2.sub1:vreg_64, implicit $exec +... + +# Same constant reg for each subreg, so there are multiple constant uses +# GCN-LABEL: name: fold-imm-readfirstlane-regsequence3{{$}} +# GCN: %0:vgpr_32 = V_MOV_B32_e32 0, implicit $exec +# GCN-NEXT: %1:vgpr_32 = V_MOV_B32_e32 0, implicit $exec +# GCN-NEXT: %2:vreg_64 = REG_SEQUENCE %0, %subreg.sub0, killed %1, %subreg.sub1 +# GCN-NEXT: %3:sgpr_32 = S_MOV_B32 0 +# GCN-NEXT: %4:sgpr_32 = S_MOV_B32 0 +--- +name: fold-imm-readfirstlane-regsequence3 +tracksRegLiveness: true +body: | + bb.0: + %0:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %2:vreg_64 = REG_SEQUENCE %0:vgpr_32, %subreg.sub0, killed %1:vgpr_32, %subreg.sub1 + %3:sgpr_32 = V_READFIRSTLANE_B32 %2.sub0:vreg_64, implicit $exec + %4:sgpr_32 = V_READFIRSTLANE_B32 %2.sub1:vreg_64, implicit $exec +... + +# FIXME: This should fold +# GCN-LABEL: name: fold-copy-readfirstlane-regsequence0{{$}} +# GCN: %0:vgpr_32 = COPY $sgpr10 +# GCN-NEXT: %1:vgpr_32 = COPY $sgpr11 +# GCN-NEXT: %2:vreg_64 = REG_SEQUENCE %0, %subreg.sub0, killed %1, %subreg.sub1 +# GCN-NEXT: %3:sgpr_32 = V_READFIRSTLANE_B32 %2.sub0, implicit $exec +# GCN-NEXT: %4:sgpr_32 = V_READFIRSTLANE_B32 %2.sub1, implicit $exec +--- +name: fold-copy-readfirstlane-regsequence0 +tracksRegLiveness: true +body: | + bb.0: + liveins: $sgpr10, $sgpr11 + %0:vgpr_32 = COPY $sgpr10 + %1:vgpr_32 = COPY $sgpr11 + %2:vreg_64 = REG_SEQUENCE %0:vgpr_32, %subreg.sub0, killed %1:vgpr_32, %subreg.sub1 + %3:sgpr_32 = V_READFIRSTLANE_B32 %2.sub0:vreg_64, implicit $exec + %4:sgpr_32 = V_READFIRSTLANE_B32 %2.sub1:vreg_64, implicit $exec +... + +# GCN-LABEL: name: fold-copy-readfirstlane-regsequence1{{$}} +# GCN: %0:sreg_32_xm0 = COPY $sgpr10 +# GCN-NEXT: %1:sreg_32_xm0 = COPY $sgpr11 +# GCN-NEXT: %2:vgpr_32 = COPY %0 +# GCN-NEXT: %3:vgpr_32 = COPY %1 +# GCN-NEXT: %4:vreg_64 = REG_SEQUENCE %2, %subreg.sub0, killed %3, %subreg.sub1 +# GCN-NEXT: %5:sgpr_32 = V_READFIRSTLANE_B32 %4.sub0, implicit $exec +# GCN-NEXT: %6:sgpr_32 = V_READFIRSTLANE_B32 %4.sub1, implicit $exec +--- +name: fold-copy-readfirstlane-regsequence1 +tracksRegLiveness: true +body: | + bb.0: + liveins: $sgpr10, $sgpr11 + %0:sreg_32_xm0 = COPY $sgpr10 + %1:sreg_32_xm0 = COPY $sgpr11 + %2:vgpr_32 = COPY %0 + %3:vgpr_32 = COPY %1 + %4:vreg_64 = REG_SEQUENCE %2:vgpr_32, %subreg.sub0, killed %3:vgpr_32, %subreg.sub1 + %5:sgpr_32 = V_READFIRSTLANE_B32 %4.sub0:vreg_64, implicit $exec + %6:sgpr_32 = V_READFIRSTLANE_B32 %4.sub1:vreg_64, implicit $exec +... -- 2.40.0