From 4dc43963efbeaf320e45f65b0ed7cf10b03b4b11 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Wed, 25 Jan 2017 04:25:02 +0000 Subject: [PATCH] AMDGPU: Implement early ifcvt target hooks. Leave early ifcvt disabled for now since there are some shader-db regressions. This causes some immediate improvements, but could be better. The cost checking that the pass does is based on critical path length for out of order CPUs which we do not want so it skips out on many cases we want. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@293016 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/AMDGPU/AMDGPUSubtarget.h | 5 + lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 14 + lib/Target/AMDGPU/SIInstrInfo.cpp | 141 +++++- lib/Target/AMDGPU/SIInstrInfo.h | 12 + lib/Target/AMDGPU/SISchedule.td | 5 + test/CodeGen/AMDGPU/early-if-convert-cost.ll | 110 +++++ test/CodeGen/AMDGPU/early-if-convert.ll | 454 ++++++++++++++++++ .../AMDGPU/uniform-branch-intrinsic-cond.ll | 1 + test/CodeGen/AMDGPU/uniform-cfg.ll | 4 +- 9 files changed, 741 insertions(+), 5 deletions(-) create mode 100644 test/CodeGen/AMDGPU/early-if-convert-cost.ll create mode 100644 test/CodeGen/AMDGPU/early-if-convert.ll diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.h b/lib/Target/AMDGPU/AMDGPUSubtarget.h index c055b8c5f79..37a21c25a50 100644 --- a/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ b/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -526,6 +526,11 @@ public: this->GISel.reset(&GISel); } + // XXX - Why is this here if it isn't in the default pass set? + bool enableEarlyIfConversion() const override { + return true; + } + void overrideSchedPolicy(MachineSchedPolicy &Policy, unsigned NumRegionInstrs) const override; diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 06adb372305..1b48d61ade3 100644 --- a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -58,6 +58,11 @@ static cl::opt EnableSROA( cl::ReallyHidden, cl::init(true)); +static cl::opt +EnableEarlyIfConversion("amdgpu-early-ifcvt", cl::Hidden, + cl::desc("Run early if-conversion"), + cl::init(false)); + static cl::opt EnableR600IfConvert( "r600-if-convert", cl::desc("Use if conversion pass"), @@ -360,6 +365,7 @@ public: void addIRPasses() override; bool addPreISel() override; void addMachineSSAOptimization() override; + bool addILPOpts() override; bool addInstSelector() override; #ifdef LLVM_BUILD_GLOBAL_ISEL bool addIRTranslator() override; @@ -552,6 +558,14 @@ void GCNPassConfig::addMachineSSAOptimization() { addPass(&SILoadStoreOptimizerID); } +bool GCNPassConfig::addILPOpts() { + if (EnableEarlyIfConversion) + addPass(&EarlyIfConverterID); + + TargetPassConfig::addILPOpts(); + return false; +} + void GCNPassConfig::addIRPasses() { // TODO: May want to move later or split into an early and late one. addPass(createAMDGPUCodeGenPreparePass(&getGCNTargetMachine())); diff --git a/lib/Target/AMDGPU/SIInstrInfo.cpp b/lib/Target/AMDGPU/SIInstrInfo.cpp index a647e29c82c..ca1fa94d812 100644 --- a/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -1291,6 +1291,13 @@ unsigned SIInstrInfo::removeBranch(MachineBasicBlock &MBB, return Count; } +// Copy the flags onto the implicit condition register operand. +static void preserveCondRegFlags(MachineOperand &CondReg, + const MachineOperand &OrigCond) { + CondReg.setIsUndef(OrigCond.isUndef()); + CondReg.setIsKill(OrigCond.isKill()); +} + unsigned SIInstrInfo::insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, @@ -1318,9 +1325,7 @@ unsigned SIInstrInfo::insertBranch(MachineBasicBlock &MBB, .addMBB(TBB); // Copy the flags onto the implicit condition register operand. - MachineOperand &CondReg = CondBr->getOperand(1); - CondReg.setIsUndef(Cond[1].isUndef()); - CondReg.setIsKill(Cond[1].isKill()); + preserveCondRegFlags(CondBr->getOperand(1), Cond[1]); if (BytesAdded) *BytesAdded = 4; @@ -1352,6 +1357,136 @@ bool SIInstrInfo::reverseBranchCondition( return false; } +bool SIInstrInfo::canInsertSelect(const MachineBasicBlock &MBB, + ArrayRef Cond, + unsigned TrueReg, unsigned FalseReg, + int &CondCycles, + int &TrueCycles, int &FalseCycles) const { + switch (Cond[0].getImm()) { + case VCCNZ: + case VCCZ: { + const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + const TargetRegisterClass *RC = MRI.getRegClass(TrueReg); + assert(MRI.getRegClass(FalseReg) == RC); + + int NumInsts = AMDGPU::getRegBitWidth(RC->getID()) / 32; + CondCycles = TrueCycles = FalseCycles = NumInsts; // ??? + + // Limit to equal cost for branch vs. N v_cndmask_b32s. + return !RI.isSGPRClass(RC) && NumInsts <= 6; + } + case SCC_TRUE: + case SCC_FALSE: { + // FIXME: We could insert for VGPRs if we could replace the original compare + // with a vector one. + const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + const TargetRegisterClass *RC = MRI.getRegClass(TrueReg); + assert(MRI.getRegClass(FalseReg) == RC); + + int NumInsts = AMDGPU::getRegBitWidth(RC->getID()) / 32; + + // Multiples of 8 can do s_cselect_b64 + if (NumInsts % 2 == 0) + NumInsts /= 2; + + CondCycles = TrueCycles = FalseCycles = NumInsts; // ??? + return RI.isSGPRClass(RC); + } + default: + return false; + } +} + +void SIInstrInfo::insertSelect(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, const DebugLoc &DL, + unsigned DstReg, ArrayRef Cond, + unsigned TrueReg, unsigned FalseReg) const { + BranchPredicate Pred = static_cast(Cond[0].getImm()); + if (Pred == VCCZ || Pred == SCC_FALSE) { + Pred = static_cast(-Pred); + std::swap(TrueReg, FalseReg); + } + + MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + const TargetRegisterClass *DstRC = MRI.getRegClass(DstReg); + unsigned DstSize = DstRC->getSize(); + + if (DstSize == 4) { + unsigned SelOp = Pred == SCC_TRUE ? + AMDGPU::S_CSELECT_B32 : AMDGPU::V_CNDMASK_B32_e32; + + // Instruction's operands are backwards from what is expected. + MachineInstr *Select = + BuildMI(MBB, I, DL, get(SelOp), DstReg) + .addReg(FalseReg) + .addReg(TrueReg); + + preserveCondRegFlags(Select->getOperand(3), Cond[1]); + return; + } + + if (DstSize == 8 && Pred == SCC_TRUE) { + MachineInstr *Select = + BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), DstReg) + .addReg(FalseReg) + .addReg(TrueReg); + + preserveCondRegFlags(Select->getOperand(3), Cond[1]); + return; + } + + static const int16_t Sub0_15[] = { + AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, + AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, + AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11, + AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15, + }; + + static const int16_t Sub0_15_64[] = { + AMDGPU::sub0_sub1, AMDGPU::sub2_sub3, + AMDGPU::sub4_sub5, AMDGPU::sub6_sub7, + AMDGPU::sub8_sub9, AMDGPU::sub10_sub11, + AMDGPU::sub12_sub13, AMDGPU::sub14_sub15, + }; + + unsigned SelOp = AMDGPU::V_CNDMASK_B32_e32; + const TargetRegisterClass *EltRC = &AMDGPU::VGPR_32RegClass; + const int16_t *SubIndices = Sub0_15; + int NElts = DstSize / 4; + + // 64-bit select is only avaialble for SALU. + if (Pred == SCC_TRUE) { + SelOp = AMDGPU::S_CSELECT_B64; + EltRC = &AMDGPU::SGPR_64RegClass; + SubIndices = Sub0_15_64; + + assert(NElts % 2 == 0); + NElts /= 2; + } + + MachineInstrBuilder MIB = BuildMI( + MBB, I, DL, get(AMDGPU::REG_SEQUENCE), DstReg); + + I = MIB->getIterator(); + + SmallVector Regs; + for (int Idx = 0; Idx != NElts; ++Idx) { + unsigned DstElt = MRI.createVirtualRegister(EltRC); + Regs.push_back(DstElt); + + unsigned SubIdx = SubIndices[Idx]; + + MachineInstr *Select = + BuildMI(MBB, I, DL, get(SelOp), DstElt) + .addReg(FalseReg, 0, SubIdx) + .addReg(TrueReg, 0, SubIdx); + preserveCondRegFlags(Select->getOperand(3), Cond[1]); + + MIB.addReg(DstElt) + .addImm(SubIdx); + } +} + static void removeModOperands(MachineInstr &MI) { unsigned Opc = MI.getOpcode(); int Src0ModIdx = AMDGPU::getNamedOperandIdx(Opc, diff --git a/lib/Target/AMDGPU/SIInstrInfo.h b/lib/Target/AMDGPU/SIInstrInfo.h index 7d31512ad52..5f53fd18917 100644 --- a/lib/Target/AMDGPU/SIInstrInfo.h +++ b/lib/Target/AMDGPU/SIInstrInfo.h @@ -203,6 +203,18 @@ public: bool reverseBranchCondition( SmallVectorImpl &Cond) const override; + + bool canInsertSelect(const MachineBasicBlock &MBB, + ArrayRef Cond, + unsigned TrueReg, unsigned FalseReg, + int &CondCycles, + int &TrueCycles, int &FalseCycles) const override; + + void insertSelect(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, const DebugLoc &DL, + unsigned DstReg, ArrayRef Cond, + unsigned TrueReg, unsigned FalseReg) const override; + bool areMemAccessesTriviallyDisjoint(MachineInstr &MIa, MachineInstr &MIb, AliasAnalysis *AA = nullptr) const override; diff --git a/lib/Target/AMDGPU/SISchedule.td b/lib/Target/AMDGPU/SISchedule.td index be27966fd5f..0f02f5825cb 100644 --- a/lib/Target/AMDGPU/SISchedule.td +++ b/lib/Target/AMDGPU/SISchedule.td @@ -53,6 +53,11 @@ class SISchedMachineModel : SchedMachineModel { let MicroOpBufferSize = 1; let IssueWidth = 1; let PostRAScheduler = 1; + + // FIXME:Approximate 2 * branch cost. Try to hack around bad + // early-ifcvt heuristics. These need improvement to avoid the OOE + // heuristics. + int MispredictPenalty = 20; } def SIFullSpeedModel : SISchedMachineModel; diff --git a/test/CodeGen/AMDGPU/early-if-convert-cost.ll b/test/CodeGen/AMDGPU/early-if-convert-cost.ll new file mode 100644 index 00000000000..d1624f86765 --- /dev/null +++ b/test/CodeGen/AMDGPU/early-if-convert-cost.ll @@ -0,0 +1,110 @@ +; RUN: llc -stress-early-ifcvt -amdgpu-early-ifcvt=1 -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s + +; FIXME: Most of these cases that don't trigger because of broken cost +; heuristics. Should not need -stress-early-ifcvt + +; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle64: +; GCN: buffer_load_dwordx2 v{{\[}}[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]{{\]}} +; GCN: v_cmp_neq_f64_e32 vcc, 1.0, v{{\[}}[[VAL_LO]]:[[VAL_HI]]{{\]}} +; GCN: v_add_f64 v{{\[}}[[ADD_LO:[0-9]+]]:[[ADD_HI:[0-9]+]]{{\]}}, v{{\[}}[[VAL_LO]]:[[VAL_HI]]{{\]}}, v{{\[}}[[VAL_LO]]:[[VAL_HI]]{{\]}} +; GCN-DAG: v_cndmask_b32_e32 v[[RESULT_LO:[0-9]+]], v[[ADD_LO]], v[[VAL_LO]], vcc +; GCN-DAG: v_cndmask_b32_e32 v[[RESULT_HI:[0-9]+]], v[[ADD_HI]], v[[VAL_HI]], vcc +; GCN: buffer_store_dwordx2 v{{\[}}[[RESULT_LO]]:[[RESULT_HI]]{{\]}} +define void @test_vccnz_ifcvt_triangle64(double addrspace(1)* %out, double addrspace(1)* %in) #0 { +entry: + %v = load double, double addrspace(1)* %in + %cc = fcmp oeq double %v, 1.000000e+00 + br i1 %cc, label %if, label %endif + +if: + %u = fadd double %v, %v + br label %endif + +endif: + %r = phi double [ %v, %entry ], [ %u, %if ] + store double %r, double addrspace(1)* %out + ret void +} + +; vcc branch with SGPR inputs +; GCN-LABEL: {{^}}test_vccnz_sgpr_ifcvt_triangle64: +; GCN: v_cmp_neq_f64 +; GCN: v_add_f64 +; GCN: v_cndmask_b32_e32 +; GCN: v_cndmask_b32_e32 +define void @test_vccnz_sgpr_ifcvt_triangle64(double addrspace(1)* %out, double addrspace(2)* %in) #0 { +entry: + %v = load double, double addrspace(2)* %in + %cc = fcmp oeq double %v, 1.000000e+00 + br i1 %cc, label %if, label %endif + +if: + %u = fadd double %v, %v + br label %endif + +endif: + %r = phi double [ %v, %entry ], [ %u, %if ] + store double %r, double addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle96: +; GCN: v_cmp_neq_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], s{{[0-9]+}}, 1.0 + +; GCN: v_add_i32_e32 +; GCN: v_add_i32_e32 +; GCN: v_add_i32_e32 +; GCN: s_mov_b64 vcc, [[CMP]] + +; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, vcc +; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, vcc +; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, vcc + +; GCN-DAG: buffer_store_dword v +; GCN-DAG: buffer_store_dwordx2 +define void @test_vccnz_ifcvt_triangle96(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(1)* %in, float %cnd) #0 { +entry: + %v = load <3 x i32>, <3 x i32> addrspace(1)* %in + %cc = fcmp oeq float %cnd, 1.000000e+00 + br i1 %cc, label %if, label %endif + +if: + %u = add <3 x i32> %v, %v + br label %endif + +endif: + %r = phi <3 x i32> [ %v, %entry ], [ %u, %if ] + store <3 x i32> %r, <3 x i32> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle128: +; GCN: v_cmp_neq_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], s{{[0-9]+}}, 1.0 + +; GCN: v_add_i32_e32 +; GCN: v_add_i32_e32 +; GCN: v_add_i32_e32 +; GCN: v_add_i32_e32 +; GCN: s_mov_b64 vcc, [[CMP]] + +; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, vcc +; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, vcc +; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, vcc +; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, vcc + +; GCN: buffer_store_dwordx4 +define void @test_vccnz_ifcvt_triangle128(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in, float %cnd) #0 { +entry: + %v = load <4 x i32>, <4 x i32> addrspace(1)* %in + %cc = fcmp oeq float %cnd, 1.000000e+00 + br i1 %cc, label %if, label %endif + +if: + %u = add <4 x i32> %v, %v + br label %endif + +endif: + %r = phi <4 x i32> [ %v, %entry ], [ %u, %if ] + store <4 x i32> %r, <4 x i32> addrspace(1)* %out + ret void +} diff --git a/test/CodeGen/AMDGPU/early-if-convert.ll b/test/CodeGen/AMDGPU/early-if-convert.ll new file mode 100644 index 00000000000..5ae1db8c686 --- /dev/null +++ b/test/CodeGen/AMDGPU/early-if-convert.ll @@ -0,0 +1,454 @@ +; RUN: llc -march=amdgcn -mcpu=verde -amdgpu-early-ifcvt=1 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; XUN: llc -march=amdgcn -mcpu=tonga -amdgpu-early-ifcvt=1 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s + +; FIXME: This leaves behind a now unnecessary and with exec + +; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle: +; GCN: buffer_load_dword [[VAL:v[0-9]+]] +; GCN: v_cmp_neq_f32_e32 vcc, 1.0, [[VAL]] +; GCN: v_add_f32_e32 [[ADD:v[0-9]+]], [[VAL]], [[VAL]] +; GCN: v_cndmask_b32_e32 [[RESULT:v[0-9]+]], [[ADD]], [[VAL]], vcc +; GCN: buffer_store_dword [[RESULT]] +define void @test_vccnz_ifcvt_triangle(float addrspace(1)* %out, float addrspace(1)* %in) #0 { +entry: + %v = load float, float addrspace(1)* %in + %cc = fcmp oeq float %v, 1.000000e+00 + br i1 %cc, label %if, label %endif + +if: + %u = fadd float %v, %v + br label %endif + +endif: + %r = phi float [ %v, %entry ], [ %u, %if ] + store float %r, float addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}test_vccnz_ifcvt_diamond: +; GCN: buffer_load_dword [[VAL:v[0-9]+]] +; GCN: v_cmp_neq_f32_e32 vcc, 1.0, [[VAL]] +; GCN-DAG: v_add_f32_e32 [[ADD:v[0-9]+]], [[VAL]], [[VAL]] +; GCN-DAG: v_mul_f32_e32 [[MUL:v[0-9]+]], [[VAL]], [[VAL]] +; GCN: v_cndmask_b32_e32 [[RESULT:v[0-9]+]], [[ADD]], [[MUL]], vcc +; GCN: buffer_store_dword [[RESULT]] +define void @test_vccnz_ifcvt_diamond(float addrspace(1)* %out, float addrspace(1)* %in) #0 { +entry: + %v = load float, float addrspace(1)* %in + %cc = fcmp oeq float %v, 1.000000e+00 + br i1 %cc, label %if, label %else + +if: + %u0 = fadd float %v, %v + br label %endif + +else: + %u1 = fmul float %v, %v + br label %endif + +endif: + %r = phi float [ %u0, %if ], [ %u1, %else ] + store float %r, float addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle_vcc_clobber: +; GCN: ; clobber vcc +; GCN: v_cmp_neq_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], s{{[0-9]+}}, 1.0 +; GCN: v_add_i32_e32 v{{[0-9]+}}, vcc +; GCN: s_mov_b64 vcc, [[CMP]] +; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, vcc +define void @test_vccnz_ifcvt_triangle_vcc_clobber(i32 addrspace(1)* %out, i32 addrspace(1)* %in, float %k) #0 { +entry: + %v = load i32, i32 addrspace(1)* %in + %cc = fcmp oeq float %k, 1.000000e+00 + br i1 %cc, label %if, label %endif + +if: + call void asm "; clobber $0", "~{VCC}"() #0 + %u = add i32 %v, %v + br label %endif + +endif: + %r = phi i32 [ %v, %entry ], [ %u, %if ] + store i32 %r, i32 addrspace(1)* %out + ret void +} + +; Longest chain of cheap instructions to convert +; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle_max_cheap: +; GCN: v_mul_f32 +; GCN: v_mul_f32 +; GCN: v_mul_f32 +; GCN: v_mul_f32 +; GCN: v_mul_f32 +; GCN: v_mul_f32 +; GCN: v_mul_f32 +; GCN: v_mul_f32 +; GCN: v_mul_f32 +; GCN: v_cndmask_b32_e32 +define void @test_vccnz_ifcvt_triangle_max_cheap(float addrspace(1)* %out, float addrspace(1)* %in) #0 { +entry: + %v = load float, float addrspace(1)* %in + %cc = fcmp oeq float %v, 1.000000e+00 + br i1 %cc, label %if, label %endif + +if: + %u.0 = fmul float %v, %v + %u.1 = fmul float %v, %u.0 + %u.2 = fmul float %v, %u.1 + %u.3 = fmul float %v, %u.2 + %u.4 = fmul float %v, %u.3 + %u.5 = fmul float %v, %u.4 + %u.6 = fmul float %v, %u.5 + %u.7 = fmul float %v, %u.6 + %u.8 = fmul float %v, %u.7 + br label %endif + +endif: + %r = phi float [ %v, %entry ], [ %u.8, %if ] + store float %r, float addrspace(1)* %out + ret void +} + +; Short chain of cheap instructions to not convert +; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle_min_expensive: +; GCN: s_cbranch_vccnz [[ENDIF:BB[0-9]+_[0-9]+]] + +; GCN: v_mul_f32 +; GCN: v_mul_f32 +; GCN: v_mul_f32 +; GCN: v_mul_f32 +; GCN: v_mul_f32 +; GCN: v_mul_f32 +; GCN: v_mul_f32 +; GCN: v_mul_f32 +; GCN: v_mul_f32 +; GCN: v_mul_f32 + +; GCN: [[ENDIF]]: +; GCN: buffer_store_dword +define void @test_vccnz_ifcvt_triangle_min_expensive(float addrspace(1)* %out, float addrspace(1)* %in) #0 { +entry: + %v = load float, float addrspace(1)* %in + %cc = fcmp oeq float %v, 1.000000e+00 + br i1 %cc, label %if, label %endif + +if: + %u.0 = fmul float %v, %v + %u.1 = fmul float %v, %u.0 + %u.2 = fmul float %v, %u.1 + %u.3 = fmul float %v, %u.2 + %u.4 = fmul float %v, %u.3 + %u.5 = fmul float %v, %u.4 + %u.6 = fmul float %v, %u.5 + %u.7 = fmul float %v, %u.6 + %u.8 = fmul float %v, %u.7 + %u.9 = fmul float %v, %u.8 + br label %endif + +endif: + %r = phi float [ %v, %entry ], [ %u.9, %if ] + store float %r, float addrspace(1)* %out + ret void +} + +; Should still branch over fdiv expansion +; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle_expensive: +; GCN: v_cmp_neq_f32_e32 +; GCN: s_cbranch_vccnz [[ENDIF:BB[0-9]+_[0-9]+]] + +; GCN: v_div_scale_f32 + +; GCN: [[ENDIF]]: +; GCN: buffer_store_dword +define void @test_vccnz_ifcvt_triangle_expensive(float addrspace(1)* %out, float addrspace(1)* %in) #0 { +entry: + %v = load float, float addrspace(1)* %in + %cc = fcmp oeq float %v, 1.000000e+00 + br i1 %cc, label %if, label %endif + +if: + %u = fdiv float %v, %v + br label %endif + +endif: + %r = phi float [ %v, %entry ], [ %u, %if ] + store float %r, float addrspace(1)* %out + ret void +} + +; vcc branch with SGPR inputs +; GCN-LABEL: {{^}}test_vccnz_sgpr_ifcvt_triangle: +; GCN: v_cmp_neq_f32_e64 +; GCN: s_cbranch_vccnz [[ENDIF:BB[0-9]+_[0-9]+]] + +; GCN: s_add_i32 + +; GCN: [[ENDIF]]: +; GCN: buffer_store_dword +define void @test_vccnz_sgpr_ifcvt_triangle(i32 addrspace(1)* %out, i32 addrspace(2)* %in, float %cnd) #0 { +entry: + %v = load i32, i32 addrspace(2)* %in + %cc = fcmp oeq float %cnd, 1.000000e+00 + br i1 %cc, label %if, label %endif + +if: + %u = add i32 %v, %v + br label %endif + +endif: + %r = phi i32 [ %v, %entry ], [ %u, %if ] + store i32 %r, i32 addrspace(1)* %out + ret void + +} + +; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle_constant_load: +; GCN: v_cndmask_b32 +define void @test_vccnz_ifcvt_triangle_constant_load(float addrspace(1)* %out, float addrspace(2)* %in) #0 { +entry: + %v = load float, float addrspace(2)* %in + %cc = fcmp oeq float %v, 1.000000e+00 + br i1 %cc, label %if, label %endif + +if: + %u = fadd float %v, %v + br label %endif + +endif: + %r = phi float [ %v, %entry ], [ %u, %if ] + store float %r, float addrspace(1)* %out + ret void +} + +; Due to broken cost heuristic, this is not if converted like +; test_vccnz_ifcvt_triangle_constant_load even though it should be. + +; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle_argload: +; GCN: v_cndmask_b32 +define void @test_vccnz_ifcvt_triangle_argload(float addrspace(1)* %out, float %v) #0 { +entry: + %cc = fcmp oeq float %v, 1.000000e+00 + br i1 %cc, label %if, label %endif + +if: + %u = fadd float %v, %v + br label %endif + +endif: + %r = phi float [ %v, %entry ], [ %u, %if ] + store float %r, float addrspace(1)* %out + ret void +} + +; Scalar branch and scalar inputs +; GCN-LABEL: {{^}}test_scc1_sgpr_ifcvt_triangle: +; GCN: s_load_dword [[VAL:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x0 +; GCN: s_add_i32 [[ADD:s[0-9]+]], [[VAL]], [[VAL]] +; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 1 +; GCN-NEXT: s_cselect_b32 [[SELECT:s[0-9]+]], [[ADD]], [[VAL]] +define void @test_scc1_sgpr_ifcvt_triangle(i32 addrspace(2)* %in, i32 %cond) #0 { +entry: + %v = load i32, i32 addrspace(2)* %in + %cc = icmp eq i32 %cond, 1 + br i1 %cc, label %if, label %endif + +if: + %u = add i32 %v, %v + br label %endif + +endif: + %r = phi i32 [ %v, %entry ], [ %u, %if ] + call void asm sideeffect "; reg use $0", "s"(i32 %r) #0 + ret void +} + +; FIXME: Should be able to use VALU compare and select +; Scalar branch but VGPR select operands +; GCN-LABEL: {{^}}test_scc1_vgpr_ifcvt_triangle: +; GCN: s_cmp_lg_u32 +; GCN: s_cbranch_scc1 [[ENDIF:BB[0-9]+_[0-9]+]] + +; GCN: v_add_f32_e32 + +; GCN: [[ENDIF]]: +; GCN: buffer_store_dword +define void @test_scc1_vgpr_ifcvt_triangle(float addrspace(1)* %out, float addrspace(1)* %in, i32 %cond) #0 { +entry: + %v = load float, float addrspace(1)* %in + %cc = icmp eq i32 %cond, 1 + br i1 %cc, label %if, label %endif + +if: + %u = fadd float %v, %v + br label %endif + +endif: + %r = phi float [ %v, %entry ], [ %u, %if ] + store float %r, float addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}test_scc1_sgpr_ifcvt_triangle64: +; GCN: s_add_u32 +; GCN: s_addc_u32 +; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 1 +; GCN-NEXT: s_cselect_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} +define void @test_scc1_sgpr_ifcvt_triangle64(i64 addrspace(2)* %in, i32 %cond) #0 { +entry: + %v = load i64, i64 addrspace(2)* %in + %cc = icmp eq i32 %cond, 1 + br i1 %cc, label %if, label %endif + +if: + %u = add i64 %v, %v + br label %endif + +endif: + %r = phi i64 [ %v, %entry ], [ %u, %if ] + call void asm sideeffect "; reg use $0", "s"(i64 %r) #0 + ret void +} + +; TODO: Can do s_cselect_b64; s_cselect_b32 +; GCN-LABEL: {{^}}test_scc1_sgpr_ifcvt_triangle96: +; GCN: s_add_i32 +; GCN: s_add_i32 +; GCN: s_add_i32 +; GCN: s_add_i32 +; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 1 +; GCN-NEXT: s_cselect_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} +; GCN-NEXT: s_cselect_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} +define void @test_scc1_sgpr_ifcvt_triangle96(<3 x i32> addrspace(2)* %in, i32 %cond) #0 { +entry: + %v = load <3 x i32>, <3 x i32> addrspace(2)* %in + %cc = icmp eq i32 %cond, 1 + br i1 %cc, label %if, label %endif + +if: + %u = add <3 x i32> %v, %v + br label %endif + +endif: + %r = phi <3 x i32> [ %v, %entry ], [ %u, %if ] + %r.ext = shufflevector <3 x i32> %r, <3 x i32> undef, <4 x i32> + call void asm sideeffect "; reg use $0", "s"(<4 x i32> %r.ext) #0 + ret void +} + +; GCN-LABEL: {{^}}test_scc1_sgpr_ifcvt_triangle128: +; GCN: s_add_i32 +; GCN: s_add_i32 +; GCN: s_add_i32 +; GCN: s_add_i32 +; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 1 +; GCN-NEXT: s_cselect_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} +; GCN-NEXT: s_cselect_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} +define void @test_scc1_sgpr_ifcvt_triangle128(<4 x i32> addrspace(2)* %in, i32 %cond) #0 { +entry: + %v = load <4 x i32>, <4 x i32> addrspace(2)* %in + %cc = icmp eq i32 %cond, 1 + br i1 %cc, label %if, label %endif + +if: + %u = add <4 x i32> %v, %v + br label %endif + +endif: + %r = phi <4 x i32> [ %v, %entry ], [ %u, %if ] + call void asm sideeffect "; reg use $0", "s"(<4 x i32> %r) #0 + ret void +} + +; GCN-LABEL: {{^}}uniform_if_swap_br_targets_scc_constant_select: +; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 0 +; GCN: s_cselect_b32 s{{[0-9]+}}, 1, 0{{$}} +define void @uniform_if_swap_br_targets_scc_constant_select(i32 %cond, i32 addrspace(1)* %out) { +entry: + %cmp0 = icmp eq i32 %cond, 0 + br i1 %cmp0, label %else, label %if + +if: + br label %done + +else: + br label %done + +done: + %value = phi i32 [0, %if], [1, %else] + store i32 %value, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}ifcvt_undef_scc: +; GCN: {{^}}; BB#0: +; GCN-NEXT: s_load_dwordx2 +; GCN-NEXT: s_cselect_b32 s{{[0-9]+}}, 1, 0 +define void @ifcvt_undef_scc(i32 %cond, i32 addrspace(1)* %out) { +entry: + br i1 undef, label %else, label %if + +if: + br label %done + +else: + br label %done + +done: + %value = phi i32 [0, %if], [1, %else] + store i32 %value, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle256: +; GCN: v_cmp_neq_f32 +; GCN: s_cbranch_vccnz [[ENDIF:BB[0-9]+_[0-9]+]] + +; GCN: v_add_i32 +; GCN: v_add_i32 + +; GCN: [[ENDIF]]: +; GCN: buffer_store_dword +define void @test_vccnz_ifcvt_triangle256(<8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %in, float %cnd) #0 { +entry: + %v = load <8 x i32>, <8 x i32> addrspace(1)* %in + %cc = fcmp oeq float %cnd, 1.000000e+00 + br i1 %cc, label %if, label %endif + +if: + %u = add <8 x i32> %v, %v + br label %endif + +endif: + %r = phi <8 x i32> [ %v, %entry ], [ %u, %if ] + store <8 x i32> %r, <8 x i32> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle512: +; GCN: v_cmp_neq_f32 +; GCN: s_cbranch_vccnz [[ENDIF:BB[0-9]+_[0-9]+]] + +; GCN: v_add_i32 +; GCN: v_add_i32 + +; GCN: [[ENDIF]]: +; GCN: buffer_store_dword +define void @test_vccnz_ifcvt_triangle512(<16 x i32> addrspace(1)* %out, <16 x i32> addrspace(1)* %in, float %cnd) #0 { +entry: + %v = load <16 x i32>, <16 x i32> addrspace(1)* %in + %cc = fcmp oeq float %cnd, 1.000000e+00 + br i1 %cc, label %if, label %endif + +if: + %u = add <16 x i32> %v, %v + br label %endif + +endif: + %r = phi <16 x i32> [ %v, %entry ], [ %u, %if ] + store <16 x i32> %r, <16 x i32> addrspace(1)* %out + ret void +} + +attributes #0 = { nounwind } diff --git a/test/CodeGen/AMDGPU/uniform-branch-intrinsic-cond.ll b/test/CodeGen/AMDGPU/uniform-branch-intrinsic-cond.ll index 93a2c6998be..eb6007f21c1 100644 --- a/test/CodeGen/AMDGPU/uniform-branch-intrinsic-cond.ll +++ b/test/CodeGen/AMDGPU/uniform-branch-intrinsic-cond.ll @@ -14,6 +14,7 @@ main_body: if: %u = fadd float %v, %v + call void asm sideeffect "", ""() #0 ; Prevent ifconversion br label %else else: diff --git a/test/CodeGen/AMDGPU/uniform-cfg.ll b/test/CodeGen/AMDGPU/uniform-cfg.ll index 154ac361e79..d3e431d1e35 100644 --- a/test/CodeGen/AMDGPU/uniform-cfg.ll +++ b/test/CodeGen/AMDGPU/uniform-cfg.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -mcpu=verde -machine-sink-split-probability-threshold=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -machine-sink-split-probability-threshold=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -march=amdgcn -mcpu=verde -amdgpu-early-ifcvt=0 -machine-sink-split-probability-threshold=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-early-ifcvt=0 -machine-sink-split-probability-threshold=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s ; GCN-LABEL: {{^}}uniform_if_scc: ; GCN-DAG: s_cmp_eq_u32 s{{[0-9]+}}, 0 -- 2.40.0