From dbe625a3115729f5ca56d20c3a4447c0760ac9d8 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Sat, 11 Mar 2017 05:40:40 +0000 Subject: [PATCH] AMDGPU: Keep track of modifiers when converting v_mac to v_mad Since v_max_f32_e64/v_max_f16_e64 can be folded if the target instruction supports the clamp bit, we also need to maintain modifiers when converting v_mac to v_mad. This fixes a rendering issue with Dirt Rally because a v_mac instruction with the clamp bit set was converted to a v_mad but that bit was lost during the conversion. Fixes: e184e01dd79 ("AMDGPU: Fold FP clamp as modifier bit") Patch by Samuel Pitoiset git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@297556 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/AMDGPU/SIInstrInfo.cpp | 14 ++++++++++---- test/CodeGen/AMDGPU/clamp-modifier.ll | 17 +++++++++++++++++ test/CodeGen/AMDGPU/omod.ll | 11 +++++++++++ 3 files changed, 38 insertions(+), 4 deletions(-) diff --git a/lib/Target/AMDGPU/SIInstrInfo.cpp b/lib/Target/AMDGPU/SIInstrInfo.cpp index 1833b324915..63524d22a34 100644 --- a/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -1766,20 +1766,26 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineFunction::iterator &MBB, const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst); const MachineOperand *Src0 = getNamedOperand(MI, AMDGPU::OpName::src0); + const MachineOperand *Src0Mods = + getNamedOperand(MI, AMDGPU::OpName::src0_modifiers); const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1); + const MachineOperand *Src1Mods = + getNamedOperand(MI, AMDGPU::OpName::src1_modifiers); const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2); + const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp); + const MachineOperand *Omod = getNamedOperand(MI, AMDGPU::OpName::omod); return BuildMI(*MBB, MI, MI.getDebugLoc(), get(IsF16 ? AMDGPU::V_MAD_F16 : AMDGPU::V_MAD_F32)) .add(*Dst) - .addImm(0) // Src0 mods + .addImm(Src0Mods ? Src0Mods->getImm() : 0) .add(*Src0) - .addImm(0) // Src1 mods + .addImm(Src1Mods ? Src1Mods->getImm() : 0) .add(*Src1) .addImm(0) // Src mods .add(*Src2) - .addImm(0) // clamp - .addImm(0); // omod + .addImm(Clamp ? Clamp->getImm() : 0) + .addImm(Omod ? Omod->getImm() : 0); } // It's not generally safe to move VALU instructions across these since it will diff --git a/test/CodeGen/AMDGPU/clamp-modifier.ll b/test/CodeGen/AMDGPU/clamp-modifier.ll index 186bd349ecc..c3a7d5e14d8 100644 --- a/test/CodeGen/AMDGPU/clamp-modifier.ll +++ b/test/CodeGen/AMDGPU/clamp-modifier.ll @@ -168,6 +168,23 @@ define amdgpu_kernel void @v_clamp_add_src_f64(double addrspace(1)* %out, double ret void } +; GCN-LABEL: {{^}}v_clamp_mac_to_mad: +; GCN: v_mad_f32 v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]}} clamp{{$}} +define amdgpu_kernel void @v_clamp_mac_to_mad(float addrspace(1)* %out, float addrspace(1)* %aptr, float %a) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid + %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid + %b = load float, float addrspace(1)* %gep0 + + %mul = fmul float %a, %a + %add = fadd float %mul, %b + %max = call float @llvm.maxnum.f32(float %add, float 0.0) + %clamp = call float @llvm.minnum.f32(float %max, float 1.0) + %res = fadd float %clamp, %b + store float %res, float addrspace(1)* %out.gep + ret void +} + declare i32 @llvm.amdgcn.workitem.id.x() #1 declare float @llvm.fabs.f32(float) #1 declare float @llvm.floor.f32(float) #1 diff --git a/test/CodeGen/AMDGPU/omod.ll b/test/CodeGen/AMDGPU/omod.ll index d48956f534e..3fd7b13fcc5 100644 --- a/test/CodeGen/AMDGPU/omod.ll +++ b/test/CodeGen/AMDGPU/omod.ll @@ -250,6 +250,17 @@ define amdgpu_ps void @v_omod_div2_f16_no_denormals(half %a) #3 { ret void } +; GCN-LABEL: {{^}}v_omod_mac_to_mad: +; GCN: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]}} mul:2{{$}} +define amdgpu_ps void @v_omod_mac_to_mad(float %b, float %a) #0 { + %mul = fmul float %a, %a + %add = fadd float %mul, %b + %mad = fmul float %add, 2.0 + %res = fmul float %mad, %b + store float %res, float addrspace(1)* undef + ret void +} + declare i32 @llvm.amdgcn.workitem.id.x() #1 declare float @llvm.fabs.f32(float) #1 declare float @llvm.floor.f32(float) #1 -- 2.40.0