From 17f22f98eb9820bf1765504ebbaf6c0f91b88c13 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Mon, 20 Jun 2016 18:33:56 +0000 Subject: [PATCH] AMDGPU: Fold more custom nodes to undef This will help sneak undefs past GVN into the DAG for some tests. Also add missing intrinsic for rsq_legacy, even though the node was already selected to the instruction. Also start passing the debug location to intrinsic errors. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@273181 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/IR/IntrinsicsAMDGPU.td | 5 ++ lib/Target/AMDGPU/SIISelLowering.cpp | 51 +++++++++++++++---- test/CodeGen/AMDGPU/llvm.amdgcn.class.ll | 14 ++++- test/CodeGen/AMDGPU/llvm.amdgcn.fract.ll | 10 ++++ test/CodeGen/AMDGPU/llvm.amdgcn.ldexp.ll | 10 +++- test/CodeGen/AMDGPU/llvm.amdgcn.rcp.ll | 14 +++-- test/CodeGen/AMDGPU/llvm.amdgcn.rsq.clamp.ll | 8 +++ test/CodeGen/AMDGPU/llvm.amdgcn.rsq.legacy.ll | 39 ++++++++++++++ test/CodeGen/AMDGPU/llvm.amdgcn.rsq.ll | 10 +++- test/CodeGen/AMDGPU/vi-removed-intrinsics.ll | 24 +++++++++ 10 files changed, 168 insertions(+), 17 deletions(-) create mode 100644 test/CodeGen/AMDGPU/llvm.amdgcn.rsq.legacy.ll create mode 100644 test/CodeGen/AMDGPU/vi-removed-intrinsics.ll diff --git a/include/llvm/IR/IntrinsicsAMDGPU.td b/include/llvm/IR/IntrinsicsAMDGPU.td index f6324297979..d36484735a5 100644 --- a/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/include/llvm/IR/IntrinsicsAMDGPU.td @@ -114,6 +114,11 @@ def int_amdgcn_rsq : Intrinsic< [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem] >; +def int_amdgcn_rsq_legacy : GCCBuiltin<"__builtin_amdgcn_rsq_legacy">, + Intrinsic< + [llvm_float_ty], [llvm_float_ty], [IntrNoMem] +>; + def int_amdgcn_rsq_clamp : Intrinsic< [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>; diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp index f9acf40afc2..2af8e5162d4 100644 --- a/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/lib/Target/AMDGPU/SIISelLowering.cpp @@ -1495,9 +1495,18 @@ SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG, DAG.getValueType(VT)); } -static SDValue emitNonHSAIntrinsicError(SelectionDAG& DAG, EVT VT) { +static SDValue emitNonHSAIntrinsicError(SelectionDAG& DAG, SDLoc DL, EVT VT) { DiagnosticInfoUnsupported BadIntrin(*DAG.getMachineFunction().getFunction(), - "non-hsa intrinsic with hsa target"); + "non-hsa intrinsic with hsa target", + DL.getDebugLoc()); + DAG.getContext()->diagnose(BadIntrin); + return DAG.getUNDEF(VT); +} + +static SDValue emitRemovedIntrinsicError(SelectionDAG& DAG, SDLoc DL, EVT VT) { + DiagnosticInfoUnsupported BadIntrin(*DAG.getMachineFunction().getFunction(), + "intrinsic not supported on subtarget", + DL.getDebugLoc()); DAG.getContext()->diagnose(BadIntrin); return DAG.getUNDEF(VT); } @@ -1541,6 +1550,12 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, case Intrinsic::amdgcn_rsq: case AMDGPUIntrinsic::AMDGPU_rsq: // Legacy name return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1)); + case Intrinsic::amdgcn_rsq_legacy: { + if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) + return emitRemovedIntrinsicError(DAG, DL, VT); + + return DAG.getNode(AMDGPUISD::RSQ_LEGACY, DL, VT, Op.getOperand(1)); + } case Intrinsic::amdgcn_rsq_clamp: case AMDGPUIntrinsic::AMDGPU_rsq_clamped: { // Legacy name if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS) @@ -1558,55 +1573,55 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, } case Intrinsic::r600_read_ngroups_x: if (Subtarget->isAmdHsaOS()) - return emitNonHSAIntrinsicError(DAG, VT); + return emitNonHSAIntrinsicError(DAG, DL, VT); return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), SI::KernelInputOffsets::NGROUPS_X, false); case Intrinsic::r600_read_ngroups_y: if (Subtarget->isAmdHsaOS()) - return emitNonHSAIntrinsicError(DAG, VT); + return emitNonHSAIntrinsicError(DAG, DL, VT); return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), SI::KernelInputOffsets::NGROUPS_Y, false); case Intrinsic::r600_read_ngroups_z: if (Subtarget->isAmdHsaOS()) - return emitNonHSAIntrinsicError(DAG, VT); + return emitNonHSAIntrinsicError(DAG, DL, VT); return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), SI::KernelInputOffsets::NGROUPS_Z, false); case Intrinsic::r600_read_global_size_x: if (Subtarget->isAmdHsaOS()) - return emitNonHSAIntrinsicError(DAG, VT); + return emitNonHSAIntrinsicError(DAG, DL, VT); return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), SI::KernelInputOffsets::GLOBAL_SIZE_X, false); case Intrinsic::r600_read_global_size_y: if (Subtarget->isAmdHsaOS()) - return emitNonHSAIntrinsicError(DAG, VT); + return emitNonHSAIntrinsicError(DAG, DL, VT); return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), SI::KernelInputOffsets::GLOBAL_SIZE_Y, false); case Intrinsic::r600_read_global_size_z: if (Subtarget->isAmdHsaOS()) - return emitNonHSAIntrinsicError(DAG, VT); + return emitNonHSAIntrinsicError(DAG, DL, VT); return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), SI::KernelInputOffsets::GLOBAL_SIZE_Z, false); case Intrinsic::r600_read_local_size_x: if (Subtarget->isAmdHsaOS()) - return emitNonHSAIntrinsicError(DAG, VT); + return emitNonHSAIntrinsicError(DAG, DL, VT); return lowerImplicitZextParam(DAG, Op, MVT::i16, SI::KernelInputOffsets::LOCAL_SIZE_X); case Intrinsic::r600_read_local_size_y: if (Subtarget->isAmdHsaOS()) - return emitNonHSAIntrinsicError(DAG, VT); + return emitNonHSAIntrinsicError(DAG, DL, VT); return lowerImplicitZextParam(DAG, Op, MVT::i16, SI::KernelInputOffsets::LOCAL_SIZE_Y); case Intrinsic::r600_read_local_size_z: if (Subtarget->isAmdHsaOS()) - return emitNonHSAIntrinsicError(DAG, VT); + return emitNonHSAIntrinsicError(DAG, DL, VT); return lowerImplicitZextParam(DAG, Op, MVT::i16, SI::KernelInputOffsets::LOCAL_SIZE_Z); @@ -2564,6 +2579,9 @@ SDValue SITargetLowering::performClassCombine(SDNode *N, return DAG.getConstant(0, SDLoc(N), MVT::i1); } + if (N->getOperand(0).isUndef()) + return DAG.getUNDEF(MVT::i1); + return SDValue(); } @@ -2964,6 +2982,17 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N, return performClassCombine(N, DCI); case ISD::FCANONICALIZE: return performFCanonicalizeCombine(N, DCI); + case AMDGPUISD::FRACT: + case AMDGPUISD::RCP: + case AMDGPUISD::RSQ: + case AMDGPUISD::RSQ_LEGACY: + case AMDGPUISD::RSQ_CLAMP: + case AMDGPUISD::LDEXP: { + SDValue Src = N->getOperand(0); + if (Src.isUndef()) + return Src; + break; + } } return AMDGPUTargetLowering::PerformDAGCombine(N, DCI); } diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.class.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.class.ll index 3eb7f6b11ab..668c669e41e 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.class.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.class.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s declare i1 @llvm.amdgcn.class.f32(float, i32) #1 declare i1 @llvm.amdgcn.class.f64(double, i32) #1 @@ -495,5 +495,17 @@ define void @test_class_0_f64(i32 addrspace(1)* %out, double %a) #0 { ret void } +; FIXME: Why is the extension still here? +; SI-LABEL: {{^}}test_class_undef_f32: +; SI-NOT: v_cmp_class +; SI: v_cndmask_b32_e64 v{{[0-9]+}}, 0, -1, +; SI: buffer_store_dword +define void @test_class_undef_f32(i32 addrspace(1)* %out, float %a, i32 %b) #0 { + %result = call i1 @llvm.amdgcn.class.f32(float undef, i32 %b) #1 + %sext = sext i1 %result to i32 + store i32 %sext, i32 addrspace(1)* %out, align 4 + ret void +} + attributes #0 = { nounwind } attributes #1 = { nounwind readnone } diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.fract.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.fract.ll index 00b5eb77b99..1cca9eb6a77 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.fract.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.fract.ll @@ -20,5 +20,15 @@ define void @v_fract_f64(double addrspace(1)* %out, double %src) #1 { ret void } +; GCN-LABEL: {{^}}v_fract_undef_f32: +; GCN-NOT: v_fract_f32 +; GCN-NOT: v0 +; GCN: buffer_store_dword v0 +define void @v_fract_undef_f32(float addrspace(1)* %out) #1 { + %fract = call float @llvm.amdgcn.fract.f32(float undef) + store float %fract, float addrspace(1)* %out + ret void +} + attributes #0 = { nounwind readnone } attributes #1 = { nounwind } diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.ldexp.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.ldexp.ll index c48d52d150b..511418f8d1e 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.ldexp.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.ldexp.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s declare float @llvm.amdgcn.ldexp.f32(float, i32) nounwind readnone @@ -42,3 +42,11 @@ define void @test_legacy_ldexp_f64(double addrspace(1)* %out, double %a, i32 %b) store double %result, double addrspace(1)* %out, align 8 ret void } + +; SI-LABEL: {{^}}test_ldexp_undef_f32: +; SI-NOT: v_ldexp_f32 +define void @test_ldexp_undef_f32(float addrspace(1)* %out, i32 %b) nounwind { + %result = call float @llvm.amdgcn.ldexp.f32(float undef, i32 %b) nounwind readnone + store float %result, float addrspace(1)* %out, align 4 + ret void +} diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.ll index 0988e43299c..26b85f0f905 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.ll @@ -1,6 +1,6 @@ -; RUN: llc -march=amdgcn -mcpu=SI -mattr=-fp32-denormals -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=SI-UNSAFE -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=SI -mattr=-fp32-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=SI-SAFE -check-prefix=SI -check-prefix=FUNC %s -; XUN: llc -march=amdgcn -mcpu=SI -mattr=+fp32-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=SI-SAFE-SPDENORM -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mattr=-fp32-denormals -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=SI-UNSAFE -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mattr=-fp32-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=SI-SAFE -check-prefix=SI -check-prefix=FUNC %s +; XUN: llc -march=amdgcn -mattr=+fp32-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=SI-SAFE-SPDENORM -check-prefix=SI -check-prefix=FUNC %s ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-fp32-denormals -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=SI-UNSAFE -check-prefix=SI -check-prefix=FUNC %s ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-fp32-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=SI-SAFE -check-prefix=SI -check-prefix=FUNC %s ; XUN: llc -march=amdgcn -mcpu=tonga -mattr=+fp32-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=SI-SAFE-SPDENORM -check-prefix=SI -check-prefix=FUNC %s @@ -69,5 +69,13 @@ define void @rsq_rcp_pat_f64(double addrspace(1)* %out, double %src) #1 { ret void } +; FUNC-LABEL: {{^}}rcp_undef_f32: +; SI-NOT: v_rcp_f32 +define void @rcp_undef_f32(float addrspace(1)* %out) #1 { + %rcp = call float @llvm.amdgcn.rcp.f32(float undef) #0 + store float %rcp, float addrspace(1)* %out, align 4 + ret void +} + attributes #0 = { nounwind readnone } attributes #1 = { nounwind } diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.clamp.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.clamp.ll index 6d8513cbbde..dff2f599061 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.clamp.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.clamp.ll @@ -38,5 +38,13 @@ define void @rsq_clamp_f64(double addrspace(1)* %out, double %src) #0 { ret void } +; FUNC-LABEL: {{^}}rsq_clamp_undef_f32: +; SI-NOT: v_rsq_clamp_f32 +define void @rsq_clamp_undef_f32(float addrspace(1)* %out) #0 { + %rsq_clamp = call float @llvm.amdgcn.rsq.clamp.f32(float undef) + store float %rsq_clamp, float addrspace(1)* %out + ret void +} + attributes #0 = { nounwind } attributes #1 = { nounwind readnone } diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.legacy.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.legacy.ll new file mode 100644 index 00000000000..47bd0d82b83 --- /dev/null +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.legacy.ll @@ -0,0 +1,39 @@ +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s + +declare float @llvm.amdgcn.rsq.legacy(float) #0 + +; FUNC-LABEL: {{^}}rsq_legacy_f32: +; SI: v_rsq_legacy_f32_e32 {{v[0-9]+}}, {{s[0-9]+}} +define void @rsq_legacy_f32(float addrspace(1)* %out, float %src) #1 { + %rsq = call float @llvm.amdgcn.rsq.legacy(float %src) #0 + store float %rsq, float addrspace(1)* %out, align 4 + ret void +} + +; TODO: Really these should be constant folded +; FUNC-LABEL: {{^}}rsq_legacy_f32_constant_4.0 +; SI: v_rsq_legacy_f32_e32 {{v[0-9]+}}, 4.0 +define void @rsq_legacy_f32_constant_4.0(float addrspace(1)* %out) #1 { + %rsq = call float @llvm.amdgcn.rsq.legacy(float 4.0) #0 + store float %rsq, float addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}rsq_legacy_f32_constant_100.0 +; SI: v_rsq_legacy_f32_e32 {{v[0-9]+}}, 0x42c80000 +define void @rsq_legacy_f32_constant_100.0(float addrspace(1)* %out) #1 { + %rsq = call float @llvm.amdgcn.rsq.legacy(float 100.0) #0 + store float %rsq, float addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}rsq_legacy_undef_f32: +; SI-NOT: v_rsq_legacy_f32 +define void @rsq_legacy_undef_f32(float addrspace(1)* %out) #1 { + %rsq = call float @llvm.amdgcn.rsq.legacy(float undef) + store float %rsq, float addrspace(1)* %out, align 4 + ret void +} + +attributes #0 = { nounwind readnone } +attributes #1 = { nounwind } diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.ll index 4c5489a4632..012f6cd8292 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s declare float @llvm.amdgcn.rsq.f32(float) #0 @@ -56,5 +56,13 @@ define void @rsq_f64_constant_100.0(double addrspace(1)* %out) #1 { ret void } +; FUNC-LABEL: {{^}}rsq_undef_f32: +; SI-NOT: v_rsq_f32 +define void @rsq_undef_f32(float addrspace(1)* %out) #1 { + %rsq = call float @llvm.amdgcn.rsq.f32(float undef) + store float %rsq, float addrspace(1)* %out, align 4 + ret void +} + attributes #0 = { nounwind readnone } attributes #1 = { nounwind } diff --git a/test/CodeGen/AMDGPU/vi-removed-intrinsics.ll b/test/CodeGen/AMDGPU/vi-removed-intrinsics.ll new file mode 100644 index 00000000000..ad7521a3da9 --- /dev/null +++ b/test/CodeGen/AMDGPU/vi-removed-intrinsics.ll @@ -0,0 +1,24 @@ +; RUN: not llc -march=amdgcn -mcpu=tonga < %s 2>&1 | FileCheck -check-prefix=ERROR %s + +; ERROR: error: :1:42: in function rsq_legacy_f32 void (float addrspace(1)*, float): intrinsic not supported on subtarget + +declare float @llvm.amdgcn.rsq.legacy(float) #0 + +define void @rsq_legacy_f32(float addrspace(1)* %out, float %src) #1 { + %rsq = call float @llvm.amdgcn.rsq.legacy(float %src), !dbg !4 + store float %rsq, float addrspace(1)* %out, align 4 + ret void +} + +attributes #0 = { nounwind readnone } +attributes #1 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} + +!0 = distinct !DICompileUnit(language: DW_LANG_OpenCL, file: !1, isOptimized: false, runtimeVersion: 0, emissionKind: NoDebug) +!1 = !DIFile(filename: "foo.cl", directory: "/dev/null") +!2 = !{i32 2, !"Dwarf Version", i32 4} +!3 = !{i32 2, !"Debug Info Version", i32 3} +!4 = !DILocation(line: 1, column: 42, scope: !5) +!5 = distinct !DISubprogram(name: "rsq_legacy_f32", scope: null, line: 1, isLocal: false, isDefinition: true, scopeLine: 2, isOptimized: false, unit: !0) -- 2.50.1