From ff4096b8f86cc6d75cc054e9c050024b00d36fb9 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Thu, 22 Dec 2016 03:55:35 +0000 Subject: [PATCH] AMDGPU: Form more FMAs if fusion is allowed Extend the existing fadd/fsub->fmad combines to produce FMA if allowed. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@290311 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/AMDGPU/SIISelLowering.cpp | 75 ++-- lib/Target/AMDGPU/SIISelLowering.h | 1 + test/CodeGen/AMDGPU/fmuladd.f16.ll | 467 +++++++++++++++++++++ test/CodeGen/AMDGPU/fmuladd.f32.ll | 583 +++++++++++++++++++++++++++ test/CodeGen/AMDGPU/fmuladd.f64.ll | 119 ++++++ test/CodeGen/AMDGPU/fmuladd.ll | 400 ------------------ test/CodeGen/AMDGPU/mad-sub.ll | 420 ------------------- 7 files changed, 1215 insertions(+), 850 deletions(-) create mode 100644 test/CodeGen/AMDGPU/fmuladd.f16.ll create mode 100644 test/CodeGen/AMDGPU/fmuladd.f32.ll create mode 100644 test/CodeGen/AMDGPU/fmuladd.f64.ll delete mode 100644 test/CodeGen/AMDGPU/fmuladd.ll delete mode 100644 test/CodeGen/AMDGPU/mad-sub.ll diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp index 1572897630e..52cc0428e28 100644 --- a/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/lib/Target/AMDGPU/SIISelLowering.cpp @@ -3871,24 +3871,31 @@ SDValue SITargetLowering::performMinMaxCombine(SDNode *N, return SDValue(); } +unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG, EVT VT) const { + // Only do this if we are not trying to support denormals. v_mad_f32 does not + // support denormals ever. + if ((VT == MVT::f32 && !Subtarget->hasFP32Denormals()) || + (VT == MVT::f16 && !Subtarget->hasFP16Denormals())) + return ISD::FMAD; + + const TargetOptions &Options = DAG.getTarget().Options; + if ((Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath) && + isFMAFasterThanFMulAndFAdd(VT)) { + return ISD::FMA; + } + + return 0; +} + SDValue SITargetLowering::performFAddCombine(SDNode *N, DAGCombinerInfo &DCI) const { if (DCI.getDAGCombineLevel() < AfterLegalizeDAG) return SDValue(); + SelectionDAG &DAG = DCI.DAG; EVT VT = N->getValueType(0); - if (VT == MVT::f64) - return SDValue(); - - assert(!VT.isVector()); - - // Only do this if we are not trying to support denormals. v_mad_f32 does - // not support denormals ever. - if ((VT == MVT::f32 && Subtarget->hasFP32Denormals()) || - (VT == MVT::f16 && Subtarget->hasFP16Denormals())) - return SDValue(); + assert(!VT.isVector()); - SelectionDAG &DAG = DCI.DAG; SDLoc SL(N); SDValue LHS = N->getOperand(0); SDValue RHS = N->getOperand(1); @@ -3900,8 +3907,11 @@ SDValue SITargetLowering::performFAddCombine(SDNode *N, if (LHS.getOpcode() == ISD::FADD) { SDValue A = LHS.getOperand(0); if (A == LHS.getOperand(1)) { - const SDValue Two = DAG.getConstantFP(2.0, SL, VT); - return DAG.getNode(ISD::FMAD, SL, VT, Two, A, RHS); + unsigned FusedOp = getFusedOpcode(DAG, VT); + if (FusedOp != 0) { + const SDValue Two = DAG.getConstantFP(2.0, SL, VT); + return DAG.getNode(FusedOp, SL, VT, Two, A, RHS); + } } } @@ -3909,8 +3919,11 @@ SDValue SITargetLowering::performFAddCombine(SDNode *N, if (RHS.getOpcode() == ISD::FADD) { SDValue A = RHS.getOperand(0); if (A == RHS.getOperand(1)) { - const SDValue Two = DAG.getConstantFP(2.0, SL, VT); - return DAG.getNode(ISD::FMAD, SL, VT, Two, A, LHS); + unsigned FusedOp = getFusedOpcode(DAG, VT); + if (FusedOp != 0) { + const SDValue Two = DAG.getConstantFP(2.0, SL, VT); + return DAG.getNode(FusedOp, SL, VT, Two, A, LHS); + } } } @@ -3932,29 +3945,31 @@ SDValue SITargetLowering::performFSubCombine(SDNode *N, // // Only do this if we are not trying to support denormals. v_mad_f32 does // not support denormals ever. - if ((VT == MVT::f32 && !Subtarget->hasFP32Denormals()) || - (VT == MVT::f16 && !Subtarget->hasFP16Denormals())) { - SDValue LHS = N->getOperand(0); - SDValue RHS = N->getOperand(1); - if (LHS.getOpcode() == ISD::FADD) { - // (fsub (fadd a, a), c) -> mad 2.0, a, (fneg c) - - SDValue A = LHS.getOperand(0); - if (A == LHS.getOperand(1)) { + SDValue LHS = N->getOperand(0); + SDValue RHS = N->getOperand(1); + if (LHS.getOpcode() == ISD::FADD) { + // (fsub (fadd a, a), c) -> mad 2.0, a, (fneg c) + SDValue A = LHS.getOperand(0); + if (A == LHS.getOperand(1)) { + unsigned FusedOp = getFusedOpcode(DAG, VT); + if (FusedOp != 0){ const SDValue Two = DAG.getConstantFP(2.0, SL, VT); SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS); - return DAG.getNode(ISD::FMAD, SL, VT, Two, A, NegRHS); + return DAG.getNode(FusedOp, SL, VT, Two, A, NegRHS); } } + } - if (RHS.getOpcode() == ISD::FADD) { - // (fsub c, (fadd a, a)) -> mad -2.0, a, c + if (RHS.getOpcode() == ISD::FADD) { + // (fsub c, (fadd a, a)) -> mad -2.0, a, c - SDValue A = RHS.getOperand(0); - if (A == RHS.getOperand(1)) { + SDValue A = RHS.getOperand(0); + if (A == RHS.getOperand(1)) { + unsigned FusedOp = getFusedOpcode(DAG, VT); + if (FusedOp != 0){ const SDValue NegTwo = DAG.getConstantFP(-2.0, SL, VT); - return DAG.getNode(ISD::FMAD, SL, VT, NegTwo, A, LHS); + return DAG.getNode(FusedOp, SL, VT, NegTwo, A, LHS); } } } diff --git a/lib/Target/AMDGPU/SIISelLowering.h b/lib/Target/AMDGPU/SIISelLowering.h index 5f3c4ce51bd..d7127193f05 100644 --- a/lib/Target/AMDGPU/SIISelLowering.h +++ b/lib/Target/AMDGPU/SIISelLowering.h @@ -83,6 +83,7 @@ class SITargetLowering final : public AMDGPUTargetLowering { SDValue performMinMaxCombine(SDNode *N, DAGCombinerInfo &DCI) const; + unsigned getFusedOpcode(const SelectionDAG &DAG, EVT VT) const; SDValue performFAddCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performFSubCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performSetCCCombine(SDNode *N, DAGCombinerInfo &DCI) const; diff --git a/test/CodeGen/AMDGPU/fmuladd.f16.ll b/test/CodeGen/AMDGPU/fmuladd.f16.ll new file mode 100644 index 00000000000..7a245a950b6 --- /dev/null +++ b/test/CodeGen/AMDGPU/fmuladd.f16.ll @@ -0,0 +1,467 @@ +; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-fp16-denormals -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,VI-FLUSH,VI %s +; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-fp16-denormals -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,VI-FLUSH,VI %s +; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-fp16-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,VI-FLUSH,VI %s +; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-fp16-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,VI-FLUSH,VI %s + +; RUN: llc -march=amdgcn -mcpu=fiji -mattr=+fp16-denormals -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,VI-DENORM-STRICT,VI-DENORM,VI %s +; RUN: llc -march=amdgcn -mcpu=fiji -mattr=+fp16-denormals -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,VI-DENORM-STRICT,VI-DENORM,VI %s +; RUN: llc -march=amdgcn -mcpu=fiji -mattr=+fp16-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,VI-DENORM-CONTRACT,VI-DENORM,VI %s +; RUN: llc -march=amdgcn -mcpu=fiji -mattr=+fp16-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,VI-DENORM-CONTRACT,VI-DENORM,VI %s + +declare i32 @llvm.amdgcn.workitem.id.x() #1 +declare half @llvm.fmuladd.f16(half, half, half) #1 +declare half @llvm.fabs.f16(half) #1 + +; GCN-LABEL: {{^}}fmuladd_f16: +; VI-FLUSH: v_mac_f16_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}} + +; VI-DENORM: v_fma_f16 {{v[0-9]+, v[0-9]+, v[0-9]+}} +define void @fmuladd_f16(half addrspace(1)* %out, half addrspace(1)* %in1, + half addrspace(1)* %in2, half addrspace(1)* %in3) #0 { + %r0 = load half, half addrspace(1)* %in1 + %r1 = load half, half addrspace(1)* %in2 + %r2 = load half, half addrspace(1)* %in3 + %r3 = tail call half @llvm.fmuladd.f16(half %r0, half %r1, half %r2) + store half %r3, half addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}fmuladd_2.0_a_b_f16 +; GCN: {{buffer|flat}}_load_ushort [[R1:v[0-9]+]], +; GCN: {{buffer|flat}}_load_ushort [[R2:v[0-9]+]], +; VI-FLUSH: v_mac_f16_e32 [[R2]], 2.0, [[R1]] +; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]] + +; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]] +; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +define void @fmuladd_2.0_a_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid + %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1 + %gep.out = getelementptr half, half addrspace(1)* %out, i32 %tid + + %r1 = load volatile half, half addrspace(1)* %gep.0 + %r2 = load volatile half, half addrspace(1)* %gep.1 + + %r3 = tail call half @llvm.fmuladd.f16(half 2.0, half %r1, half %r2) + store half %r3, half addrspace(1)* %gep.out + ret void +} + +; GCN-LABEL: {{^}}fmuladd_a_2.0_b_f16 +; GCN: {{buffer|flat}}_load_ushort [[R1:v[0-9]+]], +; GCN: {{buffer|flat}}_load_ushort [[R2:v[0-9]+]], +; VI-FLUSH: v_mac_f16_e32 [[R2]], 2.0, [[R1]] +; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]] + +; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]] +; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +define void @fmuladd_a_2.0_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid + %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1 + %gep.out = getelementptr half, half addrspace(1)* %out, i32 %tid + + %r1 = load volatile half, half addrspace(1)* %gep.0 + %r2 = load volatile half, half addrspace(1)* %gep.1 + + %r3 = tail call half @llvm.fmuladd.f16(half %r1, half 2.0, half %r2) + store half %r3, half addrspace(1)* %gep.out + ret void +} + +; GCN-LABEL: {{^}}fadd_a_a_b_f16: +; GCN: {{buffer|flat}}_load_ushort [[R1:v[0-9]+]], +; GCN: {{buffer|flat}}_load_ushort [[R2:v[0-9]+]], +; VI-FLUSH: v_mac_f16_e32 [[R2]], 2.0, [[R1]] +; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]] + +; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]] + +; VI-DENORM-STRICT: v_add_f16_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] +; VI-DENORM-STRICT: v_add_f16_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]] + +; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +define void @fadd_a_a_b_f16(half addrspace(1)* %out, + half addrspace(1)* %in1, + half addrspace(1)* %in2) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid + %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1 + %gep.out = getelementptr half, half addrspace(1)* %out, i32 %tid + + %r0 = load volatile half, half addrspace(1)* %gep.0 + %r1 = load volatile half, half addrspace(1)* %gep.1 + + %add.0 = fadd half %r0, %r0 + %add.1 = fadd half %add.0, %r1 + store half %add.1, half addrspace(1)* %gep.out + ret void +} + +; GCN-LABEL: {{^}}fadd_b_a_a_f16: +; GCN: {{buffer|flat}}_load_ushort [[R1:v[0-9]+]], +; GCN: {{buffer|flat}}_load_ushort [[R2:v[0-9]+]], +; VI-FLUSH: v_mac_f16_e32 [[R2]], 2.0, [[R1]] +; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]] + +; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]] + +; VI-DENORM-STRICT: v_add_f16_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] +; VI-DENORM-STRICT: v_add_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]] + +; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +define void @fadd_b_a_a_f16(half addrspace(1)* %out, + half addrspace(1)* %in1, + half addrspace(1)* %in2) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid + %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1 + %gep.out = getelementptr half, half addrspace(1)* %out, i32 %tid + + %r0 = load volatile half, half addrspace(1)* %gep.0 + %r1 = load volatile half, half addrspace(1)* %gep.1 + + %add.0 = fadd half %r0, %r0 + %add.1 = fadd half %r1, %add.0 + store half %add.1, half addrspace(1)* %gep.out + ret void +} + +; GCN-LABEL: {{^}}fmuladd_neg_2.0_a_b_f16 +; GCN: {{buffer|flat}}_load_ushort [[R1:v[0-9]+]], +; GCN: {{buffer|flat}}_load_ushort [[R2:v[0-9]+]], +; VI-FLUSH: v_mac_f16_e32 [[R2]], -2.0, [[R1]] +; VI-DENORM: v_fma_f16 [[R2:v[0-9]+]], [[R1]], -2.0, [[R2]] +; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]] +define void @fmuladd_neg_2.0_a_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid + %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1 + %gep.out = getelementptr half, half addrspace(1)* %out, i32 %tid + + %r1 = load volatile half, half addrspace(1)* %gep.0 + %r2 = load volatile half, half addrspace(1)* %gep.1 + + %r3 = tail call half @llvm.fmuladd.f16(half -2.0, half %r1, half %r2) + store half %r3, half addrspace(1)* %gep.out + ret void +} + +; GCN-LABEL: {{^}}fmuladd_neg_2.0_neg_a_b_f16 +; GCN: {{buffer|flat}}_load_ushort [[R1:v[0-9]+]], +; GCN: {{buffer|flat}}_load_ushort [[R2:v[0-9]+]], +; VI-FLUSH: v_mac_f16_e32 [[R2]], 2.0, [[R1]] +; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]] + +; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], -[[R1]], -2.0, [[R2]] +; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +define void @fmuladd_neg_2.0_neg_a_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid + %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1 + %gep.out = getelementptr half, half addrspace(1)* %out, i32 %tid + + %r1 = load volatile half, half addrspace(1)* %gep.0 + %r2 = load volatile half, half addrspace(1)* %gep.1 + + %r1.fneg = fsub half -0.000000e+00, %r1 + + %r3 = tail call half @llvm.fmuladd.f16(half -2.0, half %r1.fneg, half %r2) + store half %r3, half addrspace(1)* %gep.out + ret void +} + +; GCN-LABEL: {{^}}fmuladd_2.0_neg_a_b_f16 +; GCN: {{buffer|flat}}_load_ushort [[R1:v[0-9]+]], +; GCN: {{buffer|flat}}_load_ushort [[R2:v[0-9]+]], +; VI-FLUSH: v_mac_f16_e32 [[R2]], -2.0, [[R1]] +; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]] + +; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], -[[R1]], 2.0, [[R2]] +; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +define void @fmuladd_2.0_neg_a_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid + %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1 + %gep.out = getelementptr half, half addrspace(1)* %out, i32 %tid + + %r1 = load volatile half, half addrspace(1)* %gep.0 + %r2 = load volatile half, half addrspace(1)* %gep.1 + + %r1.fneg = fsub half -0.000000e+00, %r1 + + %r3 = tail call half @llvm.fmuladd.f16(half 2.0, half %r1.fneg, half %r2) + store half %r3, half addrspace(1)* %gep.out + ret void +} + +; GCN-LABEL: {{^}}fmuladd_2.0_a_neg_b_f16 +; GCN: {{buffer|flat}}_load_ushort [[R1:v[0-9]+]], +; GCN: {{buffer|flat}}_load_ushort [[R2:v[0-9]+]], +; VI-FLUSH: v_mad_f16 [[RESULT:v[0-9]+]], 2.0, [[R1]], -[[R2]] +; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]] +; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +define void @fmuladd_2.0_a_neg_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid + %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1 + %gep.out = getelementptr half, half addrspace(1)* %out, i32 %tid + + %r1 = load volatile half, half addrspace(1)* %gep.0 + %r2 = load volatile half, half addrspace(1)* %gep.1 + + %r2.fneg = fsub half -0.000000e+00, %r2 + + %r3 = tail call half @llvm.fmuladd.f16(half 2.0, half %r1, half %r2.fneg) + store half %r3, half addrspace(1)* %gep.out + ret void +} + +; GCN-LABEL: {{^}}mad_sub_f16: +; GCN: {{buffer|flat}}_load_ushort [[REGA:v[0-9]+]] +; GCN: {{buffer|flat}}_load_ushort [[REGB:v[0-9]+]] +; GCN: {{buffer|flat}}_load_ushort [[REGC:v[0-9]+]] + +; VI-FLUSH: v_mad_f16 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -[[REGC]] + +; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -[[REGC]] + +; VI-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGB]], [[REGA]] +; VI-DENORM-STRICT: v_subrev_f16_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]] + +; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +define void @mad_sub_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 { + %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 + %tid.ext = sext i32 %tid to i64 + %gep0 = getelementptr half, half addrspace(1)* %ptr, i64 %tid.ext + %add1 = add i64 %tid.ext, 1 + %gep1 = getelementptr half, half addrspace(1)* %ptr, i64 %add1 + %add2 = add i64 %tid.ext, 2 + %gep2 = getelementptr half, half addrspace(1)* %ptr, i64 %add2 + %outgep = getelementptr half, half addrspace(1)* %out, i64 %tid.ext + %a = load volatile half, half addrspace(1)* %gep0, align 2 + %b = load volatile half, half addrspace(1)* %gep1, align 2 + %c = load volatile half, half addrspace(1)* %gep2, align 2 + %mul = fmul half %a, %b + %sub = fsub half %mul, %c + store half %sub, half addrspace(1)* %outgep, align 2 + ret void +} + +; GCN-LABEL: {{^}}mad_sub_inv_f16: +; GCN: {{buffer|flat}}_load_ushort [[REGA:v[0-9]+]] +; GCN: {{buffer|flat}}_load_ushort [[REGB:v[0-9]+]] +; GCN: {{buffer|flat}}_load_ushort [[REGC:v[0-9]+]] +; VI-FLUSH: v_mad_f16 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], [[REGC]] + +; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], [[REGC]] + +; VI-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGB]], [[REGA]] +; VI-DENORM-STRICT: v_subrev_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]] + +; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +define void @mad_sub_inv_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 { + %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 + %tid.ext = sext i32 %tid to i64 + %gep0 = getelementptr half, half addrspace(1)* %ptr, i64 %tid.ext + %add1 = add i64 %tid.ext, 1 + %gep1 = getelementptr half, half addrspace(1)* %ptr, i64 %add1 + %add2 = add i64 %tid.ext, 2 + %gep2 = getelementptr half, half addrspace(1)* %ptr, i64 %add2 + %outgep = getelementptr half, half addrspace(1)* %out, i64 %tid.ext + %a = load volatile half, half addrspace(1)* %gep0, align 2 + %b = load volatile half, half addrspace(1)* %gep1, align 2 + %c = load volatile half, half addrspace(1)* %gep2, align 2 + %mul = fmul half %a, %b + %sub = fsub half %c, %mul + store half %sub, half addrspace(1)* %outgep, align 2 + ret void +} + +; GCN-LABEL: {{^}}mad_sub_fabs_f16: +; GCN: {{buffer|flat}}_load_ushort [[REGA:v[0-9]+]] +; GCN: {{buffer|flat}}_load_ushort [[REGB:v[0-9]+]] +; GCN: {{buffer|flat}}_load_ushort [[REGC:v[0-9]+]] +; VI-FLUSH: v_mad_f16 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -|[[REGC]]| + +; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -|[[REGC]]| + +; VI-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGB]], [[REGA]] +; VI-DENORM-STRICT: v_sub_f16_e64 [[RESULT:v[0-9]+]], [[TMP]], |[[REGC]]| + +; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +define void @mad_sub_fabs_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 { + %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 + %tid.ext = sext i32 %tid to i64 + %gep0 = getelementptr half, half addrspace(1)* %ptr, i64 %tid.ext + %add1 = add i64 %tid.ext, 1 + %gep1 = getelementptr half, half addrspace(1)* %ptr, i64 %add1 + %add2 = add i64 %tid.ext, 2 + %gep2 = getelementptr half, half addrspace(1)* %ptr, i64 %add2 + %outgep = getelementptr half, half addrspace(1)* %out, i64 %tid.ext + %a = load volatile half, half addrspace(1)* %gep0, align 2 + %b = load volatile half, half addrspace(1)* %gep1, align 2 + %c = load volatile half, half addrspace(1)* %gep2, align 2 + %c.abs = call half @llvm.fabs.f16(half %c) #0 + %mul = fmul half %a, %b + %sub = fsub half %mul, %c.abs + store half %sub, half addrspace(1)* %outgep, align 2 + ret void +} + +; GCN-LABEL: {{^}}mad_sub_fabs_inv_f16: +; GCN: {{buffer|flat}}_load_ushort [[REGA:v[0-9]+]] +; GCN: {{buffer|flat}}_load_ushort [[REGB:v[0-9]+]] +; GCN: {{buffer|flat}}_load_ushort [[REGC:v[0-9]+]] + +; VI-FLUSH: v_mad_f16 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], |[[REGC]]| + +; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], |[[REGC]]| + +; VI-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGB]], [[REGA]] +; VI-DENORM-STRICT: v_sub_f16_e64 [[RESULT:v[0-9]+]], |[[REGC]]|, [[TMP]] + +; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +define void @mad_sub_fabs_inv_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 { + %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 + %tid.ext = sext i32 %tid to i64 + %gep0 = getelementptr half, half addrspace(1)* %ptr, i64 %tid.ext + %add1 = add i64 %tid.ext, 1 + %gep1 = getelementptr half, half addrspace(1)* %ptr, i64 %add1 + %add2 = add i64 %tid.ext, 2 + %gep2 = getelementptr half, half addrspace(1)* %ptr, i64 %add2 + %outgep = getelementptr half, half addrspace(1)* %out, i64 %tid.ext + %a = load volatile half, half addrspace(1)* %gep0, align 2 + %b = load volatile half, half addrspace(1)* %gep1, align 2 + %c = load volatile half, half addrspace(1)* %gep2, align 2 + %c.abs = call half @llvm.fabs.f16(half %c) #0 + %mul = fmul half %a, %b + %sub = fsub half %c.abs, %mul + store half %sub, half addrspace(1)* %outgep, align 2 + ret void +} + +; GCN-LABEL: {{^}}neg_neg_mad_f16: +; GCN: {{buffer|flat}}_load_ushort [[REGA:v[0-9]+]] +; GCN: {{buffer|flat}}_load_ushort [[REGB:v[0-9]+]] +; GCN: {{buffer|flat}}_load_ushort [[REGC:v[0-9]+]] + +; VI-FLUSH: v_mac_f16_e32 [[REGC]], [[REGB]], [[REGA]] +; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REGC]] + +; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], [[REGC]] + +; VI-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGB]], [[REGA]] +; VI-DENORM-STRICT: v_add_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]] +; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +define void @neg_neg_mad_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 { + %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 + %tid.ext = sext i32 %tid to i64 + %gep0 = getelementptr half, half addrspace(1)* %ptr, i64 %tid.ext + %add1 = add i64 %tid.ext, 1 + %gep1 = getelementptr half, half addrspace(1)* %ptr, i64 %add1 + %add2 = add i64 %tid.ext, 2 + %gep2 = getelementptr half, half addrspace(1)* %ptr, i64 %add2 + %outgep = getelementptr half, half addrspace(1)* %out, i64 %tid.ext + %a = load volatile half, half addrspace(1)* %gep0, align 2 + %b = load volatile half, half addrspace(1)* %gep1, align 2 + %c = load volatile half, half addrspace(1)* %gep2, align 2 + %nega = fsub half -0.000000e+00, %a + %negb = fsub half -0.000000e+00, %b + %mul = fmul half %nega, %negb + %sub = fadd half %mul, %c + store half %sub, half addrspace(1)* %outgep, align 2 + ret void +} + +; GCN-LABEL: {{^}}mad_fabs_sub_f16: +; GCN: {{buffer|flat}}_load_ushort [[REGA:v[0-9]+]] +; GCN: {{buffer|flat}}_load_ushort [[REGB:v[0-9]+]] +; GCN: {{buffer|flat}}_load_ushort [[REGC:v[0-9]+]] + +; VI-FLUSH: v_mad_f16 [[RESULT:v[0-9]+]], [[REGA]], |[[REGB]]|, -[[REGC]] + +; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[REGA]], |[[REGB]]|, -[[REGC]] + +; VI-DENORM-STRICT: v_mul_f16_e64 [[TMP:v[0-9]+]], [[REGA]], |[[REGB]]| +; VI-DENORM-STRICT: v_subrev_f16_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]] + +; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +define void @mad_fabs_sub_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 { + %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 + %tid.ext = sext i32 %tid to i64 + %gep0 = getelementptr half, half addrspace(1)* %ptr, i64 %tid.ext + %add1 = add i64 %tid.ext, 1 + %gep1 = getelementptr half, half addrspace(1)* %ptr, i64 %add1 + %add2 = add i64 %tid.ext, 2 + %gep2 = getelementptr half, half addrspace(1)* %ptr, i64 %add2 + %outgep = getelementptr half, half addrspace(1)* %out, i64 %tid.ext + %a = load volatile half, half addrspace(1)* %gep0, align 2 + %b = load volatile half, half addrspace(1)* %gep1, align 2 + %c = load volatile half, half addrspace(1)* %gep2, align 2 + %b.abs = call half @llvm.fabs.f16(half %b) #0 + %mul = fmul half %a, %b.abs + %sub = fsub half %mul, %c + store half %sub, half addrspace(1)* %outgep, align 2 + ret void +} + +; GCN-LABEL: {{^}}fsub_c_fadd_a_a_f16: +; GCN: {{buffer|flat}}_load_ushort [[R1:v[0-9]+]], +; GCN: {{buffer|flat}}_load_ushort [[R2:v[0-9]+]], +; VI-FLUSH: v_mac_f16_e32 [[R2]], -2.0, [[R1]] +; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]] + +; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], -2.0, [[R2]] + +; VI-DENORM-STRICT: v_add_f16_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] +; VI-DENORM-STRICT: v_subrev_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]] + +; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +define void @fsub_c_fadd_a_a_f16(half addrspace(1)* %out, half addrspace(1)* %in) { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid + %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1 + %gep.out = getelementptr half, half addrspace(1)* %out, i32 %tid + + %r1 = load volatile half, half addrspace(1)* %gep.0 + %r2 = load volatile half, half addrspace(1)* %gep.1 + + %add = fadd half %r1, %r1 + %r3 = fsub half %r2, %add + + store half %r3, half addrspace(1)* %gep.out + ret void +} + +; GCN-LABEL: {{^}}fsub_fadd_a_a_c_f16: +; GCN: {{buffer|flat}}_load_ushort [[R1:v[0-9]+]], +; GCN: {{buffer|flat}}_load_ushort [[R2:v[0-9]+]], + +; VI-FLUSH: v_mad_f16 [[RESULT:v[0-9]+]], 2.0, [[R1]], -[[R2]] + +; VI-DENORM-CONTRACT: v_fma_f16 [[R2]], [[R1]], 2.0, -[[R2]] + +; VI-DENORM-STRICT: v_add_f16_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] +; VI-DENORM-STRICT: v_subrev_f16_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]] + +; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +define void @fsub_fadd_a_a_c_f16(half addrspace(1)* %out, half addrspace(1)* %in) { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid + %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1 + %gep.out = getelementptr half, half addrspace(1)* %out, i32 %tid + + %r1 = load volatile half, half addrspace(1)* %gep.0 + %r2 = load volatile half, half addrspace(1)* %gep.1 + + %add = fadd half %r1, %r1 + %r3 = fsub half %add, %r2 + + store half %r3, half addrspace(1)* %gep.out + ret void +} + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } diff --git a/test/CodeGen/AMDGPU/fmuladd.f32.ll b/test/CodeGen/AMDGPU/fmuladd.f32.ll new file mode 100644 index 00000000000..ce800837ddc --- /dev/null +++ b/test/CodeGen/AMDGPU/fmuladd.f32.ll @@ -0,0 +1,583 @@ +; RUN: llc -verify-machineinstrs -mcpu=tahiti -mattr=-fp32-denormals,+fast-fmaf -fp-contract=on < %s | FileCheck -check-prefixes=GCN,GCN-FLUSH-STRICT,GCN-FLUSH,SI-FLUSH,GCN-FLUSH-FASTFMA,GCN-FLUSH-FASTFMA-STRICT,SI %s +; RUN: llc -verify-machineinstrs -mcpu=tahiti -mattr=+fp32-denormals,+fast-fmaf -fp-contract=on < %s | FileCheck -check-prefixes=GCN,GCN-DENORM-STRICT,GCN-DENORM,SI-DENORM,GCN-DENORM-FASTFMA,GCN-DENORM-FASTFMA-STRICT,SI %s +; RUN: llc -verify-machineinstrs -mcpu=verde -mattr=-fp32-denormals,-fast-fmaf -fp-contract=on < %s | FileCheck -check-prefixes=GCN,GCN-FLUSH-STRICT,GCN-FLUSH,SI-FLUSH,GCN-FLUSH-SLOWFMA,GCN-FLUSH-SLOWFMA-STRICT,SI %s +; RUN: llc -verify-machineinstrs -mcpu=verde -mattr=+fp32-denormals,-fast-fmaf -fp-contract=on < %s | FileCheck -check-prefixes=GCN,GCN-DENORM-STRICT,GCN-DENORM,SI-DENORM,GCN-DENORM-SLOWFMA,GCN-DENORM-SLOWFMA-STRICT,SI %s + +; RUN: llc -verify-machineinstrs -mcpu=tahiti -mattr=-fp32-denormals,+fast-fmaf -fp-contract=fast < %s | FileCheck -check-prefixes=GCN,GCN-FLUSH-CONTRACT,GCN-FLUSH,SI-FLUSH,GCN-FLUSH-FASTFMA,GCN-FLUSH-FASTFMA-CONTRACT,SI %s +; RUN: llc -verify-machineinstrs -mcpu=tahiti -mattr=+fp32-denormals,+fast-fmaf -fp-contract=fast < %s | FileCheck -check-prefixes=GCN,GCN-DENORM-CONTRACT,GCN-DENORM,SI-DENORM,GCN-DENORM-FASTFMA,GCN-DENORM-FASTFMA-CONTRACT,SI %s +; RUN: llc -verify-machineinstrs -mcpu=verde -mattr=-fp32-denormals,-fast-fmaf -fp-contract=fast < %s | FileCheck -check-prefixes=GCN,GCN-FLUSH-CONTRACT,GCN-FLUSH,SI-FLUSH,GCN-FLUSH-SLOWFMA,GCN-FLUSH-SLOWFMA-CONTRACT,SI %s +; RUN: llc -verify-machineinstrs -mcpu=verde -mattr=+fp32-denormals,-fast-fmaf -fp-contract=fast < %s | FileCheck -check-prefixes=GCN,GCN-DENORM-CONTRACT,GCN-DENORM,SI-DENORM,GCN-DENORM-SLOWFMA,GCN-DENORM-SLOWFMA-CONTRACT,SI %s + +; Test all permutations of: fp32 denormals, fast fp contract, fp contract enabled for fmuladd, fmaf fast/slow. + +target triple = "amdgcn--" + + +declare i32 @llvm.amdgcn.workitem.id.x() #1 +declare float @llvm.fmuladd.f32(float, float, float) #1 +declare half @llvm.fmuladd.f16(half, half, half) #1 +declare float @llvm.fabs.f32(float) #1 + +; GCN-LABEL: {{^}}fmuladd_f32: +; GCN-FLUSH: v_mac_f32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}} + +; GCN-DENORM-FASTFMA: v_fma_f32 {{v[0-9]+, v[0-9]+, v[0-9]+}} + +; GCN-DENORM-SLOWFMA: v_mul_f32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}} +; GCN-DENORM-SLOWFMA: v_add_f32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}} +define void @fmuladd_f32(float addrspace(1)* %out, float addrspace(1)* %in1, + float addrspace(1)* %in2, float addrspace(1)* %in3) #0 { + %r0 = load float, float addrspace(1)* %in1 + %r1 = load float, float addrspace(1)* %in2 + %r2 = load float, float addrspace(1)* %in3 + %r3 = tail call float @llvm.fmuladd.f32(float %r0, float %r1, float %r2) + store float %r3, float addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}fmul_fadd_f32: +; GCN-FLUSH: v_mac_f32 + +; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 + +; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e32 +; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 + +; GCN-DENORM-STRICT: v_mul_f32_e32 +; GCN-DENORM-STRICT: v_add_f32_e32 +define void @fmul_fadd_f32(float addrspace(1)* %out, float addrspace(1)* %in1, + float addrspace(1)* %in2, float addrspace(1)* %in3) #0 { + %r0 = load volatile float, float addrspace(1)* %in1 + %r1 = load volatile float, float addrspace(1)* %in2 + %r2 = load volatile float, float addrspace(1)* %in3 + %mul = fmul float %r0, %r1 + %add = fadd float %mul, %r2 + store float %add, float addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}fmuladd_2.0_a_b_f32 +; GCN: {{buffer|flat}}_load_dword [[R1:v[0-9]+]], +; GCN: {{buffer|flat}}_load_dword [[R2:v[0-9]+]], + +; GCN-FLUSH: v_mac_f32_e32 [[R2]], 2.0, [[R1]] +; SI-FLUSH: buffer_store_dword [[R2]] +; VI-FLUSH: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[R2]] + +; GCN-DENORM-FASTFMA: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]] + +; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] +; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]] + +; SI-DENORM buffer_store_dword [[RESULT]] +; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +define void @fmuladd_2.0_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid + %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 + %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid + + %r1 = load volatile float, float addrspace(1)* %gep.0 + %r2 = load volatile float, float addrspace(1)* %gep.1 + + %r3 = tail call float @llvm.fmuladd.f32(float 2.0, float %r1, float %r2) + store float %r3, float addrspace(1)* %gep.out + ret void +} + +; GCN-LABEL: {{^}}fmuladd_a_2.0_b_f32 +; GCN: {{buffer|flat}}_load_dword [[R1:v[0-9]+]], +; GCN: {{buffer|flat}}_load_dword [[R2:v[0-9]+]], + +; GCN-FLUSH: v_mac_f32_e32 [[R2]], 2.0, [[R1]] +; SI-FLUSH: buffer_store_dword [[R2]] +; VI-FLUSH: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[R2]] + +; GCN-DENORM-FASTFMA: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]] + +; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] +; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]] + +; SI-DENORM: buffer_store_dword [[RESULT]] +; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +define void @fmuladd_a_2.0_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid + %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 + %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid + + %r1 = load volatile float, float addrspace(1)* %gep.0 + %r2 = load volatile float, float addrspace(1)* %gep.1 + + %r3 = tail call float @llvm.fmuladd.f32(float %r1, float 2.0, float %r2) + store float %r3, float addrspace(1)* %gep.out + ret void +} + +; GCN-LABEL: {{^}}fadd_a_a_b_f32: +; GCN: {{buffer|flat}}_load_dword [[R1:v[0-9]+]], +; GCN: {{buffer|flat}}_load_dword [[R2:v[0-9]+]], + +; GCN-FLUSH: v_mac_f32_e32 [[R2]], 2.0, [[R1]] +; SI-FLUSH: buffer_store_dword [[R2]] +; VI-FLUSH: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[R2]] + +; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]] + +; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] +; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]] + +; GCN-DENORM-STRICT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] +; GCN-DENORM-STRICT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]] + +; SI-DENORM: buffer_store_dword [[RESULT]] +; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +define void @fadd_a_a_b_f32(float addrspace(1)* %out, + float addrspace(1)* %in1, + float addrspace(1)* %in2) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid + %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 + %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid + + %r0 = load volatile float, float addrspace(1)* %gep.0 + %r1 = load volatile float, float addrspace(1)* %gep.1 + + %add.0 = fadd float %r0, %r0 + %add.1 = fadd float %add.0, %r1 + store float %add.1, float addrspace(1)* %gep.out + ret void +} + +; GCN-LABEL: {{^}}fadd_b_a_a_f32: +; GCN: {{buffer|flat}}_load_dword [[R1:v[0-9]+]], +; GCN: {{buffer|flat}}_load_dword [[R2:v[0-9]+]], + +; GCN-FLUSH: v_mac_f32_e32 [[R2]], 2.0, [[R1]] +; SI-FLUSH: buffer_store_dword [[R2]] +; VI-FLUSH: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[R2]] + +; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]] + +; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] +; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]] + +; GCN-DENORM-STRICT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] +; GCN-DENORM-STRICT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]] + +; SI-DENORM: buffer_store_dword [[RESULT]] +; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +define void @fadd_b_a_a_f32(float addrspace(1)* %out, + float addrspace(1)* %in1, + float addrspace(1)* %in2) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid + %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 + %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid + + %r0 = load volatile float, float addrspace(1)* %gep.0 + %r1 = load volatile float, float addrspace(1)* %gep.1 + + %add.0 = fadd float %r0, %r0 + %add.1 = fadd float %r1, %add.0 + store float %add.1, float addrspace(1)* %gep.out + ret void +} + +; GCN-LABEL: {{^}}fmuladd_neg_2.0_a_b_f32 +; GCN: {{buffer|flat}}_load_dword [[R1:v[0-9]+]], +; GCN: {{buffer|flat}}_load_dword [[R2:v[0-9]+]], +; GCN-FLUSH: v_mac_f32_e32 [[R2]], -2.0, [[R1]] + +; GCN-DENORM-FASTFMA: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], -2.0, [[R2]] + +; GCN-DENORM-SLOWFMA: v_mul_f32_e32 [[TMP:v[0-9]+]], -2.0, [[R1]] +; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]] + +; SI-DENORM: buffer_store_dword [[RESULT]] +; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +define void @fmuladd_neg_2.0_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid + %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 + %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid + + %r1 = load volatile float, float addrspace(1)* %gep.0 + %r2 = load volatile float, float addrspace(1)* %gep.1 + + %r3 = tail call float @llvm.fmuladd.f32(float -2.0, float %r1, float %r2) + store float %r3, float addrspace(1)* %gep.out + ret void +} + +; GCN-LABEL: {{^}}fmuladd_neg_2.0_neg_a_b_f32 +; GCN: {{buffer|flat}}_load_dword [[R1:v[0-9]+]], +; GCN: {{buffer|flat}}_load_dword [[R2:v[0-9]+]], + +; GCN-FLUSH: v_mac_f32_e32 [[R2]], 2.0, [[R1]] +; SI-FLUSH: buffer_store_dword [[R2]] +; VI-FLUSH: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[R2]] + +; GCN-DENORM-FASTFMA: v_fma_f32 [[RESULT:v[0-9]+]], -[[R1]], -2.0, [[R2]] + +; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] +; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]] + +; SI-DENORM: buffer_store_dword [[RESULT]] +; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +define void @fmuladd_neg_2.0_neg_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid + %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 + %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid + + %r1 = load volatile float, float addrspace(1)* %gep.0 + %r2 = load volatile float, float addrspace(1)* %gep.1 + + %r1.fneg = fsub float -0.000000e+00, %r1 + + %r3 = tail call float @llvm.fmuladd.f32(float -2.0, float %r1.fneg, float %r2) + store float %r3, float addrspace(1)* %gep.out + ret void +} + +; GCN-LABEL: {{^}}fmuladd_2.0_neg_a_b_f32: +; GCN: {{buffer|flat}}_load_dword [[R1:v[0-9]+]], +; GCN: {{buffer|flat}}_load_dword [[R2:v[0-9]+]], + +; GCN-FLUSH: v_mac_f32_e32 [[R2]], -2.0, [[R1]] +; SI-FLUSH: buffer_store_dword [[R2]] +; VI-FLUSH: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[R2]] + +; GCN-DENORM-FASTFMA: v_fma_f32 [[RESULT:v[0-9]+]], -[[R1]], 2.0, [[R2]] + +; GCN-DENORM-SLOWFMA: v_mul_f32_e32 [[TMP:v[0-9]+]], -2.0, [[R1]] +; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]] + +; SI-DENORM: buffer_store_dword [[RESULT]] +; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +define void @fmuladd_2.0_neg_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid + %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 + %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid + + %r1 = load volatile float, float addrspace(1)* %gep.0 + %r2 = load volatile float, float addrspace(1)* %gep.1 + + %r1.fneg = fsub float -0.000000e+00, %r1 + + %r3 = tail call float @llvm.fmuladd.f32(float 2.0, float %r1.fneg, float %r2) + store float %r3, float addrspace(1)* %gep.out + ret void +} + +; GCN-LABEL: {{^}}fmuladd_2.0_a_neg_b_f32: +; GCN: {{buffer|flat}}_load_dword [[R1:v[0-9]+]], +; GCN: {{buffer|flat}}_load_dword [[R2:v[0-9]+]], +; GCN-FLUSH: v_mad_f32 [[RESULT:v[0-9]+]], 2.0, [[R1]], -[[R2]] +; SI-FLUSH: buffer_store_dword [[RESULT]] +; VI-FLUSH: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] + +; GCN-DENORM-FASTFMA: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]] + +; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] +; GCN-DENORM-SLOWFMA: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]] + +; SI-DENORM: buffer_store_dword [[RESULT]] +; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +define void @fmuladd_2.0_a_neg_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid + %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 + %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid + + %r1 = load volatile float, float addrspace(1)* %gep.0 + %r2 = load volatile float, float addrspace(1)* %gep.1 + + %r2.fneg = fsub float -0.000000e+00, %r2 + + %r3 = tail call float @llvm.fmuladd.f32(float 2.0, float %r1, float %r2.fneg) + store float %r3, float addrspace(1)* %gep.out + ret void +} + +; GCN-LABEL: {{^}}mad_sub_f32: +; GCN: {{buffer|flat}}_load_dword [[REGA:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[REGB:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[REGC:v[0-9]+]] +; GCN-FLUSH: v_mad_f32 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -[[REGC]] + +; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -[[REGC]] + +; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGB]], [[REGA]] +; GCN-DENORM-SLOWFMA-CONTRACT: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]] + +; GCN-DENORM-STRICT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGB]], [[REGA]] +; GCN-DENORM-STRICT: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]] + +; SI: buffer_store_dword [[RESULT]] +; VI: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +define void @mad_sub_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 { + %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 + %tid.ext = sext i32 %tid to i64 + %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext + %add1 = add i64 %tid.ext, 1 + %gep1 = getelementptr float, float addrspace(1)* %ptr, i64 %add1 + %add2 = add i64 %tid.ext, 2 + %gep2 = getelementptr float, float addrspace(1)* %ptr, i64 %add2 + %outgep = getelementptr float, float addrspace(1)* %out, i64 %tid.ext + %a = load volatile float, float addrspace(1)* %gep0, align 4 + %b = load volatile float, float addrspace(1)* %gep1, align 4 + %c = load volatile float, float addrspace(1)* %gep2, align 4 + %mul = fmul float %a, %b + %sub = fsub float %mul, %c + store float %sub, float addrspace(1)* %outgep, align 4 + ret void +} + +; GCN-LABEL: {{^}}mad_sub_inv_f32: +; GCN: {{buffer|flat}}_load_dword [[REGA:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[REGB:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[REGC:v[0-9]+]] + +; GCN-FLUSH: v_mad_f32 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], [[REGC]] + +; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], [[REGC]] + +; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGB]], [[REGA]] +; GCN-DENORM-SLOWFMA-CONTRACT: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]] + +; GCN-DENORM-STRICT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGB]], [[REGA]] +; GCN-DENORM-STRICT: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]] + +; SI: buffer_store_dword [[RESULT]] +; VI: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +define void @mad_sub_inv_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 { + %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 + %tid.ext = sext i32 %tid to i64 + %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext + %add1 = add i64 %tid.ext, 1 + %gep1 = getelementptr float, float addrspace(1)* %ptr, i64 %add1 + %add2 = add i64 %tid.ext, 2 + %gep2 = getelementptr float, float addrspace(1)* %ptr, i64 %add2 + %outgep = getelementptr float, float addrspace(1)* %out, i64 %tid.ext + %a = load volatile float, float addrspace(1)* %gep0, align 4 + %b = load volatile float, float addrspace(1)* %gep1, align 4 + %c = load volatile float, float addrspace(1)* %gep2, align 4 + %mul = fmul float %a, %b + %sub = fsub float %c, %mul + store float %sub, float addrspace(1)* %outgep, align 4 + ret void +} + +; GCN-LABEL: {{^}}mad_sub_fabs_f32: +; GCN: {{buffer|flat}}_load_dword [[REGA:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[REGB:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[REGC:v[0-9]+]] +; GCN-FLUSH: v_mad_f32 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -|[[REGC]]| + +; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -|[[REGC]]| + +; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGB]], [[REGA]] +; GCN-DENORM-SLOWFMA-CONTRACT: v_sub_f32_e64 [[RESULT:v[0-9]+]], [[TMP]], |[[REGC]]| + +; GCN-DENORM-STRICT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGB]], [[REGA]] +; GCN-DENORM-STRICT: v_sub_f32_e64 [[RESULT:v[0-9]+]], [[TMP]], |[[REGC]]| + +; SI: buffer_store_dword [[RESULT]] +; VI: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +define void @mad_sub_fabs_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 { + %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 + %tid.ext = sext i32 %tid to i64 + %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext + %add1 = add i64 %tid.ext, 1 + %gep1 = getelementptr float, float addrspace(1)* %ptr, i64 %add1 + %add2 = add i64 %tid.ext, 2 + %gep2 = getelementptr float, float addrspace(1)* %ptr, i64 %add2 + %outgep = getelementptr float, float addrspace(1)* %out, i64 %tid.ext + %a = load volatile float, float addrspace(1)* %gep0, align 4 + %b = load volatile float, float addrspace(1)* %gep1, align 4 + %c = load volatile float, float addrspace(1)* %gep2, align 4 + %c.abs = call float @llvm.fabs.f32(float %c) #0 + %mul = fmul float %a, %b + %sub = fsub float %mul, %c.abs + store float %sub, float addrspace(1)* %outgep, align 4 + ret void +} + +; GCN-LABEL: {{^}}mad_sub_fabs_inv_f32: +; GCN: {{buffer|flat}}_load_dword [[REGA:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[REGB:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[REGC:v[0-9]+]] +; GCN-FLUSH: v_mad_f32 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], |[[REGC]]| + +; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], |[[REGC]]| + +; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGB]], [[REGA]] +; GCN-DENORM-SLOWFMA-CONTRACT: v_sub_f32_e64 [[RESULT:v[0-9]+]], |[[REGC]]|, [[TMP]] + +; GCN-DENORM-STRICT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGB]], [[REGA]] +; GCN-DENORM-STRICT: v_sub_f32_e64 [[RESULT:v[0-9]+]], |[[REGC]]|, [[TMP]] + +; SI: buffer_store_dword [[RESULT]] +; VI: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +define void @mad_sub_fabs_inv_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 { + %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 + %tid.ext = sext i32 %tid to i64 + %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext + %add1 = add i64 %tid.ext, 1 + %gep1 = getelementptr float, float addrspace(1)* %ptr, i64 %add1 + %add2 = add i64 %tid.ext, 2 + %gep2 = getelementptr float, float addrspace(1)* %ptr, i64 %add2 + %outgep = getelementptr float, float addrspace(1)* %out, i64 %tid.ext + %a = load volatile float, float addrspace(1)* %gep0, align 4 + %b = load volatile float, float addrspace(1)* %gep1, align 4 + %c = load volatile float, float addrspace(1)* %gep2, align 4 + %c.abs = call float @llvm.fabs.f32(float %c) #0 + %mul = fmul float %a, %b + %sub = fsub float %c.abs, %mul + store float %sub, float addrspace(1)* %outgep, align 4 + ret void +} + +; GCN-LABEL: {{^}}neg_neg_mad_f32: +; GCN: {{buffer|flat}}_load_dword [[REGA:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[REGB:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[REGC:v[0-9]+]] + +; GCN-FLUSH: v_mac_f32_e32 [[REGC]], [[REGB]], [[REGA]] +; SI-FLUSH: buffer_store_dword [[REGC]] +; VI-FLUSH: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REGC]] + +; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], [[REGC]] + +; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGB]], [[REGA]] +; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]] + +; GCN-DENORM-STRICT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGB]], [[REGA]] +; GCN-DENORM-STRICT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]] + +; SI-DENORM: buffer_store_dword [[RESULT]] +; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +define void @neg_neg_mad_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 { + %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 + %tid.ext = sext i32 %tid to i64 + %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext + %add1 = add i64 %tid.ext, 1 + %gep1 = getelementptr float, float addrspace(1)* %ptr, i64 %add1 + %add2 = add i64 %tid.ext, 2 + %gep2 = getelementptr float, float addrspace(1)* %ptr, i64 %add2 + %outgep = getelementptr float, float addrspace(1)* %out, i64 %tid.ext + %a = load volatile float, float addrspace(1)* %gep0, align 4 + %b = load volatile float, float addrspace(1)* %gep1, align 4 + %c = load volatile float, float addrspace(1)* %gep2, align 4 + %nega = fsub float -0.000000e+00, %a + %negb = fsub float -0.000000e+00, %b + %mul = fmul float %nega, %negb + %sub = fadd float %mul, %c + store float %sub, float addrspace(1)* %outgep, align 4 + ret void +} + +; GCN-LABEL: {{^}}mad_fabs_sub_f32: +; GCN: {{buffer|flat}}_load_dword [[REGA:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[REGB:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[REGC:v[0-9]+]] +; GCN-FLUSH: v_mad_f32 [[RESULT:v[0-9]+]], [[REGA]], |[[REGB]]|, -[[REGC]] + +; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[REGA]], |[[REGB]]|, -[[REGC]] + +; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e64 [[TMP:v[0-9]+]], [[REGA]], |[[REGB]]| +; GCN-DENORM-SLOWFMA-CONTRACT: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]] + +; GCN-DENORM-STRICT: v_mul_f32_e64 [[TMP:v[0-9]+]], [[REGA]], |[[REGB]]| +; GCN-DENORM-STRICT: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]] + +; SI: buffer_store_dword [[RESULT]] +; VI: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +define void @mad_fabs_sub_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 { + %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 + %tid.ext = sext i32 %tid to i64 + %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext + %add1 = add i64 %tid.ext, 1 + %gep1 = getelementptr float, float addrspace(1)* %ptr, i64 %add1 + %add2 = add i64 %tid.ext, 2 + %gep2 = getelementptr float, float addrspace(1)* %ptr, i64 %add2 + %outgep = getelementptr float, float addrspace(1)* %out, i64 %tid.ext + %a = load volatile float, float addrspace(1)* %gep0, align 4 + %b = load volatile float, float addrspace(1)* %gep1, align 4 + %c = load volatile float, float addrspace(1)* %gep2, align 4 + %b.abs = call float @llvm.fabs.f32(float %b) #0 + %mul = fmul float %a, %b.abs + %sub = fsub float %mul, %c + store float %sub, float addrspace(1)* %outgep, align 4 + ret void +} + +; GCN-LABEL: {{^}}fsub_c_fadd_a_a_f32: +; GCN: {{buffer|flat}}_load_dword [[R1:v[0-9]+]], +; GCN: {{buffer|flat}}_load_dword [[R2:v[0-9]+]], +; GCN-FLUSH: v_mac_f32_e32 [[R2]], -2.0, [[R1]] +; SI-FLUSH: buffer_store_dword [[R2]] +; VI-FLUSH: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[R2]] + +; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], -2.0, [[R2]] + +; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] +; GCN-DENORM-SLOWFMA-CONTRACT: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]] + +; GCN-DENORM-STRICT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] +; GCN-DENORM-STRICT: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]] + +; SI-DENORM: buffer_store_dword [[RESULT]] +; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +define void @fsub_c_fadd_a_a_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone + %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid + %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 + %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid + + %r1 = load volatile float, float addrspace(1)* %gep.0 + %r2 = load volatile float, float addrspace(1)* %gep.1 + + %add = fadd float %r1, %r1 + %r3 = fsub float %r2, %add + + store float %r3, float addrspace(1)* %gep.out + ret void +} + +; GCN-LABEL: {{^}}fsub_fadd_a_a_c_f32: +; GCN: {{buffer|flat}}_load_dword [[R1:v[0-9]+]], +; GCN: {{buffer|flat}}_load_dword [[R2:v[0-9]+]], +; GCN-FLUSH: v_mad_f32 [[RESULT:v[0-9]+]], 2.0, [[R1]], -[[R2]] + +; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]] + +; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] +; GCN-DENORM-SLOWFMA-CONTRACT: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]] + +; GCN-DENORM-STRICT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] +; GCN-DENORM-STRICT: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]] + +; SI: buffer_store_dword [[RESULT]] +; VI: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +define void @fsub_fadd_a_a_c_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone + %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid + %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 + %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid + + %r1 = load volatile float, float addrspace(1)* %gep.0 + %r2 = load volatile float, float addrspace(1)* %gep.1 + + %add = fadd float %r1, %r1 + %r3 = fsub float %add, %r2 + + store float %r3, float addrspace(1)* %gep.out + ret void +} + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } diff --git a/test/CodeGen/AMDGPU/fmuladd.f64.ll b/test/CodeGen/AMDGPU/fmuladd.f64.ll new file mode 100644 index 00000000000..fe209e34dd1 --- /dev/null +++ b/test/CodeGen/AMDGPU/fmuladd.f64.ll @@ -0,0 +1,119 @@ +; RUN: llc -march=amdgcn -mcpu=tahiti -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-STRICTSI %s +; RUN: llc -march=amdgcn -mcpu=verde -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,SI %s +; RUN: llc -march=amdgcn -mcpu=tahiti -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,SI %s +; RUN: llc -march=amdgcn -mcpu=verde -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,SI %s +; RUN: llc -march=amdgcn -mcpu=tonga -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,VI %s +; RUN: llc -march=amdgcn -mcpu=tonga -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,VI %s + +; GCN-LABEL: {{^}}fmuladd_f64: +; GCN: v_fma_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}} +define void @fmuladd_f64(double addrspace(1)* %out, double addrspace(1)* %in1, + double addrspace(1)* %in2, double addrspace(1)* %in3) #0 { + %r0 = load double, double addrspace(1)* %in1 + %r1 = load double, double addrspace(1)* %in2 + %r2 = load double, double addrspace(1)* %in3 + %r3 = tail call double @llvm.fmuladd.f64(double %r0, double %r1, double %r2) + store double %r3, double addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}fmul_fadd_f64: +; GCN-CONTRACT: v_fma_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}} + +; GCN-STRICT: v_mul_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}} +; GCN-STRICT: v_add_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}} +define void @fmul_fadd_f64(double addrspace(1)* %out, double addrspace(1)* %in1, + double addrspace(1)* %in2, double addrspace(1)* %in3) #0 { + %r0 = load double, double addrspace(1)* %in1 + %r1 = load double, double addrspace(1)* %in2 + %r2 = load double, double addrspace(1)* %in3 + %tmp = fmul double %r0, %r1 + %r3 = fadd double %tmp, %r2 + store double %r3, double addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}fadd_a_a_b_f64: +; GCN: {{buffer|flat}}_load_dwordx2 [[R1:v\[[0-9]+:[0-9]+\]]], +; GCN: {{buffer|flat}}_load_dwordx2 [[R2:v\[[0-9]+:[0-9]+\]]], + +; GCN-STRICT: v_add_f64 [[TMP:v\[[0-9]+:[0-9]+\]]], [[R1]], [[R1]] +; GCN-STRICT: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[TMP]], [[R2]] + +; GCN-CONTRACT: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[R1]], 2.0, [[R2]] + +; SI: buffer_store_dwordx2 [[RESULT]] +; VI: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +define void @fadd_a_a_b_f64(double addrspace(1)* %out, + double addrspace(1)* %in1, + double addrspace(1)* %in2) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone + %gep.0 = getelementptr double, double addrspace(1)* %out, i32 %tid + %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 + %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid + + %r0 = load volatile double, double addrspace(1)* %gep.0 + %r1 = load volatile double, double addrspace(1)* %gep.1 + + %add.0 = fadd double %r0, %r0 + %add.1 = fadd double %add.0, %r1 + store double %add.1, double addrspace(1)* %gep.out + ret void +} + +; GCN-LABEL: {{^}}fadd_b_a_a_f64: +; GCN: {{buffer|flat}}_load_dwordx2 [[R1:v\[[0-9]+:[0-9]+\]]], +; GCN: {{buffer|flat}}_load_dwordx2 [[R2:v\[[0-9]+:[0-9]+\]]], + +; GCN-STRICT: v_add_f64 [[TMP:v\[[0-9]+:[0-9]+\]]], [[R1]], [[R1]] +; GCN-STRICT: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[R2]], [[TMP]] + +; GCN-CONTRACT: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[R1]], 2.0, [[R2]] + +; SI: buffer_store_dwordx2 [[RESULT]] +; VI: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +define void @fadd_b_a_a_f64(double addrspace(1)* %out, + double addrspace(1)* %in1, + double addrspace(1)* %in2) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone + %gep.0 = getelementptr double, double addrspace(1)* %out, i32 %tid + %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 + %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid + + %r0 = load volatile double, double addrspace(1)* %gep.0 + %r1 = load volatile double, double addrspace(1)* %gep.1 + + %add.0 = fadd double %r0, %r0 + %add.1 = fadd double %r1, %add.0 + store double %add.1, double addrspace(1)* %gep.out + ret void +} + +; GCN-LABEL: {{^}}mad_sub_f64: +; GCN-STRICT: v_mul_f64 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} +; GCN-STRICT: v_add_f64 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, -v{{\[[0-9]+:[0-9]+\]}} + +; GCN-CONTRACT: v_fma_f64 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, -v{{\[[0-9]+:[0-9]+\]}} +define void @mad_sub_f64(double addrspace(1)* noalias nocapture %out, double addrspace(1)* noalias nocapture readonly %ptr) #1 { + %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 + %tid.ext = sext i32 %tid to i64 + %gep0 = getelementptr double, double addrspace(1)* %ptr, i64 %tid.ext + %add1 = add i64 %tid.ext, 1 + %gep1 = getelementptr double, double addrspace(1)* %ptr, i64 %add1 + %add2 = add i64 %tid.ext, 2 + %gep2 = getelementptr double, double addrspace(1)* %ptr, i64 %add2 + %outgep = getelementptr double, double addrspace(1)* %out, i64 %tid.ext + %a = load volatile double, double addrspace(1)* %gep0, align 8 + %b = load volatile double, double addrspace(1)* %gep1, align 8 + %c = load volatile double, double addrspace(1)* %gep2, align 8 + %mul = fmul double %a, %b + %sub = fsub double %mul, %c + store double %sub, double addrspace(1)* %outgep, align 8 + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() #1 +declare double @llvm.fmuladd.f64(double, double, double) #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } diff --git a/test/CodeGen/AMDGPU/fmuladd.ll b/test/CodeGen/AMDGPU/fmuladd.ll deleted file mode 100644 index 1913d563bc7..00000000000 --- a/test/CodeGen/AMDGPU/fmuladd.ll +++ /dev/null @@ -1,400 +0,0 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s - -declare double @llvm.fmuladd.f64(double, double, double) #1 -declare i32 @llvm.amdgcn.workitem.id.x() #1 -declare float @llvm.fabs.f32(float) #1 -declare float @llvm.fmuladd.f32(float, float, float) #1 -declare half @llvm.fabs.f16(half) #1 -declare half @llvm.fmuladd.f16(half, half, half) #1 - -; GCN-LABEL: {{^}}fmuladd_f64: -; GCN: v_fma_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}} -define void @fmuladd_f64(double addrspace(1)* %out, double addrspace(1)* %in1, - double addrspace(1)* %in2, double addrspace(1)* %in3) #0 { - %r0 = load double, double addrspace(1)* %in1 - %r1 = load double, double addrspace(1)* %in2 - %r2 = load double, double addrspace(1)* %in3 - %r3 = tail call double @llvm.fmuladd.f64(double %r0, double %r1, double %r2) - store double %r3, double addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}fmuladd_f32: -; GCN: v_mac_f32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}} -define void @fmuladd_f32(float addrspace(1)* %out, float addrspace(1)* %in1, - float addrspace(1)* %in2, float addrspace(1)* %in3) #0 { - %r0 = load float, float addrspace(1)* %in1 - %r1 = load float, float addrspace(1)* %in2 - %r2 = load float, float addrspace(1)* %in3 - %r3 = tail call float @llvm.fmuladd.f32(float %r0, float %r1, float %r2) - store float %r3, float addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}fmuladd_2.0_a_b_f32 -; GCN: {{buffer|flat}}_load_dword [[R1:v[0-9]+]], -; GCN: {{buffer|flat}}_load_dword [[R2:v[0-9]+]], -; GCN: v_mac_f32_e32 [[R2]], 2.0, [[R1]] - -; SI: buffer_store_dword [[R2]] -; VI: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[R2]] -define void @fmuladd_2.0_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { - %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone - %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid - %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 - %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid - - %r1 = load volatile float, float addrspace(1)* %gep.0 - %r2 = load volatile float, float addrspace(1)* %gep.1 - - %r3 = tail call float @llvm.fmuladd.f32(float 2.0, float %r1, float %r2) - store float %r3, float addrspace(1)* %gep.out - ret void -} - -; GCN-LABEL: {{^}}fmuladd_a_2.0_b_f32 -; GCN: {{buffer|flat}}_load_dword [[R1:v[0-9]+]], -; GCN: {{buffer|flat}}_load_dword [[R2:v[0-9]+]], -; GCN: v_mac_f32_e32 [[R2]], 2.0, [[R1]] - -; SI: buffer_store_dword [[R2]] -; VI: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[R2]] -define void @fmuladd_a_2.0_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { - %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone - %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid - %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 - %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid - - %r1 = load volatile float, float addrspace(1)* %gep.0 - %r2 = load volatile float, float addrspace(1)* %gep.1 - - %r3 = tail call float @llvm.fmuladd.f32(float %r1, float 2.0, float %r2) - store float %r3, float addrspace(1)* %gep.out - ret void -} - -; GCN-LABEL: {{^}}fadd_a_a_b_f32: -; GCN: {{buffer|flat}}_load_dword [[R1:v[0-9]+]], -; GCN: {{buffer|flat}}_load_dword [[R2:v[0-9]+]], -; GCN: v_mac_f32_e32 [[R2]], 2.0, [[R1]] - -; SI: buffer_store_dword [[R2]] -; VI: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[R2]] -define void @fadd_a_a_b_f32(float addrspace(1)* %out, - float addrspace(1)* %in1, - float addrspace(1)* %in2) #0 { - %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone - %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid - %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 - %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid - - %r0 = load volatile float, float addrspace(1)* %gep.0 - %r1 = load volatile float, float addrspace(1)* %gep.1 - - %add.0 = fadd float %r0, %r0 - %add.1 = fadd float %add.0, %r1 - store float %add.1, float addrspace(1)* %gep.out - ret void -} - -; GCN-LABEL: {{^}}fadd_b_a_a_f32: -; GCN: {{buffer|flat}}_load_dword [[R1:v[0-9]+]], -; GCN: {{buffer|flat}}_load_dword [[R2:v[0-9]+]], -; GCN: v_mac_f32_e32 [[R2]], 2.0, [[R1]] - -; SI: buffer_store_dword [[R2]] -; VI: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[R2]] -define void @fadd_b_a_a_f32(float addrspace(1)* %out, - float addrspace(1)* %in1, - float addrspace(1)* %in2) #0 { - %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone - %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid - %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 - %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid - - %r0 = load volatile float, float addrspace(1)* %gep.0 - %r1 = load volatile float, float addrspace(1)* %gep.1 - - %add.0 = fadd float %r0, %r0 - %add.1 = fadd float %r1, %add.0 - store float %add.1, float addrspace(1)* %gep.out - ret void -} - -; GCN-LABEL: {{^}}fmuladd_neg_2.0_a_b_f32 -; GCN: {{buffer|flat}}_load_dword [[R1:v[0-9]+]], -; GCN: {{buffer|flat}}_load_dword [[R2:v[0-9]+]], -; GCN: v_mac_f32_e32 [[R2]], -2.0, [[R1]] - -; SI: buffer_store_dword [[R2]] -; VI: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[R2]] -define void @fmuladd_neg_2.0_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { - %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone - %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid - %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 - %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid - - %r1 = load volatile float, float addrspace(1)* %gep.0 - %r2 = load volatile float, float addrspace(1)* %gep.1 - - %r3 = tail call float @llvm.fmuladd.f32(float -2.0, float %r1, float %r2) - store float %r3, float addrspace(1)* %gep.out - ret void -} - -; GCN-LABEL: {{^}}fmuladd_neg_2.0_neg_a_b_f32 -; GCN: {{buffer|flat}}_load_dword [[R1:v[0-9]+]], -; GCN: {{buffer|flat}}_load_dword [[R2:v[0-9]+]], -; GCN: v_mac_f32_e32 [[R2]], 2.0, [[R1]] - -; SI: buffer_store_dword [[R2]] -; VI: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[R2]] -define void @fmuladd_neg_2.0_neg_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { - %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone - %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid - %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 - %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid - - %r1 = load volatile float, float addrspace(1)* %gep.0 - %r2 = load volatile float, float addrspace(1)* %gep.1 - - %r1.fneg = fsub float -0.000000e+00, %r1 - - %r3 = tail call float @llvm.fmuladd.f32(float -2.0, float %r1.fneg, float %r2) - store float %r3, float addrspace(1)* %gep.out - ret void -} - -; GCN-LABEL: {{^}}fmuladd_2.0_neg_a_b_f32 -; GCN: {{buffer|flat}}_load_dword [[R1:v[0-9]+]], -; GCN: {{buffer|flat}}_load_dword [[R2:v[0-9]+]], -; GCN: v_mac_f32_e32 [[R2]], -2.0, [[R1]] - -; SI: buffer_store_dword [[R2]] -; VI: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[R2]] -define void @fmuladd_2.0_neg_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { - %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone - %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid - %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 - %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid - - %r1 = load volatile float, float addrspace(1)* %gep.0 - %r2 = load volatile float, float addrspace(1)* %gep.1 - - %r1.fneg = fsub float -0.000000e+00, %r1 - - %r3 = tail call float @llvm.fmuladd.f32(float 2.0, float %r1.fneg, float %r2) - store float %r3, float addrspace(1)* %gep.out - ret void -} - -; GCN-LABEL: {{^}}fmuladd_2.0_a_neg_b_f32 -; GCN: {{buffer|flat}}_load_dword [[R1:v[0-9]+]], -; GCN: {{buffer|flat}}_load_dword [[R2:v[0-9]+]], -; GCN: v_mad_f32 [[RESULT:v[0-9]+]], 2.0, [[R1]], -[[R2]] - -; SI: buffer_store_dword [[RESULT]] -; VI: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -define void @fmuladd_2.0_a_neg_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { - %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone - %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid - %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 - %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid - - %r1 = load volatile float, float addrspace(1)* %gep.0 - %r2 = load volatile float, float addrspace(1)* %gep.1 - - %r2.fneg = fsub float -0.000000e+00, %r2 - - %r3 = tail call float @llvm.fmuladd.f32(float 2.0, float %r1, float %r2.fneg) - store float %r3, float addrspace(1)* %gep.out - ret void -} - -; GCN-LABEL: {{^}}fmuladd_f16: -; VI: v_mac_f16_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}} -define void @fmuladd_f16(half addrspace(1)* %out, half addrspace(1)* %in1, - half addrspace(1)* %in2, half addrspace(1)* %in3) #0 { - %r0 = load half, half addrspace(1)* %in1 - %r1 = load half, half addrspace(1)* %in2 - %r2 = load half, half addrspace(1)* %in3 - %r3 = tail call half @llvm.fmuladd.f16(half %r0, half %r1, half %r2) - store half %r3, half addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}fmuladd_2.0_a_b_f16 -; GCN: {{buffer|flat}}_load_ushort [[R1:v[0-9]+]], -; GCN: {{buffer|flat}}_load_ushort [[R2:v[0-9]+]], -; VI: v_mac_f16_e32 [[R2]], 2.0, [[R1]] - -; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]] -define void @fmuladd_2.0_a_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 { - %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone - %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid - %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1 - %gep.out = getelementptr half, half addrspace(1)* %out, i32 %tid - - %r1 = load volatile half, half addrspace(1)* %gep.0 - %r2 = load volatile half, half addrspace(1)* %gep.1 - - %r3 = tail call half @llvm.fmuladd.f16(half 2.0, half %r1, half %r2) - store half %r3, half addrspace(1)* %gep.out - ret void -} - -; GCN-LABEL: {{^}}fmuladd_a_2.0_b_f16 -; GCN: {{buffer|flat}}_load_ushort [[R1:v[0-9]+]], -; GCN: {{buffer|flat}}_load_ushort [[R2:v[0-9]+]], -; VI: v_mac_f16_e32 [[R2]], 2.0, [[R1]] - -; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]] -define void @fmuladd_a_2.0_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 { - %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone - %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid - %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1 - %gep.out = getelementptr half, half addrspace(1)* %out, i32 %tid - - %r1 = load volatile half, half addrspace(1)* %gep.0 - %r2 = load volatile half, half addrspace(1)* %gep.1 - - %r3 = tail call half @llvm.fmuladd.f16(half %r1, half 2.0, half %r2) - store half %r3, half addrspace(1)* %gep.out - ret void -} - -; GCN-LABEL: {{^}}fadd_a_a_b_f16: -; GCN: {{buffer|flat}}_load_ushort [[R1:v[0-9]+]], -; GCN: {{buffer|flat}}_load_ushort [[R2:v[0-9]+]], -; VI: v_mac_f16_e32 [[R2]], 2.0, [[R1]] - -; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]] -define void @fadd_a_a_b_f16(half addrspace(1)* %out, - half addrspace(1)* %in1, - half addrspace(1)* %in2) #0 { - %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone - %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid - %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1 - %gep.out = getelementptr half, half addrspace(1)* %out, i32 %tid - - %r0 = load volatile half, half addrspace(1)* %gep.0 - %r1 = load volatile half, half addrspace(1)* %gep.1 - - %add.0 = fadd half %r0, %r0 - %add.1 = fadd half %add.0, %r1 - store half %add.1, half addrspace(1)* %gep.out - ret void -} - -; GCN-LABEL: {{^}}fadd_b_a_a_f16: -; GCN: {{buffer|flat}}_load_ushort [[R1:v[0-9]+]], -; GCN: {{buffer|flat}}_load_ushort [[R2:v[0-9]+]], -; VI: v_mac_f16_e32 [[R2]], 2.0, [[R1]] - -; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]] -define void @fadd_b_a_a_f16(half addrspace(1)* %out, - half addrspace(1)* %in1, - half addrspace(1)* %in2) #0 { - %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone - %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid - %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1 - %gep.out = getelementptr half, half addrspace(1)* %out, i32 %tid - - %r0 = load volatile half, half addrspace(1)* %gep.0 - %r1 = load volatile half, half addrspace(1)* %gep.1 - - %add.0 = fadd half %r0, %r0 - %add.1 = fadd half %r1, %add.0 - store half %add.1, half addrspace(1)* %gep.out - ret void -} - -; GCN-LABEL: {{^}}fmuladd_neg_2.0_a_b_f16 -; GCN: {{buffer|flat}}_load_ushort [[R1:v[0-9]+]], -; GCN: {{buffer|flat}}_load_ushort [[R2:v[0-9]+]], -; VI: v_mac_f16_e32 [[R2]], -2.0, [[R1]] - -; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]] -define void @fmuladd_neg_2.0_a_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 { - %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone - %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid - %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1 - %gep.out = getelementptr half, half addrspace(1)* %out, i32 %tid - - %r1 = load volatile half, half addrspace(1)* %gep.0 - %r2 = load volatile half, half addrspace(1)* %gep.1 - - %r3 = tail call half @llvm.fmuladd.f16(half -2.0, half %r1, half %r2) - store half %r3, half addrspace(1)* %gep.out - ret void -} - -; GCN-LABEL: {{^}}fmuladd_neg_2.0_neg_a_b_f16 -; GCN: {{buffer|flat}}_load_ushort [[R1:v[0-9]+]], -; GCN: {{buffer|flat}}_load_ushort [[R2:v[0-9]+]], -; VI: v_mac_f16_e32 [[R2]], 2.0, [[R1]] - -; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]] -define void @fmuladd_neg_2.0_neg_a_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 { - %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone - %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid - %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1 - %gep.out = getelementptr half, half addrspace(1)* %out, i32 %tid - - %r1 = load volatile half, half addrspace(1)* %gep.0 - %r2 = load volatile half, half addrspace(1)* %gep.1 - - %r1.fneg = fsub half -0.000000e+00, %r1 - - %r3 = tail call half @llvm.fmuladd.f16(half -2.0, half %r1.fneg, half %r2) - store half %r3, half addrspace(1)* %gep.out - ret void -} - -; GCN-LABEL: {{^}}fmuladd_2.0_neg_a_b_f16 -; GCN: {{buffer|flat}}_load_ushort [[R1:v[0-9]+]], -; GCN: {{buffer|flat}}_load_ushort [[R2:v[0-9]+]], -; VI: v_mac_f16_e32 [[R2]], -2.0, [[R1]] - -; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]] -define void @fmuladd_2.0_neg_a_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 { - %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone - %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid - %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1 - %gep.out = getelementptr half, half addrspace(1)* %out, i32 %tid - - %r1 = load volatile half, half addrspace(1)* %gep.0 - %r2 = load volatile half, half addrspace(1)* %gep.1 - - %r1.fneg = fsub half -0.000000e+00, %r1 - - %r3 = tail call half @llvm.fmuladd.f16(half 2.0, half %r1.fneg, half %r2) - store half %r3, half addrspace(1)* %gep.out - ret void -} - -; GCN-LABEL: {{^}}fmuladd_2.0_a_neg_b_f16 -; GCN: {{buffer|flat}}_load_ushort [[R1:v[0-9]+]], -; GCN: {{buffer|flat}}_load_ushort [[R2:v[0-9]+]], -; VI: v_mad_f16 [[RESULT:v[0-9]+]], 2.0, [[R1]], -[[R2]] - -; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -define void @fmuladd_2.0_a_neg_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 { - %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone - %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid - %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1 - %gep.out = getelementptr half, half addrspace(1)* %out, i32 %tid - - %r1 = load volatile half, half addrspace(1)* %gep.0 - %r2 = load volatile half, half addrspace(1)* %gep.1 - - %r2.fneg = fsub half -0.000000e+00, %r2 - - %r3 = tail call half @llvm.fmuladd.f16(half 2.0, half %r1, half %r2.fneg) - store half %r3, half addrspace(1)* %gep.out - ret void -} - -attributes #0 = { nounwind } -attributes #1 = { nounwind readnone } diff --git a/test/CodeGen/AMDGPU/mad-sub.ll b/test/CodeGen/AMDGPU/mad-sub.ll deleted file mode 100644 index 40027bff10a..00000000000 --- a/test/CodeGen/AMDGPU/mad-sub.ll +++ /dev/null @@ -1,420 +0,0 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s - -declare i32 @llvm.amdgcn.workitem.id.x() #0 -declare float @llvm.fabs.f32(float) #0 -declare half @llvm.fabs.f16(half) #0 - -; GCN-LABEL: {{^}}mad_sub_f32: -; GCN: {{buffer|flat}}_load_dword [[REGA:v[0-9]+]] -; GCN: {{buffer|flat}}_load_dword [[REGB:v[0-9]+]] -; GCN: {{buffer|flat}}_load_dword [[REGC:v[0-9]+]] -; GCN: v_mad_f32 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -[[REGC]] - -; SI: buffer_store_dword [[RESULT]] -; VI: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -define void @mad_sub_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #1 { - %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 - %tid.ext = sext i32 %tid to i64 - %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext - %add1 = add i64 %tid.ext, 1 - %gep1 = getelementptr float, float addrspace(1)* %ptr, i64 %add1 - %add2 = add i64 %tid.ext, 2 - %gep2 = getelementptr float, float addrspace(1)* %ptr, i64 %add2 - %outgep = getelementptr float, float addrspace(1)* %out, i64 %tid.ext - %a = load volatile float, float addrspace(1)* %gep0, align 4 - %b = load volatile float, float addrspace(1)* %gep1, align 4 - %c = load volatile float, float addrspace(1)* %gep2, align 4 - %mul = fmul float %a, %b - %sub = fsub float %mul, %c - store float %sub, float addrspace(1)* %outgep, align 4 - ret void -} - -; GCN-LABEL: {{^}}mad_sub_inv_f32: -; GCN: {{buffer|flat}}_load_dword [[REGA:v[0-9]+]] -; GCN: {{buffer|flat}}_load_dword [[REGB:v[0-9]+]] -; GCN: {{buffer|flat}}_load_dword [[REGC:v[0-9]+]] -; GCN: v_mad_f32 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], [[REGC]] - -; SI: buffer_store_dword [[RESULT]] -; VI: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -define void @mad_sub_inv_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #1 { - %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 - %tid.ext = sext i32 %tid to i64 - %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext - %add1 = add i64 %tid.ext, 1 - %gep1 = getelementptr float, float addrspace(1)* %ptr, i64 %add1 - %add2 = add i64 %tid.ext, 2 - %gep2 = getelementptr float, float addrspace(1)* %ptr, i64 %add2 - %outgep = getelementptr float, float addrspace(1)* %out, i64 %tid.ext - %a = load volatile float, float addrspace(1)* %gep0, align 4 - %b = load volatile float, float addrspace(1)* %gep1, align 4 - %c = load volatile float, float addrspace(1)* %gep2, align 4 - %mul = fmul float %a, %b - %sub = fsub float %c, %mul - store float %sub, float addrspace(1)* %outgep, align 4 - ret void -} - -; GCN-LABEL: {{^}}mad_sub_f64: -; GCN: v_mul_f64 -; GCN: v_add_f64 -define void @mad_sub_f64(double addrspace(1)* noalias nocapture %out, double addrspace(1)* noalias nocapture readonly %ptr) #1 { - %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 - %tid.ext = sext i32 %tid to i64 - %gep0 = getelementptr double, double addrspace(1)* %ptr, i64 %tid.ext - %add1 = add i64 %tid.ext, 1 - %gep1 = getelementptr double, double addrspace(1)* %ptr, i64 %add1 - %add2 = add i64 %tid.ext, 2 - %gep2 = getelementptr double, double addrspace(1)* %ptr, i64 %add2 - %outgep = getelementptr double, double addrspace(1)* %out, i64 %tid.ext - %a = load volatile double, double addrspace(1)* %gep0, align 8 - %b = load volatile double, double addrspace(1)* %gep1, align 8 - %c = load volatile double, double addrspace(1)* %gep2, align 8 - %mul = fmul double %a, %b - %sub = fsub double %mul, %c - store double %sub, double addrspace(1)* %outgep, align 8 - ret void -} - -; GCN-LABEL: {{^}}mad_sub_fabs_f32: -; GCN: {{buffer|flat}}_load_dword [[REGA:v[0-9]+]] -; GCN: {{buffer|flat}}_load_dword [[REGB:v[0-9]+]] -; GCN: {{buffer|flat}}_load_dword [[REGC:v[0-9]+]] -; GCN: v_mad_f32 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -|[[REGC]]| -; SI: buffer_store_dword [[RESULT]] -; VI: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -define void @mad_sub_fabs_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #1 { - %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 - %tid.ext = sext i32 %tid to i64 - %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext - %add1 = add i64 %tid.ext, 1 - %gep1 = getelementptr float, float addrspace(1)* %ptr, i64 %add1 - %add2 = add i64 %tid.ext, 2 - %gep2 = getelementptr float, float addrspace(1)* %ptr, i64 %add2 - %outgep = getelementptr float, float addrspace(1)* %out, i64 %tid.ext - %a = load volatile float, float addrspace(1)* %gep0, align 4 - %b = load volatile float, float addrspace(1)* %gep1, align 4 - %c = load volatile float, float addrspace(1)* %gep2, align 4 - %c.abs = call float @llvm.fabs.f32(float %c) #0 - %mul = fmul float %a, %b - %sub = fsub float %mul, %c.abs - store float %sub, float addrspace(1)* %outgep, align 4 - ret void -} - -; GCN-LABEL: {{^}}mad_sub_fabs_inv_f32: -; GCN: {{buffer|flat}}_load_dword [[REGA:v[0-9]+]] -; GCN: {{buffer|flat}}_load_dword [[REGB:v[0-9]+]] -; GCN: {{buffer|flat}}_load_dword [[REGC:v[0-9]+]] -; GCN: v_mad_f32 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], |[[REGC]]| -; SI: buffer_store_dword [[RESULT]] -; VI: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -define void @mad_sub_fabs_inv_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #1 { - %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 - %tid.ext = sext i32 %tid to i64 - %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext - %add1 = add i64 %tid.ext, 1 - %gep1 = getelementptr float, float addrspace(1)* %ptr, i64 %add1 - %add2 = add i64 %tid.ext, 2 - %gep2 = getelementptr float, float addrspace(1)* %ptr, i64 %add2 - %outgep = getelementptr float, float addrspace(1)* %out, i64 %tid.ext - %a = load volatile float, float addrspace(1)* %gep0, align 4 - %b = load volatile float, float addrspace(1)* %gep1, align 4 - %c = load volatile float, float addrspace(1)* %gep2, align 4 - %c.abs = call float @llvm.fabs.f32(float %c) #0 - %mul = fmul float %a, %b - %sub = fsub float %c.abs, %mul - store float %sub, float addrspace(1)* %outgep, align 4 - ret void -} - -; GCN-LABEL: {{^}}neg_neg_mad_f32: -; GCN: v_mac_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} -define void @neg_neg_mad_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #1 { - %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 - %tid.ext = sext i32 %tid to i64 - %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext - %add1 = add i64 %tid.ext, 1 - %gep1 = getelementptr float, float addrspace(1)* %ptr, i64 %add1 - %add2 = add i64 %tid.ext, 2 - %gep2 = getelementptr float, float addrspace(1)* %ptr, i64 %add2 - %outgep = getelementptr float, float addrspace(1)* %out, i64 %tid.ext - %a = load volatile float, float addrspace(1)* %gep0, align 4 - %b = load volatile float, float addrspace(1)* %gep1, align 4 - %c = load volatile float, float addrspace(1)* %gep2, align 4 - %nega = fsub float -0.000000e+00, %a - %negb = fsub float -0.000000e+00, %b - %mul = fmul float %nega, %negb - %sub = fadd float %mul, %c - store float %sub, float addrspace(1)* %outgep, align 4 - ret void -} - -; GCN-LABEL: {{^}}mad_fabs_sub_f32: -; GCN: {{buffer|flat}}_load_dword [[REGA:v[0-9]+]] -; GCN: {{buffer|flat}}_load_dword [[REGB:v[0-9]+]] -; GCN: {{buffer|flat}}_load_dword [[REGC:v[0-9]+]] -; GCN: v_mad_f32 [[RESULT:v[0-9]+]], [[REGA]], |[[REGB]]|, -[[REGC]] -; SI: buffer_store_dword [[RESULT]] -; VI: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -define void @mad_fabs_sub_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #1 { - %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 - %tid.ext = sext i32 %tid to i64 - %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext - %add1 = add i64 %tid.ext, 1 - %gep1 = getelementptr float, float addrspace(1)* %ptr, i64 %add1 - %add2 = add i64 %tid.ext, 2 - %gep2 = getelementptr float, float addrspace(1)* %ptr, i64 %add2 - %outgep = getelementptr float, float addrspace(1)* %out, i64 %tid.ext - %a = load volatile float, float addrspace(1)* %gep0, align 4 - %b = load volatile float, float addrspace(1)* %gep1, align 4 - %c = load volatile float, float addrspace(1)* %gep2, align 4 - %b.abs = call float @llvm.fabs.f32(float %b) #0 - %mul = fmul float %a, %b.abs - %sub = fsub float %mul, %c - store float %sub, float addrspace(1)* %outgep, align 4 - ret void -} - -; GCN-LABEL: {{^}}fsub_c_fadd_a_a_f32: -; GCN: {{buffer|flat}}_load_dword [[R1:v[0-9]+]], -; GCN: {{buffer|flat}}_load_dword [[R2:v[0-9]+]], -; GCN: v_mac_f32_e32 [[R2]], -2.0, [[R1]] - -; SI: buffer_store_dword [[R2]] -; VI: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[R2]] -define void @fsub_c_fadd_a_a_f32(float addrspace(1)* %out, float addrspace(1)* %in) { - %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone - %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid - %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 - %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid - - %r1 = load volatile float, float addrspace(1)* %gep.0 - %r2 = load volatile float, float addrspace(1)* %gep.1 - - %add = fadd float %r1, %r1 - %r3 = fsub float %r2, %add - - store float %r3, float addrspace(1)* %gep.out - ret void -} - -; GCN-LABEL: {{^}}fsub_fadd_a_a_c_f32: -; GCN: {{buffer|flat}}_load_dword [[R1:v[0-9]+]], -; GCN: {{buffer|flat}}_load_dword [[R2:v[0-9]+]], -; GCN: v_mad_f32 [[RESULT:v[0-9]+]], 2.0, [[R1]], -[[R2]] - -; SI: buffer_store_dword [[RESULT]] -; VI: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -define void @fsub_fadd_a_a_c_f32(float addrspace(1)* %out, float addrspace(1)* %in) { - %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone - %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid - %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 - %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid - - %r1 = load volatile float, float addrspace(1)* %gep.0 - %r2 = load volatile float, float addrspace(1)* %gep.1 - - %add = fadd float %r1, %r1 - %r3 = fsub float %add, %r2 - - store float %r3, float addrspace(1)* %gep.out - ret void -} - -; GCN-LABEL: {{^}}mad_sub_f16: -; GCN: {{buffer|flat}}_load_ushort [[REGA:v[0-9]+]] -; GCN: {{buffer|flat}}_load_ushort [[REGB:v[0-9]+]] -; GCN: {{buffer|flat}}_load_ushort [[REGC:v[0-9]+]] - -; VI: v_mad_f16 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -[[REGC]] -; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -define void @mad_sub_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 { - %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 - %tid.ext = sext i32 %tid to i64 - %gep0 = getelementptr half, half addrspace(1)* %ptr, i64 %tid.ext - %add1 = add i64 %tid.ext, 1 - %gep1 = getelementptr half, half addrspace(1)* %ptr, i64 %add1 - %add2 = add i64 %tid.ext, 2 - %gep2 = getelementptr half, half addrspace(1)* %ptr, i64 %add2 - %outgep = getelementptr half, half addrspace(1)* %out, i64 %tid.ext - %a = load volatile half, half addrspace(1)* %gep0, align 2 - %b = load volatile half, half addrspace(1)* %gep1, align 2 - %c = load volatile half, half addrspace(1)* %gep2, align 2 - %mul = fmul half %a, %b - %sub = fsub half %mul, %c - store half %sub, half addrspace(1)* %outgep, align 2 - ret void -} - -; GCN-LABEL: {{^}}mad_sub_inv_f16: -; GCN: {{buffer|flat}}_load_ushort [[REGA:v[0-9]+]] -; GCN: {{buffer|flat}}_load_ushort [[REGB:v[0-9]+]] -; GCN: {{buffer|flat}}_load_ushort [[REGC:v[0-9]+]] -; VI: v_mad_f16 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], [[REGC]] -; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -define void @mad_sub_inv_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 { - %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 - %tid.ext = sext i32 %tid to i64 - %gep0 = getelementptr half, half addrspace(1)* %ptr, i64 %tid.ext - %add1 = add i64 %tid.ext, 1 - %gep1 = getelementptr half, half addrspace(1)* %ptr, i64 %add1 - %add2 = add i64 %tid.ext, 2 - %gep2 = getelementptr half, half addrspace(1)* %ptr, i64 %add2 - %outgep = getelementptr half, half addrspace(1)* %out, i64 %tid.ext - %a = load volatile half, half addrspace(1)* %gep0, align 2 - %b = load volatile half, half addrspace(1)* %gep1, align 2 - %c = load volatile half, half addrspace(1)* %gep2, align 2 - %mul = fmul half %a, %b - %sub = fsub half %c, %mul - store half %sub, half addrspace(1)* %outgep, align 2 - ret void -} - -; GCN-LABEL: {{^}}mad_sub_fabs_f16: -; GCN: {{buffer|flat}}_load_ushort [[REGA:v[0-9]+]] -; GCN: {{buffer|flat}}_load_ushort [[REGB:v[0-9]+]] -; GCN: {{buffer|flat}}_load_ushort [[REGC:v[0-9]+]] -; VI: v_mad_f16 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -|[[REGC]]| -; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -define void @mad_sub_fabs_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 { - %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 - %tid.ext = sext i32 %tid to i64 - %gep0 = getelementptr half, half addrspace(1)* %ptr, i64 %tid.ext - %add1 = add i64 %tid.ext, 1 - %gep1 = getelementptr half, half addrspace(1)* %ptr, i64 %add1 - %add2 = add i64 %tid.ext, 2 - %gep2 = getelementptr half, half addrspace(1)* %ptr, i64 %add2 - %outgep = getelementptr half, half addrspace(1)* %out, i64 %tid.ext - %a = load volatile half, half addrspace(1)* %gep0, align 2 - %b = load volatile half, half addrspace(1)* %gep1, align 2 - %c = load volatile half, half addrspace(1)* %gep2, align 2 - %c.abs = call half @llvm.fabs.f16(half %c) #0 - %mul = fmul half %a, %b - %sub = fsub half %mul, %c.abs - store half %sub, half addrspace(1)* %outgep, align 2 - ret void -} - -; GCN-LABEL: {{^}}mad_sub_fabs_inv_f16: -; GCN: {{buffer|flat}}_load_ushort [[REGA:v[0-9]+]] -; GCN: {{buffer|flat}}_load_ushort [[REGB:v[0-9]+]] -; GCN: {{buffer|flat}}_load_ushort [[REGC:v[0-9]+]] - -; VI: v_mad_f16 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], |[[REGC]]| -; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -define void @mad_sub_fabs_inv_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 { - %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 - %tid.ext = sext i32 %tid to i64 - %gep0 = getelementptr half, half addrspace(1)* %ptr, i64 %tid.ext - %add1 = add i64 %tid.ext, 1 - %gep1 = getelementptr half, half addrspace(1)* %ptr, i64 %add1 - %add2 = add i64 %tid.ext, 2 - %gep2 = getelementptr half, half addrspace(1)* %ptr, i64 %add2 - %outgep = getelementptr half, half addrspace(1)* %out, i64 %tid.ext - %a = load volatile half, half addrspace(1)* %gep0, align 2 - %b = load volatile half, half addrspace(1)* %gep1, align 2 - %c = load volatile half, half addrspace(1)* %gep2, align 2 - %c.abs = call half @llvm.fabs.f16(half %c) #0 - %mul = fmul half %a, %b - %sub = fsub half %c.abs, %mul - store half %sub, half addrspace(1)* %outgep, align 2 - ret void -} - -; GCN-LABEL: {{^}}neg_neg_mad_f16: -; VI: v_mac_f16_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} -define void @neg_neg_mad_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 { - %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 - %tid.ext = sext i32 %tid to i64 - %gep0 = getelementptr half, half addrspace(1)* %ptr, i64 %tid.ext - %add1 = add i64 %tid.ext, 1 - %gep1 = getelementptr half, half addrspace(1)* %ptr, i64 %add1 - %add2 = add i64 %tid.ext, 2 - %gep2 = getelementptr half, half addrspace(1)* %ptr, i64 %add2 - %outgep = getelementptr half, half addrspace(1)* %out, i64 %tid.ext - %a = load volatile half, half addrspace(1)* %gep0, align 2 - %b = load volatile half, half addrspace(1)* %gep1, align 2 - %c = load volatile half, half addrspace(1)* %gep2, align 2 - %nega = fsub half -0.000000e+00, %a - %negb = fsub half -0.000000e+00, %b - %mul = fmul half %nega, %negb - %sub = fadd half %mul, %c - store half %sub, half addrspace(1)* %outgep, align 2 - ret void -} - -; GCN-LABEL: {{^}}mad_fabs_sub_f16: -; GCN: {{buffer|flat}}_load_ushort [[REGA:v[0-9]+]] -; GCN: {{buffer|flat}}_load_ushort [[REGB:v[0-9]+]] -; GCN: {{buffer|flat}}_load_ushort [[REGC:v[0-9]+]] - -; VI: v_mad_f16 [[RESULT:v[0-9]+]], [[REGA]], |[[REGB]]|, -[[REGC]] -; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -define void @mad_fabs_sub_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 { - %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 - %tid.ext = sext i32 %tid to i64 - %gep0 = getelementptr half, half addrspace(1)* %ptr, i64 %tid.ext - %add1 = add i64 %tid.ext, 1 - %gep1 = getelementptr half, half addrspace(1)* %ptr, i64 %add1 - %add2 = add i64 %tid.ext, 2 - %gep2 = getelementptr half, half addrspace(1)* %ptr, i64 %add2 - %outgep = getelementptr half, half addrspace(1)* %out, i64 %tid.ext - %a = load volatile half, half addrspace(1)* %gep0, align 2 - %b = load volatile half, half addrspace(1)* %gep1, align 2 - %c = load volatile half, half addrspace(1)* %gep2, align 2 - %b.abs = call half @llvm.fabs.f16(half %b) #0 - %mul = fmul half %a, %b.abs - %sub = fsub half %mul, %c - store half %sub, half addrspace(1)* %outgep, align 2 - ret void -} - -; GCN-LABEL: {{^}}fsub_c_fadd_a_a_f16: -; GCN: {{buffer|flat}}_load_ushort [[R1:v[0-9]+]], -; GCN: {{buffer|flat}}_load_ushort [[R2:v[0-9]+]], -; VI: v_mac_f16_e32 [[R2]], -2.0, [[R1]] - -; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]] -define void @fsub_c_fadd_a_a_f16(half addrspace(1)* %out, half addrspace(1)* %in) { - %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone - %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid - %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1 - %gep.out = getelementptr half, half addrspace(1)* %out, i32 %tid - - %r1 = load volatile half, half addrspace(1)* %gep.0 - %r2 = load volatile half, half addrspace(1)* %gep.1 - - %add = fadd half %r1, %r1 - %r3 = fsub half %r2, %add - - store half %r3, half addrspace(1)* %gep.out - ret void -} - -; GCN-LABEL: {{^}}fsub_fadd_a_a_c_f16: -; GCN: {{buffer|flat}}_load_ushort [[R1:v[0-9]+]], -; GCN: {{buffer|flat}}_load_ushort [[R2:v[0-9]+]], - -; VI: v_mad_f16 [[RESULT:v[0-9]+]], 2.0, [[R1]], -[[R2]] -; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -define void @fsub_fadd_a_a_c_f16(half addrspace(1)* %out, half addrspace(1)* %in) { - %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone - %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid - %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1 - %gep.out = getelementptr half, half addrspace(1)* %out, i32 %tid - - %r1 = load volatile half, half addrspace(1)* %gep.0 - %r2 = load volatile half, half addrspace(1)* %gep.1 - - %add = fadd half %r1, %r1 - %r3 = fsub half %add, %r2 - - store half %r3, half addrspace(1)* %gep.out - ret void -} - -attributes #0 = { nounwind readnone } -attributes #1 = { nounwind } -- 2.50.0