From: Matt Arsenault Date: Thu, 2 Feb 2017 02:27:04 +0000 (+0000) Subject: AMDGPU: Use source modifiers with f16->f32 conversions X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=9f7e91552b6bf09c699d0dfd7853fba4c7e3b45e;p=llvm AMDGPU: Use source modifiers with f16->f32 conversions The operand types were defined to fit the fp16_to_fp node, which has the half as an integer type. v_cvt_f32_f16 does support source modifiers, so change this to have an FP type and modifiers. For targets without legal f16, this requires recognizing the bit operations and trying to produce them. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@293857 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 70eb3e1a82c..3350654daad 100644 --- a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -479,6 +479,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, setTargetDAGCombine(ISD::FADD); setTargetDAGCombine(ISD::FSUB); setTargetDAGCombine(ISD::FNEG); + setTargetDAGCombine(ISD::FABS); } //===----------------------------------------------------------------------===// @@ -2968,6 +2969,45 @@ SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N, SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc); return DAG.getNode(ISD::FP_ROUND, SL, VT, Neg, N0.getOperand(1)); } + case ISD::FP16_TO_FP: { + // v_cvt_f32_f16 supports source modifiers on pre-VI targets without legal + // f16, but legalization of f16 fneg ends up pulling it out of the source. + // Put the fneg back as a legal source operation that can be matched later. + SDLoc SL(N); + + SDValue Src = N0.getOperand(0); + EVT SrcVT = Src.getValueType(); + + // fneg (fp16_to_fp x) -> fp16_to_fp (xor x, 0x8000) + SDValue IntFNeg = DAG.getNode(ISD::XOR, SL, SrcVT, Src, + DAG.getConstant(0x8000, SL, SrcVT)); + return DAG.getNode(ISD::FP16_TO_FP, SL, N->getValueType(0), IntFNeg); + } + default: + return SDValue(); + } +} + +SDValue AMDGPUTargetLowering::performFAbsCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + SelectionDAG &DAG = DCI.DAG; + SDValue N0 = N->getOperand(0); + + if (!N0.hasOneUse()) + return SDValue(); + + switch (N0.getOpcode()) { + case ISD::FP16_TO_FP: { + assert(!Subtarget->has16BitInsts() && "should only see if f16 is illegal"); + SDLoc SL(N); + SDValue Src = N0.getOperand(0); + EVT SrcVT = Src.getValueType(); + + // fabs (fp16_to_fp x) -> fp16_to_fp (and x, 0x7fff) + SDValue IntFAbs = DAG.getNode(ISD::AND, SL, SrcVT, Src, + DAG.getConstant(0x7fff, SL, SrcVT)); + return DAG.getNode(ISD::FP16_TO_FP, SL, N->getValueType(0), IntFAbs); + } default: return SDValue(); } @@ -3080,6 +3120,8 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N, return performSelectCombine(N, DCI); case ISD::FNEG: return performFNegCombine(N, DCI); + case ISD::FABS: + return performFAbsCombine(N, DCI); case AMDGPUISD::BFE_I32: case AMDGPUISD::BFE_U32: { assert(!N->getValueType(0).isVector() && diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.h b/lib/Target/AMDGPU/AMDGPUISelLowering.h index 880d571078b..58ac09f2698 100644 --- a/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ b/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -85,6 +85,7 @@ protected: SDValue RHS, DAGCombinerInfo &DCI) const; SDValue performSelectCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performFNegCombine(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue performFAbsCombine(SDNode *N, DAGCombinerInfo &DCI) const; static EVT getEquivalentMemType(LLVMContext &Context, EVT VT); diff --git a/lib/Target/AMDGPU/AMDGPUInstructions.td b/lib/Target/AMDGPU/AMDGPUInstructions.td index 004b10f5d20..7d2a52bccc8 100644 --- a/lib/Target/AMDGPU/AMDGPUInstructions.td +++ b/lib/Target/AMDGPU/AMDGPUInstructions.td @@ -637,6 +637,9 @@ def umax_oneuse : HasOneUseBinOp; def umin_oneuse : HasOneUseBinOp; def fminnum_oneuse : HasOneUseBinOp; def fmaxnum_oneuse : HasOneUseBinOp; +def and_oneuse : HasOneUseBinOp; +def or_oneuse : HasOneUseBinOp; +def xor_oneuse : HasOneUseBinOp; } // Properties = [SDNPCommutative, SDNPAssociative] def sub_oneuse : HasOneUseBinOp; diff --git a/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp b/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp index 0c5bb0648a1..bdaa5aef83d 100644 --- a/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp +++ b/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp @@ -226,6 +226,8 @@ uint32_t SIMCCodeEmitter::getLitEncoding(const MCOperand &MO, case 8: return getLit64Encoding(static_cast(Imm), STI); case 2: + // FIXME Is this correct? What do inline immediates do on SI for f16 src + // which does not have f16 support? return getLit16Encoding(static_cast(Imm), STI); default: llvm_unreachable("invalid operand size"); diff --git a/lib/Target/AMDGPU/SIInstrInfo.cpp b/lib/Target/AMDGPU/SIInstrInfo.cpp index e718b459ec3..51006589b5c 100644 --- a/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -1824,7 +1824,8 @@ bool SIInstrInfo::isInlineConstant(const APInt &Imm) const { return AMDGPU::isInlinableLiteral64(Imm.getSExtValue(), ST.hasInv2PiInlineImm()); case 16: - return AMDGPU::isInlinableLiteral16(Imm.getSExtValue(), + return ST.has16BitInsts() && + AMDGPU::isInlinableLiteral16(Imm.getSExtValue(), ST.hasInv2PiInlineImm()); default: llvm_unreachable("invalid bitwidth"); @@ -1854,8 +1855,13 @@ bool SIInstrInfo::isInlineConstant(const MachineOperand &MO, } case 16: { if (isInt<16>(Imm) || isUInt<16>(Imm)) { + // A few special case instructions have 16-bit operands on subtargets + // where 16-bit instructions are not legal. + // TODO: Do the 32-bit immediates work? We shouldn't really need to handle + // constants in these cases int16_t Trunc = static_cast(Imm); - return AMDGPU::isInlinableLiteral16(Trunc, ST.hasInv2PiInlineImm()); + return ST.has16BitInsts() && + AMDGPU::isInlinableLiteral16(Trunc, ST.hasInv2PiInlineImm()); } return false; diff --git a/lib/Target/AMDGPU/SIInstrInfo.td b/lib/Target/AMDGPU/SIInstrInfo.td index cf9b9c5a145..a691ef11a86 100644 --- a/lib/Target/AMDGPU/SIInstrInfo.td +++ b/lib/Target/AMDGPU/SIInstrInfo.td @@ -619,6 +619,8 @@ def SIOperand { def SRCMODS { int NONE = 0; int NEG = 1; + int ABS = 2; + int NEG_ABS = 3; } def DSTCLAMP { @@ -1132,6 +1134,8 @@ def VOP_F64_I32 : VOPProfile <[f64, i32, untyped, untyped]>; def VOP_I32_F32 : VOPProfile <[i32, f32, untyped, untyped]>; def VOP_I32_F64 : VOPProfile <[i32, f64, untyped, untyped]>; def VOP_I32_I32 : VOPProfile <[i32, i32, untyped, untyped]>; +def VOP_F16_F32 : VOPProfile <[f16, f32, untyped, untyped]>; +def VOP_F32_F16 : VOPProfile <[f32, f16, untyped, untyped]>; def VOP_F32_F32_F16 : VOPProfile <[f32, f32, f16, untyped]>; def VOP_F32_F32_F32 : VOPProfile <[f32, f32, f32, untyped]>; diff --git a/lib/Target/AMDGPU/SIInstructions.td b/lib/Target/AMDGPU/SIInstructions.td index 5cdd139a83f..ed0609da9be 100644 --- a/lib/Target/AMDGPU/SIInstructions.td +++ b/lib/Target/AMDGPU/SIInstructions.td @@ -430,9 +430,26 @@ def : Pat < } // End Predicates = [UnsafeFPMath] + +// f16_to_fp patterns +def : Pat < + (f32 (f16_to_fp i32:$src0)), + (V_CVT_F32_F16_e64 SRCMODS.NONE, $src0, DSTCLAMP.NONE, DSTOMOD.NONE) +>; + +def : Pat < + (f32 (f16_to_fp (and_oneuse i32:$src0, 0x7fff))), + (V_CVT_F32_F16_e64 SRCMODS.ABS, $src0, DSTCLAMP.NONE, DSTOMOD.NONE) +>; + +def : Pat < + (f32 (f16_to_fp (or_oneuse i32:$src0, 0x8000))), + (V_CVT_F32_F16_e64 SRCMODS.NEG_ABS, $src0, DSTCLAMP.NONE, DSTOMOD.NONE) +>; + def : Pat < - (f32 (fpextend f16:$src)), - (V_CVT_F32_F16_e32 $src) + (f32 (f16_to_fp (xor_oneuse i32:$src0, 0x8000))), + (V_CVT_F32_F16_e64 SRCMODS.NEG, $src0, DSTCLAMP.NONE, DSTOMOD.NONE) >; def : Pat < @@ -440,9 +457,10 @@ def : Pat < (V_CVT_F64_F32_e32 (V_CVT_F32_F16_e32 $src)) >; +// fp_to_fp16 patterns def : Pat < - (f16 (fpround f32:$src)), - (V_CVT_F16_F32_e32 $src) + (i32 (fp_to_f16 (f32 (VOP3Mods0 f32:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod)))), + (V_CVT_F16_F32_e64 $src0_modifiers, f32:$src0, $clamp, $omod) >; def : Pat < diff --git a/lib/Target/AMDGPU/VOP1Instructions.td b/lib/Target/AMDGPU/VOP1Instructions.td index a15b9ceff2f..15abed4a76b 100644 --- a/lib/Target/AMDGPU/VOP1Instructions.td +++ b/lib/Target/AMDGPU/VOP1Instructions.td @@ -147,8 +147,8 @@ defm V_CVT_F32_I32 : VOP1Inst <"v_cvt_f32_i32", VOP_F32_I32, sint_to_fp>; defm V_CVT_F32_U32 : VOP1Inst <"v_cvt_f32_u32", VOP_F32_I32, uint_to_fp>; defm V_CVT_U32_F32 : VOP1Inst <"v_cvt_u32_f32", VOP_I32_F32, fp_to_uint>; defm V_CVT_I32_F32 : VOP1Inst <"v_cvt_i32_f32", VOP_I32_F32, fp_to_sint>; -defm V_CVT_F16_F32 : VOP1Inst <"v_cvt_f16_f32", VOP_I32_F32, fp_to_f16>; -defm V_CVT_F32_F16 : VOP1Inst <"v_cvt_f32_f16", VOP_F32_I32, f16_to_fp>; +defm V_CVT_F16_F32 : VOP1Inst <"v_cvt_f16_f32", VOP_F16_F32, fpround>; +defm V_CVT_F32_F16 : VOP1Inst <"v_cvt_f32_f16", VOP_F32_F16, fpextend>; defm V_CVT_RPI_I32_F32 : VOP1Inst <"v_cvt_rpi_i32_f32", VOP_I32_F32, cvt_rpi_i32_f32>; defm V_CVT_FLR_I32_F32 : VOP1Inst <"v_cvt_flr_i32_f32", VOP_I32_F32, cvt_flr_i32_f32>; defm V_CVT_OFF_F32_I4 : VOP1Inst <"v_cvt_off_f32_i4", VOP_F32_I32>; diff --git a/test/CodeGen/AMDGPU/fabs.f16.ll b/test/CodeGen/AMDGPU/fabs.f16.ll index c64aa6228c7..77c941356b9 100644 --- a/test/CodeGen/AMDGPU/fabs.f16.ll +++ b/test/CodeGen/AMDGPU/fabs.f16.ll @@ -19,8 +19,7 @@ define void @fabs_free_f16(half addrspace(1)* %out, i16 %in) { ; GCN-LABEL: {{^}}fabs_f16: ; CI: flat_load_ushort [[VAL:v[0-9]+]], -; CI: v_cvt_f32_f16_e32 [[CVT0:v[0-9]+]], [[VAL]] -; CI: v_cvt_f16_f32_e64 [[RESULT:v[0-9]+]], |[[CVT0]]| +; CI: v_and_b32_e32 [[CVT0:v[0-9]+]], 0x7fff, [[VAL]] ; CI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] define void @fabs_f16(half addrspace(1)* %out, half %in) { %fabs = call half @llvm.fabs.f16(half %in) @@ -30,10 +29,10 @@ define void @fabs_f16(half addrspace(1)* %out, half %in) { ; FIXME: Should be able to use single and ; GCN-LABEL: {{^}}fabs_v2f16: -; CI: v_cvt_f32_f16_e32 v{{[0-9]+}}, -; CI: v_cvt_f32_f16_e32 v{{[0-9]+}}, -; CI: v_cvt_f16_f32_e64 v{{[0-9]+}}, |v{{[0-9]+}}| -; CI: v_cvt_f16_f32_e64 v{{[0-9]+}}, |v{{[0-9]+}}| + +; CI: s_movk_i32 [[MASK:s[0-9]+]], 0x7fff +; CI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]], v{{[0-9]+}} +; CI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]], v{{[0-9]+}} ; VI: flat_load_ushort [[LO:v[0-9]+]] ; VI: flat_load_ushort [[HI:v[0-9]+]] @@ -51,10 +50,11 @@ define void @fabs_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %in) { } ; GCN-LABEL: {{^}}fabs_v4f16: -; CI: v_cvt_f16_f32_e64 v{{[0-9]+}}, |v{{[0-9]+}}| -; CI: v_cvt_f16_f32_e64 v{{[0-9]+}}, |v{{[0-9]+}}| -; CI: v_cvt_f16_f32_e64 v{{[0-9]+}}, |v{{[0-9]+}}| -; CI: v_cvt_f16_f32_e64 v{{[0-9]+}}, |v{{[0-9]+}}| +; CI: s_movk_i32 [[MASK:s[0-9]+]], 0x7fff +; CI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]], v{{[0-9]+}} +; CI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]], v{{[0-9]+}} +; CI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]], v{{[0-9]+}} +; CI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]], v{{[0-9]+}} ; VI: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x7fff{{$}} ; VI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]], v{{[0-9]+}} @@ -72,9 +72,10 @@ define void @fabs_v4f16(<4 x half> addrspace(1)* %out, <4 x half> %in) { ; GCN-LABEL: {{^}}fabs_fold_f16: ; GCN: flat_load_ushort [[IN0:v[0-9]+]] ; GCN: flat_load_ushort [[IN1:v[0-9]+]] + ; CI-DAG: v_cvt_f32_f16_e32 [[CVT0:v[0-9]+]], [[IN0]] -; CI-DAG: v_cvt_f32_f16_e32 [[CVT1:v[0-9]+]], [[IN1]] -; CI: v_mul_f32_e64 [[RESULT:v[0-9]+]], |[[CVT1]]|, [[CVT0]] +; CI-DAG: v_cvt_f32_f16_e64 [[ABS_CVT1:v[0-9]+]], |[[IN1]]| +; CI: v_mul_f32_e32 [[RESULT:v[0-9]+]], [[CVT0]], [[ABS_CVT1]] ; CI: v_cvt_f16_f32_e32 [[CVTRESULT:v[0-9]+]], [[RESULT]] ; CI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[CVTRESULT]] diff --git a/test/CodeGen/AMDGPU/fcmp.f16.ll b/test/CodeGen/AMDGPU/fcmp.f16.ll index a62726f7f06..8a01ea2fe43 100644 --- a/test/CodeGen/AMDGPU/fcmp.f16.ll +++ b/test/CodeGen/AMDGPU/fcmp.f16.ll @@ -28,10 +28,10 @@ entry: ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] ; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] -; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] -; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] +; SI: v_cvt_f32_f16_e64 v[[A_F32:[0-9]+]], |v[[A_F16]]| +; SI: v_cvt_f32_f16_e64 v[[B_F32:[0-9]+]], |v[[B_F16]]| -; SI: v_cmp_lt_f32_e64 s{{\[[0-9]+:[0-9]+\]}}, |v[[A_F32]]|, |v[[B_F32]]| +; SI: v_cmp_lt_f32_e32 vcc, v[[A_F32]], v[[B_F32]] ; VI: v_cmp_lt_f16_e64 s{{\[[0-9]+:[0-9]+\]}}, |v[[A_F16]]|, |v[[B_F16]]| ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]] diff --git a/test/CodeGen/AMDGPU/fneg-fabs.f16.ll b/test/CodeGen/AMDGPU/fneg-fabs.f16.ll index d7d21311c1b..240fd071000 100644 --- a/test/CodeGen/AMDGPU/fneg-fabs.f16.ll +++ b/test/CodeGen/AMDGPU/fneg-fabs.f16.ll @@ -3,8 +3,8 @@ ; GCN-LABEL: {{^}}fneg_fabs_fadd_f16: ; CI: v_cvt_f32_f16_e32 -; CI: v_cvt_f32_f16_e32 -; CI: v_sub_f32_e64 v{{[0-9]+}}, v{{[0-9]+}}, |v{{[0-9]+}}| +; CI: v_cvt_f32_f16_e64 [[CVT_ABS_X:v[0-9]+]], |v{{[0-9]+}}| +; CI: v_subrev_f32_e32 v{{[0-9]+}}, [[CVT_ABS_X]], v{{[0-9]+}} ; VI-NOT: and ; VI: v_sub_f16_e64 {{v[0-9]+}}, {{v[0-9]+}}, |{{v[0-9]+}}| @@ -17,14 +17,15 @@ define void @fneg_fabs_fadd_f16(half addrspace(1)* %out, half %x, half %y) { } ; GCN-LABEL: {{^}}fneg_fabs_fmul_f16: -; CI: v_cvt_f32_f16_e32 -; CI: v_cvt_f32_f16_e32 -; CI: v_mul_f32_e64 {{v[0-9]+}}, {{v[0-9]+}}, -|{{v[0-9]+}}| +; CI-DAG: v_cvt_f32_f16_e32 +; CI-DAG: v_cvt_f32_f16_e64 [[CVT_NEG_ABS_X:v[0-9]+]], -|{{v[0-9]+}}| +; CI: v_mul_f32_e32 {{v[0-9]+}}, [[CVT_NEG_ABS_X]], {{v[0-9]+}} ; CI: v_cvt_f16_f32_e32 ; VI-NOT: and -; VI: v_mul_f16_e64 {{v[0-9]+}}, {{v[0-9]+}}, -|{{v[0-9]+}}| -; VI-NOT: and +; VI: v_mul_f16_e64 [[MUL:v[0-9]+]], {{v[0-9]+}}, -|{{v[0-9]+}}| +; VI-NOT: [[MUL]] +; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[MUL]] define void @fneg_fabs_fmul_f16(half addrspace(1)* %out, half %x, half %y) { %fabs = call half @llvm.fabs.f16(half %x) %fsub = fsub half -0.000000e+00, %fabs @@ -49,10 +50,7 @@ define void @fneg_fabs_free_f16(half addrspace(1)* %out, i16 %in) { ; FIXME: Should use or ; GCN-LABEL: {{^}}fneg_fabs_f16: -; CI: v_cvt_f32_f16_e32 v{{[0-9]+}}, -; CI: v_cvt_f16_f32_e64 v{{[0-9]+}}, -|v{{[0-9]+}}| - -; VI: v_or_b32_e32 v{{[0-9]+}}, 0x8000, v{{[0-9]+}} +; GCN: v_or_b32_e32 v{{[0-9]+}}, 0x8000, v{{[0-9]+}} define void @fneg_fabs_f16(half addrspace(1)* %out, half %in) { %fabs = call half @llvm.fabs.f16(half %in) %fsub = fsub half -0.000000e+00, %fabs @@ -61,10 +59,7 @@ define void @fneg_fabs_f16(half addrspace(1)* %out, half %in) { } ; GCN-LABEL: {{^}}v_fneg_fabs_f16: -; CI: v_cvt_f32_f16_e32 v{{[0-9]+}}, -; CI: v_cvt_f16_f32_e64 v{{[0-9]+}}, -|v{{[0-9]+}}| - -; VI: v_or_b32_e32 v{{[0-9]+}}, 0x8000, v{{[0-9]+}} +; GCN: v_or_b32_e32 v{{[0-9]+}}, 0x8000, v{{[0-9]+}} define void @v_fneg_fabs_f16(half addrspace(1)* %out, half addrspace(1)* %in) { %val = load half, half addrspace(1)* %in, align 2 %fabs = call half @llvm.fabs.f16(half %val) @@ -75,13 +70,10 @@ define void @v_fneg_fabs_f16(half addrspace(1)* %out, half addrspace(1)* %in) { ; FIXME: single bit op ; GCN-LABEL: {{^}}fneg_fabs_v2f16: -; CI: v_cvt_f16_f32_e64 v{{[0-9]+}}, -|v{{[0-9]+}}| -; CI: v_cvt_f16_f32_e64 v{{[0-9]+}}, -|v{{[0-9]+}}| - -; VI: s_mov_b32 [[MASK:s[0-9]+]], 0x8000{{$}} -; VI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]], -; VI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]], -; VI: flat_store_dword +; GCN: s_mov_b32 [[MASK:s[0-9]+]], 0x8000{{$}} +; GCN: v_or_b32_e32 v{{[0-9]+}}, [[MASK]], +; GCN: v_or_b32_e32 v{{[0-9]+}}, [[MASK]], +; GCN: store_dword define void @fneg_fabs_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %in) { %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %in) %fsub = fsub <2 x half> , %fabs @@ -90,17 +82,12 @@ define void @fneg_fabs_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %in) { } ; GCN-LABEL: {{^}}fneg_fabs_v4f16: -; CI: v_cvt_f16_f32_e64 v{{[0-9]+}}, -|v{{[0-9]+}}| -; CI: v_cvt_f16_f32_e64 v{{[0-9]+}}, -|v{{[0-9]+}}| -; CI: v_cvt_f16_f32_e64 v{{[0-9]+}}, -|v{{[0-9]+}}| -; CI: v_cvt_f16_f32_e64 v{{[0-9]+}}, -|v{{[0-9]+}}| - -; VI: s_mov_b32 [[MASK:s[0-9]+]], 0x8000{{$}} -; VI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]], -; VI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]], -; VI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]], -; VI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]], -; VI: flat_store_dwordx2 +; GCN: s_mov_b32 [[MASK:s[0-9]+]], 0x8000{{$}} +; GCN: v_or_b32_e32 v{{[0-9]+}}, [[MASK]], +; GCN: v_or_b32_e32 v{{[0-9]+}}, [[MASK]], +; GCN: v_or_b32_e32 v{{[0-9]+}}, [[MASK]], +; GCN: v_or_b32_e32 v{{[0-9]+}}, [[MASK]], +; GCN: store_dwordx2 define void @fneg_fabs_v4f16(<4 x half> addrspace(1)* %out, <4 x half> %in) { %fabs = call <4 x half> @llvm.fabs.v4f16(<4 x half> %in) %fsub = fsub <4 x half> , %fabs diff --git a/test/CodeGen/AMDGPU/fneg.f16.ll b/test/CodeGen/AMDGPU/fneg.f16.ll index e3dfd9201a2..d545cc789d8 100644 --- a/test/CodeGen/AMDGPU/fneg.f16.ll +++ b/test/CodeGen/AMDGPU/fneg.f16.ll @@ -15,13 +15,9 @@ define void @s_fneg_f16(half addrspace(1)* %out, half %in) { ; FUNC-LABEL: {{^}}v_fneg_f16: ; GCN: flat_load_ushort [[VAL:v[0-9]+]], - -; CI: v_cvt_f32_f16_e32 [[CVT0:v[0-9]+]], [[VAL]] -; CI: v_cvt_f16_f32_e64 [[CVT1:v[0-9]+]], -[[CVT0]] -; CI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[CVT1]] - -; VI: v_xor_b32_e32 [[XOR:v[0-9]+]], 0x8000, [[VAL]] +; GCN: v_xor_b32_e32 [[XOR:v[0-9]+]], 0x8000, [[VAL]] ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[XOR]] +; SI: buffer_store_short [[XOR]] define void @v_fneg_f16(half addrspace(1)* %out, half addrspace(1)* %in) { %val = load half, half addrspace(1)* %in, align 2 %fneg = fsub half -0.000000e+00, %val @@ -45,8 +41,9 @@ define void @fneg_free_f16(half addrspace(1)* %out, i16 %in) { ; FUNC-LABEL: {{^}}v_fneg_fold_f16: ; GCN: flat_load_ushort [[NEG_VALUE:v[0-9]+]] -; CI: v_cvt_f32_f16_e32 [[CVT0:v[0-9]+]], [[CVT0]] -; CI: v_mul_f32_e64 [[MUL:v[0-9]+]], -[[CVT0]], [[CVT0]] +; CI-DAG: v_cvt_f32_f16_e32 [[CVT_VAL:v[0-9]+]], [[NEG_VALUE]] +; CI-DAG: v_cvt_f32_f16_e64 [[NEG_CVT0:v[0-9]+]], -[[NEG_VALUE]] +; CI: v_mul_f32_e32 [[MUL:v[0-9]+]], [[CVT_VAL]], [[NEG_CVT0]] ; CI: v_cvt_f16_f32_e32 [[CVT1:v[0-9]+]], [[MUL]] ; CI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[CVT1]] diff --git a/test/CodeGen/AMDGPU/fpext.f16.ll b/test/CodeGen/AMDGPU/fpext.f16.ll index c4f5d7cdfb5..433fdf1e075 100644 --- a/test/CodeGen/AMDGPU/fpext.f16.ll +++ b/test/CodeGen/AMDGPU/fpext.f16.ll @@ -68,3 +68,202 @@ entry: store <2 x double> %r.val, <2 x double> addrspace(1)* %r ret void } + +; GCN-LABEL: {{^}}s_fneg_fpext_f16_to_f32: +; GCN: v_cvt_f32_f16_e32 v{{[0-9]+}}, s{{[0-9]+}} +define void @s_fneg_fpext_f16_to_f32(float addrspace(1)* %r, i32 %a) { +entry: + %a.trunc = trunc i32 %a to i16 + %a.val = bitcast i16 %a.trunc to half + %r.val = fpext half %a.val to float + store float %r.val, float addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}fneg_fpext_f16_to_f32: +; GCN: {{buffer|flat}}_load_ushort [[A:v[0-9]+]] +; GCN: v_cvt_f32_f16_e64 v{{[0-9]+}}, -[[A]] +define void @fneg_fpext_f16_to_f32( + float addrspace(1)* %r, + half addrspace(1)* %a) { +entry: + %a.val = load half, half addrspace(1)* %a + %a.neg = fsub half -0.0, %a.val + %r.val = fpext half %a.neg to float + store float %r.val, float addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}fabs_fpext_f16_to_f32: +; GCN: {{buffer|flat}}_load_ushort [[A:v[0-9]+]] +; GCN: v_cvt_f32_f16_e64 v{{[0-9]+}}, |[[A]]| +define void @fabs_fpext_f16_to_f32( + float addrspace(1)* %r, + half addrspace(1)* %a) { +entry: + %a.val = load half, half addrspace(1)* %a + %a.fabs = call half @llvm.fabs.f16(half %a.val) + %r.val = fpext half %a.fabs to float + store float %r.val, float addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}fneg_fabs_fpext_f16_to_f32: +; GCN: {{buffer|flat}}_load_ushort [[A:v[0-9]+]] +; GCN: v_cvt_f32_f16_e64 v{{[0-9]+}}, -|[[A]]| +define void @fneg_fabs_fpext_f16_to_f32( + float addrspace(1)* %r, + half addrspace(1)* %a) { +entry: + %a.val = load half, half addrspace(1)* %a + %a.fabs = call half @llvm.fabs.f16(half %a.val) + %a.fneg.fabs = fsub half -0.0, %a.fabs + %r.val = fpext half %a.fneg.fabs to float + store float %r.val, float addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}fneg_multi_use_fpext_f16_to_f32: +; GCN: {{buffer|flat}}_load_ushort [[A:v[0-9]+]] +; GCN-DAG: v_xor_b32_e32 [[XOR:v[0-9]+]], 0x8000, [[A]] + +; FIXME: Using the source modifier here only wastes code size +; SI-DAG: v_cvt_f32_f16_e32 [[CVT:v[0-9]+]], [[A]] +; VI-DAG: v_cvt_f32_f16_e64 [[CVT:v[0-9]+]], -[[A]] + +; GCN: store_dword [[CVT]] +; GCN: store_short [[XOR]] +define void @fneg_multi_use_fpext_f16_to_f32( + float addrspace(1)* %r, + half addrspace(1)* %a) { +entry: + %a.val = load half, half addrspace(1)* %a + %a.neg = fsub half -0.0, %a.val + %r.val = fpext half %a.neg to float + store volatile float %r.val, float addrspace(1)* %r + store volatile half %a.neg, half addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}fneg_multi_foldable_use_fpext_f16_to_f32: +; GCN: {{buffer|flat}}_load_ushort [[A:v[0-9]+]] +; GCN-DAG: v_cvt_f32_f16_e64 [[CVTA_NEG:v[0-9]+]], -[[A]] +; SI-DAG: v_cvt_f32_f16_e32 [[CVTA:v[0-9]+]], [[A]] +; SI: v_mul_f32_e32 [[MUL_F32:v[0-9]+]], [[CVTA]], [[CVTA_NEG]] +; SI: v_cvt_f16_f32_e32 [[MUL:v[0-9]+]], [[MUL_F32]] + +; VI-DAG: v_cvt_f32_f16_e64 [[CVT_NEGA:v[0-9]+]], -[[A]] +; VI: v_mul_f16_e64 [[MUL:v[0-9]+]], -[[A]], [[A]] + +; GCN: buffer_store_dword [[CVTA_NEG]] +; GCN: buffer_store_short [[MUL]] +define void @fneg_multi_foldable_use_fpext_f16_to_f32( + float addrspace(1)* %r, + half addrspace(1)* %a) { +entry: + %a.val = load half, half addrspace(1)* %a + %a.neg = fsub half -0.0, %a.val + %r.val = fpext half %a.neg to float + %mul = fmul half %a.neg, %a.val + store volatile float %r.val, float addrspace(1)* %r + store volatile half %mul, half addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}fabs_multi_use_fpext_f16_to_f32: +; GCN: {{buffer|flat}}_load_ushort [[A:v[0-9]+]] +; GCN-DAG: v_and_b32_e32 [[XOR:v[0-9]+]], 0x7fff, [[A]] + +; SI-DAG: v_cvt_f32_f16_e32 [[CVT:v[0-9]+]], [[A]] +; VI-DAG: v_cvt_f32_f16_e64 [[CVT:v[0-9]+]], |[[A]]| + +; GCN: store_dword [[CVT]] +; GCN: store_short [[XOR]] +define void @fabs_multi_use_fpext_f16_to_f32( + float addrspace(1)* %r, + half addrspace(1)* %a) { +entry: + %a.val = load half, half addrspace(1)* %a + %a.fabs = call half @llvm.fabs.f16(half %a.val) + %r.val = fpext half %a.fabs to float + store volatile float %r.val, float addrspace(1)* %r + store volatile half %a.fabs, half addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}fabs_multi_foldable_use_fpext_f16_to_f32: +; GCN: {{buffer|flat}}_load_ushort [[A:v[0-9]+]] +; SI: v_cvt_f32_f16_e32 [[CVTA:v[0-9]+]], [[A]] +; SI: v_mul_f32_e64 [[MUL_F32:v[0-9]+]], |[[CVTA]]|, [[CVTA]] +; SI: v_cvt_f16_f32_e32 [[MUL:v[0-9]+]], [[MUL_F32]] +; SI: v_and_b32_e32 [[ABS_A:v[0-9]+]], 0x7fffffff, [[CVTA]] + +; VI-DAG: v_cvt_f32_f16_e64 [[ABS_A:v[0-9]+]], |[[A]]| +; VI: v_mul_f16_e64 [[MUL:v[0-9]+]], |[[A]]|, [[A]] + +; GCN: buffer_store_dword [[ABS_A]] +; GCN: buffer_store_short [[MUL]] +define void @fabs_multi_foldable_use_fpext_f16_to_f32( + float addrspace(1)* %r, + half addrspace(1)* %a) { +entry: + %a.val = load half, half addrspace(1)* %a + %a.fabs = call half @llvm.fabs.f16(half %a.val) + %r.val = fpext half %a.fabs to float + %mul = fmul half %a.fabs, %a.val + store volatile float %r.val, float addrspace(1)* %r + store volatile half %mul, half addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}fabs_fneg_multi_use_fpext_f16_to_f32: +; GCN: {{buffer|flat}}_load_ushort [[A:v[0-9]+]] +; GCN-DAG: v_or_b32_e32 [[OR:v[0-9]+]], 0x8000, [[A]] + +; SI: v_cvt_f32_f16_e32 [[CVT:v[0-9]+]], [[OR]] +; VI-DAG: v_cvt_f32_f16_e64 [[CVT:v[0-9]+]], -|[[OR]]| + +; GCN: buffer_store_dword [[CVT]] +; GCN: buffer_store_short [[OR]] +define void @fabs_fneg_multi_use_fpext_f16_to_f32( + float addrspace(1)* %r, + half addrspace(1)* %a) { +entry: + %a.val = load half, half addrspace(1)* %a + %a.fabs = call half @llvm.fabs.f16(half %a.val) + %a.fneg.fabs = fsub half -0.0, %a.fabs + %r.val = fpext half %a.fneg.fabs to float + store volatile float %r.val, float addrspace(1)* %r + store volatile half %a.fneg.fabs, half addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}fabs_fneg_multi_foldable_use_fpext_f16_to_f32: +; GCN: {{buffer|flat}}_load_ushort [[A:v[0-9]+]] +; SI: v_cvt_f32_f16_e32 [[CVTA:v[0-9]+]], [[A]] +; SI: v_mul_f32_e64 [[MUL_F32:v[0-9]+]], -|[[CVTA]]|, [[CVTA]] +; SI: v_cvt_f16_f32_e32 [[MUL:v[0-9]+]], [[MUL_F32]] +; SI: v_or_b32_e32 [[FABS_FNEG:v[0-9]+]], 0x80000000, [[CVTA]] + +; VI-DAG: v_cvt_f32_f16_e64 [[FABS_FNEG:v[0-9]+]], -|[[A]]| +; VI-DAG: v_mul_f16_e64 [[MUL:v[0-9]+]], -|[[A]]|, [[A]] + +; GCN: buffer_store_dword [[FABS_FNEG]] +; GCN: buffer_store_short [[MUL]] +define void @fabs_fneg_multi_foldable_use_fpext_f16_to_f32( + float addrspace(1)* %r, + half addrspace(1)* %a) { +entry: + %a.val = load half, half addrspace(1)* %a + %a.fabs = call half @llvm.fabs.f16(half %a.val) + %a.fneg.fabs = fsub half -0.0, %a.fabs + %r.val = fpext half %a.fneg.fabs to float + %mul = fmul half %a.fneg.fabs, %a.val + store volatile float %r.val, float addrspace(1)* %r + store volatile half %mul, half addrspace(1)* undef + ret void +} + +declare half @llvm.fabs.f16(half) #1 + +attributes #1 = { nounwind readnone } diff --git a/test/CodeGen/AMDGPU/fptrunc.f16.ll b/test/CodeGen/AMDGPU/fptrunc.f16.ll index 284fc53c824..c9905d5f7ff 100644 --- a/test/CodeGen/AMDGPU/fptrunc.f16.ll +++ b/test/CodeGen/AMDGPU/fptrunc.f16.ll @@ -1,7 +1,7 @@ ; RUN: llc -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s ; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s -; GCN-LABEL: {{^}}fptrunc_f32_to_f16 +; GCN-LABEL: {{^}}fptrunc_f32_to_f16: ; GCN: buffer_load_dword v[[A_F32:[0-9]+]] ; GCN: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[A_F32]] ; GCN: buffer_store_short v[[R_F16]] @@ -16,7 +16,7 @@ entry: ret void } -; GCN-LABEL: {{^}}fptrunc_f64_to_f16 +; GCN-LABEL: {{^}}fptrunc_f64_to_f16: ; GCN: buffer_load_dwordx2 v{{\[}}[[A_F64_0:[0-9]+]]:[[A_F64_1:[0-9]+]]{{\]}} ; GCN: v_cvt_f32_f64_e32 v[[A_F32:[0-9]+]], v{{\[}}[[A_F64_0]]:[[A_F64_1]]{{\]}} ; GCN: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[A_F32]] @@ -32,7 +32,7 @@ entry: ret void } -; GCN-LABEL: {{^}}fptrunc_v2f32_to_v2f16 +; GCN-LABEL: {{^}}fptrunc_v2f32_to_v2f16: ; GCN: buffer_load_dwordx2 v{{\[}}[[A_F32_0:[0-9]+]]:[[A_F32_1:[0-9]+]]{{\]}} ; GCN-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[A_F32_0]] ; GCN-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[A_F32_1]] @@ -51,7 +51,7 @@ entry: ret void } -; GCN-LABEL: {{^}}fptrunc_v2f64_to_v2f16 +; GCN-LABEL: {{^}}fptrunc_v2f64_to_v2f16: ; GCN: buffer_load_dwordx4 v{{\[}}[[A_F64_0:[0-9]+]]:[[A_F64_3:[0-9]+]]{{\]}} ; GCN: v_cvt_f32_f64_e32 v[[A_F32_0:[0-9]+]], v{{\[}}[[A_F64_0]]:{{[0-9]+}}{{\]}} ; GCN: v_cvt_f32_f64_e32 v[[A_F32_1:[0-9]+]], v{{\[}}{{[0-9]+}}:[[A_F64_3]]{{\]}} @@ -70,3 +70,56 @@ entry: store <2 x half> %r.val, <2 x half> addrspace(1)* %r ret void } + +; GCN-LABEL: {{^}}fneg_fptrunc_f32_to_f16: +; GCN: buffer_load_dword v[[A_F32:[0-9]+]] +; GCN: v_cvt_f16_f32_e64 v[[R_F16:[0-9]+]], -v[[A_F32]] +; GCN: buffer_store_short v[[R_F16]] +; GCN: s_endpgm +define void @fneg_fptrunc_f32_to_f16( + half addrspace(1)* %r, + float addrspace(1)* %a) { +entry: + %a.val = load float, float addrspace(1)* %a + %a.fneg = fsub float -0.0, %a.val + %r.val = fptrunc float %a.fneg to half + store half %r.val, half addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}fabs_fptrunc_f32_to_f16: +; GCN: buffer_load_dword v[[A_F32:[0-9]+]] +; GCN: v_cvt_f16_f32_e64 v[[R_F16:[0-9]+]], |v[[A_F32]]| +; GCN: buffer_store_short v[[R_F16]] +; GCN: s_endpgm +define void @fabs_fptrunc_f32_to_f16( + half addrspace(1)* %r, + float addrspace(1)* %a) { +entry: + %a.val = load float, float addrspace(1)* %a + %a.fabs = call float @llvm.fabs.f32(float %a.val) + %r.val = fptrunc float %a.fabs to half + store half %r.val, half addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}fneg_fabs_fptrunc_f32_to_f16: +; GCN: buffer_load_dword v[[A_F32:[0-9]+]] +; GCN: v_cvt_f16_f32_e64 v[[R_F16:[0-9]+]], -|v[[A_F32]]| +; GCN: buffer_store_short v[[R_F16]] +; GCN: s_endpgm +define void @fneg_fabs_fptrunc_f32_to_f16( + half addrspace(1)* %r, + float addrspace(1)* %a) { +entry: + %a.val = load float, float addrspace(1)* %a + %a.fabs = call float @llvm.fabs.f32(float %a.val) + %a.fneg.fabs = fsub float -0.0, %a.fabs + %r.val = fptrunc float %a.fneg.fabs to half + store half %r.val, half addrspace(1)* %r + ret void +} + +declare float @llvm.fabs.f32(float) #1 + +attributes #1 = { nounwind readnone } diff --git a/test/CodeGen/AMDGPU/v_mac_f16.ll b/test/CodeGen/AMDGPU/v_mac_f16.ll index 20c1d2310d3..413f3f337d3 100644 --- a/test/CodeGen/AMDGPU/v_mac_f16.ll +++ b/test/CodeGen/AMDGPU/v_mac_f16.ll @@ -31,9 +31,10 @@ entry: ret void } -; GCN-LABEL: {{^}}mac_f16_same_add +; GCN-LABEL: {{^}}mac_f16_same_add: ; SI: v_mad_f32 v{{[0-9]}}, v{{[0-9]+}}, v{{[0-9]+}}, [[ADD:v[0-9]+]] ; SI: v_mac_f32_e32 [[ADD]], v{{[0-9]+}}, v{{[0-9]+}} + ; VI: v_mad_f16 v{{[0-9]}}, v{{[0-9]+}}, v{{[0-9]+}}, [[ADD:v[0-9]+]] ; VI: v_mac_f16_e32 [[ADD]], v{{[0-9]+}}, v{{[0-9]+}} ; GCN: s_endpgm @@ -63,9 +64,11 @@ entry: ret void } -; GCN-LABEL: {{^}}mac_f16_neg_a -; SI-NOT: v_mac_f32 -; SI: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; GCN-LABEL: {{^}}mac_f16_neg_a: +; SI-DAG: v_cvt_f32_f16_e64 [[CVT_NEG:v[0-9]+]], -v{{[0-9]+}} +; SI-DAG: v_cvt_f32_f16_e32 [[CVT_OTHER:v[0-9]+]], v{{[0-9]+}} +; SI: v_mac_f32_e32 v{{[0-9]+}}, [[CVT_OTHER]], [[CVT_NEG]] + ; VI-NOT: v_mac_f16 ; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; GCN: s_endpgm @@ -87,9 +90,10 @@ entry: ret void } -; GCN-LABEL: {{^}}mac_f16_neg_b -; SI-NOT: v_mac_f32 -; SI: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; GCN-LABEL: {{^}}mac_f16_neg_b: +; SI-DAG: v_cvt_f32_f16_e64 [[CVT_NEG:v[0-9]+]], -v{{[0-9]+}} +; SI-DAG: v_cvt_f32_f16_e32 [[CVT_OTHER:v[0-9]+]], v{{[0-9]+}} +; SI: v_mac_f32_e32 v{{[0-9]+}}, [[CVT_OTHER]], [[CVT_NEG]] ; VI-NOT: v_mac_f16 ; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; GCN: s_endpgm @@ -111,9 +115,12 @@ entry: ret void } -; GCN-LABEL: {{^}}mac_f16_neg_c -; SI-NOT: v_mac_f32 -; SI: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}} +; GCN-LABEL: {{^}}mac_f16_neg_c: +; SI: v_cvt_f32_f16_e32 +; SI-DAG: v_cvt_f32_f16_e32 +; SI-DAG: v_cvt_f32_f16_e64 [[CVT_NEG:v[0-9]+]], -v{{[0-9]+}} +; SI: v_mac_f32_e32 [[CVT_NEG]], v{{[0-9]+}}, v{{[0-9]+}} + ; VI-NOT: v_mac_f16 ; VI: v_mad_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}} ; GCN: s_endpgm @@ -207,9 +214,11 @@ entry: ret void } -; GCN-LABEL: {{^}}mac_f16_neg_a_unsafe_fp_math -; SI-NOT: v_mac_f32 -; SI: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]}} +; GCN-LABEL: {{^}}mac_f16_neg_a_unsafe_fp_math: +; SI-DAG: v_cvt_f32_f16_e64 [[CVT_NEG:v[0-9]+]], -v{{[0-9]+}} +; SI-DAG: v_cvt_f32_f16_e32 [[CVT_OTHER:v[0-9]+]], v{{[0-9]+}} +; SI: v_mac_f32_e32 v{{[0-9]+}}, [[CVT_OTHER]], [[CVT_NEG]] + ; VI-NOT: v_mac_f16 ; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]}} ; GCN: s_endpgm @@ -231,9 +240,11 @@ entry: ret void } -; GCN-LABEL: {{^}}mac_f16_neg_b_unsafe_fp_math -; SI-NOT: v_mac_f32 -; SI: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]}} +; GCN-LABEL: {{^}}mac_f16_neg_b_unsafe_fp_math: +; SI-DAG: v_cvt_f32_f16_e64 [[CVT_NEG:v[0-9]+]], -v{{[0-9]+}} +; SI-DAG: v_cvt_f32_f16_e32 [[CVT_OTHER:v[0-9]+]], v{{[0-9]+}} +; SI: v_mac_f32_e32 v{{[0-9]+}}, [[CVT_OTHER]], [[CVT_NEG]] + ; VI-NOT: v_mac_f16 ; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]}} ; GCN: s_endpgm @@ -255,9 +266,12 @@ entry: ret void } -; GCN-LABEL: {{^}}mac_f16_neg_c_unsafe_fp_math -; SI-NOT: v_mac_f32 -; SI: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]}} +; GCN-LABEL: {{^}}mac_f16_neg_c_unsafe_fp_math: +; SI: v_cvt_f32_f16_e32 +; SI: v_cvt_f32_f16_e32 +; SI-DAG: v_cvt_f32_f16_e64 [[CVT_NEG:v[0-9]+]], -v{{[0-9]+}} +; SI: v_mac_f32_e32 [[CVT_NEG]], v{{[0-9]+}}, v{{[0-9]+}} + ; VI-NOT: v_mac_f16 ; VI: v_mad_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]}} ; GCN: s_endpgm @@ -279,7 +293,7 @@ entry: ret void } -; GCN-LABEL: {{^}}mac_v2f16 +; GCN-LABEL: {{^}}mac_v2f16: ; GCN: {{buffer|flat}}_load_dword v[[A_V2_F16:[0-9]+]] ; GCN: {{buffer|flat}}_load_dword v[[B_V2_F16:[0-9]+]] ; GCN: {{buffer|flat}}_load_dword v[[C_V2_F16:[0-9]+]] @@ -322,7 +336,7 @@ entry: ret void } -; GCN-LABEL: {{^}}mac_v2f16_same_add +; GCN-LABEL: {{^}}mac_v2f16_same_add: ; SI: v_mad_f32 v{{[0-9]}}, v{{[0-9]+}}, v{{[0-9]+}}, [[ADD0:v[0-9]+]] ; SI: v_mad_f32 v{{[0-9]}}, v{{[0-9]+}}, v{{[0-9]+}}, [[ADD1:v[0-9]+]] ; SI: v_mac_f32_e32 [[ADD0]], v{{[0-9]+}}, v{{[0-9]+}} @@ -358,10 +372,13 @@ entry: ret void } -; GCN-LABEL: {{^}}mac_v2f16_neg_a -; SI-NOT: v_mac_f32 -; SI: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; SI: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; GCN-LABEL: {{^}}mac_v2f16_neg_a: +; SI: v_cvt_f32_f16_e64 [[CVT_NEG0:v[0-9]+]], -{{v[0-9]+}} +; SI: v_cvt_f32_f16_e64 [[CVT_NEG1:v[0-9]+]], -{{v[0-9]+}} + +; SI-DAG: v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[CVT_NEG0]] +; SI-DAG: v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[CVT_NEG1]] + ; VI-NOT: v_mac_f16 ; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} @@ -385,9 +402,12 @@ entry: } ; GCN-LABEL: {{^}}mac_v2f16_neg_b -; SI-NOT: v_mac_f32 -; SI: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; SI: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; SI: v_cvt_f32_f16_e64 [[CVT_NEG0:v[0-9]+]], -{{v[0-9]+}} +; SI: v_cvt_f32_f16_e64 [[CVT_NEG1:v[0-9]+]], -{{v[0-9]+}} +; SI-DAG: v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[CVT_NEG0]] +; SI-DAG: v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[CVT_NEG1]] + + ; VI-NOT: v_mac_f16 ; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} @@ -410,10 +430,13 @@ entry: ret void } -; GCN-LABEL: {{^}}mac_v2f16_neg_c -; SI-NOT: v_mac_f32 -; SI: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}} -; SI: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}} +; GCN-LABEL: {{^}}mac_v2f16_neg_c: +; SI: v_cvt_f32_f16_e64 [[CVT_NEG0:v[0-9]+]], -{{v[0-9]+}} +; SI: v_cvt_f32_f16_e64 [[CVT_NEG1:v[0-9]+]], -{{v[0-9]+}} + +; SI-DAG: v_mac_f32_e32 [[CVT_NEG0]], v{{[0-9]+}}, v{{[0-9]+}} +; SI-DAG: v_mac_f32_e32 [[CVT_NEG1]], v{{[0-9]+}}, v{{[0-9]+}} + ; VI-NOT: v_mac_f16 ; VI: v_mad_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}} ; VI: v_mad_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}} @@ -464,7 +487,7 @@ entry: ret void } -; GCN-LABEL: {{^}}mac_v2f16_neg_b_safe_fp_math +; GCN-LABEL: {{^}}mac_v2f16_neg_b_safe_fp_math: ; SI: v_sub_f32_e32 v[[NEG_A0:[0-9]+]], 0, v{{[0-9]+}} ; SI: v_sub_f32_e32 v[[NEG_A1:[0-9]+]], 0, v{{[0-9]+}} ; SI: v_mac_f32_e32 v{{[0-9]+}}, v[[NEG_A0]], v{{[0-9]+}} @@ -492,7 +515,7 @@ entry: ret void } -; GCN-LABEL: {{^}}mac_v2f16_neg_c_safe_fp_math +; GCN-LABEL: {{^}}mac_v2f16_neg_c_safe_fp_math: ; SI: v_sub_f32_e32 v[[NEG_A0:[0-9]+]], 0, v{{[0-9]+}} ; SI: v_sub_f32_e32 v[[NEG_A1:[0-9]+]], 0, v{{[0-9]+}} ; SI: v_mac_f32_e32 v[[NEG_A0]], v{{[0-9]+}}, v{{[0-9]+}} @@ -520,10 +543,13 @@ entry: ret void } -; GCN-LABEL: {{^}}mac_v2f16_neg_a_unsafe_fp_math -; SI-NOT: v_mac_f32 -; SI: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}} -; SI: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}} +; GCN-LABEL: {{^}}mac_v2f16_neg_a_unsafe_fp_math: +; SI: v_cvt_f32_f16_e64 [[CVT_NEG0:v[0-9]+]], -{{v[0-9]+}} +; SI: v_cvt_f32_f16_e64 [[CVT_NEG1:v[0-9]+]], -{{v[0-9]+}} + +; SI-DAG: v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[CVT_NEG0]] +; SI-DAG: v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[CVT_NEG1]] + ; VI-NOT: v_mac_f16 ; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}} ; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}} @@ -546,10 +572,13 @@ entry: ret void } -; GCN-LABEL: {{^}}mac_v2f16_neg_b_unsafe_fp_math -; SI-NOT: v_mac_f32 -; SI: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}} -; SI: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}} +; GCN-LABEL: {{^}}mac_v2f16_neg_b_unsafe_fp_math: +; SI: v_cvt_f32_f16_e64 [[CVT_NEG0:v[0-9]+]], -{{v[0-9]+}} +; SI: v_cvt_f32_f16_e64 [[CVT_NEG1:v[0-9]+]], -{{v[0-9]+}} + +; SI-DAG: v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[CVT_NEG0]] +; SI-DAG: v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[CVT_NEG1]] + ; VI-NOT: v_mac_f16 ; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}} ; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}} @@ -572,10 +601,13 @@ entry: ret void } -; GCN-LABEL: {{^}}mac_v2f16_neg_c_unsafe_fp_math -; SI-NOT: v_mac_f32 -; SI: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[-0-9]}} -; SI: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[-0-9]}} +; GCN-LABEL: {{^}}mac_v2f16_neg_c_unsafe_fp_math: +; SI: v_cvt_f32_f16_e64 [[CVT_NEG0:v[0-9]+]], -{{v[0-9]+}} +; SI: v_cvt_f32_f16_e64 [[CVT_NEG1:v[0-9]+]], -{{v[0-9]+}} + +; SI-DAG: v_mac_f32_e32 [[CVT_NEG0]], v{{[0-9]+}}, v{{[0-9]+}} +; SI-DAG: v_mac_f32_e32 [[CVT_NEG1]], v{{[0-9]+}}, v{{[0-9]+}} + ; VI-NOT: v_mac_f16 ; VI: v_mad_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[-0-9]}} ; VI: v_mad_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[-0-9]}} diff --git a/test/MC/AMDGPU/gfx8_asm_all.s b/test/MC/AMDGPU/gfx8_asm_all.s index 30e7eeeae12..fcf6e29acfd 100644 --- a/test/MC/AMDGPU/gfx8_asm_all.s +++ b/test/MC/AMDGPU/gfx8_asm_all.s @@ -24330,14 +24330,16 @@ v_cvt_f32_f16_e64 v0, exec_hi v_cvt_f32_f16_e64 v0, 0 // CHECK: [0x00,0x00,0x4b,0xd1,0x80,0x00,0x00,0x00] +// FIXME: Parsing source modifiers v_cvt_f32_f16_e64 v0, -1 -// CHECK: [0x00,0x00,0x4b,0xd1,0xc1,0x00,0x00,0x00] +// CHECK: [0x00,0x00,0x4b,0xd1,0x81,0x00,0x00,0x20] v_cvt_f32_f16_e64 v0, 0.5 // CHECK: [0x00,0x00,0x4b,0xd1,0xf0,0x00,0x00,0x00] +// FIXME: Parsing source modifiers v_cvt_f32_f16_e64 v0, -4.0 -// CHECK: [0x00,0x00,0x4b,0xd1,0xf7,0x00,0x00,0x00] +// CHECK: [0x00,0x00,0x4b,0xd1,0xf6,0x00,0x00,0x20] v_cvt_f32_f16_e64 v0, v0 // CHECK: [0x00,0x00,0x4b,0xd1,0x00,0x01,0x00,0x00]