From: Sjoerd Meijer Date: Thu, 24 Aug 2017 09:21:10 +0000 (+0000) Subject: [AArch64] Custom lowering of copysign f16 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=9a6d31e0cadea820af7d9f3adf0169d0b760dd0e;p=llvm [AArch64] Custom lowering of copysign f16 This is a follow up patch of r311154 and introduces custom lowering of copysign f16 to avoid promotions to single precision types when the subtarget supports fullfp16. Differential Revision: https://reviews.llvm.org/D36893 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@311646 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp index 03b4edfd333..d674062276e 100644 --- a/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -321,6 +321,10 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::FPOW, MVT::f64, Expand); setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom); setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); + if (Subtarget->hasFullFP16()) + setOperationAction(ISD::FCOPYSIGN, MVT::f16, Custom); + else + setOperationAction(ISD::FCOPYSIGN, MVT::f16, Promote); setOperationAction(ISD::FREM, MVT::f16, Promote); setOperationAction(ISD::FPOW, MVT::f16, Promote); @@ -333,7 +337,6 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::FLOG, MVT::f16, Promote); setOperationAction(ISD::FLOG2, MVT::f16, Promote); setOperationAction(ISD::FLOG10, MVT::f16, Promote); - setOperationAction(ISD::FCOPYSIGN, MVT::f16, Promote); if (!Subtarget->hasFullFP16()) { setOperationAction(ISD::SELECT, MVT::f16, Promote); @@ -4084,25 +4087,26 @@ SDValue AArch64TargetLowering::LowerFCOPYSIGN(SDValue Op, In2 = DAG.getNode(ISD::FP_ROUND, DL, VT, In2, DAG.getIntPtrConstant(0, DL)); EVT VecVT; - EVT EltVT; uint64_t EltMask; SDValue VecVal1, VecVal2; - if (VT == MVT::f32 || VT == MVT::v2f32 || VT == MVT::v4f32) { - EltVT = MVT::i32; - VecVT = (VT == MVT::v2f32 ? MVT::v2i32 : MVT::v4i32); - EltMask = 0x80000000ULL; + auto setVecVal = [&] (int Idx) { if (!VT.isVector()) { - VecVal1 = DAG.getTargetInsertSubreg(AArch64::ssub, DL, VecVT, + VecVal1 = DAG.getTargetInsertSubreg(Idx, DL, VecVT, DAG.getUNDEF(VecVT), In1); - VecVal2 = DAG.getTargetInsertSubreg(AArch64::ssub, DL, VecVT, + VecVal2 = DAG.getTargetInsertSubreg(Idx, DL, VecVT, DAG.getUNDEF(VecVT), In2); } else { VecVal1 = DAG.getNode(ISD::BITCAST, DL, VecVT, In1); VecVal2 = DAG.getNode(ISD::BITCAST, DL, VecVT, In2); } + }; + + if (VT == MVT::f32 || VT == MVT::v2f32 || VT == MVT::v4f32) { + VecVT = (VT == MVT::v2f32 ? MVT::v2i32 : MVT::v4i32); + EltMask = 0x80000000ULL; + setVecVal(AArch64::ssub); } else if (VT == MVT::f64 || VT == MVT::v2f64) { - EltVT = MVT::i64; VecVT = MVT::v2i64; // We want to materialize a mask with the high bit set, but the AdvSIMD @@ -4110,15 +4114,11 @@ SDValue AArch64TargetLowering::LowerFCOPYSIGN(SDValue Op, // 64-bit elements. Instead, materialize zero and then negate it. EltMask = 0; - if (!VT.isVector()) { - VecVal1 = DAG.getTargetInsertSubreg(AArch64::dsub, DL, VecVT, - DAG.getUNDEF(VecVT), In1); - VecVal2 = DAG.getTargetInsertSubreg(AArch64::dsub, DL, VecVT, - DAG.getUNDEF(VecVT), In2); - } else { - VecVal1 = DAG.getNode(ISD::BITCAST, DL, VecVT, In1); - VecVal2 = DAG.getNode(ISD::BITCAST, DL, VecVT, In2); - } + setVecVal(AArch64::dsub); + } else if (VT == MVT::f16 || VT == MVT::v4f16 || VT == MVT::v8f16) { + VecVT = (VT == MVT::v4f16 ? MVT::v4i16 : MVT::v8i16); + EltMask = 0x8000ULL; + setVecVal(AArch64::hsub); } else { llvm_unreachable("Invalid type for copysign!"); } @@ -4136,6 +4136,8 @@ SDValue AArch64TargetLowering::LowerFCOPYSIGN(SDValue Op, SDValue Sel = DAG.getNode(AArch64ISD::BIT, DL, VecVT, VecVal1, VecVal2, BuildVec); + if (VT == MVT::f16) + return DAG.getTargetExtractSubreg(AArch64::hsub, DL, VT, Sel); if (VT == MVT::f32) return DAG.getTargetExtractSubreg(AArch64::ssub, DL, VT, Sel); else if (VT == MVT::f64) diff --git a/test/CodeGen/AArch64/f16-instructions.ll b/test/CodeGen/AArch64/f16-instructions.ll index 22102bbc057..1bec17f78ad 100644 --- a/test/CodeGen/AArch64/f16-instructions.ll +++ b/test/CodeGen/AArch64/f16-instructions.ll @@ -934,37 +934,57 @@ define half @test_maxnum(half %a, half %b) #0 { ret half %r } -; CHECK-COMMON-LABEL: test_copysign: -; CHECK-COMMON-NEXT: fcvt s1, h1 -; CHECK-COMMON-NEXT: fcvt s0, h0 -; CHECK-COMMON-NEXT: movi.4s v2, #128, lsl #24 -; CHECK-COMMON-NEXT: bit.16b v0, v1, v2 -; CHECK-COMMON-NEXT: fcvt h0, s0 -; CHECK-COMMON-NEXT: ret +; CHECK-CVT-LABEL: test_copysign: +; CHECK-CVT-NEXT: fcvt s1, h1 +; CHECK-CVT-NEXT: fcvt s0, h0 +; CHECK-CVT-NEXT: movi.4s v2, #128, lsl #24 +; CHECK-CVT-NEXT: bit.16b v0, v1, v2 +; CHECK-CVT-NEXT: fcvt h0, s0 +; CHECK-CVT-NEXT: ret + +; CHECK-FP16-LABEL: test_copysign: +; CHECK-FP16-NEXT: movi.8h v2, #128, lsl #8 +; CHECK-FP16-NEXT: bit.16b v0, v1, v2 +; CHECK-FP16-NEXT: ret + define half @test_copysign(half %a, half %b) #0 { %r = call half @llvm.copysign.f16(half %a, half %b) ret half %r } -; CHECK-COMMON-LABEL: test_copysign_f32: -; CHECK-COMMON-NEXT: fcvt s0, h0 -; CHECK-COMMON-NEXT: movi.4s v2, #128, lsl #24 -; CHECK-COMMON-NEXT: bit.16b v0, v1, v2 -; CHECK-COMMON-NEXT: fcvt h0, s0 -; CHECK-COMMON-NEXT: ret +; CHECK-CVT-LABEL: test_copysign_f32: +; CHECK-CVT-NEXT: fcvt s0, h0 +; CHECK-CVT-NEXT: movi.4s v2, #128, lsl #24 +; CHECK-CVT-NEXT: bit.16b v0, v1, v2 +; CHECK-CVT-NEXT: fcvt h0, s0 +; CHECK-CVT-NEXT: ret + +; CHECK-FP16-LABEL: test_copysign_f32: +; CHECK-FP16-NEXT: fcvt h1, s1 +; CHECK-FP16-NEXT: movi.8h v2, #128, lsl #8 +; CHECK-FP16-NEXT: bit.16b v0, v1, v2 +; CHECK-FP16-NEXT: ret + define half @test_copysign_f32(half %a, float %b) #0 { %tb = fptrunc float %b to half %r = call half @llvm.copysign.f16(half %a, half %tb) ret half %r } -; CHECK-COMMON-LABEL: test_copysign_f64: -; CHECK-COMMON-NEXT: fcvt s1, d1 -; CHECK-COMMON-NEXT: fcvt s0, h0 -; CHECK-COMMON-NEXT: movi.4s v2, #128, lsl #24 -; CHECK-COMMON-NEXT: bit.16b v0, v1, v2 -; CHECK-COMMON-NEXT: fcvt h0, s0 -; CHECK-COMMON-NEXT: ret +; CHECK-CVT-LABEL: test_copysign_f64: +; CHECK-CVT-NEXT: fcvt s1, d1 +; CHECK-CVT-NEXT: fcvt s0, h0 +; CHECK-CVT-NEXT: movi.4s v2, #128, lsl #24 +; CHECK-CVT-NEXT: bit.16b v0, v1, v2 +; CHECK-CVT-NEXT: fcvt h0, s0 +; CHECK-CVT-NEXT: ret + +; CHECK-FP16-LABEL: test_copysign_f64: +; CHECK-FP16-NEXT: fcvt h1, d1 +; CHECK-FP16-NEXT: movi.8h v2, #128, lsl #8 +; CHECK-FP16-NEXT: bit.16b v0, v1, v2 +; CHECK-FP16-NEXT: ret + define half @test_copysign_f64(half %a, double %b) #0 { %tb = fptrunc double %b to half %r = call half @llvm.copysign.f16(half %a, half %tb) @@ -974,12 +994,19 @@ define half @test_copysign_f64(half %a, double %b) #0 { ; Check that the FP promotion will use a truncating FP_ROUND, so we can fold ; away the (fpext (fp_round )) here. -; CHECK-COMMON-LABEL: test_copysign_extended: -; CHECK-COMMON-NEXT: fcvt s1, h1 -; CHECK-COMMON-NEXT: fcvt s0, h0 -; CHECK-COMMON-NEXT: movi.4s v2, #128, lsl #24 -; CHECK-COMMON-NEXT: bit.16b v0, v1, v2 -; CHECK-COMMON-NEXT: ret +; CHECK-CVT-LABEL: test_copysign_extended: +; CHECK-CVT-NEXT: fcvt s1, h1 +; CHECK-CVT-NEXT: fcvt s0, h0 +; CHECK-CVT-NEXT: movi.4s v2, #128, lsl #24 +; CHECK-CVT-NEXT: bit.16b v0, v1, v2 +; CHECK-CVT-NEXT: ret + +; CHECK-FP16-LABEL: test_copysign_extended: +; CHECK-FP16-NEXT: movi.8h v2, #128, lsl #8 +; CHECK-FP16-NEXT: bit.16b v0, v1, v2 +; CHECK-FP16-NEXT: fcvt s0, h0 +; CHECK-FP16-NEXT: ret + define float @test_copysign_extended(half %a, half %b) #0 { %r = call half @llvm.copysign.f16(half %a, half %b) %xr = fpext half %r to float