From: Oliver Stannard Date: Tue, 13 Jun 2017 13:04:32 +0000 (+0000) Subject: [ARM] Add scheduling classes for VFNM[AS] X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=8475234d5e24860ad8e29c20410f82271c117b11;p=llvm [ARM] Add scheduling classes for VFNM[AS] The VFNM[AS] instructions did not have scheduling information attached, which was causing assertion failures with the Cortex-A57 scheduling model and -fp-contract=fast, because the Cortex-A57 sched model claims to be complete. Differential Revision: https://reviews.llvm.org/D34139 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@305288 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/Target/ARM/ARMInstrVFP.td b/lib/Target/ARM/ARMInstrVFP.td index 817b567db76..5d887c4fcbf 100644 --- a/lib/Target/ARM/ARMInstrVFP.td +++ b/lib/Target/ARM/ARMInstrVFP.td @@ -2010,7 +2010,8 @@ def VFNMAD : ADbI<0b11101, 0b01, 1, 0, [(set DPR:$Dd,(fsub_mlx (fneg (fmul_su DPR:$Dn,DPR:$Dm)), (f64 DPR:$Ddin)))]>, RegConstraint<"$Ddin = $Dd">, - Requires<[HasVFP4,HasDPVFP,UseFusedMAC]>; + Requires<[HasVFP4,HasDPVFP,UseFusedMAC]>, + Sched<[WriteFPMAC64, ReadFPMAC, ReadFPMUL, ReadFPMUL]>; def VFNMAS : ASbI<0b11101, 0b01, 1, 0, (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm), @@ -2018,7 +2019,8 @@ def VFNMAS : ASbI<0b11101, 0b01, 1, 0, [(set SPR:$Sd, (fsub_mlx (fneg (fmul_su SPR:$Sn, SPR:$Sm)), SPR:$Sdin))]>, RegConstraint<"$Sdin = $Sd">, - Requires<[HasVFP4,DontUseNEONForFP,UseFusedMAC]> { + Requires<[HasVFP4,DontUseNEONForFP,UseFusedMAC]>, + Sched<[WriteFPMAC32, ReadFPMAC, ReadFPMUL, ReadFPMUL]> { // Some single precision VFP instructions may be executed on both NEON and // VFP pipelines. } @@ -2028,7 +2030,8 @@ def VFNMAH : AHbI<0b11101, 0b01, 1, 0, IIC_fpFMAC16, "vfnma", ".f16\t$Sd, $Sn, $Sm", []>, RegConstraint<"$Sdin = $Sd">, - Requires<[HasFullFP16,UseFusedMAC]>; + Requires<[HasFullFP16,UseFusedMAC]>, + Sched<[WriteFPMAC32, ReadFPMAC, ReadFPMUL, ReadFPMUL]>; def : Pat<(fsub_mlx (fneg (fmul_su DPR:$a, (f64 DPR:$b))), DPR:$dstin), (VFNMAD DPR:$dstin, DPR:$a, DPR:$b)>, @@ -2059,14 +2062,16 @@ def VFNMSD : ADbI<0b11101, 0b01, 0, 0, [(set DPR:$Dd, (fsub_mlx (fmul_su DPR:$Dn, DPR:$Dm), (f64 DPR:$Ddin)))]>, RegConstraint<"$Ddin = $Dd">, - Requires<[HasVFP4,HasDPVFP,UseFusedMAC]>; + Requires<[HasVFP4,HasDPVFP,UseFusedMAC]>, + Sched<[WriteFPMAC64, ReadFPMAC, ReadFPMUL, ReadFPMUL]>; def VFNMSS : ASbI<0b11101, 0b01, 0, 0, (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm), IIC_fpFMAC32, "vfnms", ".f32\t$Sd, $Sn, $Sm", [(set SPR:$Sd, (fsub_mlx (fmul_su SPR:$Sn, SPR:$Sm), SPR:$Sdin))]>, RegConstraint<"$Sdin = $Sd">, - Requires<[HasVFP4,DontUseNEONForFP,UseFusedMAC]> { + Requires<[HasVFP4,DontUseNEONForFP,UseFusedMAC]>, + Sched<[WriteFPMAC32, ReadFPMAC, ReadFPMUL, ReadFPMUL]> { // Some single precision VFP instructions may be executed on both NEON and // VFP pipelines. } @@ -2076,7 +2081,8 @@ def VFNMSH : AHbI<0b11101, 0b01, 0, 0, IIC_fpFMAC16, "vfnms", ".f16\t$Sd, $Sn, $Sm", []>, RegConstraint<"$Sdin = $Sd">, - Requires<[HasFullFP16,UseFusedMAC]>; + Requires<[HasFullFP16,UseFusedMAC]>, + Sched<[WriteFPMAC32, ReadFPMAC, ReadFPMUL, ReadFPMUL]>; def : Pat<(fsub_mlx (fmul_su DPR:$a, (f64 DPR:$b)), DPR:$dstin), (VFNMSD DPR:$dstin, DPR:$a, DPR:$b)>, diff --git a/test/CodeGen/ARM/cortex-a57-misched-vfma.ll b/test/CodeGen/ARM/cortex-a57-misched-vfma.ll index 5f914323861..e234e179ed0 100644 --- a/test/CodeGen/ARM/cortex-a57-misched-vfma.ll +++ b/test/CodeGen/ARM/cortex-a57-misched-vfma.ll @@ -156,3 +156,41 @@ define <2 x float> @Test4(<2 x float> %f1, <2 x float> %f2, <2 x float> %f3, <2 %sub2 = fsub <2 x float> %sub1, %mul3 ret <2 x float> %sub2 } + +define float @Test5(float %f1, float %f2, float %f3) { +; CHECK: ********** MI Scheduling ********** +; CHECK: Test5:BB#0 + +; CHECK-DEFAULT: VNMLS +; CHECK-FAST: VFNMS +; CHECK: Latency : 9 +; CHECK: Successors: +; CHECK: data +; > VMLAS not-optimized latency to VMOVRS = 9 +; CHECK-SAME: Latency=9 + +; f1 * f2 - f3 ==> VNMLS/VFNMS + %mul = fmul float %f1, %f2 + %sub = fsub float %mul, %f3 + ret float %sub +} + + +define float @Test6(float %f1, float %f2, float %f3) { +; CHECK: ********** MI Scheduling ********** +; CHECK: Test6:BB#0 + +; CHECK-DEFAULT: VNMLA +; CHECK-FAST: VFNMA +; CHECK: Latency : 9 +; CHECK: Successors: +; CHECK: data +; > VMLAS not-optimized latency to VMOVRS = 9 +; CHECK-SAME: Latency=9 + +; f1 * f2 - f3 ==> VNMLA/VFNMA + %mul = fmul float %f1, %f2 + %sub1 = fsub float -0.0, %mul + %sub2 = fsub float %sub1, %f2 + ret float %sub2 +}