From: Evandro Menezes Date: Fri, 20 Jul 2018 16:49:28 +0000 (+0000) Subject: [ARM] Add new feature to enable optimizing the VFP registers X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=6649de34e98afa7ecf997cfbbb38e094ab5fe1b2;p=llvm [ARM] Add new feature to enable optimizing the VFP registers Enable the optimization of operations on DPR and SPR via a feature instead of checking the target. Differential revision: https://reviews.llvm.org/D49463 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@337575 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/Target/ARM/A15SDOptimizer.cpp b/lib/Target/ARM/A15SDOptimizer.cpp index b7f7cb20315..be88fe4ddb1 100644 --- a/lib/Target/ARM/A15SDOptimizer.cpp +++ b/lib/Target/ARM/A15SDOptimizer.cpp @@ -660,8 +660,9 @@ bool A15SDOptimizer::runOnMachineFunction(MachineFunction &Fn) { const ARMSubtarget &STI = Fn.getSubtarget(); // Since the A15SDOptimizer pass can insert VDUP instructions, it can only be // enabled when NEON is available. - if (!(STI.isCortexA15() && STI.hasNEON())) + if (!(STI.useSplatVFPToNeon() && STI.hasNEON())) return false; + TII = STI.getInstrInfo(); TRI = STI.getRegisterInfo(); MRI = &Fn.getRegInfo(); diff --git a/lib/Target/ARM/ARM.td b/lib/Target/ARM/ARM.td index 3b2136aaed7..742b3551889 100644 --- a/lib/Target/ARM/ARM.td +++ b/lib/Target/ARM/ARM.td @@ -195,6 +195,13 @@ def FeatureDontWidenVMOVS : SubtargetFeature<"dont-widen-vmovs", "DontWidenVMOVS", "true", "Don't widen VMOVS to VMOVD">; +// Some targets (e.g. Cortex-A15) prefer to avoid mixing operations on different +// VFP register widths. +def FeatureSplatVFPToNeon : SubtargetFeature<"splat-vfp-neon", + "SplatVFPToNeon", "true", + "Splat register from VFP to NEON", + [FeatureDontWidenVMOVS]>; + // Whether or not it is profitable to expand VFP/NEON MLA/MLS instructions. def FeatureExpandMLx : SubtargetFeature<"expand-fp-mlx", "ExpandMLx", "true", @@ -819,6 +826,7 @@ def : ProcessorModel<"cortex-a12", CortexA9Model, [ARMv7a, ProcA12, def : ProcessorModel<"cortex-a15", CortexA9Model, [ARMv7a, ProcA15, FeatureDontWidenVMOVS, + FeatureSplatVFPToNeon, FeatureHasRetAddrStack, FeatureMuxedUnits, FeatureTrustZone, diff --git a/lib/Target/ARM/ARMSubtarget.h b/lib/Target/ARM/ARMSubtarget.h index 93856e3dc38..165077926c8 100644 --- a/lib/Target/ARM/ARMSubtarget.h +++ b/lib/Target/ARM/ARMSubtarget.h @@ -352,9 +352,12 @@ protected: /// If true, the AGU and NEON/FPU units are multiplexed. bool HasMuxedUnits = false; - /// If true, VMOVS will never be widened to VMOVD + /// If true, VMOVS will never be widened to VMOVD. bool DontWidenVMOVS = false; + /// If true, splat a register between VFP and NEON instructions. + bool SplatVFPToNeon = false; + /// If true, run the MLx expansion pass. bool ExpandMLx = false; @@ -591,6 +594,7 @@ public: bool hasSlowLoadDSubregister() const { return SlowLoadDSubregister; } bool hasMuxedUnits() const { return HasMuxedUnits; } bool dontWidenVMOVS() const { return DontWidenVMOVS; } + bool useSplatVFPToNeon() const { return SplatVFPToNeon; } bool useNEONForFPMovs() const { return UseNEONForFPMovs; } bool checkVLDnAccessAlignment() const { return CheckVLDnAlign; } bool nonpipelinedVFP() const { return NonpipelinedVFP; } diff --git a/test/CodeGen/ARM/a15-SD-dep.ll b/test/CodeGen/ARM/a15-SD-dep.ll index 625c40eb416..d0edccbf433 100644 --- a/test/CodeGen/ARM/a15-SD-dep.ll +++ b/test/CodeGen/ARM/a15-SD-dep.ll @@ -1,8 +1,8 @@ -; RUN: llc -O1 -mcpu=cortex-a15 -mtriple=armv7-linux-gnueabi -disable-a15-sd-optimization -verify-machineinstrs < %s | FileCheck -check-prefix=CHECK-DISABLED %s -; RUN: llc -O1 -mcpu=cortex-a15 -mtriple=armv7-linux-gnueabi -verify-machineinstrs < %s | FileCheck -check-prefix=CHECK-ENABLED %s +; RUN: llc -O1 -mattr=+splat-vfp-neon -mtriple=armv7-linux-gnueabi -verify-machineinstrs -disable-a15-sd-optimization < %s | FileCheck -check-prefixes=CHECK,CHECK-DISABLED %s +; RUN: llc -O1 -mattr=-splat-vfp-neon -mtriple=armv7-linux-gnueabi -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,CHECK-DISABLED %s +; RUN: llc -O1 -mattr=+splat-vfp-neon -mtriple=armv7-linux-gnueabi -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,CHECK-ENABLED %s -; CHECK-ENABLED-LABEL: t1: -; CHECK-DISABLED-LABEL: t1: +; CHECK-LABEL: t1: define <2 x float> @t1(float %f) { ; CHECK-ENABLED: vdup.32 d{{[0-9]*}}, d0[0] ; CHECK-DISABLED-NOT: vdup.32 d{{[0-9]*}}, d0[0] @@ -11,8 +11,7 @@ define <2 x float> @t1(float %f) { ret <2 x float> %i2 } -; CHECK-ENABLED-LABEL: t2: -; CHECK-DISABLED-LABEL: t2: +; CHECK-LABEL: t2: define <4 x float> @t2(float %g, float %f) { ; CHECK-ENABLED: vdup.32 q{{[0-9]*}}, d0[0] ; CHECK-DISABLED-NOT: vdup.32 d{{[0-9]*}}, d0[0] @@ -21,8 +20,7 @@ define <4 x float> @t2(float %g, float %f) { ret <4 x float> %i2 } -; CHECK-ENABLED-LABEL: t3: -; CHECK-DISABLED-LABEL: t3: +; CHECK-LABEL: t3: define arm_aapcs_vfpcc <2 x float> @t3(float %f) { ; CHECK-ENABLED: vdup.32 d{{[0-9]*}}, d0[0] ; CHECK-DISABLED-NOT: vdup.32 d{{[0-9]*}}, d0[0] @@ -31,8 +29,7 @@ define arm_aapcs_vfpcc <2 x float> @t3(float %f) { ret <2 x float> %i2 } -; CHECK-ENABLED-LABEL: t4: -; CHECK-DISABLED-LABEL: t4: +; CHECK-LABEL: t4: define <2 x float> @t4(float %f) { ; CHECK-ENABLED: vdup.32 d{{[0-9]*}}, d0[0] ; CHECK-DISABLED-NOT: vdup @@ -45,8 +42,7 @@ b: ret <2 x float> %i2 } -; CHECK-ENABLED-LABEL: t5: -; CHECK-DISABLED-LABEL: t5: +; CHECK-LABEL: t5: define arm_aapcs_vfpcc <4 x float> @t5(<4 x float> %q, float %f) { ; CHECK-ENABLED: vdup.32 d{{[0-9]*}}, d{{[0-9]*}}[0] ; CHECK-ENABLED: vadd.f32 @@ -58,8 +54,7 @@ define arm_aapcs_vfpcc <4 x float> @t5(<4 x float> %q, float %f) { } ; Test that DPair can be successfully passed as QPR. -; CHECK-ENABLED-LABEL: test_DPair1: -; CHECK-DISABLED-LABEL: test_DPair1: +; CHECK-LABEL: test_DPair1: define void @test_DPair1(i32 %vsout, i8* nocapture %out, float %x, float %y) { entry: %0 = insertelement <4 x float> undef, float %x, i32 1 @@ -89,8 +84,7 @@ sw.epilog: ; preds = %entry ret void } -; CHECK-ENABLED-LABEL: test_DPair2: -; CHECK-DISABLED-LABEL: test_DPair2: +; CHECK-LABEL: test_DPair2: define void @test_DPair2(i32 %vsout, i8* nocapture %out, float %x) { entry: %0 = insertelement <4 x float> undef, float %x, i32 0