From 5ae42c7d63acab17352921404d3e9c35f38829d3 Mon Sep 17 00:00:00 2001 From: John Brawn Date: Wed, 28 Jun 2017 14:11:15 +0000 Subject: [PATCH] [ARM] Improve if-conversion for M-class CPUs without branch predictors The current heuristic in isProfitableToIfCvt assumes we have a branch predictor, and so gives the wrong answer in some cases when we don't. This patch adds a subtarget feature to indicate that a subtarget has no branch predictor, and changes the heuristic in isProfitableToiIfCvt when it's present. This gives a slight overall improvement in a set of embedded benchmarks on Cortex-M4 and Cortex-M33. Differential Revision: https://reviews.llvm.org/D34398 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@306547 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/ARM/ARM.td | 26 ++- lib/Target/ARM/ARMBaseInstrInfo.cpp | 45 ++++- lib/Target/ARM/ARMSchedule.td | 1 + lib/Target/ARM/ARMScheduleM3.td | 21 +++ lib/Target/ARM/ARMSubtarget.h | 6 + .../Thumb2/ifcvt-no-branch-predictor.ll | 154 ++++++++++++++++++ 6 files changed, 239 insertions(+), 14 deletions(-) create mode 100644 lib/Target/ARM/ARMScheduleM3.td create mode 100644 test/CodeGen/Thumb2/ifcvt-no-branch-predictor.ll diff --git a/lib/Target/ARM/ARM.td b/lib/Target/ARM/ARM.td index 840600914b1..c52a1d7611d 100644 --- a/lib/Target/ARM/ARM.td +++ b/lib/Target/ARM/ARM.td @@ -222,6 +222,13 @@ def FeatureAvoidMOVsShOp : SubtargetFeature<"avoid-movs-shop", def FeatureHasRetAddrStack : SubtargetFeature<"ret-addr-stack", "HasRetAddrStack", "true", "Has return address stack">; +// Some processors have no branch predictor, which changes the expected cost of +// taking a branch which affects the choice of whether to use predicated +// instructions. +def FeatureHasNoBranchPredictor : SubtargetFeature<"no-branch-predictor", + "HasBranchPredictor", "false", + "Has no branch predictor">; + /// DSP extension. def FeatureDSP : SubtargetFeature<"dsp", "HasDSP", "true", "Supports DSP instructions in ARM and/or Thumb2">; @@ -756,13 +763,19 @@ def : ProcessorModel<"cortex-r8", CortexA8Model, [ARMv7r, FeatureHasSlowFPVMLx, FeatureAvoidPartialCPSR]>; -def : ProcNoItin<"cortex-m3", [ARMv7m, ProcM3]>; -def : ProcNoItin<"sc300", [ARMv7m, ProcM3]>; +def : ProcessorModel<"cortex-m3", CortexM3Model, [ARMv7m, + ProcM3, + FeatureHasNoBranchPredictor]>; + +def : ProcessorModel<"sc300", CortexM3Model, [ARMv7m, + ProcM3, + FeatureHasNoBranchPredictor]>; -def : ProcNoItin<"cortex-m4", [ARMv7em, +def : ProcessorModel<"cortex-m4", CortexM3Model, [ARMv7em, FeatureVFP4, FeatureVFPOnlySP, - FeatureD16]>; + FeatureD16, + FeatureHasNoBranchPredictor]>; def : ProcNoItin<"cortex-m7", [ARMv7em, FeatureFPARMv8, @@ -771,11 +784,12 @@ def : ProcNoItin<"cortex-m7", [ARMv7em, def : ProcNoItin<"cortex-m23", [ARMv8mBaseline, FeatureNoMovt]>; -def : ProcNoItin<"cortex-m33", [ARMv8mMainline, +def : ProcessorModel<"cortex-m33", CortexM3Model, [ARMv8mMainline, FeatureDSP, FeatureFPARMv8, FeatureD16, - FeatureVFPOnlySP]>; + FeatureVFPOnlySP, + FeatureHasNoBranchPredictor]>; def : ProcNoItin<"cortex-a32", [ARMv8a, FeatureHWDivThumb, diff --git a/lib/Target/ARM/ARMBaseInstrInfo.cpp b/lib/Target/ARM/ARMBaseInstrInfo.cpp index e0810c358f2..1ec6b24b2ed 100644 --- a/lib/Target/ARM/ARMBaseInstrInfo.cpp +++ b/lib/Target/ARM/ARMBaseInstrInfo.cpp @@ -1851,9 +1851,9 @@ isProfitableToIfCvt(MachineBasicBlock &MBB, } bool ARMBaseInstrInfo:: -isProfitableToIfCvt(MachineBasicBlock &, +isProfitableToIfCvt(MachineBasicBlock &TBB, unsigned TCycles, unsigned TExtra, - MachineBasicBlock &, + MachineBasicBlock &FBB, unsigned FCycles, unsigned FExtra, BranchProbability Probability) const { if (!TCycles) @@ -1863,14 +1863,43 @@ isProfitableToIfCvt(MachineBasicBlock &, // Here we scale up each component of UnpredCost to avoid precision issue when // scaling TCycles/FCycles by Probability. const unsigned ScalingUpFactor = 1024; - unsigned TUnpredCost = Probability.scale(TCycles * ScalingUpFactor); - unsigned FUnpredCost = + + unsigned PredCost = (TCycles + FCycles + TExtra + FExtra) * ScalingUpFactor; + unsigned UnpredCost; + if (!Subtarget.hasBranchPredictor()) { + // When we don't have a branch predictor it's always cheaper to not take a + // branch than take it, so we have to take that into account. + unsigned NotTakenBranchCost = 1; + unsigned TakenBranchCost = Subtarget.getMispredictionPenalty(); + unsigned TUnpredCycles, FUnpredCycles; + if (!FCycles) { + // Triangle: TBB is the fallthrough + TUnpredCycles = TCycles + NotTakenBranchCost; + FUnpredCycles = TakenBranchCost; + } else { + // Diamond: TBB is the block that is branched to, FBB is the fallthrough + TUnpredCycles = TCycles + TakenBranchCost; + FUnpredCycles = FCycles + NotTakenBranchCost; + } + // The total cost is the cost of each path scaled by their probabilites + unsigned TUnpredCost = Probability.scale(TUnpredCycles * ScalingUpFactor); + unsigned FUnpredCost = Probability.getCompl().scale(FUnpredCycles * ScalingUpFactor); + UnpredCost = TUnpredCost + FUnpredCost; + // When predicating assume that the first IT can be folded away but later + // ones cost one cycle each + if (Subtarget.isThumb2() && TCycles + FCycles > 4) { + PredCost += ((TCycles + FCycles - 4) / 4) * ScalingUpFactor; + } + } else { + unsigned TUnpredCost = Probability.scale(TCycles * ScalingUpFactor); + unsigned FUnpredCost = Probability.getCompl().scale(FCycles * ScalingUpFactor); - unsigned UnpredCost = TUnpredCost + FUnpredCost; - UnpredCost += 1 * ScalingUpFactor; // The branch itself - UnpredCost += Subtarget.getMispredictionPenalty() * ScalingUpFactor / 10; + UnpredCost = TUnpredCost + FUnpredCost; + UnpredCost += 1 * ScalingUpFactor; // The branch itself + UnpredCost += Subtarget.getMispredictionPenalty() * ScalingUpFactor / 10; + } - return (TCycles + FCycles + TExtra + FExtra) * ScalingUpFactor <= UnpredCost; + return PredCost <= UnpredCost; } bool diff --git a/lib/Target/ARM/ARMSchedule.td b/lib/Target/ARM/ARMSchedule.td index 1c7902520f2..53e012f13ee 100644 --- a/lib/Target/ARM/ARMSchedule.td +++ b/lib/Target/ARM/ARMSchedule.td @@ -424,3 +424,4 @@ include "ARMScheduleA9.td" include "ARMScheduleSwift.td" include "ARMScheduleR52.td" include "ARMScheduleA57.td" +include "ARMScheduleM3.td" diff --git a/lib/Target/ARM/ARMScheduleM3.td b/lib/Target/ARM/ARMScheduleM3.td new file mode 100644 index 00000000000..93f8299f9bd --- /dev/null +++ b/lib/Target/ARM/ARMScheduleM3.td @@ -0,0 +1,21 @@ +//=- ARMScheduleM3.td - ARM Cortex-M3 Scheduling Definitions -*- tablegen -*-=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the machine model for the ARM Cortex-M3 processor. +// +//===----------------------------------------------------------------------===// + +def CortexM3Model : SchedMachineModel { + let IssueWidth = 1; // Only IT can be dual-issued, so assume single-issue + let MicroOpBufferSize = 0; // In-order + let LoadLatency = 2; // Latency when not pipelined, not pc-relative + let MispredictPenalty = 2; // Best case branch taken cost + + let CompleteModel = 0; +} diff --git a/lib/Target/ARM/ARMSubtarget.h b/lib/Target/ARM/ARMSubtarget.h index d890d0fa777..e15b17512c9 100644 --- a/lib/Target/ARM/ARMSubtarget.h +++ b/lib/Target/ARM/ARMSubtarget.h @@ -246,6 +246,11 @@ protected: /// avoid issue "normal" call instructions to callees which do not return. bool HasRetAddrStack = false; + /// HasBranchPredictor - True if the subtarget has a branch predictor. Having + /// a branch predictor or not changes the expected cost of taking a branch + /// which affects the choice of whether to use predicated instructions. + bool HasBranchPredictor = true; + /// HasMPExtension - True if the subtarget supports Multiprocessing /// extension (ARMv7 only). bool HasMPExtension = false; @@ -554,6 +559,7 @@ public: bool cheapPredicableCPSRDef() const { return CheapPredicableCPSRDef; } bool avoidMOVsShifterOperand() const { return AvoidMOVsShifterOperand; } bool hasRetAddrStack() const { return HasRetAddrStack; } + bool hasBranchPredictor() const { return HasBranchPredictor; } bool hasMPExtension() const { return HasMPExtension; } bool hasDSP() const { return HasDSP; } bool useNaClTrap() const { return UseNaClTrap; } diff --git a/test/CodeGen/Thumb2/ifcvt-no-branch-predictor.ll b/test/CodeGen/Thumb2/ifcvt-no-branch-predictor.ll new file mode 100644 index 00000000000..9fcc0f5d617 --- /dev/null +++ b/test/CodeGen/Thumb2/ifcvt-no-branch-predictor.ll @@ -0,0 +1,154 @@ +; RUN: llc < %s -mtriple=thumbv7m -mcpu=cortex-m7 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-BP +; RUN: llc < %s -mtriple=thumbv7m -mcpu=cortex-m3 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-NOBP + +declare void @otherfn() + +; CHECK-LABEL: triangle1: +; CHECK: itt ne +; CHECK: movne +; CHECK: strne +define i32 @triangle1(i32 %n, i32* %p) { +entry: + %tobool = icmp eq i32 %n, 0 + br i1 %tobool, label %if.end, label %if.then + +if.then: + store i32 1, i32* %p, align 4 + br label %if.end + +if.end: + tail call void @otherfn() + ret i32 0 +} + +; CHECK-LABEL: triangle2: +; CHECK-BP: itttt ne +; CHECK-BP: movne +; CHECK-BP: strne +; CHECK-BP: movne +; CHECK-BP: strne +; CHECK-NOBP: cbz +; CHECK-NOBP: movs +; CHECK-NOBP: str +; CHECK-NOBP: movs +; CHECK-NOBP: str +define i32 @triangle2(i32 %n, i32* %p, i32* %q) { +entry: + %tobool = icmp eq i32 %n, 0 + br i1 %tobool, label %if.end, label %if.then + +if.then: + store i32 1, i32* %p, align 4 + store i32 2, i32* %q, align 4 + br label %if.end + +if.end: + tail call void @otherfn() + ret i32 0 +} + +; CHECK-LABEL: triangle3: +; CHECK: cbz +; CHECK: movs +; CHECK: str +; CHECK: movs +; CHECK: str +; CHECK: movs +; CHECK: str +define i32 @triangle3(i32 %n, i32* %p, i32* %q, i32* %r) { +entry: + %tobool = icmp eq i32 %n, 0 + br i1 %tobool, label %if.end, label %if.then + +if.then: + store i32 1, i32* %p, align 4 + store i32 2, i32* %q, align 4 + store i32 3, i32* %r, align 4 + br label %if.end + +if.end: + tail call void @otherfn() + ret i32 0 +} + +; CHECK-LABEL: diamond1: +; CHECK: ite eq +; CHECK: ldreq +; CHECK: strne +define i32 @diamond1(i32 %n, i32* %p) { +entry: + %tobool = icmp eq i32 %n, 0 + br i1 %tobool, label %if.else, label %if.then + +if.then: + store i32 %n, i32* %p, align 4 + br label %if.end + +if.else: + %0 = load i32, i32* %p, align 4 + br label %if.end + +if.end: + %n.addr.0 = phi i32 [ %n, %if.then ], [ %0, %if.else ] + tail call void @otherfn() + ret i32 %n.addr.0 +} + +; CHECK-LABEL: diamond2: +; CHECK-BP: itte +; CHECK-BP: streq +; CHECK-BP: ldreq +; CHECK-BP: strne +; CHECK-NOBP: cbz +; CHECK-NOBP: str +; CHECK-NOBP: b +; CHECK-NOBP: str +; CHECK-NOBP: ldr +define i32 @diamond2(i32 %n, i32 %m, i32* %p, i32* %q) { +entry: + %tobool = icmp eq i32 %n, 0 + br i1 %tobool, label %if.else, label %if.then + +if.then: + store i32 %n, i32* %p, align 4 + br label %if.end + +if.else: + store i32 %m, i32* %q, align 4 + %0 = load i32, i32* %p, align 4 + br label %if.end + +if.end: + %n.addr.0 = phi i32 [ %n, %if.then ], [ %0, %if.else ] + tail call void @otherfn() + ret i32 %n.addr.0 +} + +; CHECK-LABEL: diamond3: +; CHECK: cbz +; CHECK: movs +; CHECK: str +; CHECK: b +; CHECK: ldr +; CHECK: ldr +; CHECK: adds +define i32 @diamond3(i32 %n, i32* %p, i32* %q) { +entry: + %tobool = icmp eq i32 %n, 0 + br i1 %tobool, label %if.else, label %if.then + +if.then: + store i32 1, i32* %p, align 4 + br label %if.end + +if.else: + %0 = load i32, i32* %p, align 4 + %1 = load i32, i32* %q, align 4 + %add = add nsw i32 %1, %0 + br label %if.end + +if.end: + %n.addr.0 = phi i32 [ %n, %if.then ], [ %add, %if.else ] + tail call void @otherfn() + ret i32 %n.addr.0 +} -- 2.40.0