From: Matthias Braun Date: Tue, 4 Oct 2016 19:28:21 +0000 (+0000) Subject: AArch64: Macrofusion: Split features, add missing combinations. X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=627f6167c25474c686ac3b734138a781da588194;p=llvm AArch64: Macrofusion: Split features, add missing combinations. AArch64InstrInfo::shouldScheduleAdjacent() determines whether two instruction can benefit from macroop fusion on apple CPUs. The list turned out to be incomplete: - the "rr" variants of the instructions were missing - even the "rs" variants can have shift value == 0 and behave like the "rr" variants This also splits the MacropFusion target feature into ArithmeticBccFusion and ArithmeticCbzFusion. Differential Revision: https://reviews.llvm.org/D25142 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@283243 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/Target/AArch64/AArch64.td b/lib/Target/AArch64/AArch64.td index 5c66748cee6..2ff3cf45a84 100644 --- a/lib/Target/AArch64/AArch64.td +++ b/lib/Target/AArch64/AArch64.td @@ -94,9 +94,13 @@ def FeatureAlternateSExtLoadCVTF32Pattern : SubtargetFeature< "alternate-sextload-cvt-f32-pattern", "UseAlternateSExtLoadCVTF32Pattern", "true", "Use alternative pattern for sextload convert to f32">; -def FeatureMacroOpFusion : SubtargetFeature< - "macroop-fusion", "HasMacroOpFusion", "true", - "CPU supports macro op fusion">; +def FeatureArithmeticBccFusion : SubtargetFeature< + "arith-bcc-fusion", "HasArithmeticBccFusion", "true", + "CPU fuses arithmetic+bcc operations">; + +def FeatureArithmeticCbzFusion : SubtargetFeature< + "arith-cbz-fusion", "HasArithmeticCbzFusion", "true", + "CPU fuses arithmetic + cbz/cbnz operations">; def FeatureDisableLatencySchedHeuristic : SubtargetFeature< "disable-latency-sched-heuristic", "DisableLatencySchedHeuristic", "true", @@ -204,7 +208,8 @@ def ProcCyclone : SubtargetFeature<"cyclone", "ARMProcFamily", "Cyclone", FeatureCrypto, FeatureDisableLatencySchedHeuristic, FeatureFPARMv8, - FeatureMacroOpFusion, + FeatureArithmeticBccFusion, + FeatureArithmeticCbzFusion, FeatureNEON, FeaturePerfMon, FeatureSlowMisaligned128Store, @@ -244,7 +249,7 @@ def ProcVulcan : SubtargetFeature<"vulcan", "ARMProcFamily", "Vulcan", FeatureCRC, FeatureCrypto, FeatureFPARMv8, - FeatureMacroOpFusion, + FeatureArithmeticBccFusion, FeatureNEON, FeaturePostRAScheduler, FeaturePredictableSelectIsExpensive, diff --git a/lib/Target/AArch64/AArch64InstrInfo.cpp b/lib/Target/AArch64/AArch64InstrInfo.cpp index 859f7828901..b26dbce1875 100644 --- a/lib/Target/AArch64/AArch64InstrInfo.cpp +++ b/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -1876,39 +1876,80 @@ bool AArch64InstrInfo::shouldClusterMemOps(MachineInstr &FirstLdSt, bool AArch64InstrInfo::shouldScheduleAdjacent(MachineInstr &First, MachineInstr &Second) const { - if (Subtarget.hasMacroOpFusion()) { + if (Subtarget.hasArithmeticBccFusion()) { // Fuse CMN, CMP, TST followed by Bcc. unsigned SecondOpcode = Second.getOpcode(); if (SecondOpcode == AArch64::Bcc) { switch (First.getOpcode()) { default: return false; - case AArch64::SUBSWri: case AArch64::ADDSWri: - case AArch64::ANDSWri: - case AArch64::SUBSXri: + case AArch64::ADDSWrr: case AArch64::ADDSXri: + case AArch64::ADDSXrr: + case AArch64::ANDSWri: + case AArch64::ANDSWrr: case AArch64::ANDSXri: + case AArch64::ANDSXrr: + case AArch64::SUBSWri: + case AArch64::SUBSWrr: + case AArch64::SUBSXri: + case AArch64::SUBSXrr: + case AArch64::BICSWrr: + case AArch64::BICSXrr: return true; + case AArch64::ADDSWrs: + case AArch64::ADDSXrs: + case AArch64::ANDSWrs: + case AArch64::ANDSXrs: + case AArch64::SUBSWrs: + case AArch64::SUBSXrs: + case AArch64::BICSWrs: + case AArch64::BICSXrs: + // Shift value can be 0 making these behave like the "rr" variant... + return !hasShiftedReg(Second); } } + } + if (Subtarget.hasArithmeticCbzFusion()) { // Fuse ALU operations followed by CBZ/CBNZ. + unsigned SecondOpcode = Second.getOpcode(); if (SecondOpcode == AArch64::CBNZW || SecondOpcode == AArch64::CBNZX || SecondOpcode == AArch64::CBZW || SecondOpcode == AArch64::CBZX) { switch (First.getOpcode()) { default: return false; case AArch64::ADDWri: + case AArch64::ADDWrr: case AArch64::ADDXri: + case AArch64::ADDXrr: case AArch64::ANDWri: + case AArch64::ANDWrr: case AArch64::ANDXri: + case AArch64::ANDXrr: case AArch64::EORWri: + case AArch64::EORWrr: case AArch64::EORXri: + case AArch64::EORXrr: case AArch64::ORRWri: + case AArch64::ORRWrr: case AArch64::ORRXri: + case AArch64::ORRXrr: case AArch64::SUBWri: + case AArch64::SUBWrr: case AArch64::SUBXri: + case AArch64::SUBXrr: return true; + case AArch64::ADDWrs: + case AArch64::ADDXrs: + case AArch64::ANDWrs: + case AArch64::ANDXrs: + case AArch64::SUBWrs: + case AArch64::SUBXrs: + case AArch64::BICWrs: + case AArch64::BICXrs: + // Shift value can be 0 making these behave like the "rr" variant... + return !hasShiftedReg(Second); } } } diff --git a/lib/Target/AArch64/AArch64Subtarget.h b/lib/Target/AArch64/AArch64Subtarget.h index 9f51c6be635..a21dbd8322f 100644 --- a/lib/Target/AArch64/AArch64Subtarget.h +++ b/lib/Target/AArch64/AArch64Subtarget.h @@ -80,7 +80,8 @@ protected: bool Misaligned128StoreIsSlow = false; bool AvoidQuadLdStPairs = false; bool UseAlternateSExtLoadCVTF32Pattern = false; - bool HasMacroOpFusion = false; + bool HasArithmeticBccFusion = false; + bool HasArithmeticCbzFusion = false; bool DisableLatencySchedHeuristic = false; uint8_t MaxInterleaveFactor = 2; uint8_t VectorInsertExtractBaseCost = 3; @@ -188,7 +189,8 @@ public: bool useAlternateSExtLoadCVTF32Pattern() const { return UseAlternateSExtLoadCVTF32Pattern; } - bool hasMacroOpFusion() const { return HasMacroOpFusion; } + bool hasArithmeticBccFusion() const { return HasArithmeticBccFusion; } + bool hasArithmeticCbzFusion() const { return HasArithmeticCbzFusion; } unsigned getMaxInterleaveFactor() const { return MaxInterleaveFactor; } unsigned getVectorInsertExtractBaseCost() const { return VectorInsertExtractBaseCost; diff --git a/test/CodeGen/AArch64/misched-fusion.ll b/test/CodeGen/AArch64/misched-fusion.ll index 0f4c0ac84ce..d5140c62f66 100644 --- a/test/CodeGen/AArch64/misched-fusion.ll +++ b/test/CodeGen/AArch64/misched-fusion.ll @@ -1,4 +1,4 @@ -; RUN: llc -o - %s -mattr=+macroop-fusion,+use-postra-scheduler | FileCheck %s +; RUN: llc -o - %s -mattr=+arith-cbz-fusion,+use-postra-scheduler | FileCheck %s ; RUN: llc -o - %s -mcpu=cyclone | FileCheck %s target triple = "arm64-apple-ios"