"true",
"Use NEON for single precision FP">;
+// On some processors, VLDn instructions that access unaligned data take one
+// extra cycle. Take that into account when computing operand latencies.
+def FeatureCheckVLDnAlign : SubtargetFeature<"vldn-align", "CheckVLDnAlign",
+ "true",
+ "Check for VLDn unaligned access">;
+
+// Some processors have a nonpipelined VFP coprocessor.
+def FeatureNonpipelinedVFP : SubtargetFeature<"nonpipelined-vfp",
+ "NonpipelinedVFP", "true",
+ "VFP instructions are not pipelined">;
+
// Some processors have FP multiply-accumulate instructions that don't
// play nicely with other VFP / NEON instructions, and it's generally better
// to just not use them.
def : ProcessorModel<"cortex-a8", CortexA8Model, [ARMv7a, ProcA8,
FeatureHasRetAddrStack,
+ FeatureNonpipelinedVFP,
FeatureTrustZone,
FeatureSlowFPBrcc,
FeatureHasSlowFPVMLx,
FeatureAvoidPartialCPSR,
FeaturePreferVMOVSR,
FeatureNEONForFPMovs,
+ FeatureCheckVLDnAlign,
FeatureMP]>;
// FIXME: A12 has currently the same Schedule model as A9
FeatureT2XtPk,
FeatureVFP4,
FeatureMP,
+ FeatureCheckVLDnAlign,
FeatureHWDiv,
FeatureHWDivARM,
FeatureAvoidPartialCPSR,
// division features.
def : ProcessorModel<"krait", CortexA9Model, [ARMv7a, ProcKrait,
FeatureHasRetAddrStack,
+ FeatureCheckVLDnAlign,
FeatureVMLxForwarding,
FeatureT2XtPk,
FeatureFP16,
return Size / 4;
}
+static unsigned getNumMicroOpsSingleIssuePlusExtras(unsigned Opc,
+ unsigned NumRegs) {
+ unsigned UOps = 1 + NumRegs; // 1 for address computation.
+ switch (Opc) {
+ default:
+ break;
+ case ARM::VLDMDIA_UPD:
+ case ARM::VLDMDDB_UPD:
+ case ARM::VLDMSIA_UPD:
+ case ARM::VLDMSDB_UPD:
+ case ARM::VSTMDIA_UPD:
+ case ARM::VSTMDDB_UPD:
+ case ARM::VSTMSIA_UPD:
+ case ARM::VSTMSDB_UPD:
+ case ARM::LDMIA_UPD:
+ case ARM::LDMDA_UPD:
+ case ARM::LDMDB_UPD:
+ case ARM::LDMIB_UPD:
+ case ARM::STMIA_UPD:
+ case ARM::STMDA_UPD:
+ case ARM::STMDB_UPD:
+ case ARM::STMIB_UPD:
+ case ARM::tLDMIA_UPD:
+ case ARM::tSTMIA_UPD:
+ case ARM::t2LDMIA_UPD:
+ case ARM::t2LDMDB_UPD:
+ case ARM::t2STMIA_UPD:
+ case ARM::t2STMDB_UPD:
+ ++UOps; // One for base register writeback.
+ break;
+ case ARM::LDMIA_RET:
+ case ARM::tPOP_RET:
+ case ARM::t2LDMIA_RET:
+ UOps += 2; // One for base reg wb, one for write to pc.
+ break;
+ }
+ return UOps;
+}
+
unsigned
ARMBaseInstrInfo::getNumMicroOps(const InstrItineraryData *ItinData,
const MachineInstr *MI) const {
case ARM::t2STMIA_UPD:
case ARM::t2STMDB_UPD: {
unsigned NumRegs = MI->getNumOperands() - Desc.getNumOperands() + 1;
- if (Subtarget.isSwift()) {
- int UOps = 1 + NumRegs; // One for address computation, one for each ld / st.
- switch (Opc) {
- default: break;
- case ARM::VLDMDIA_UPD:
- case ARM::VLDMDDB_UPD:
- case ARM::VLDMSIA_UPD:
- case ARM::VLDMSDB_UPD:
- case ARM::VSTMDIA_UPD:
- case ARM::VSTMDDB_UPD:
- case ARM::VSTMSIA_UPD:
- case ARM::VSTMSDB_UPD:
- case ARM::LDMIA_UPD:
- case ARM::LDMDA_UPD:
- case ARM::LDMDB_UPD:
- case ARM::LDMIB_UPD:
- case ARM::STMIA_UPD:
- case ARM::STMDA_UPD:
- case ARM::STMDB_UPD:
- case ARM::STMIB_UPD:
- case ARM::tLDMIA_UPD:
- case ARM::tSTMIA_UPD:
- case ARM::t2LDMIA_UPD:
- case ARM::t2LDMDB_UPD:
- case ARM::t2STMIA_UPD:
- case ARM::t2STMDB_UPD:
- ++UOps; // One for base register writeback.
- break;
- case ARM::LDMIA_RET:
- case ARM::tPOP_RET:
- case ARM::t2LDMIA_RET:
- UOps += 2; // One for base reg wb, one for write to pc.
- break;
- }
- return UOps;
- } else if (Subtarget.isCortexA8() || Subtarget.isCortexA7()) {
+ switch (Subtarget.getLdStMultipleTiming()) {
+ case ARMSubtarget::SingleIssuePlusExtras:
+ return getNumMicroOpsSingleIssuePlusExtras(Opc, NumRegs);
+ case ARMSubtarget::SingleIssue:
+ // Assume the worst.
+ return NumRegs;
+ case ARMSubtarget::DoubleIssue: {
if (NumRegs < 4)
return 2;
// 4 registers would be issued: 2, 2.
// 5 registers would be issued: 2, 2, 1.
- int A8UOps = (NumRegs / 2);
+ unsigned UOps = (NumRegs / 2);
if (NumRegs % 2)
- ++A8UOps;
- return A8UOps;
- } else if (Subtarget.isLikeA9()) {
- int A9UOps = (NumRegs / 2);
+ ++UOps;
+ return UOps;
+ }
+ case ARMSubtarget::DoubleIssueCheckUnalignedAccess: {
+ unsigned UOps = (NumRegs / 2);
// If there are odd number of registers or if it's not 64-bit aligned,
// then it takes an extra AGU (Address Generation Unit) cycle.
- if ((NumRegs % 2) ||
- !MI->hasOneMemOperand() ||
+ if ((NumRegs % 2) || !MI->hasOneMemOperand() ||
(*MI->memoperands_begin())->getAlignment() < 8)
- ++A9UOps;
- return A9UOps;
- } else {
- // Assume the worst.
- return NumRegs;
+ ++UOps;
+ return UOps;
+ }
}
}
}
+ llvm_unreachable("Didn't find the number of microops");
}
int
}
}
- if (DefAlign < 8 && Subtarget.isLikeA9()) {
+ if (DefAlign < 8 && Subtarget.checkVLDnAccessAlignment()) {
switch (DefMCID->getOpcode()) {
default: break;
case ARM::VLD1q8:
if (!UseNode->isMachineOpcode()) {
int Latency = ItinData->getOperandCycle(DefMCID.getSchedClass(), DefIdx);
- if (Subtarget.isLikeA9() || Subtarget.isSwift())
- return Latency <= 2 ? 1 : Latency - 1;
- else
- return Latency <= 3 ? 1 : Latency - 2;
+ int Adj = Subtarget.getPreISelOperandLatencyAdjustment();
+ int Threshold = 1 + Adj;
+ return Latency <= Threshold ? 1 : Latency - Adj;
}
const MCInstrDesc &UseMCID = get(UseNode->getMachineOpcode());
}
}
- if (DefAlign < 8 && Subtarget.isLikeA9())
+ if (DefAlign < 8 && Subtarget.checkVLDnAccessAlignment())
switch (DefMCID.getOpcode()) {
default: break;
case ARM::VLD1q8:
const MachineInstr *UseMI, unsigned UseIdx) const {
unsigned DDomain = DefMI->getDesc().TSFlags & ARMII::DomainMask;
unsigned UDomain = UseMI->getDesc().TSFlags & ARMII::DomainMask;
- if (Subtarget.isCortexA8() &&
+ if (Subtarget.nonpipelinedVFP() &&
(DDomain == ARMII::DomainVFP || UDomain == ARMII::DomainVFP))
- // CortexA8 VFP instructions are not pipelined.
return true;
// Hoist VFP / NEON instructions with 4 or higher latency.
Has8MSecExt(false), HasCrypto(false), HasCRC(false), HasRAS(false),
HasZeroCycleZeroing(false), IsProfitableToUnpredicate(false),
HasSlowVGETLNi32(false), HasSlowVDUP32(false), PreferVMOVSR(false),
- PreferISHST(false), UseNEONForFPMovs(false), StrictAlign(false),
- RestrictIT(false), HasDSP(false), UseNaClTrap(false), GenLongCalls(false),
+ PreferISHST(false), UseNEONForFPMovs(false), CheckVLDnAlign(false),
+ NonpipelinedVFP(false), StrictAlign(false), RestrictIT(false),
+ HasDSP(false), UseNaClTrap(false), GenLongCalls(false),
UnsafeFPMath(false), UseSjLjEH(false), stackAlignment(4), CPUString(CPU),
- IsLittle(IsLittle), TargetTriple(TT), Options(TM.Options), TM(TM),
+ MaxInterleaveFactor(1), LdStMultipleTiming(SingleIssue),
+ PreISelOperandLatencyAdjustment(2), IsLittle(IsLittle), TargetTriple(TT),
+ Options(TM.Options), TM(TM),
FrameLowering(initializeFrameLowering(CPU, FS)),
// At this point initializeSubtargetDependencies has been called so
// we can query directly.
if ((Bits[ARM::ProcA5] || Bits[ARM::ProcA8]) && // Where this matters
(Options.UnsafeFPMath || isTargetDarwin()))
UseNEONForSinglePrecisionFP = true;
+
+ // FIXME: Teach TableGen to deal with these instead of doing it manually here.
+ switch (ARMProcFamily) {
+ case Others:
+ case CortexA5:
+ break;
+ case CortexA7:
+ LdStMultipleTiming = DoubleIssue;
+ break;
+ case CortexA8:
+ LdStMultipleTiming = DoubleIssue;
+ break;
+ case CortexA9:
+ LdStMultipleTiming = DoubleIssueCheckUnalignedAccess;
+ PreISelOperandLatencyAdjustment = 1;
+ break;
+ case CortexA12:
+ break;
+ case CortexA15:
+ MaxInterleaveFactor = 2;
+ PreISelOperandLatencyAdjustment = 1;
+ break;
+ case CortexA17:
+ case CortexA32:
+ case CortexA35:
+ case CortexA53:
+ case CortexA57:
+ case CortexA72:
+ case CortexA73:
+ case CortexR4:
+ case CortexR4F:
+ case CortexR5:
+ case CortexR7:
+ case CortexM3:
+ case ExynosM1:
+ break;
+ case Krait:
+ PreISelOperandLatencyAdjustment = 1;
+ break;
+ case Swift:
+ MaxInterleaveFactor = 2;
+ LdStMultipleTiming = SingleIssuePlusExtras;
+ PreISelOperandLatencyAdjustment = 1;
+ break;
+ }
}
bool ARMSubtarget::isAPCS_ABI() const {
ARMv7m, ARMv7em, ARMv8a, ARMv81a, ARMv82a, ARMv8mMainline, ARMv8mBaseline
};
+public:
+ /// What kind of timing do load multiple/store multiple instructions have.
+ enum ARMLdStMultipleTiming {
+ /// Can load/store 2 registers/cycle.
+ DoubleIssue,
+ /// Can load/store 2 registers/cycle, but needs an extra cycle if the access
+ /// is not 64-bit aligned.
+ DoubleIssueCheckUnalignedAccess,
+ /// Can load/store 1 register/cycle.
+ SingleIssue,
+ /// Can load/store 1 register/cycle, but needs an extra cycle for address
+ /// computation and potentially also for register writeback.
+ SingleIssuePlusExtras,
+ };
+
+protected:
/// ARMProcFamily - ARM processor family: Cortex-A8, Cortex-A9, and others.
ARMProcFamilyEnum ARMProcFamily;
/// If true, VMOVRS, VMOVSR and VMOVS will be converted from VFP to NEON.
bool UseNEONForFPMovs;
+ /// If true, VLDn instructions take an extra cycle for unaligned accesses.
+ bool CheckVLDnAlign;
+
+ /// If true, VFP instructions are not pipelined.
+ bool NonpipelinedVFP;
+
/// StrictAlign - If true, the subtarget disallows unaligned memory
/// accesses for some types. For details, see
/// ARMTargetLowering::allowsMisalignedMemoryAccesses().
/// CPUString - String name of used CPU.
std::string CPUString;
+ unsigned MaxInterleaveFactor;
+
+ /// What kind of timing do load multiple/store multiple have (double issue,
+ /// single issue etc).
+ ARMLdStMultipleTiming LdStMultipleTiming;
+
+ /// The adjustment that we need to apply to get the operand latency from the
+ /// operand cycle returned by the itinerary data for pre-ISel operands.
+ int PreISelOperandLatencyAdjustment;
+
/// IsLittle - The target is Little Endian
bool IsLittle;
bool preferVMOVSR() const { return PreferVMOVSR; }
bool preferISHSTBarriers() const { return PreferISHST; }
bool useNEONForFPMovs() const { return UseNEONForFPMovs; }
+ bool checkVLDnAccessAlignment() const { return CheckVLDnAlign; }
+ bool nonpipelinedVFP() const { return NonpipelinedVFP; }
bool prefers32BitThumb() const { return Pref32BitThumb; }
bool avoidCPSRPartialUpdate() const { return AvoidCPSRPartialUpdate; }
bool avoidMOVsShifterOperand() const { return AvoidMOVsShifterOperand; }
/// function for this subtarget.
unsigned getStackAlignment() const { return stackAlignment; }
+ unsigned getMaxInterleaveFactor() const { return MaxInterleaveFactor; }
+
+ ARMLdStMultipleTiming getLdStMultipleTiming() const {
+ return LdStMultipleTiming;
+ }
+
+ int getPreISelOperandLatencyAdjustment() const {
+ return PreISelOperandLatencyAdjustment;
+ }
+
/// GVIsIndirectSymbol - true if the GV will be accessed via an indirect
/// symbol.
bool GVIsIndirectSymbol(const GlobalValue *GV, Reloc::Model RelocM) const;
}
unsigned getMaxInterleaveFactor(unsigned VF) {
- // These are out of order CPUs:
- if (ST->isCortexA15() || ST->isSwift())
- return 2;
- return 1;
+ return ST->getMaxInterleaveFactor();
}
int getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp);