#ifndef LLVM_MC_MCSCHEDULE_H
#define LLVM_MC_MCSCHEDULE_H
+#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/Optional.h"
#include "llvm/Config/llvm-config.h"
#include "llvm/Support/DataTypes.h"
getReciprocalThroughput(const MCSubtargetInfo &STI, const MCInstrInfo &MCII,
const MCInst &Inst) const;
+ /// Returns the maximum forwarding delay for register reads dependent on
+ /// writes of scheduling class WriteResourceIdx.
+ static unsigned getForwardingDelayCycles(ArrayRef<MCReadAdvanceEntry> Entries,
+ unsigned WriteResourceIdx = 0);
+
/// Returns the default initialized model.
static const MCSchedModel &GetDefaultSchedModel() { return Default; }
static const MCSchedModel Default;
return 0;
}
+ /// Return the set of ReadAdvance entries declared by the scheduling class
+ /// descriptor in input.
+ ArrayRef<MCReadAdvanceEntry>
+ getReadAdvanceEntries(const MCSchedClassDesc &SC) const {
+ if (!SC.NumReadAdvanceEntries)
+ return ArrayRef<MCReadAdvanceEntry>();
+ return ArrayRef<MCReadAdvanceEntry>(&ReadAdvanceTable[SC.ReadAdvanceIdx],
+ SC.NumReadAdvanceEntries);
+ }
+
/// Get scheduling itinerary of a CPU.
InstrItineraryData getInstrItineraryForCPU(StringRef CPU) const;
unsigned MaxLatency;
// Number of MicroOps for this instruction.
unsigned NumMicroOps;
+ // SchedClassID used to construct this InstrDesc.
+ // This information is currently used by views to do fast queries on the
+ // subtarget when computing the reciprocal throughput.
+ unsigned SchedClassID;
bool MayLoad;
bool MayStore;
TargetSchedModel TSchedModel;
TSchedModel.init(this);
unsigned Latency = TSchedModel.computeInstrLatency(&MI);
+
+ // Add extra latency due to forwarding delays.
+ const MCSchedClassDesc &SCDesc = *TSchedModel.resolveSchedClass(&MI);
+ Latency +=
+ MCSchedModel::getForwardingDelayCycles(getReadAdvanceEntries(SCDesc));
+
double RThroughput = TSchedModel.computeReciprocalThroughput(&MI);
return createSchedInfoStr(Latency, RThroughput);
}
TargetSchedModel TSchedModel;
TSchedModel.init(this);
unsigned Latency;
- if (TSchedModel.hasInstrSchedModel())
+ if (TSchedModel.hasInstrSchedModel()) {
Latency = TSchedModel.computeInstrLatency(MCI);
- else if (TSchedModel.hasInstrItineraries()) {
+ // Add extra latency due to forwarding delays.
+ const MCSchedModel &SM = *TSchedModel.getMCSchedModel();
+ unsigned SClassID = getInstrInfo()->get(MCI.getOpcode()).getSchedClass();
+ while (SM.getSchedClassDesc(SClassID)->isVariant())
+ SClassID = resolveVariantSchedClass(SClassID, &MCI, SM.ProcID);
+ const MCSchedClassDesc &SCDesc = *SM.getSchedClassDesc(SClassID);
+ Latency +=
+ MCSchedModel::getForwardingDelayCycles(getReadAdvanceEntries(SCDesc));
+ } else if (TSchedModel.hasInstrItineraries()) {
auto *ItinData = TSchedModel.getInstrItineraries();
Latency = ItinData->getStageLatency(
getInstrInfo()->get(MCI.getOpcode()).getSchedClass());
// that it can execute at the maximum default issue width.
return 1.0 / DefaultIssueWidth;
}
+
+unsigned
+MCSchedModel::getForwardingDelayCycles(ArrayRef<MCReadAdvanceEntry> Entries,
+ unsigned WriteResourceID) {
+ if (Entries.empty())
+ return 0;
+
+ int DelayCycles = 0;
+ for (const MCReadAdvanceEntry &E : Entries) {
+ if (E.WriteResourceID != WriteResourceID)
+ continue;
+ DelayCycles = std::min(DelayCycles, E.Cycles);
+ }
+
+ return std::abs(DelayCycles);
+}
// Create a new empty descriptor.
std::unique_ptr<InstrDesc> ID = llvm::make_unique<InstrDesc>();
ID->NumMicroOps = SCDesc.NumMicroOps;
+ ID->SchedClassID = SchedClassID;
if (MCDesc.isCall() && FirstCallInst) {
// We don't correctly model calls.
"pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
[(set VR64:$dst, (int_x86_mmx_pinsr_w VR64:$src1,
GR32orGR64:$src2, imm:$src3))]>,
- Sched<[WriteVecInsert]>;
+ Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>;
def MMX_PINSRWrm : MMXIi8<0xC4, MRMSrcMem,
(outs VR64:$dst),
"vpinsrw\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
[(set VR128:$dst,
(X86pinsrw VR128:$src1, GR32orGR64:$src2, imm:$src3))]>,
- Sched<[WriteVecInsert]>;
+ Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>;
def rm : Ii8<0xC4, MRMSrcMem,
(outs VR128:$dst), (ins VR128:$src1,
i16mem:$src2, u8imm:$src3),
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
[(set VR128:$dst,
(X86pinsrb VR128:$src1, GR32orGR64:$src2, imm:$src3))]>,
- Sched<[WriteVecInsert]>;
+ Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>;
def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
(ins VR128:$src1, i8mem:$src2, u8imm:$src3),
!if(Is2Addr,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
[(set VR128:$dst,
(v4i32 (insertelt VR128:$src1, GR32:$src2, imm:$src3)))]>,
- Sched<[WriteVecInsert]>;
+ Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>;
def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
(ins VR128:$src1, i32mem:$src2, u8imm:$src3),
!if(Is2Addr,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
[(set VR128:$dst,
(v2i64 (insertelt VR128:$src1, GR64:$src2, imm:$src3)))]>,
- Sched<[WriteVecInsert]>;
+ Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>;
def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
(ins VR128:$src1, i64mem:$src2, u8imm:$src3),
!if(Is2Addr,
def : ReadAdvance<ReadAfterVecXLd, 5>;
def : ReadAdvance<ReadAfterVecYLd, 6>;
+def : ReadAdvance<ReadInt2Fpu, 0>;
+
// Many SchedWrites are defined in pairs with and without a folded load.
// Instructions with folded loads are usually micro-fused, so they only appear
// as two micro-ops when queued in the reservation station.
def : ReadAdvance<ReadAfterVecXLd, 6>;
def : ReadAdvance<ReadAfterVecYLd, 7>;
+def : ReadAdvance<ReadInt2Fpu, 0>;
+
// Many SchedWrites are defined in pairs with and without a folded load.
// Instructions with folded loads are usually micro-fused, so they only appear
// as two micro-ops when queued in the reservation station.
def : ReadAdvance<ReadAfterVecXLd, 6>;
def : ReadAdvance<ReadAfterVecYLd, 7>;
+def : ReadAdvance<ReadInt2Fpu, 0>;
+
// Many SchedWrites are defined in pairs with and without a folded load.
// Instructions with folded loads are usually micro-fused, so they only appear
// as two micro-ops when queued in the reservation station.
def : ReadAdvance<ReadAfterVecXLd, 6>;
def : ReadAdvance<ReadAfterVecYLd, 7>;
+def : ReadAdvance<ReadInt2Fpu, 0>;
+
// Many SchedWrites are defined in pairs with and without a folded load.
// Instructions with folded loads are usually micro-fused, so they only appear
// as two micro-ops when queued in the reservation station.
def : ReadAdvance<ReadAfterVecXLd, 6>;
def : ReadAdvance<ReadAfterVecYLd, 7>;
+def : ReadAdvance<ReadInt2Fpu, 0>;
+
// Many SchedWrites are defined in pairs with and without a folded load.
// Instructions with folded loads are usually micro-fused, so they only appear
// as two micro-ops when queued in the reservation station.
def ReadAfterVecXLd : SchedRead;
def ReadAfterVecYLd : SchedRead;
+// Instructions that move data between general purpose registers and vector
+// registers may be subject to extra latency due to data bypass delays.
+// This SchedRead describes a bypass delay caused by data being moved from the
+// integer unit to the floating point unit.
+def ReadInt2Fpu : SchedRead;
+
// Instructions with both a load and a store folded are modeled as a folded
// load + WriteRMW.
def WriteRMW : SchedWrite;
def : ReadAdvance<ReadAfterVecXLd, 3>;
def : ReadAdvance<ReadAfterVecYLd, 3>;
+def : ReadAdvance<ReadInt2Fpu, 0>;
+
// Many SchedWrites are defined in pairs with and without a folded load.
// Instructions with folded loads are usually micro-fused, so they only appear
// as two micro-ops when dispatched by the schedulers.
def : ReadAdvance<ReadAfterVecXLd, 5>;
def : ReadAdvance<ReadAfterVecYLd, 5>;
+def : ReadAdvance<ReadInt2Fpu, 0>;
+
// A folded store needs a cycle on the PdStore for the store data.
def : WriteRes<WriteRMW, [PdStore]>;
def : ReadAdvance<ReadAfterVecXLd, 5>;
def : ReadAdvance<ReadAfterVecYLd, 5>;
+/// "Additional 6 cycle transfer operation which moves a floating point
+/// operation input value from the integer unit to the floating point unit.
+/// Reference: AMDfam16h SOG (Appendix A "Instruction Latencies", Section A.2).
+def : ReadAdvance<ReadInt2Fpu, -6>;
+
// Many SchedWrites are defined in pairs with and without a folded load.
// Instructions with folded loads are usually micro-fused, so they only appear
// as two micro-ops when dispatched by the schedulers.
// Vector insert/extract operations.
////////////////////////////////////////////////////////////////////////////////
-defm : X86WriteRes<WriteVecInsert, [JFPU01, JVALU], 7, [1,1], 2>;
+defm : X86WriteRes<WriteVecInsert, [JFPU01, JVALU], 1, [1,1], 2>;
defm : X86WriteRes<WriteVecInsertLd, [JFPU01, JVALU, JLAGU], 4, [1,1,1], 1>;
defm : X86WriteRes<WriteVecExtract, [JFPU0, JFPA, JALU0], 3, [1,1,1], 1>;
defm : X86WriteRes<WriteVecExtractSt, [JFPU1, JSTC, JSAGU], 3, [1,1,1], 1>;
def : ReadAdvance<ReadAfterVecXLd, 3>;
def : ReadAdvance<ReadAfterVecYLd, 3>;
+def : ReadAdvance<ReadInt2Fpu, 0>;
+
// Many SchedWrites are defined in pairs with and without a folded load.
// Instructions with folded loads are usually micro-fused, so they only appear
// as two micro-ops when queued in the reservation station.
def : ReadAdvance<ReadAfterVecXLd, 8>;
def : ReadAdvance<ReadAfterVecYLd, 8>;
+def : ReadAdvance<ReadInt2Fpu, 0>;
+
// The Integer PRF for Zen is 168 entries, and it holds the architectural and
// speculative version of the 64-bit integer registers.
// Reference: "Software Optimization Guide for AMD Family 17h Processors"
;
; BTVER2-LABEL: test_pinsrw:
; BTVER2: # %bb.0:
-; BTVER2-NEXT: pinsrw $0, %edi, %mm0 # sched: [7:0.50]
; BTVER2-NEXT: movswl (%rsi), %eax # sched: [4:1.00]
+; BTVER2-NEXT: pinsrw $0, %edi, %mm0 # sched: [7:0.50]
; BTVER2-NEXT: pinsrw $1, %eax, %mm0 # sched: [7:0.50]
; BTVER2-NEXT: movq %mm0, %rax # sched: [4:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; BTVER2-SSE-LABEL: test_pinsrq:
; BTVER2-SSE: # %bb.0:
-; BTVER2-SSE-NEXT: pinsrq $1, %rdi, %xmm0 # sched: [7:0.50]
; BTVER2-SSE-NEXT: pinsrq $1, (%rsi), %xmm1 # sched: [4:1.00]
+; BTVER2-SSE-NEXT: pinsrq $1, %rdi, %xmm0 # sched: [7:0.50]
; BTVER2-SSE-NEXT: paddq %xmm1, %xmm0 # sched: [1:0.50]
; BTVER2-SSE-NEXT: retq # sched: [4:1.00]
;
; BTVER2-LABEL: test_pinsrq:
; BTVER2: # %bb.0:
-; BTVER2-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm0 # sched: [7:0.50]
; BTVER2-NEXT: vpinsrq $1, (%rsi), %xmm1, %xmm1 # sched: [4:1.00]
+; BTVER2-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm0 # sched: [7:0.50]
; BTVER2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
# CHECK: Iterations: 500
# CHECK-NEXT: Instructions: 1000
-# CHECK-NEXT: Total Cycles: 7003
+# CHECK-NEXT: Total Cycles: 1003
# CHECK-NEXT: Total uOps: 2000
# CHECK: Dispatch Width: 2
-# CHECK-NEXT: uOps Per Cycle: 0.29
-# CHECK-NEXT: IPC: 0.14
+# CHECK-NEXT: uOps Per Cycle: 1.99
+# CHECK-NEXT: IPC: 1.00
# CHECK-NEXT: Block RThroughput: 2.0
# CHECK: Instruction Info:
# CHECK: Iterations: 500
# CHECK-NEXT: Instructions: 1000
-# CHECK-NEXT: Total Cycles: 7003
+# CHECK-NEXT: Total Cycles: 1003
# CHECK-NEXT: Total uOps: 2000
# CHECK: Dispatch Width: 2
-# CHECK-NEXT: uOps Per Cycle: 0.29
-# CHECK-NEXT: IPC: 0.14
+# CHECK-NEXT: uOps Per Cycle: 1.99
+# CHECK-NEXT: IPC: 1.00
# CHECK-NEXT: Block RThroughput: 2.0
# CHECK: Instruction Info:
# CHECK: Iterations: 500
# CHECK-NEXT: Instructions: 1000
-# CHECK-NEXT: Total Cycles: 7003
+# CHECK-NEXT: Total Cycles: 1003
# CHECK-NEXT: Total uOps: 2000
# CHECK: Dispatch Width: 2
-# CHECK-NEXT: uOps Per Cycle: 0.29
-# CHECK-NEXT: IPC: 0.14
+# CHECK-NEXT: uOps Per Cycle: 1.99
+# CHECK-NEXT: IPC: 1.00
# CHECK-NEXT: Block RThroughput: 2.0
# CHECK: Instruction Info:
# CHECK: Iterations: 500
# CHECK-NEXT: Instructions: 1000
-# CHECK-NEXT: Total Cycles: 7003
+# CHECK-NEXT: Total Cycles: 1003
# CHECK-NEXT: Total uOps: 2000
# CHECK: Dispatch Width: 2
-# CHECK-NEXT: uOps Per Cycle: 0.29
-# CHECK-NEXT: IPC: 0.14
+# CHECK-NEXT: uOps Per Cycle: 1.99
+# CHECK-NEXT: IPC: 1.00
# CHECK-NEXT: Block RThroughput: 2.0
# CHECK: Instruction Info:
# CHECK: Iterations: 500
# CHECK-NEXT: Instructions: 1500
-# CHECK-NEXT: Total Cycles: 7004
+# CHECK-NEXT: Total Cycles: 1509
# CHECK-NEXT: Total uOps: 2500
# CHECK: Dispatch Width: 2
-# CHECK-NEXT: uOps Per Cycle: 0.36
-# CHECK-NEXT: IPC: 0.21
+# CHECK-NEXT: uOps Per Cycle: 1.66
+# CHECK-NEXT: IPC: 0.99
# CHECK-NEXT: Block RThroughput: 2.5
# CHECK: Instruction Info:
# CHECK-NEXT: - - - - - 1.00 - - - - - 1.00 - - vpinsrb $1, %eax, %xmm0, %xmm0
# CHECK: Timeline view:
-# CHECK-NEXT: 0123456789 0123456789
-# CHECK-NEXT: Index 0123456789 0123456789 012345
+# CHECK-NEXT: 01234567
+# CHECK-NEXT: Index 0123456789
-# CHECK: [0,0] DeER . . . . . . . . . addl %eax, %eax
-# CHECK-NEXT: [0,1] .DeeeeeeeER . . . . . . . vpinsrb $0, %eax, %xmm0, %xmm0
-# CHECK-NEXT: [0,2] . D======eeeeeeeER . . . . . . vpinsrb $1, %eax, %xmm0, %xmm0
-# CHECK-NEXT: [1,0] . DeE-----------R . . . . . . addl %eax, %eax
-# CHECK-NEXT: [1,1] . D===========eeeeeeeER. . . . . vpinsrb $0, %eax, %xmm0, %xmm0
-# CHECK-NEXT: [1,2] . D=================eeeeeeeER . . . vpinsrb $1, %eax, %xmm0, %xmm0
-# CHECK-NEXT: [2,0] . .DeE----------------------R . . . addl %eax, %eax
-# CHECK-NEXT: [2,1] . . D======================eeeeeeeER . . vpinsrb $0, %eax, %xmm0, %xmm0
-# CHECK-NEXT: [2,2] . . D============================eeeeeeeER vpinsrb $1, %eax, %xmm0, %xmm0
+# CHECK: [0,0] DeER . . . . addl %eax, %eax
+# CHECK-NEXT: [0,1] .D======eER . . vpinsrb $0, %eax, %xmm0, %xmm0
+# CHECK-NEXT: [0,2] . D======eER . . vpinsrb $1, %eax, %xmm0, %xmm0
+# CHECK-NEXT: [1,0] . DeE-----R . . addl %eax, %eax
+# CHECK-NEXT: [1,1] . D======eER . . vpinsrb $0, %eax, %xmm0, %xmm0
+# CHECK-NEXT: [1,2] . D======eER. . vpinsrb $1, %eax, %xmm0, %xmm0
+# CHECK-NEXT: [2,0] . .DeE-----R. . addl %eax, %eax
+# CHECK-NEXT: [2,1] . . D======eER. vpinsrb $0, %eax, %xmm0, %xmm0
+# CHECK-NEXT: [2,2] . . D======eER vpinsrb $1, %eax, %xmm0, %xmm0
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
# CHECK: [0] [1] [2] [3]
-# CHECK-NEXT: 0. 3 1.0 1.0 11.0 addl %eax, %eax
-# CHECK-NEXT: 1. 3 12.0 0.0 0.0 vpinsrb $0, %eax, %xmm0, %xmm0
-# CHECK-NEXT: 2. 3 18.0 0.0 0.0 vpinsrb $1, %eax, %xmm0, %xmm0
+# CHECK-NEXT: 0. 3 1.0 1.0 3.3 addl %eax, %eax
+# CHECK-NEXT: 1. 3 7.0 0.0 0.0 vpinsrb $0, %eax, %xmm0, %xmm0
+# CHECK-NEXT: 2. 3 7.0 0.0 0.0 vpinsrb $1, %eax, %xmm0, %xmm0
const MCSchedClassDesc &SCDesc = *SM.getSchedClassDesc(SchedClassID);
unsigned NumMicroOpcodes = SCDesc.NumMicroOps;
unsigned Latency = MCSchedModel::computeInstrLatency(STI, SCDesc);
+ // Add extra latency due to delays in the forwarding data paths.
+ Latency += MCSchedModel::getForwardingDelayCycles(
+ STI.getReadAdvanceEntries(SCDesc));
Optional<double> RThroughput =
MCSchedModel::getReciprocalThroughput(STI, SCDesc);