From 6b6a90fb8512840679035c82d0be6aa9cd777b2c Mon Sep 17 00:00:00 2001 From: Andrea Di Biagio Date: Wed, 23 Jan 2019 16:35:07 +0000 Subject: [PATCH] [MC][X86] Correctly model additional operand latency caused by transfer delays from the integer to the floating point unit. This patch adds a new ReadAdvance definition named ReadInt2Fpu. ReadInt2Fpu allows x86 scheduling models to accurately describe delays caused by data transfers from the integer unit to the floating point unit. ReadInt2Fpu currently defaults to a delay of zero cycles (i.e. no delay) for all x86 models excluding BtVer2. That means, this patch is only a functional change for the Jaguar cpu model only. Tablegen definitions for instructions (V)PINSR* have been updated to account for the new ReadInt2Fpu. That read is mapped to the the GPR input operand. On Jaguar, int-to-fpu transfers are modeled as a +6cy delay. Before this patch, that extra delay was added to the opcode latency. In practice, the insert opcode only executes for 1cy. Most of the actual latency is actually contributed by the so-called operand-latency. According to the AMD SOG for family 16h, (V)PINSR* latency is defined by expression f+1, where f is defined as a forwarding delay from the integer unit to the fpu. When printing instruction latency from MCA (see InstructionInfoView.cpp) and LLC (only when flag -print-schedule is speified), we now need to account for any extra forwarding delays. We do this by checking if scheduling classes declare any negative ReadAdvance entries. Quoting a code comment in TargetSchedule.td: "A negative advance effectively increases latency, which may be used for cross-domain stalls". When computing the instruction latency for the purpose of our scheduling tests, we now add any extra delay to the formula. This avoids regressing existing codegen and mca schedule tests. It comes with the cost of an extra (but very simple) hook in MCSchedModel. Differential Revision: https://reviews.llvm.org/D57056 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@351965 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/MC/MCSchedule.h | 6 ++++ include/llvm/MC/MCSubtargetInfo.h | 10 ++++++ include/llvm/MCA/Instruction.h | 4 +++ lib/CodeGen/TargetSubtargetInfo.cpp | 18 ++++++++-- lib/MC/MCSchedule.cpp | 16 +++++++++ lib/MCA/InstrBuilder.cpp | 1 + lib/Target/X86/X86InstrMMX.td | 2 +- lib/Target/X86/X86InstrSSE.td | 8 ++--- lib/Target/X86/X86SchedBroadwell.td | 2 ++ lib/Target/X86/X86SchedHaswell.td | 2 ++ lib/Target/X86/X86SchedSandyBridge.td | 2 ++ lib/Target/X86/X86SchedSkylakeClient.td | 2 ++ lib/Target/X86/X86SchedSkylakeServer.td | 2 ++ lib/Target/X86/X86Schedule.td | 6 ++++ lib/Target/X86/X86ScheduleAtom.td | 2 ++ lib/Target/X86/X86ScheduleBdVer2.td | 2 ++ lib/Target/X86/X86ScheduleBtVer2.td | 7 +++- lib/Target/X86/X86ScheduleSLM.td | 2 ++ lib/Target/X86/X86ScheduleZnver1.td | 2 ++ test/CodeGen/X86/mmx-schedule.ll | 2 +- test/CodeGen/X86/sse41-schedule.ll | 4 +-- .../X86/BtVer2/int-to-fpu-forwarding-1.s | 24 ++++++------- .../X86/BtVer2/int-to-fpu-forwarding-3.s | 34 +++++++++---------- tools/llvm-mca/Views/InstructionInfoView.cpp | 3 ++ 24 files changed, 123 insertions(+), 40 deletions(-) diff --git a/include/llvm/MC/MCSchedule.h b/include/llvm/MC/MCSchedule.h index 25a3a9cdb8f..df3248ee6e8 100644 --- a/include/llvm/MC/MCSchedule.h +++ b/include/llvm/MC/MCSchedule.h @@ -14,6 +14,7 @@ #ifndef LLVM_MC_MCSCHEDULE_H #define LLVM_MC_MCSCHEDULE_H +#include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/Optional.h" #include "llvm/Config/llvm-config.h" #include "llvm/Support/DataTypes.h" @@ -369,6 +370,11 @@ struct MCSchedModel { getReciprocalThroughput(const MCSubtargetInfo &STI, const MCInstrInfo &MCII, const MCInst &Inst) const; + /// Returns the maximum forwarding delay for register reads dependent on + /// writes of scheduling class WriteResourceIdx. + static unsigned getForwardingDelayCycles(ArrayRef Entries, + unsigned WriteResourceIdx = 0); + /// Returns the default initialized model. static const MCSchedModel &GetDefaultSchedModel() { return Default; } static const MCSchedModel Default; diff --git a/include/llvm/MC/MCSubtargetInfo.h b/include/llvm/MC/MCSubtargetInfo.h index 03eea1e8dce..2ad72c3c325 100644 --- a/include/llvm/MC/MCSubtargetInfo.h +++ b/include/llvm/MC/MCSubtargetInfo.h @@ -152,6 +152,16 @@ public: return 0; } + /// Return the set of ReadAdvance entries declared by the scheduling class + /// descriptor in input. + ArrayRef + getReadAdvanceEntries(const MCSchedClassDesc &SC) const { + if (!SC.NumReadAdvanceEntries) + return ArrayRef(); + return ArrayRef(&ReadAdvanceTable[SC.ReadAdvanceIdx], + SC.NumReadAdvanceEntries); + } + /// Get scheduling itinerary of a CPU. InstrItineraryData getInstrItineraryForCPU(StringRef CPU) const; diff --git a/include/llvm/MCA/Instruction.h b/include/llvm/MCA/Instruction.h index 27d135e3510..3effa2b7654 100644 --- a/include/llvm/MCA/Instruction.h +++ b/include/llvm/MCA/Instruction.h @@ -332,6 +332,10 @@ struct InstrDesc { unsigned MaxLatency; // Number of MicroOps for this instruction. unsigned NumMicroOps; + // SchedClassID used to construct this InstrDesc. + // This information is currently used by views to do fast queries on the + // subtarget when computing the reciprocal throughput. + unsigned SchedClassID; bool MayLoad; bool MayStore; diff --git a/lib/CodeGen/TargetSubtargetInfo.cpp b/lib/CodeGen/TargetSubtargetInfo.cpp index c9f90b88cac..e34f9a1579d 100644 --- a/lib/CodeGen/TargetSubtargetInfo.cpp +++ b/lib/CodeGen/TargetSubtargetInfo.cpp @@ -88,6 +88,12 @@ std::string TargetSubtargetInfo::getSchedInfoStr(const MachineInstr &MI) const { TargetSchedModel TSchedModel; TSchedModel.init(this); unsigned Latency = TSchedModel.computeInstrLatency(&MI); + + // Add extra latency due to forwarding delays. + const MCSchedClassDesc &SCDesc = *TSchedModel.resolveSchedClass(&MI); + Latency += + MCSchedModel::getForwardingDelayCycles(getReadAdvanceEntries(SCDesc)); + double RThroughput = TSchedModel.computeReciprocalThroughput(&MI); return createSchedInfoStr(Latency, RThroughput); } @@ -99,9 +105,17 @@ std::string TargetSubtargetInfo::getSchedInfoStr(MCInst const &MCI) const { TargetSchedModel TSchedModel; TSchedModel.init(this); unsigned Latency; - if (TSchedModel.hasInstrSchedModel()) + if (TSchedModel.hasInstrSchedModel()) { Latency = TSchedModel.computeInstrLatency(MCI); - else if (TSchedModel.hasInstrItineraries()) { + // Add extra latency due to forwarding delays. + const MCSchedModel &SM = *TSchedModel.getMCSchedModel(); + unsigned SClassID = getInstrInfo()->get(MCI.getOpcode()).getSchedClass(); + while (SM.getSchedClassDesc(SClassID)->isVariant()) + SClassID = resolveVariantSchedClass(SClassID, &MCI, SM.ProcID); + const MCSchedClassDesc &SCDesc = *SM.getSchedClassDesc(SClassID); + Latency += + MCSchedModel::getForwardingDelayCycles(getReadAdvanceEntries(SCDesc)); + } else if (TSchedModel.hasInstrItineraries()) { auto *ItinData = TSchedModel.getInstrItineraries(); Latency = ItinData->getStageLatency( getInstrInfo()->get(MCI.getOpcode()).getSchedClass()); diff --git a/lib/MC/MCSchedule.cpp b/lib/MC/MCSchedule.cpp index 6797a47c75a..1fc5ec5e975 100644 --- a/lib/MC/MCSchedule.cpp +++ b/lib/MC/MCSchedule.cpp @@ -149,3 +149,19 @@ MCSchedModel::getReciprocalThroughput(unsigned SchedClass, // that it can execute at the maximum default issue width. return 1.0 / DefaultIssueWidth; } + +unsigned +MCSchedModel::getForwardingDelayCycles(ArrayRef Entries, + unsigned WriteResourceID) { + if (Entries.empty()) + return 0; + + int DelayCycles = 0; + for (const MCReadAdvanceEntry &E : Entries) { + if (E.WriteResourceID != WriteResourceID) + continue; + DelayCycles = std::min(DelayCycles, E.Cycles); + } + + return std::abs(DelayCycles); +} diff --git a/lib/MCA/InstrBuilder.cpp b/lib/MCA/InstrBuilder.cpp index 4b0ec329f9e..1e08f898523 100644 --- a/lib/MCA/InstrBuilder.cpp +++ b/lib/MCA/InstrBuilder.cpp @@ -532,6 +532,7 @@ InstrBuilder::createInstrDescImpl(const MCInst &MCI) { // Create a new empty descriptor. std::unique_ptr ID = llvm::make_unique(); ID->NumMicroOps = SCDesc.NumMicroOps; + ID->SchedClassID = SchedClassID; if (MCDesc.isCall() && FirstCallInst) { // We don't correctly model calls. diff --git a/lib/Target/X86/X86InstrMMX.td b/lib/Target/X86/X86InstrMMX.td index c00c4f4ca09..8e2a45b1bed 100644 --- a/lib/Target/X86/X86InstrMMX.td +++ b/lib/Target/X86/X86InstrMMX.td @@ -543,7 +543,7 @@ let Predicates = [HasMMX, HasSSE1] in { "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}", [(set VR64:$dst, (int_x86_mmx_pinsr_w VR64:$src1, GR32orGR64:$src2, imm:$src3))]>, - Sched<[WriteVecInsert]>; + Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>; def MMX_PINSRWrm : MMXIi8<0xC4, MRMSrcMem, (outs VR64:$dst), diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index 807af7f4808..e6427b1764a 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -4122,7 +4122,7 @@ multiclass sse2_pinsrw { "vpinsrw\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), [(set VR128:$dst, (X86pinsrw VR128:$src1, GR32orGR64:$src2, imm:$src3))]>, - Sched<[WriteVecInsert]>; + Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>; def rm : Ii8<0xC4, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, i16mem:$src2, u8imm:$src3), @@ -5577,7 +5577,7 @@ multiclass SS41I_insert8 opc, string asm, bit Is2Addr = 1> { "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), [(set VR128:$dst, (X86pinsrb VR128:$src1, GR32orGR64:$src2, imm:$src3))]>, - Sched<[WriteVecInsert]>; + Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>; def rm : SS4AIi8 opc, string asm, bit Is2Addr = 1> { "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), [(set VR128:$dst, (v4i32 (insertelt VR128:$src1, GR32:$src2, imm:$src3)))]>, - Sched<[WriteVecInsert]>; + Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>; def rm : SS4AIi8 opc, string asm, bit Is2Addr = 1> { "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), [(set VR128:$dst, (v2i64 (insertelt VR128:$src1, GR64:$src2, imm:$src3)))]>, - Sched<[WriteVecInsert]>; + Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>; def rm : SS4AIi8; def : ReadAdvance; def : ReadAdvance; +def : ReadAdvance; + // Many SchedWrites are defined in pairs with and without a folded load. // Instructions with folded loads are usually micro-fused, so they only appear // as two micro-ops when queued in the reservation station. diff --git a/lib/Target/X86/X86SchedHaswell.td b/lib/Target/X86/X86SchedHaswell.td index 155d77224a6..dc040e0e7d3 100644 --- a/lib/Target/X86/X86SchedHaswell.td +++ b/lib/Target/X86/X86SchedHaswell.td @@ -86,6 +86,8 @@ def : ReadAdvance; def : ReadAdvance; def : ReadAdvance; +def : ReadAdvance; + // Many SchedWrites are defined in pairs with and without a folded load. // Instructions with folded loads are usually micro-fused, so they only appear // as two micro-ops when queued in the reservation station. diff --git a/lib/Target/X86/X86SchedSandyBridge.td b/lib/Target/X86/X86SchedSandyBridge.td index 4f47acde7cf..503b905842b 100644 --- a/lib/Target/X86/X86SchedSandyBridge.td +++ b/lib/Target/X86/X86SchedSandyBridge.td @@ -76,6 +76,8 @@ def : ReadAdvance; def : ReadAdvance; def : ReadAdvance; +def : ReadAdvance; + // Many SchedWrites are defined in pairs with and without a folded load. // Instructions with folded loads are usually micro-fused, so they only appear // as two micro-ops when queued in the reservation station. diff --git a/lib/Target/X86/X86SchedSkylakeClient.td b/lib/Target/X86/X86SchedSkylakeClient.td index 7d84d814d94..71045897376 100644 --- a/lib/Target/X86/X86SchedSkylakeClient.td +++ b/lib/Target/X86/X86SchedSkylakeClient.td @@ -80,6 +80,8 @@ def : ReadAdvance; def : ReadAdvance; def : ReadAdvance; +def : ReadAdvance; + // Many SchedWrites are defined in pairs with and without a folded load. // Instructions with folded loads are usually micro-fused, so they only appear // as two micro-ops when queued in the reservation station. diff --git a/lib/Target/X86/X86SchedSkylakeServer.td b/lib/Target/X86/X86SchedSkylakeServer.td index 0d452f9beb3..8bd80078891 100644 --- a/lib/Target/X86/X86SchedSkylakeServer.td +++ b/lib/Target/X86/X86SchedSkylakeServer.td @@ -80,6 +80,8 @@ def : ReadAdvance; def : ReadAdvance; def : ReadAdvance; +def : ReadAdvance; + // Many SchedWrites are defined in pairs with and without a folded load. // Instructions with folded loads are usually micro-fused, so they only appear // as two micro-ops when queued in the reservation station. diff --git a/lib/Target/X86/X86Schedule.td b/lib/Target/X86/X86Schedule.td index 886c3aef3f5..f50cb621046 100644 --- a/lib/Target/X86/X86Schedule.td +++ b/lib/Target/X86/X86Schedule.td @@ -17,6 +17,12 @@ def ReadAfterVecLd : SchedRead; def ReadAfterVecXLd : SchedRead; def ReadAfterVecYLd : SchedRead; +// Instructions that move data between general purpose registers and vector +// registers may be subject to extra latency due to data bypass delays. +// This SchedRead describes a bypass delay caused by data being moved from the +// integer unit to the floating point unit. +def ReadInt2Fpu : SchedRead; + // Instructions with both a load and a store folded are modeled as a folded // load + WriteRMW. def WriteRMW : SchedWrite; diff --git a/lib/Target/X86/X86ScheduleAtom.td b/lib/Target/X86/X86ScheduleAtom.td index 779692b7da6..bf50aeee1df 100644 --- a/lib/Target/X86/X86ScheduleAtom.td +++ b/lib/Target/X86/X86ScheduleAtom.td @@ -46,6 +46,8 @@ def : ReadAdvance; def : ReadAdvance; def : ReadAdvance; +def : ReadAdvance; + // Many SchedWrites are defined in pairs with and without a folded load. // Instructions with folded loads are usually micro-fused, so they only appear // as two micro-ops when dispatched by the schedulers. diff --git a/lib/Target/X86/X86ScheduleBdVer2.td b/lib/Target/X86/X86ScheduleBdVer2.td index ca14ed478f0..90ca79915fa 100644 --- a/lib/Target/X86/X86ScheduleBdVer2.td +++ b/lib/Target/X86/X86ScheduleBdVer2.td @@ -250,6 +250,8 @@ def : ReadAdvance; def : ReadAdvance; def : ReadAdvance; +def : ReadAdvance; + // A folded store needs a cycle on the PdStore for the store data. def : WriteRes; diff --git a/lib/Target/X86/X86ScheduleBtVer2.td b/lib/Target/X86/X86ScheduleBtVer2.td index 8d8de3e8e15..3a2ed733f56 100644 --- a/lib/Target/X86/X86ScheduleBtVer2.td +++ b/lib/Target/X86/X86ScheduleBtVer2.td @@ -108,6 +108,11 @@ def : ReadAdvance; def : ReadAdvance; def : ReadAdvance; +/// "Additional 6 cycle transfer operation which moves a floating point +/// operation input value from the integer unit to the floating point unit. +/// Reference: AMDfam16h SOG (Appendix A "Instruction Latencies", Section A.2). +def : ReadAdvance; + // Many SchedWrites are defined in pairs with and without a folded load. // Instructions with folded loads are usually micro-fused, so they only appear // as two micro-ops when dispatched by the schedulers. @@ -540,7 +545,7 @@ defm : X86WriteResPairUnsupported; // Vector insert/extract operations. //////////////////////////////////////////////////////////////////////////////// -defm : X86WriteRes; +defm : X86WriteRes; defm : X86WriteRes; defm : X86WriteRes; defm : X86WriteRes; diff --git a/lib/Target/X86/X86ScheduleSLM.td b/lib/Target/X86/X86ScheduleSLM.td index 5dca0ff7019..fc150fca545 100644 --- a/lib/Target/X86/X86ScheduleSLM.td +++ b/lib/Target/X86/X86ScheduleSLM.td @@ -52,6 +52,8 @@ def : ReadAdvance; def : ReadAdvance; def : ReadAdvance; +def : ReadAdvance; + // Many SchedWrites are defined in pairs with and without a folded load. // Instructions with folded loads are usually micro-fused, so they only appear // as two micro-ops when queued in the reservation station. diff --git a/lib/Target/X86/X86ScheduleZnver1.td b/lib/Target/X86/X86ScheduleZnver1.td index 0407afc4203..1a75281cf0c 100644 --- a/lib/Target/X86/X86ScheduleZnver1.td +++ b/lib/Target/X86/X86ScheduleZnver1.td @@ -94,6 +94,8 @@ def : ReadAdvance; def : ReadAdvance; def : ReadAdvance; +def : ReadAdvance; + // The Integer PRF for Zen is 168 entries, and it holds the architectural and // speculative version of the 64-bit integer registers. // Reference: "Software Optimization Guide for AMD Family 17h Processors" diff --git a/test/CodeGen/X86/mmx-schedule.ll b/test/CodeGen/X86/mmx-schedule.ll index 51dc5e102ff..d423b9a2a90 100644 --- a/test/CodeGen/X86/mmx-schedule.ll +++ b/test/CodeGen/X86/mmx-schedule.ll @@ -3887,8 +3887,8 @@ define i64 @test_pinsrw(x86_mmx %a0, i32 %a1, i16* %a2) optsize { ; ; BTVER2-LABEL: test_pinsrw: ; BTVER2: # %bb.0: -; BTVER2-NEXT: pinsrw $0, %edi, %mm0 # sched: [7:0.50] ; BTVER2-NEXT: movswl (%rsi), %eax # sched: [4:1.00] +; BTVER2-NEXT: pinsrw $0, %edi, %mm0 # sched: [7:0.50] ; BTVER2-NEXT: pinsrw $1, %eax, %mm0 # sched: [7:0.50] ; BTVER2-NEXT: movq %mm0, %rax # sched: [4:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] diff --git a/test/CodeGen/X86/sse41-schedule.ll b/test/CodeGen/X86/sse41-schedule.ll index ea606463fc1..4870434a8ae 100644 --- a/test/CodeGen/X86/sse41-schedule.ll +++ b/test/CodeGen/X86/sse41-schedule.ll @@ -2679,15 +2679,15 @@ define <2 x i64> @test_pinsrq(<2 x i64> %a0, <2 x i64> %a1, i64 %a2, i64 *%a3) { ; ; BTVER2-SSE-LABEL: test_pinsrq: ; BTVER2-SSE: # %bb.0: -; BTVER2-SSE-NEXT: pinsrq $1, %rdi, %xmm0 # sched: [7:0.50] ; BTVER2-SSE-NEXT: pinsrq $1, (%rsi), %xmm1 # sched: [4:1.00] +; BTVER2-SSE-NEXT: pinsrq $1, %rdi, %xmm0 # sched: [7:0.50] ; BTVER2-SSE-NEXT: paddq %xmm1, %xmm0 # sched: [1:0.50] ; BTVER2-SSE-NEXT: retq # sched: [4:1.00] ; ; BTVER2-LABEL: test_pinsrq: ; BTVER2: # %bb.0: -; BTVER2-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm0 # sched: [7:0.50] ; BTVER2-NEXT: vpinsrq $1, (%rsi), %xmm1, %xmm1 # sched: [4:1.00] +; BTVER2-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm0 # sched: [7:0.50] ; BTVER2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] ; diff --git a/test/tools/llvm-mca/X86/BtVer2/int-to-fpu-forwarding-1.s b/test/tools/llvm-mca/X86/BtVer2/int-to-fpu-forwarding-1.s index 9e44702ae79..398a52a8479 100644 --- a/test/tools/llvm-mca/X86/BtVer2/int-to-fpu-forwarding-1.s +++ b/test/tools/llvm-mca/X86/BtVer2/int-to-fpu-forwarding-1.s @@ -27,12 +27,12 @@ vpinsrq $1, %rax, %xmm0, %xmm0 # CHECK: Iterations: 500 # CHECK-NEXT: Instructions: 1000 -# CHECK-NEXT: Total Cycles: 7003 +# CHECK-NEXT: Total Cycles: 1003 # CHECK-NEXT: Total uOps: 2000 # CHECK: Dispatch Width: 2 -# CHECK-NEXT: uOps Per Cycle: 0.29 -# CHECK-NEXT: IPC: 0.14 +# CHECK-NEXT: uOps Per Cycle: 1.99 +# CHECK-NEXT: IPC: 1.00 # CHECK-NEXT: Block RThroughput: 2.0 # CHECK: Instruction Info: @@ -76,12 +76,12 @@ vpinsrq $1, %rax, %xmm0, %xmm0 # CHECK: Iterations: 500 # CHECK-NEXT: Instructions: 1000 -# CHECK-NEXT: Total Cycles: 7003 +# CHECK-NEXT: Total Cycles: 1003 # CHECK-NEXT: Total uOps: 2000 # CHECK: Dispatch Width: 2 -# CHECK-NEXT: uOps Per Cycle: 0.29 -# CHECK-NEXT: IPC: 0.14 +# CHECK-NEXT: uOps Per Cycle: 1.99 +# CHECK-NEXT: IPC: 1.00 # CHECK-NEXT: Block RThroughput: 2.0 # CHECK: Instruction Info: @@ -125,12 +125,12 @@ vpinsrq $1, %rax, %xmm0, %xmm0 # CHECK: Iterations: 500 # CHECK-NEXT: Instructions: 1000 -# CHECK-NEXT: Total Cycles: 7003 +# CHECK-NEXT: Total Cycles: 1003 # CHECK-NEXT: Total uOps: 2000 # CHECK: Dispatch Width: 2 -# CHECK-NEXT: uOps Per Cycle: 0.29 -# CHECK-NEXT: IPC: 0.14 +# CHECK-NEXT: uOps Per Cycle: 1.99 +# CHECK-NEXT: IPC: 1.00 # CHECK-NEXT: Block RThroughput: 2.0 # CHECK: Instruction Info: @@ -174,12 +174,12 @@ vpinsrq $1, %rax, %xmm0, %xmm0 # CHECK: Iterations: 500 # CHECK-NEXT: Instructions: 1000 -# CHECK-NEXT: Total Cycles: 7003 +# CHECK-NEXT: Total Cycles: 1003 # CHECK-NEXT: Total uOps: 2000 # CHECK: Dispatch Width: 2 -# CHECK-NEXT: uOps Per Cycle: 0.29 -# CHECK-NEXT: IPC: 0.14 +# CHECK-NEXT: uOps Per Cycle: 1.99 +# CHECK-NEXT: IPC: 1.00 # CHECK-NEXT: Block RThroughput: 2.0 # CHECK: Instruction Info: diff --git a/test/tools/llvm-mca/X86/BtVer2/int-to-fpu-forwarding-3.s b/test/tools/llvm-mca/X86/BtVer2/int-to-fpu-forwarding-3.s index 4e130be8597..00c13f9ef59 100644 --- a/test/tools/llvm-mca/X86/BtVer2/int-to-fpu-forwarding-3.s +++ b/test/tools/llvm-mca/X86/BtVer2/int-to-fpu-forwarding-3.s @@ -9,12 +9,12 @@ vpinsrb $1, %eax, %xmm0, %xmm0 # CHECK: Iterations: 500 # CHECK-NEXT: Instructions: 1500 -# CHECK-NEXT: Total Cycles: 7004 +# CHECK-NEXT: Total Cycles: 1509 # CHECK-NEXT: Total uOps: 2500 # CHECK: Dispatch Width: 2 -# CHECK-NEXT: uOps Per Cycle: 0.36 -# CHECK-NEXT: IPC: 0.21 +# CHECK-NEXT: uOps Per Cycle: 1.66 +# CHECK-NEXT: IPC: 0.99 # CHECK-NEXT: Block RThroughput: 2.5 # CHECK: Instruction Info: @@ -57,18 +57,18 @@ vpinsrb $1, %eax, %xmm0, %xmm0 # CHECK-NEXT: - - - - - 1.00 - - - - - 1.00 - - vpinsrb $1, %eax, %xmm0, %xmm0 # CHECK: Timeline view: -# CHECK-NEXT: 0123456789 0123456789 -# CHECK-NEXT: Index 0123456789 0123456789 012345 +# CHECK-NEXT: 01234567 +# CHECK-NEXT: Index 0123456789 -# CHECK: [0,0] DeER . . . . . . . . . addl %eax, %eax -# CHECK-NEXT: [0,1] .DeeeeeeeER . . . . . . . vpinsrb $0, %eax, %xmm0, %xmm0 -# CHECK-NEXT: [0,2] . D======eeeeeeeER . . . . . . vpinsrb $1, %eax, %xmm0, %xmm0 -# CHECK-NEXT: [1,0] . DeE-----------R . . . . . . addl %eax, %eax -# CHECK-NEXT: [1,1] . D===========eeeeeeeER. . . . . vpinsrb $0, %eax, %xmm0, %xmm0 -# CHECK-NEXT: [1,2] . D=================eeeeeeeER . . . vpinsrb $1, %eax, %xmm0, %xmm0 -# CHECK-NEXT: [2,0] . .DeE----------------------R . . . addl %eax, %eax -# CHECK-NEXT: [2,1] . . D======================eeeeeeeER . . vpinsrb $0, %eax, %xmm0, %xmm0 -# CHECK-NEXT: [2,2] . . D============================eeeeeeeER vpinsrb $1, %eax, %xmm0, %xmm0 +# CHECK: [0,0] DeER . . . . addl %eax, %eax +# CHECK-NEXT: [0,1] .D======eER . . vpinsrb $0, %eax, %xmm0, %xmm0 +# CHECK-NEXT: [0,2] . D======eER . . vpinsrb $1, %eax, %xmm0, %xmm0 +# CHECK-NEXT: [1,0] . DeE-----R . . addl %eax, %eax +# CHECK-NEXT: [1,1] . D======eER . . vpinsrb $0, %eax, %xmm0, %xmm0 +# CHECK-NEXT: [1,2] . D======eER. . vpinsrb $1, %eax, %xmm0, %xmm0 +# CHECK-NEXT: [2,0] . .DeE-----R. . addl %eax, %eax +# CHECK-NEXT: [2,1] . . D======eER. vpinsrb $0, %eax, %xmm0, %xmm0 +# CHECK-NEXT: [2,2] . . D======eER vpinsrb $1, %eax, %xmm0, %xmm0 # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -77,6 +77,6 @@ vpinsrb $1, %eax, %xmm0, %xmm0 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage # CHECK: [0] [1] [2] [3] -# CHECK-NEXT: 0. 3 1.0 1.0 11.0 addl %eax, %eax -# CHECK-NEXT: 1. 3 12.0 0.0 0.0 vpinsrb $0, %eax, %xmm0, %xmm0 -# CHECK-NEXT: 2. 3 18.0 0.0 0.0 vpinsrb $1, %eax, %xmm0, %xmm0 +# CHECK-NEXT: 0. 3 1.0 1.0 3.3 addl %eax, %eax +# CHECK-NEXT: 1. 3 7.0 0.0 0.0 vpinsrb $0, %eax, %xmm0, %xmm0 +# CHECK-NEXT: 2. 3 7.0 0.0 0.0 vpinsrb $1, %eax, %xmm0, %xmm0 diff --git a/tools/llvm-mca/Views/InstructionInfoView.cpp b/tools/llvm-mca/Views/InstructionInfoView.cpp index 60b8b1f5141..1fbffa3e5b6 100644 --- a/tools/llvm-mca/Views/InstructionInfoView.cpp +++ b/tools/llvm-mca/Views/InstructionInfoView.cpp @@ -43,6 +43,9 @@ void InstructionInfoView::printView(raw_ostream &OS) const { const MCSchedClassDesc &SCDesc = *SM.getSchedClassDesc(SchedClassID); unsigned NumMicroOpcodes = SCDesc.NumMicroOps; unsigned Latency = MCSchedModel::computeInstrLatency(STI, SCDesc); + // Add extra latency due to delays in the forwarding data paths. + Latency += MCSchedModel::getForwardingDelayCycles( + STI.getReadAdvanceEntries(SCDesc)); Optional RThroughput = MCSchedModel::getReciprocalThroughput(STI, SCDesc); -- 2.50.1