From 4ce34f143424e8f8ec44879dcb4bc9aa87e7504b Mon Sep 17 00:00:00 2001 From: Roman Lebedev Date: Mon, 4 Feb 2019 09:12:17 +0000 Subject: [PATCH] [llvm-exegesis] Throughput support in analysis mode Summary: D57000 / [[ https://bugs.llvm.org/show_bug.cgi?id=37698 | PR37698 ]] added support for measuring of the inverse throughput. But the support for the analysis was not added. This attempts to fix that. (analysis done o bdver2 / piledriver) First, small-scale experiment: ``` $ ./bin/llvm-exegesis -num-repetitions=10000 -mode=inverse_throughput -opcode-name=BSF64rr Check generated assembly with: /usr/bin/objdump -d /tmp/snippet-d0acdd.o --- mode: inverse_throughput key: instructions: - 'BSF64rr RAX RDX' config: '' register_initial_values: - 'RDX=0x0' cpu_name: bdver2 llvm_triple: x86_64-unknown-linux-gnu num_repetitions: 10000 measurements: - { key: inverse_throughput, value: 3.0278, per_snippet_value: 3.0278 } error: '' info: instruction has no tied variables picking Uses different from defs assembled_snippet: 48BA0000000000000000480FBCC2480FBCC2480FBCC2480FBCC2480FBCC2480FBCC2480FBCC2480FBCC2480FBCC2480FBCC2480FBCC2480FBCC2480FBCC2480FBCC2480FBCC2480FBCC2C3 ... ``` If we plug `bsfq %r12, %r10` into llvm-mca: https://godbolt.org/z/ZtOyhJ ``` Dispatch Width: 4 uOps Per Cycle: 3.00 IPC: 0.50 Block RThroughput: 2.0 ``` So RThroughput mismatch exists. Now, let's upscale and analyse: {F8207148} `$ ./bin/llvm-exegesis -mode=analysis -analysis-epsilon=1.0 -benchmarks-file=/tmp/benchmarks-inverse_throughput.yaml -analysis-inconsistencies-output-file=/tmp/clusters.html`: {F8207172} {F8207197} And if we now look at https://www.agner.org/optimize/instruction_tables.pdf, `Reciprocal throughput` for `BSF r,r` is listed as `3`. Yay? Reviewers: courbet, gchatelet Reviewed By: courbet Subscribers: tschuett, RKSimon, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D57647 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@353023 91177308-0d34-0410-b5e6-96231b3b80d8 --- tools/llvm-exegesis/lib/Analysis.cpp | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/tools/llvm-exegesis/lib/Analysis.cpp b/tools/llvm-exegesis/lib/Analysis.cpp index 47afa8c4fe2..158fbfe3b0a 100644 --- a/tools/llvm-exegesis/lib/Analysis.cpp +++ b/tools/llvm-exegesis/lib/Analysis.cpp @@ -316,6 +316,7 @@ void Analysis::printSchedClassClustersHtml( writeLatencySnippetHtml(OS, Point.Key.Instructions, *InstrInfo_); break; case InstructionBenchmark::Uops: + case InstructionBenchmark::InverseThroughput: writeUopsSnippetHtml(OS, Point.Key.Instructions, *InstrInfo_); break; default: @@ -507,9 +508,14 @@ bool Analysis::SchedClassCluster::measurementsMatch( } ClusterCenterPoint[I].PerInstructionValue = Representative[I].avg(); } + } else if (Mode == InstructionBenchmark::InverseThroughput) { + for (int I = 0, E = Representative.size(); I < E; ++I) { + SchedClassPoint[I].PerInstructionValue = + MCSchedModel::getReciprocalThroughput(STI, *RSC.SCDesc); + ClusterCenterPoint[I].PerInstructionValue = Representative[I].min(); + } } else { - llvm::errs() << "unimplemented measurement matching for mode " << Mode - << "\n"; + llvm_unreachable("unimplemented measurement matching mode"); return false; } return Clustering.isNeighbour(ClusterCenterPoint, SchedClassPoint); @@ -519,9 +525,9 @@ void Analysis::printSchedClassDescHtml(const ResolvedSchedClass &RSC, llvm::raw_ostream &OS) const { OS << ""; OS << ""; + "th>"; if (RSC.SCDesc->isValid()) { const auto &SM = SubtargetInfo_->getSchedModel(); OS << ""; @@ -540,6 +546,12 @@ void Analysis::printSchedClassDescHtml(const ResolvedSchedClass &RSC, OS << ""; } OS << ""; + // inverse throughput. + OS << ""; // WriteProcRes. OS << "
ValidVariantNumMicroOpsLatencyWriteProcResIdealized " - "Resource Pressure
RThroughputWriteProcResIdealized Resource Pressure
"; + writeMeasurementValue( + OS, + MCSchedModel::getReciprocalThroughput(*SubtargetInfo_, *RSC.SCDesc)); + OS << "
    "; for (const auto &WPR : RSC.NonRedundantWriteProcRes) { -- 2.40.0