From 4ce34f143424e8f8ec44879dcb4bc9aa87e7504b Mon Sep 17 00:00:00 2001
From: Roman Lebedev <lebedev.ri@gmail.com>
Date: Mon, 4 Feb 2019 09:12:17 +0000
Subject: [PATCH] [llvm-exegesis] Throughput support in analysis mode

Summary:
D57000 / [[ https://bugs.llvm.org/show_bug.cgi?id=37698 | PR37698 ]] added support for measuring of the inverse throughput.
But the support for the analysis was not added.
This attempts to fix that. (analysis done o bdver2 / piledriver)

First, small-scale experiment:
```
$ ./bin/llvm-exegesis -num-repetitions=10000 -mode=inverse_throughput -opcode-name=BSF64rr
Check generated assembly with: /usr/bin/objdump -d /tmp/snippet-d0acdd.o
---
mode:            inverse_throughput
key:
  instructions:
    - 'BSF64rr RAX RDX'
  config:          ''
  register_initial_values:
    - 'RDX=0x0'
cpu_name:        bdver2
llvm_triple:     x86_64-unknown-linux-gnu
num_repetitions: 10000
measurements:
  - { key: inverse_throughput, value: 3.0278, per_snippet_value: 3.0278 }
error:           ''
info:            instruction has no tied variables picking Uses different from defs
assembled_snippet: 48BA0000000000000000480FBCC2480FBCC2480FBCC2480FBCC2480FBCC2480FBCC2480FBCC2480FBCC2480FBCC2480FBCC2480FBCC2480FBCC2480FBCC2480FBCC2480FBCC2480FBCC2C3
...
```
If we plug `bsfq	%r12, %r10` into llvm-mca:
https://godbolt.org/z/ZtOyhJ
```
Dispatch Width:    4
uOps Per Cycle:    3.00
IPC:               0.50
Block RThroughput: 2.0
```
So RThroughput mismatch exists.

Now, let's upscale and analyse:
{F8207148}
`$ ./bin/llvm-exegesis -mode=analysis -analysis-epsilon=1.0 -benchmarks-file=/tmp/benchmarks-inverse_throughput.yaml -analysis-inconsistencies-output-file=/tmp/clusters.html`:
{F8207172}
{F8207197}
And if we now look at https://www.agner.org/optimize/instruction_tables.pdf,
`Reciprocal throughput` for `BSF r,r` is listed as `3`.
Yay?

Reviewers: courbet, gchatelet

Reviewed By: courbet

Subscribers: tschuett, RKSimon, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D57647

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@353023 91177308-0d34-0410-b5e6-96231b3b80d8
---
 tools/llvm-exegesis/lib/Analysis.cpp | 22 +++++++++++++++++-----
 1 file changed, 17 insertions(+), 5 deletions(-)
diff --git a/tools/llvm-exegesis/lib/Analysis.cpp b/tools/llvm-exegesis/lib/Analysis.cpp
index 47afa8c4fe2..158fbfe3b0a 100644
--- a/tools/llvm-exegesis/lib/Analysis.cpp
+++ b/tools/llvm-exegesis/lib/Analysis.cpp
@@ -316,6 +316,7 @@ void Analysis::printSchedClassClustersHtml(
         writeLatencySnippetHtml(OS, Point.Key.Instructions, *InstrInfo_);
         break;
       case InstructionBenchmark::Uops:
+      case InstructionBenchmark::InverseThroughput:
         writeUopsSnippetHtml(OS, Point.Key.Instructions, *InstrInfo_);
         break;
       default:
@@ -507,9 +508,14 @@ bool Analysis::SchedClassCluster::measurementsMatch(
       }
       ClusterCenterPoint[I].PerInstructionValue = Representative[I].avg();
     }
+  } else if (Mode == InstructionBenchmark::InverseThroughput) {
+    for (int I = 0, E = Representative.size(); I < E; ++I) {
+      SchedClassPoint[I].PerInstructionValue =
+          MCSchedModel::getReciprocalThroughput(STI, *RSC.SCDesc);
+      ClusterCenterPoint[I].PerInstructionValue = Representative[I].min();
+    }
   } else {
-    llvm::errs() << "unimplemented measurement matching for mode " << Mode
-                 << "\n";
+    llvm_unreachable("unimplemented measurement matching mode");
     return false;
   }
   return Clustering.isNeighbour(ClusterCenterPoint, SchedClassPoint);
@@ -519,9 +525,9 @@ void Analysis::printSchedClassDescHtml(const ResolvedSchedClass &RSC,
                                        llvm::raw_ostream &OS) const {
   OS << "<table class=\"sched-class-desc\">";
   OS << "<tr><th>Valid</th><th>Variant</th><th>NumMicroOps</th><th>Latency</"
-        "th><th>WriteProcRes</th><th title=\"This is the idealized unit "
-        "resource (port) pressure assuming ideal distribution\">Idealized "
-        "Resource Pressure</th></tr>";
+        "th><th>RThroughput</th><th>WriteProcRes</th><th title=\"This is the "
+        "idealized unit resource (port) pressure assuming ideal "
+        "distribution\">Idealized Resource Pressure</th></tr>";
   if (RSC.SCDesc->isValid()) {
     const auto &SM = SubtargetInfo_->getSchedModel();
     OS << "<tr><td>&#10004;</td>";
@@ -540,6 +546,12 @@ void Analysis::printSchedClassDescHtml(const ResolvedSchedClass &RSC,
       OS << "</li>";
     }
     OS << "</ul></td>";
+    // inverse throughput.
+    OS << "<td>";
+    writeMeasurementValue<kEscapeHtml>(
+        OS,
+        MCSchedModel::getReciprocalThroughput(*SubtargetInfo_, *RSC.SCDesc));
+    OS << "</td>";
     // WriteProcRes.
     OS << "<td><ul>";
     for (const auto &WPR : RSC.NonRedundantWriteProcRes) {
-- 
2.50.1


Valid	Variant	NumMicroOps	Latency	WriteProcRes	Idealized " - "Resource Pressure
RThroughput	WriteProcRes	Idealized Resource Pressure
✔	"; + writeMeasurementValue( + OS, + MCSchedModel::getReciprocalThroughput(SubtargetInfo_, RSC.SCDesc)); + OS << "	"; for (const auto &WPR : RSC.NonRedundantWriteProcRes) { -- 2.50.1