From 9fa92ef02a4d8e43d47adeca57f4e73a1bff184d Mon Sep 17 00:00:00 2001
From: Andrea Di Biagio <Andrea_DiBiagio@sn.scee.net>
Date: Thu, 19 Sep 2019 16:05:11 +0000
Subject: [PATCH] [MCA] Improved cost computation for loop carried dependencies
 in the bottleneck analysis.

This patch introduces a cut-off threshold for dependency edge frequences with
the goal of simplifying the critical sequence computation.  This patch also
removes the cost normalization for loop carried dependencies.  We didn't really
need to artificially amplify the cost of loop-carried dependencies since it is
already computed as the integral over time of the delay (in cycle).

In the absence of backend stalls there is no need for computing a critical
sequence. With this patch we early exit from the critical sequence computation
if no bottleneck was reported during the simulation.



git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@372337 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../X86/SkylakeClient/bottleneck-analysis.s   | 12 ++----
 tools/llvm-mca/Views/BottleneckAnalysis.cpp   | 40 ++++++++++++++++---
 tools/llvm-mca/Views/BottleneckAnalysis.h     |  8 ++--
 3 files changed, 43 insertions(+), 17 deletions(-)
diff --git a/test/tools/llvm-mca/X86/SkylakeClient/bottleneck-analysis.s b/test/tools/llvm-mca/X86/SkylakeClient/bottleneck-analysis.s
index 1f5e2dbf469..98c5376ff82 100644
--- a/test/tools/llvm-mca/X86/SkylakeClient/bottleneck-analysis.s
+++ b/test/tools/llvm-mca/X86/SkylakeClient/bottleneck-analysis.s
@@ -67,18 +67,14 @@
 # CHECK-NEXT:  |      11.   vmovaps	%xmm7, %xmm4
 # CHECK-NEXT:  +----> 12.   vfmadd213ps	%xmm1, %xmm14, %xmm2      ## REGISTER dependency:  %xmm1
 # CHECK-NEXT:  |      13.   vmovaps	%xmm6, %xmm1
-# CHECK-NEXT:  |      14.   vfmadd213ps	%xmm2, %xmm15, %xmm3
-# CHECK-NEXT:  |      15.   vpermilps	$170, %xmm3, %xmm0
+# CHECK-NEXT:  +----> 14.   vfmadd213ps	%xmm2, %xmm15, %xmm3      ## REGISTER dependency:  %xmm2
+# CHECK-NEXT:  +----> 15.   vpermilps	$170, %xmm3, %xmm0                ## REGISTER dependency:  %xmm3
 # CHECK-NEXT:  |      16.   vmovups	%xmm3, (%rdx,%rax)
 # CHECK-NEXT:  |      17.   vpermilps	$255, %xmm3, %xmm2
 # CHECK-NEXT:  |      18.   addq	$16, %rax
 # CHECK-NEXT:  |      19.   decl	%ecx
-# CHECK-NEXT:  |      20.   vmovaps	%xmm0, %xmm3
-# CHECK-NEXT:  |      21.   jne	.LBB0_4
-# CHECK-NEXT:  |
-# CHECK-NEXT:  |    < loop carried >
-# CHECK-NEXT:  |
-# CHECK-NEXT:  +----> 2.    vmulps	-24(%rsp), %xmm7, %xmm8           ## RESOURCE interference:  SKLPort1 [ probability: 45% ]
+# CHECK-NEXT:  +----> 20.   vmovaps	%xmm0, %xmm3                      ## REGISTER dependency:  %xmm0
+# CHECK-NEXT:         21.   jne	.LBB0_4
 
 # CHECK:      Instruction Info:
 # CHECK-NEXT: [1]: #uOps
diff --git a/tools/llvm-mca/Views/BottleneckAnalysis.cpp b/tools/llvm-mca/Views/BottleneckAnalysis.cpp
index 560c6c6e8a3..feff0cd6d52 100644
--- a/tools/llvm-mca/Views/BottleneckAnalysis.cpp
+++ b/tools/llvm-mca/Views/BottleneckAnalysis.cpp
@@ -165,10 +165,33 @@ void DependencyGraph::dumpDependencyEdge(raw_ostream &OS,
            "Unsupported dependency type!");
     OS << " - RESOURCE MASK: " << DE.ResourceOrRegID;
   }
-  OS << " - CYCLES: " << DE.Cost << '\n';
+  OS << " - COST: " << DE.Cost << '\n';
 }
 #endif // NDEBUG
 
+void DependencyGraph::pruneEdges(unsigned Iterations) {
+  for (DGNode &N : Nodes) {
+    unsigned NumPruned = 0;
+    const unsigned Size = N.OutgoingEdges.size();
+    // Use a cut-off threshold to prune edges with a low frequency.
+    for (unsigned I = 0, E = Size; I < E; ++I) {
+      DependencyEdge &Edge = N.OutgoingEdges[I];
+      if (Edge.Frequency == Iterations)
+        continue;
+      double Factor = (double)Edge.Frequency / Iterations;
+      if (0.10 < Factor)
+        continue;
+      Nodes[Edge.ToIID].NumPredecessors--;
+      std::swap(Edge, N.OutgoingEdges[E - 1]);
+      --E;
+      ++NumPruned;
+    }
+
+    if (NumPruned)
+      N.OutgoingEdges.resize(Size - NumPruned);
+  }
+}
+
 void DependencyGraph::initializeRootSet(
     SmallVectorImpl<unsigned> &RootSet) const {
   for (unsigned I = 0, E = Nodes.size(); I < E; ++I) {
@@ -179,7 +202,7 @@ void DependencyGraph::initializeRootSet(
 }
 
 void DependencyGraph::propagateThroughEdges(
-    SmallVectorImpl<unsigned> &RootSet) {
+    SmallVectorImpl<unsigned> &RootSet, unsigned Iterations) {
   SmallVector<unsigned, 8> ToVisit;
 
   // A critical sequence is computed as the longest path from a node of the
@@ -189,6 +212,10 @@ void DependencyGraph::propagateThroughEdges(
   // Each node of the graph starts with an initial default cost of zero.  The
   // cost of a node is a measure of criticality: the higher the cost, the bigger
   // is the performance impact.
+  // For register and memory dependencies, the cost is a function of the write
+  // latency as well as the actual delay (in cycles) caused to users.
+  // For processor resource dependencies, the cost is a function of the resource
+  // pressure. Resource interferences with low frequency values are ignored.
   //
   // This algorithm is very similar to a (reverse) Dijkstra.  Every iteration of
   // the inner loop selects (i.e. visits) a node N from a set of `unvisited
@@ -277,6 +304,10 @@ static void printInstruction(formatted_raw_ostream &FOS,
 }
 
 void BottleneckAnalysis::printCriticalSequence(raw_ostream &OS) const {
+  // Early exit if no bottlenecks were found during the simulation.
+  if (!SeenStallCycles || !BPI.PressureIncreaseCycles)
+    return;
+
   SmallVector<const DependencyEdge *, 16> Seq;
   DG.getCriticalSequence(Seq);
   if (Seq.empty())
@@ -432,7 +463,6 @@ void BottleneckAnalysis::addRegisterDep(unsigned From, unsigned To,
   bool IsLoopCarried = From >= To;
   unsigned SourceSize = Source.size();
   if (IsLoopCarried) {
-    Cost *= Iterations / 2;
     DG.addRegisterDep(From, To + SourceSize, RegID, Cost);
     DG.addRegisterDep(From + SourceSize, To + (SourceSize * 2), RegID, Cost);
     return;
@@ -445,7 +475,6 @@ void BottleneckAnalysis::addMemoryDep(unsigned From, unsigned To,
   bool IsLoopCarried = From >= To;
   unsigned SourceSize = Source.size();
   if (IsLoopCarried) {
-    Cost *= Iterations / 2;
     DG.addMemoryDep(From, To + SourceSize, Cost);
     DG.addMemoryDep(From + SourceSize, To + (SourceSize * 2), Cost);
     return;
@@ -458,7 +487,6 @@ void BottleneckAnalysis::addResourceDep(unsigned From, unsigned To,
   bool IsLoopCarried = From >= To;
   unsigned SourceSize = Source.size();
   if (IsLoopCarried) {
-    Cost *= Iterations / 2;
     DG.addResourceDep(From, To + SourceSize, Mask, Cost);
     DG.addResourceDep(From + SourceSize, To + (SourceSize * 2), Mask, Cost);
     return;
@@ -514,7 +542,7 @@ void BottleneckAnalysis::onEvent(const HWInstructionEvent &Event) {
 
   // Check if this is the last simulated instruction.
   if (IID == ((Iterations * Source.size()) - 1))
-    DG.finalizeGraph();
+    DG.finalizeGraph(Iterations);
 }
 
 void BottleneckAnalysis::onEvent(const HWPressureEvent &Event) {
diff --git a/tools/llvm-mca/Views/BottleneckAnalysis.h b/tools/llvm-mca/Views/BottleneckAnalysis.h
index 7564b1a4820..9e3bd5978f0 100644
--- a/tools/llvm-mca/Views/BottleneckAnalysis.h
+++ b/tools/llvm-mca/Views/BottleneckAnalysis.h
@@ -236,8 +236,9 @@ class DependencyGraph {
   void addDependency(unsigned From, unsigned To,
                      DependencyEdge::Dependency &&DE);
 
+  void pruneEdges(unsigned Iterations);
   void initializeRootSet(SmallVectorImpl<unsigned> &RootSet) const;
-  void propagateThroughEdges(SmallVectorImpl<unsigned> &RootSet);
+  void propagateThroughEdges(SmallVectorImpl<unsigned> &RootSet, unsigned Iterations);
 
 #ifndef NDEBUG
   void dumpDependencyEdge(raw_ostream &OS, const DependencyEdge &DE,
@@ -263,10 +264,11 @@ public:
 
   // Called by the bottleneck analysis at the end of simulation to propagate
   // costs through the edges of the graph, and compute a critical path.
-  void finalizeGraph() {
+  void finalizeGraph(unsigned Iterations) {
     SmallVector<unsigned, 16> RootSet;
+    pruneEdges(Iterations);
     initializeRootSet(RootSet);
-    propagateThroughEdges(RootSet);
+    propagateThroughEdges(RootSet, Iterations);
   }
 
   // Returns a sequence of edges representing the critical sequence based on the
-- 
2.40.0