]> granicus.if.org Git - llvm/commitdiff
[MCA][LSUnit] Track loads and stores until retirement.
authorAndrea Di Biagio <Andrea_DiBiagio@sn.scee.net>
Tue, 8 Oct 2019 10:46:01 +0000 (10:46 +0000)
committerAndrea Di Biagio <Andrea_DiBiagio@sn.scee.net>
Tue, 8 Oct 2019 10:46:01 +0000 (10:46 +0000)
Before this patch, loads and stores were only tracked by their corresponding
queues in the LSUnit from dispatch until execute stage. In practice we should be
more conservative and assume that memory opcodes leave their queues at
retirement stage.

Basically, loads should leave the load queue only when they have completed and
delivered their data. We conservatively assume that a load is completed when it
is retired. Stores should be tracked by the store queue from dispatch until
retirement. In practice, stores can only leave the store queue if their data can
be written to the data cache.

This is mostly a mechanical change. With this patch, the retire stage notifies
the LSUnit when a memory instruction is retired. That would triggers the release
of LDQ/STQ entries.  The only visible change is in memory tests for the bdver2
model. That is because bdver2 is the only model that defines the load/store
queue size.

This patch partially addresses PR39830.

Differential Revision: https://reviews.llvm.org/D68266

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@374034 91177308-0d34-0410-b5e6-96231b3b80d8

include/llvm/MCA/HardwareUnits/LSUnit.h
include/llvm/MCA/Stages/RetireStage.h
lib/MCA/Context.cpp
lib/MCA/HardwareUnits/LSUnit.cpp
lib/MCA/Stages/RetireStage.cpp
test/tools/llvm-mca/X86/BdVer2/load-store-throughput.s
test/tools/llvm-mca/X86/BdVer2/load-throughput.s
test/tools/llvm-mca/X86/BdVer2/store-throughput.s

index 0dd5d3322aa18130921f7c314dfaf8829c68be18..34903794db4a6053e324ec10fa113ee88e8f011a 100644 (file)
@@ -291,9 +291,14 @@ public:
     return NextGroupID++;
   }
 
-  // Instruction executed event handlers.
   virtual void onInstructionExecuted(const InstRef &IR);
 
+  // Loads are tracked by the LDQ (load queue) from dispatch until completion.
+  // Stores are tracked by the STQ (store queue) from dispatch until commitment.
+  // By default we conservatively assume that the LDQ receives a load at
+  // dispatch. Loads leave the LDQ at retirement stage.
+  virtual void onInstructionRetired(const InstRef &IR);
+
   virtual void onInstructionIssued(const InstRef &IR) {
     unsigned GroupID = IR.getInstruction()->getLSUTokenID();
     Groups[GroupID]->onInstructionIssued(IR);
@@ -438,9 +443,6 @@ public:
   /// 6. A store has to wait until an older store barrier is fully executed.
   unsigned dispatch(const InstRef &IR) override;
 
-  // FIXME: For simplicity, we optimistically assume a similar behavior for
-  // store instructions. In practice, store operations don't tend to leave the
-  // store queue until they reach the 'Retired' stage (See PR39830).
   void onInstructionExecuted(const InstRef &IR) override;
 };
 
index 08c216ac7bf43408ded5f66837bbcd2de4a3cda7..f4713688d25f668134fe62655457a2238a3ce9ab 100644 (file)
@@ -16,6 +16,7 @@
 #ifndef LLVM_MCA_RETIRE_STAGE_H
 #define LLVM_MCA_RETIRE_STAGE_H
 
+#include "llvm/MCA/HardwareUnits/LSUnit.h"
 #include "llvm/MCA/HardwareUnits/RegisterFile.h"
 #include "llvm/MCA/HardwareUnits/RetireControlUnit.h"
 #include "llvm/MCA/Stages/Stage.h"
@@ -27,13 +28,14 @@ class RetireStage final : public Stage {
   // Owner will go away when we move listeners/eventing to the stages.
   RetireControlUnit &RCU;
   RegisterFile &PRF;
+  LSUnitBase &LSU;
 
   RetireStage(const RetireStage &Other) = delete;
   RetireStage &operator=(const RetireStage &Other) = delete;
 
 public:
-  RetireStage(RetireControlUnit &R, RegisterFile &F)
-      : Stage(), RCU(R), PRF(F) {}
+  RetireStage(RetireControlUnit &R, RegisterFile &F, LSUnitBase &LS)
+      : Stage(), RCU(R), PRF(F), LSU(LS) {}
 
   bool hasWorkToComplete() const override { return !RCU.isEmpty(); }
   Error cycleStart() override;
index 546c82c6dd9873136adf2eb7efe4b36e647cee9a..0160e1f9f7874534630e2c4e9b3600a5bccd0e50 100644 (file)
@@ -44,7 +44,7 @@ Context::createDefaultPipeline(const PipelineOptions &Opts, SourceMgr &SrcMgr) {
                                                    *RCU, *PRF);
   auto Execute =
       std::make_unique<ExecuteStage>(*HWS, Opts.EnableBottleneckAnalysis);
-  auto Retire = std::make_unique<RetireStage>(*RCU, *PRF);
+  auto Retire = std::make_unique<RetireStage>(*RCU, *PRF, *LSU);
 
   // Pass the ownership of all the hardware units to this Context.
   addHardwareUnit(std::move(RCU));
index 973bb908e41a687dd58d149ca3b629efea635cb5..0ee084c7ce1a927c27925880d10e7d9cfbe8aab5 100644 (file)
@@ -160,17 +160,19 @@ LSUnit::Status LSUnit::isAvailable(const InstRef &IR) const {
 }
 
 void LSUnitBase::onInstructionExecuted(const InstRef &IR) {
-  const InstrDesc &Desc = IR.getInstruction()->getDesc();
-  bool IsALoad = Desc.MayLoad;
-  bool IsAStore = Desc.MayStore;
-  assert((IsALoad || IsAStore) && "Expected a memory operation!");
-
   unsigned GroupID = IR.getInstruction()->getLSUTokenID();
   auto It = Groups.find(GroupID);
+  assert(It != Groups.end() && "Instruction not dispatched to the LS unit");
   It->second->onInstructionExecuted();
-  if (It->second->isExecuted()) {
+  if (It->second->isExecuted())
     Groups.erase(It);
-  }
+}
+
+void LSUnitBase::onInstructionRetired(const InstRef &IR) {
+  const InstrDesc &Desc = IR.getInstruction()->getDesc();
+  bool IsALoad = Desc.MayLoad;
+  bool IsAStore = Desc.MayStore;
+  assert((IsALoad || IsAStore) && "Expected a memory operation!");
 
   if (IsALoad) {
     releaseLQSlot();
index 735444525241aecd4321065ef71e42fb04e2425c..f792af748bce9cf4ea2cbc55e84c3b0b0a39a18a 100644 (file)
@@ -52,6 +52,10 @@ void RetireStage::notifyInstructionRetired(const InstRef &IR) const {
   llvm::SmallVector<unsigned, 4> FreedRegs(PRF.getNumRegisterFiles());
   const Instruction &Inst = *IR.getInstruction();
 
+  // Release the load/store queue entries.
+  if (Inst.isMemOp())
+    LSU.onInstructionRetired(IR);
+
   for (const WriteState &WS : Inst.getDefs())
     PRF.removeRegisterWrite(WS, FreedRegs);
   notifyEvent<HWInstructionEvent>(HWInstructionRetiredEvent(IR, FreedRegs));
index d9d6e9e18e530f8b0ccaa600ba8bc9779fcf17f0..4f53cce27e235b60f8f4edcfa4ecf212ae5e0062 100644 (file)
@@ -507,12 +507,12 @@ movaps %xmm3, (%rbx)
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      400
-# CHECK-NEXT: Total Cycles:      593
+# CHECK-NEXT: Total Cycles:      554
 # CHECK-NEXT: Total uOps:        400
 
 # CHECK:      Dispatch Width:    4
-# CHECK-NEXT: uOps Per Cycle:    0.67
-# CHECK-NEXT: IPC:               0.67
+# CHECK-NEXT: uOps Per Cycle:    0.72
+# CHECK-NEXT: IPC:               0.72
 # CHECK-NEXT: Block RThroughput: 4.0
 
 # CHECK:      Instruction Info:
@@ -532,24 +532,24 @@ movaps %xmm3, (%rbx)
 # CHECK:      Dynamic Dispatch Stall Cycles:
 # CHECK-NEXT: RAT     - Register unavailable:                      0
 # CHECK-NEXT: RCU     - Retire tokens unavailable:                 0
-# CHECK-NEXT: SCHEDQ  - Scheduler full:                            187  (31.5%)
+# CHECK-NEXT: SCHEDQ  - Scheduler full:                            55  (9.9%)
 # CHECK-NEXT: LQ      - Load queue full:                           0
-# CHECK-NEXT: SQ      - Store queue full:                          342  (57.7%)
+# CHECK-NEXT: SQ      - Store queue full:                          437  (78.9%)
 # CHECK-NEXT: GROUP   - Static restrictions on the dispatch group: 0
 
 # CHECK:      Dispatch Logic - number of cycles where we saw N micro opcodes dispatched:
 # CHECK-NEXT: [# dispatched], [# cycles]
-# CHECK-NEXT:  0,              403  (68.0%)
-# CHECK-NEXT:  1,              90  (15.2%)
-# CHECK-NEXT:  2,              2  (0.3%)
-# CHECK-NEXT:  3,              86  (14.5%)
-# CHECK-NEXT:  4,              12  (2.0%)
+# CHECK-NEXT:  0,              365  (65.9%)
+# CHECK-NEXT:  1,              88  (15.9%)
+# CHECK-NEXT:  2,              3  (0.5%)
+# CHECK-NEXT:  3,              86  (15.5%)
+# CHECK-NEXT:  4,              12  (2.2%)
 
 # CHECK:      Schedulers - number of cycles where we saw N micro opcodes issued:
 # CHECK-NEXT: [# issued], [# cycles]
-# CHECK-NEXT:  0,          292  (49.2%)
-# CHECK-NEXT:  1,          202  (34.1%)
-# CHECK-NEXT:  2,          99  (16.7%)
+# CHECK-NEXT:  0,          253  (45.7%)
+# CHECK-NEXT:  1,          202  (36.5%)
+# CHECK-NEXT:  2,          99  (17.9%)
 
 # CHECK:      Scheduler's queue usage:
 # CHECK-NEXT: [1] Resource name.
@@ -595,8 +595,8 @@ movaps %xmm3, (%rbx)
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16.0] [16.1] [17]   [18]   Instructions:
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -     3.00    -      -      -      -     1.00   movd    %mm0, (%rax)
-# CHECK-NEXT: 0.36   2.64    -      -      -      -      -      -      -      -      -     3.00    -      -      -     1.00    -      -      -      -     3.00    -      -     movd    (%rcx), %mm1
-# CHECK-NEXT: 2.64   0.36    -      -      -      -      -      -      -      -     3.00    -      -      -     1.00    -      -      -      -     3.00    -      -      -     movd    (%rdx), %mm2
+# CHECK-NEXT: 1.53   1.47    -      -      -      -      -      -      -      -      -     3.00    -      -      -     1.00    -      -      -      -     3.00    -      -     movd    (%rcx), %mm1
+# CHECK-NEXT: 1.47   1.53    -      -      -      -      -      -      -      -     3.00    -      -      -     1.00    -      -      -      -     3.00    -      -      -     movd    (%rdx), %mm2
 # CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -      -      -      -      -     1.00    -      -     3.00    -      -      -      -      -     1.00   movd    %mm3, (%rbx)
 
 # CHECK:      Timeline view:
index 6c9f15905c246bf96b7cfad4a575d7d85bd85fb8..dfb45af19f3bc74e508e8f78485a8804a5cf7b3d 100644 (file)
@@ -80,7 +80,7 @@ vmovaps (%rbx), %ymm3
 # CHECK-NEXT: RAT     - Register unavailable:                      0
 # CHECK-NEXT: RCU     - Retire tokens unavailable:                 0
 # CHECK-NEXT: SCHEDQ  - Scheduler full:                            0
-# CHECK-NEXT: LQ      - Load queue full:                           353  (86.9%)
+# CHECK-NEXT: LQ      - Load queue full:                           354  (87.2%)
 # CHECK-NEXT: SQ      - Store queue full:                          0
 # CHECK-NEXT: GROUP   - Static restrictions on the dispatch group: 0
 
@@ -102,9 +102,9 @@ vmovaps (%rbx), %ymm3
 # CHECK-NEXT: [4] Total number of buffer entries.
 
 # CHECK:       [1]            [2]        [3]        [4]
-# CHECK-NEXT: PdEX             32         36         40
+# CHECK-NEXT: PdEX             31         34         40
 # CHECK-NEXT: PdFPU            0          0          64
-# CHECK-NEXT: PdLoad           37         40         40
+# CHECK-NEXT: PdLoad           36         40         40
 # CHECK-NEXT: PdStore          0          0          24
 
 # CHECK:      Resources:
@@ -193,7 +193,7 @@ vmovaps (%rbx), %ymm3
 # CHECK-NEXT: RAT     - Register unavailable:                      0
 # CHECK-NEXT: RCU     - Retire tokens unavailable:                 0
 # CHECK-NEXT: SCHEDQ  - Scheduler full:                            0
-# CHECK-NEXT: LQ      - Load queue full:                           353  (86.9%)
+# CHECK-NEXT: LQ      - Load queue full:                           354  (87.2%)
 # CHECK-NEXT: SQ      - Store queue full:                          0
 # CHECK-NEXT: GROUP   - Static restrictions on the dispatch group: 0
 
@@ -215,9 +215,9 @@ vmovaps (%rbx), %ymm3
 # CHECK-NEXT: [4] Total number of buffer entries.
 
 # CHECK:       [1]            [2]        [3]        [4]
-# CHECK-NEXT: PdEX             32         36         40
+# CHECK-NEXT: PdEX             31         34         40
 # CHECK-NEXT: PdFPU            0          0          64
-# CHECK-NEXT: PdLoad           37         40         40
+# CHECK-NEXT: PdLoad           36         40         40
 # CHECK-NEXT: PdStore          0          0          24
 
 # CHECK:      Resources:
@@ -306,7 +306,7 @@ vmovaps (%rbx), %ymm3
 # CHECK-NEXT: RAT     - Register unavailable:                      0
 # CHECK-NEXT: RCU     - Retire tokens unavailable:                 0
 # CHECK-NEXT: SCHEDQ  - Scheduler full:                            0
-# CHECK-NEXT: LQ      - Load queue full:                           353  (86.9%)
+# CHECK-NEXT: LQ      - Load queue full:                           354  (87.2%)
 # CHECK-NEXT: SQ      - Store queue full:                          0
 # CHECK-NEXT: GROUP   - Static restrictions on the dispatch group: 0
 
@@ -328,9 +328,9 @@ vmovaps (%rbx), %ymm3
 # CHECK-NEXT: [4] Total number of buffer entries.
 
 # CHECK:       [1]            [2]        [3]        [4]
-# CHECK-NEXT: PdEX             32         36         40
+# CHECK-NEXT: PdEX             31         34         40
 # CHECK-NEXT: PdFPU            0          0          64
-# CHECK-NEXT: PdLoad           37         40         40
+# CHECK-NEXT: PdLoad           36         40         40
 # CHECK-NEXT: PdStore          0          0          24
 
 # CHECK:      Resources:
@@ -419,7 +419,7 @@ vmovaps (%rbx), %ymm3
 # CHECK-NEXT: RAT     - Register unavailable:                      0
 # CHECK-NEXT: RCU     - Retire tokens unavailable:                 0
 # CHECK-NEXT: SCHEDQ  - Scheduler full:                            0
-# CHECK-NEXT: LQ      - Load queue full:                           353  (86.9%)
+# CHECK-NEXT: LQ      - Load queue full:                           354  (87.2%)
 # CHECK-NEXT: SQ      - Store queue full:                          0
 # CHECK-NEXT: GROUP   - Static restrictions on the dispatch group: 0
 
@@ -441,9 +441,9 @@ vmovaps (%rbx), %ymm3
 # CHECK-NEXT: [4] Total number of buffer entries.
 
 # CHECK:       [1]            [2]        [3]        [4]
-# CHECK-NEXT: PdEX             32         36         40
+# CHECK-NEXT: PdEX             31         34         40
 # CHECK-NEXT: PdFPU            0          0          64
-# CHECK-NEXT: PdLoad           37         40         40
+# CHECK-NEXT: PdLoad           36         40         40
 # CHECK-NEXT: PdStore          0          0          24
 
 # CHECK:      Resources:
@@ -532,7 +532,7 @@ vmovaps (%rbx), %ymm3
 # CHECK-NEXT: RAT     - Register unavailable:                      0
 # CHECK-NEXT: RCU     - Retire tokens unavailable:                 0
 # CHECK-NEXT: SCHEDQ  - Scheduler full:                            0
-# CHECK-NEXT: LQ      - Load queue full:                           532  (87.9%)
+# CHECK-NEXT: LQ      - Load queue full:                           533  (88.1%)
 # CHECK-NEXT: SQ      - Store queue full:                          0
 # CHECK-NEXT: GROUP   - Static restrictions on the dispatch group: 0
 
@@ -554,8 +554,8 @@ vmovaps (%rbx), %ymm3
 # CHECK-NEXT: [4] Total number of buffer entries.
 
 # CHECK:       [1]            [2]        [3]        [4]
-# CHECK-NEXT: PdEX             34         38         40
-# CHECK-NEXT: PdFPU            34         38         64
+# CHECK-NEXT: PdEX             33         36         40
+# CHECK-NEXT: PdFPU            33         36         64
 # CHECK-NEXT: PdLoad           37         40         40
 # CHECK-NEXT: PdStore          0          0          24
 
@@ -646,7 +646,7 @@ vmovaps (%rbx), %ymm3
 # CHECK-NEXT: RAT     - Register unavailable:                      0
 # CHECK-NEXT: RCU     - Retire tokens unavailable:                 0
 # CHECK-NEXT: SCHEDQ  - Scheduler full:                            0
-# CHECK-NEXT: LQ      - Load queue full:                           532  (87.9%)
+# CHECK-NEXT: LQ      - Load queue full:                           533  (88.1%)
 # CHECK-NEXT: SQ      - Store queue full:                          0
 # CHECK-NEXT: GROUP   - Static restrictions on the dispatch group: 0
 
@@ -668,8 +668,8 @@ vmovaps (%rbx), %ymm3
 # CHECK-NEXT: [4] Total number of buffer entries.
 
 # CHECK:       [1]            [2]        [3]        [4]
-# CHECK-NEXT: PdEX             34         38         40
-# CHECK-NEXT: PdFPU            34         38         64
+# CHECK-NEXT: PdEX             33         36         40
+# CHECK-NEXT: PdFPU            33         36         64
 # CHECK-NEXT: PdLoad           37         40         40
 # CHECK-NEXT: PdStore          0          0          24
 
@@ -760,7 +760,7 @@ vmovaps (%rbx), %ymm3
 # CHECK-NEXT: RAT     - Register unavailable:                      0
 # CHECK-NEXT: RCU     - Retire tokens unavailable:                 0
 # CHECK-NEXT: SCHEDQ  - Scheduler full:                            0
-# CHECK-NEXT: LQ      - Load queue full:                           344  (56.9%)
+# CHECK-NEXT: LQ      - Load queue full:                           345  (57.0%)
 # CHECK-NEXT: SQ      - Store queue full:                          0
 # CHECK-NEXT: GROUP   - Static restrictions on the dispatch group: 0
 
@@ -781,9 +781,9 @@ vmovaps (%rbx), %ymm3
 # CHECK-NEXT: [4] Total number of buffer entries.
 
 # CHECK:       [1]            [2]        [3]        [4]
-# CHECK-NEXT: PdEX             33         38         40
-# CHECK-NEXT: PdFPU            33         38         64
-# CHECK-NEXT: PdLoad           37         40         40
+# CHECK-NEXT: PdEX             33         36         40
+# CHECK-NEXT: PdFPU            33         36         64
+# CHECK-NEXT: PdLoad           36         40         40
 # CHECK-NEXT: PdStore          0          0          24
 
 # CHECK:      Resources:
index 4fc58a3827e708f8fc4e960f732de678c8e87c39..b24272c4166ad344c7c51827a298ef5b5abd7ca4 100644 (file)
@@ -81,14 +81,13 @@ vmovaps %ymm3, (%rbx)
 # CHECK-NEXT: RCU     - Retire tokens unavailable:                 0
 # CHECK-NEXT: SCHEDQ  - Scheduler full:                            0
 # CHECK-NEXT: LQ      - Load queue full:                           0
-# CHECK-NEXT: SQ      - Store queue full:                          370  (91.8%)
+# CHECK-NEXT: SQ      - Store queue full:                          371  (92.1%)
 # CHECK-NEXT: GROUP   - Static restrictions on the dispatch group: 0
 
 # CHECK:      Dispatch Logic - number of cycles where we saw N micro opcodes dispatched:
 # CHECK-NEXT: [# dispatched], [# cycles]
-# CHECK-NEXT:  0,              25  (6.2%)
-# CHECK-NEXT:  1,              370  (91.8%)
-# CHECK-NEXT:  2,              1  (0.2%)
+# CHECK-NEXT:  0,              24  (6.0%)
+# CHECK-NEXT:  1,              372  (92.3%)
 # CHECK-NEXT:  4,              7  (1.7%)
 
 # CHECK:      Schedulers - number of cycles where we saw N micro opcodes issued:
@@ -103,10 +102,10 @@ vmovaps %ymm3, (%rbx)
 # CHECK-NEXT: [4] Total number of buffer entries.
 
 # CHECK:       [1]            [2]        [3]        [4]
-# CHECK-NEXT: PdEX             22         23         40
+# CHECK-NEXT: PdEX             21         22         40
 # CHECK-NEXT: PdFPU            0          0          64
 # CHECK-NEXT: PdLoad           0          0          40
-# CHECK-NEXT: PdStore          23         24         24
+# CHECK-NEXT: PdStore          22         23         24
 
 # CHECK:      Resources:
 # CHECK-NEXT: [0.0] - PdAGLU01
@@ -195,14 +194,13 @@ vmovaps %ymm3, (%rbx)
 # CHECK-NEXT: RCU     - Retire tokens unavailable:                 0
 # CHECK-NEXT: SCHEDQ  - Scheduler full:                            0
 # CHECK-NEXT: LQ      - Load queue full:                           0
-# CHECK-NEXT: SQ      - Store queue full:                          370  (91.8%)
+# CHECK-NEXT: SQ      - Store queue full:                          371  (92.1%)
 # CHECK-NEXT: GROUP   - Static restrictions on the dispatch group: 0
 
 # CHECK:      Dispatch Logic - number of cycles where we saw N micro opcodes dispatched:
 # CHECK-NEXT: [# dispatched], [# cycles]
-# CHECK-NEXT:  0,              25  (6.2%)
-# CHECK-NEXT:  1,              370  (91.8%)
-# CHECK-NEXT:  2,              1  (0.2%)
+# CHECK-NEXT:  0,              24  (6.0%)
+# CHECK-NEXT:  1,              372  (92.3%)
 # CHECK-NEXT:  4,              7  (1.7%)
 
 # CHECK:      Schedulers - number of cycles where we saw N micro opcodes issued:
@@ -217,10 +215,10 @@ vmovaps %ymm3, (%rbx)
 # CHECK-NEXT: [4] Total number of buffer entries.
 
 # CHECK:       [1]            [2]        [3]        [4]
-# CHECK-NEXT: PdEX             22         23         40
+# CHECK-NEXT: PdEX             21         22         40
 # CHECK-NEXT: PdFPU            0          0          64
 # CHECK-NEXT: PdLoad           0          0          40
-# CHECK-NEXT: PdStore          23         24         24
+# CHECK-NEXT: PdStore          22         23         24
 
 # CHECK:      Resources:
 # CHECK-NEXT: [0.0] - PdAGLU01
@@ -309,14 +307,13 @@ vmovaps %ymm3, (%rbx)
 # CHECK-NEXT: RCU     - Retire tokens unavailable:                 0
 # CHECK-NEXT: SCHEDQ  - Scheduler full:                            0
 # CHECK-NEXT: LQ      - Load queue full:                           0
-# CHECK-NEXT: SQ      - Store queue full:                          370  (91.8%)
+# CHECK-NEXT: SQ      - Store queue full:                          371  (92.1%)
 # CHECK-NEXT: GROUP   - Static restrictions on the dispatch group: 0
 
 # CHECK:      Dispatch Logic - number of cycles where we saw N micro opcodes dispatched:
 # CHECK-NEXT: [# dispatched], [# cycles]
-# CHECK-NEXT:  0,              25  (6.2%)
-# CHECK-NEXT:  1,              370  (91.8%)
-# CHECK-NEXT:  2,              1  (0.2%)
+# CHECK-NEXT:  0,              24  (6.0%)
+# CHECK-NEXT:  1,              372  (92.3%)
 # CHECK-NEXT:  4,              7  (1.7%)
 
 # CHECK:      Schedulers - number of cycles where we saw N micro opcodes issued:
@@ -331,10 +328,10 @@ vmovaps %ymm3, (%rbx)
 # CHECK-NEXT: [4] Total number of buffer entries.
 
 # CHECK:       [1]            [2]        [3]        [4]
-# CHECK-NEXT: PdEX             22         23         40
+# CHECK-NEXT: PdEX             21         22         40
 # CHECK-NEXT: PdFPU            0          0          64
 # CHECK-NEXT: PdLoad           0          0          40
-# CHECK-NEXT: PdStore          23         24         24
+# CHECK-NEXT: PdStore          22         23         24
 
 # CHECK:      Resources:
 # CHECK-NEXT: [0.0] - PdAGLU01
@@ -423,14 +420,13 @@ vmovaps %ymm3, (%rbx)
 # CHECK-NEXT: RCU     - Retire tokens unavailable:                 0
 # CHECK-NEXT: SCHEDQ  - Scheduler full:                            0
 # CHECK-NEXT: LQ      - Load queue full:                           0
-# CHECK-NEXT: SQ      - Store queue full:                          370  (91.8%)
+# CHECK-NEXT: SQ      - Store queue full:                          371  (92.1%)
 # CHECK-NEXT: GROUP   - Static restrictions on the dispatch group: 0
 
 # CHECK:      Dispatch Logic - number of cycles where we saw N micro opcodes dispatched:
 # CHECK-NEXT: [# dispatched], [# cycles]
-# CHECK-NEXT:  0,              25  (6.2%)
-# CHECK-NEXT:  1,              370  (91.8%)
-# CHECK-NEXT:  2,              1  (0.2%)
+# CHECK-NEXT:  0,              24  (6.0%)
+# CHECK-NEXT:  1,              372  (92.3%)
 # CHECK-NEXT:  4,              7  (1.7%)
 
 # CHECK:      Schedulers - number of cycles where we saw N micro opcodes issued:
@@ -445,10 +441,10 @@ vmovaps %ymm3, (%rbx)
 # CHECK-NEXT: [4] Total number of buffer entries.
 
 # CHECK:       [1]            [2]        [3]        [4]
-# CHECK-NEXT: PdEX             22         23         40
+# CHECK-NEXT: PdEX             21         22         40
 # CHECK-NEXT: PdFPU            0          0          64
 # CHECK-NEXT: PdLoad           0          0          40
-# CHECK-NEXT: PdStore          23         24         24
+# CHECK-NEXT: PdStore          22         23         24
 
 # CHECK:      Resources:
 # CHECK-NEXT: [0.0] - PdAGLU01
@@ -537,7 +533,7 @@ vmovaps %ymm3, (%rbx)
 # CHECK-NEXT: RCU     - Retire tokens unavailable:                 0
 # CHECK-NEXT: SCHEDQ  - Scheduler full:                            0
 # CHECK-NEXT: LQ      - Load queue full:                           0
-# CHECK-NEXT: SQ      - Store queue full:                          747  (93.0%)
+# CHECK-NEXT: SQ      - Store queue full:                          748  (93.2%)
 # CHECK-NEXT: GROUP   - Static restrictions on the dispatch group: 0
 
 # CHECK:      Dispatch Logic - number of cycles where we saw N micro opcodes dispatched:
@@ -559,10 +555,10 @@ vmovaps %ymm3, (%rbx)
 # CHECK-NEXT: [4] Total number of buffer entries.
 
 # CHECK:       [1]            [2]        [3]        [4]
-# CHECK-NEXT: PdEX             22         23         40
-# CHECK-NEXT: PdFPU            22         23         64
+# CHECK-NEXT: PdEX             21         23         40
+# CHECK-NEXT: PdFPU            21         23         64
 # CHECK-NEXT: PdLoad           0          0          40
-# CHECK-NEXT: PdStore          23         24         24
+# CHECK-NEXT: PdStore          22         24         24
 
 # CHECK:      Resources:
 # CHECK-NEXT: [0.0] - PdAGLU01
@@ -650,16 +646,17 @@ vmovaps %ymm3, (%rbx)
 # CHECK:      Dynamic Dispatch Stall Cycles:
 # CHECK-NEXT: RAT     - Register unavailable:                      0
 # CHECK-NEXT: RCU     - Retire tokens unavailable:                 0
-# CHECK-NEXT: SCHEDQ  - Scheduler full:                            185  (30.7%)
+# CHECK-NEXT: SCHEDQ  - Scheduler full:                            0
 # CHECK-NEXT: LQ      - Load queue full:                           0
-# CHECK-NEXT: SQ      - Store queue full:                          372  (61.8%)
+# CHECK-NEXT: SQ      - Store queue full:                          559  (92.9%)
 # CHECK-NEXT: GROUP   - Static restrictions on the dispatch group: 0
 
 # CHECK:      Dispatch Logic - number of cycles where we saw N micro opcodes dispatched:
 # CHECK-NEXT: [# dispatched], [# cycles]
-# CHECK-NEXT:  0,              223  (37.0%)
-# CHECK-NEXT:  1,              372  (61.8%)
-# CHECK-NEXT:  4,              7  (1.2%)
+# CHECK-NEXT:  0,              222  (36.9%)
+# CHECK-NEXT:  1,              373  (62.0%)
+# CHECK-NEXT:  3,              1  (0.2%)
+# CHECK-NEXT:  4,              6  (1.0%)
 
 # CHECK:      Schedulers - number of cycles where we saw N micro opcodes issued:
 # CHECK-NEXT: [# issued], [# cycles]
@@ -673,10 +670,10 @@ vmovaps %ymm3, (%rbx)
 # CHECK-NEXT: [4] Total number of buffer entries.
 
 # CHECK:       [1]            [2]        [3]        [4]
-# CHECK-NEXT: PdEX             22         24         40
-# CHECK-NEXT: PdFPU            22         24         64
+# CHECK-NEXT: PdEX             21         23         40
+# CHECK-NEXT: PdFPU            21         23         64
 # CHECK-NEXT: PdLoad           0          0          40
-# CHECK-NEXT: PdStore          23         24         24
+# CHECK-NEXT: PdStore          22         24         24
 
 # CHECK:      Resources:
 # CHECK-NEXT: [0.0] - PdAGLU01
@@ -763,9 +760,9 @@ vmovaps %ymm3, (%rbx)
 # CHECK:      Dynamic Dispatch Stall Cycles:
 # CHECK-NEXT: RAT     - Register unavailable:                      0
 # CHECK-NEXT: RCU     - Retire tokens unavailable:                 0
-# CHECK-NEXT: SCHEDQ  - Scheduler full:                            5963  (83.2%)
+# CHECK-NEXT: SCHEDQ  - Scheduler full:                            5777  (80.6%)
 # CHECK-NEXT: LQ      - Load queue full:                           0
-# CHECK-NEXT: SQ      - Store queue full:                          374  (5.2%)
+# CHECK-NEXT: SQ      - Store queue full:                          561  (7.8%)
 # CHECK-NEXT: GROUP   - Static restrictions on the dispatch group: 0
 
 # CHECK:      Dispatch Logic - number of cycles where we saw N micro opcodes dispatched: