}
}
+// Instructions that have local forwarding disabled have an extra +1cy latency.
+
// A folded store needs a cycle on the SAGU for the store data,
// most RMW instructions don't need an extra uop.
defm : X86WriteRes<WriteRMW, [JSAGU], 1, [1], 0>;
// Horizontal add/sub instructions.
////////////////////////////////////////////////////////////////////////////////
-defm : JWriteResFpuPair<WriteFHAdd, [JFPU0, JFPA], 3>;
-defm : JWriteResYMMPair<WriteFHAddY, [JFPU0, JFPA], 3, [2,2], 2>;
-defm : JWriteResFpuPair<WritePHAdd, [JFPU01, JVALU], 1>;
-defm : JWriteResFpuPair<WritePHAddX, [JFPU01, JVALU], 1>;
+defm : JWriteResFpuPair<WriteFHAdd, [JFPU0, JFPA], 4>; // +1cy latency.
+defm : JWriteResYMMPair<WriteFHAddY, [JFPU0, JFPA], 4, [2,2], 2>; // +1cy latency.
+defm : JWriteResFpuPair<WritePHAdd, [JFPU01, JVALU], 2>; // +1cy latency.
+defm : JWriteResFpuPair<WritePHAddX, [JFPU01, JVALU], 2>; // +1cy latency.
defm : X86WriteResPairUnsupported<WritePHAddY>;
////////////////////////////////////////////////////////////////////////////////
;
; BTVER2-LABEL: test_haddpd:
; BTVER2: # %bb.0:
-; BTVER2-NEXT: vhaddpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
-; BTVER2-NEXT: vhaddpd (%rdi), %ymm0, %ymm0 # sched: [8:2.00]
+; BTVER2-NEXT: vhaddpd %ymm1, %ymm0, %ymm0 # sched: [4:2.00]
+; BTVER2-NEXT: vhaddpd (%rdi), %ymm0, %ymm0 # sched: [9:2.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_haddpd:
;
; BTVER2-LABEL: test_haddps:
; BTVER2: # %bb.0:
-; BTVER2-NEXT: vhaddps %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
-; BTVER2-NEXT: vhaddps (%rdi), %ymm0, %ymm0 # sched: [8:2.00]
+; BTVER2-NEXT: vhaddps %ymm1, %ymm0, %ymm0 # sched: [4:2.00]
+; BTVER2-NEXT: vhaddps (%rdi), %ymm0, %ymm0 # sched: [9:2.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_haddps:
;
; BTVER2-LABEL: test_hsubpd:
; BTVER2: # %bb.0:
-; BTVER2-NEXT: vhsubpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
-; BTVER2-NEXT: vhsubpd (%rdi), %ymm0, %ymm0 # sched: [8:2.00]
+; BTVER2-NEXT: vhsubpd %ymm1, %ymm0, %ymm0 # sched: [4:2.00]
+; BTVER2-NEXT: vhsubpd (%rdi), %ymm0, %ymm0 # sched: [9:2.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_hsubpd:
;
; BTVER2-LABEL: test_hsubps:
; BTVER2: # %bb.0:
-; BTVER2-NEXT: vhsubps %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
-; BTVER2-NEXT: vhsubps (%rdi), %ymm0, %ymm0 # sched: [8:2.00]
+; BTVER2-NEXT: vhsubps %ymm1, %ymm0, %ymm0 # sched: [4:2.00]
+; BTVER2-NEXT: vhsubps (%rdi), %ymm0, %ymm0 # sched: [9:2.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_hsubps:
;
; BTVER2-LABEL: test_phaddd:
; BTVER2: # %bb.0:
-; BTVER2-NEXT: phaddd %mm1, %mm0 # sched: [1:0.50]
-; BTVER2-NEXT: phaddd (%rdi), %mm0 # sched: [6:1.00]
+; BTVER2-NEXT: phaddd %mm1, %mm0 # sched: [2:0.50]
+; BTVER2-NEXT: phaddd (%rdi), %mm0 # sched: [7:1.00]
; BTVER2-NEXT: movq %mm0, %rax # sched: [4:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
;
; BTVER2-LABEL: test_phaddsw:
; BTVER2: # %bb.0:
-; BTVER2-NEXT: phaddsw %mm1, %mm0 # sched: [1:0.50]
-; BTVER2-NEXT: phaddsw (%rdi), %mm0 # sched: [6:1.00]
+; BTVER2-NEXT: phaddsw %mm1, %mm0 # sched: [2:0.50]
+; BTVER2-NEXT: phaddsw (%rdi), %mm0 # sched: [7:1.00]
; BTVER2-NEXT: movq %mm0, %rax # sched: [4:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
;
; BTVER2-LABEL: test_phaddw:
; BTVER2: # %bb.0:
-; BTVER2-NEXT: phaddw %mm1, %mm0 # sched: [1:0.50]
-; BTVER2-NEXT: phaddw (%rdi), %mm0 # sched: [6:1.00]
+; BTVER2-NEXT: phaddw %mm1, %mm0 # sched: [2:0.50]
+; BTVER2-NEXT: phaddw (%rdi), %mm0 # sched: [7:1.00]
; BTVER2-NEXT: movq %mm0, %rax # sched: [4:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
;
; BTVER2-LABEL: test_phsubd:
; BTVER2: # %bb.0:
-; BTVER2-NEXT: phsubd %mm1, %mm0 # sched: [1:0.50]
-; BTVER2-NEXT: phsubd (%rdi), %mm0 # sched: [6:1.00]
+; BTVER2-NEXT: phsubd %mm1, %mm0 # sched: [2:0.50]
+; BTVER2-NEXT: phsubd (%rdi), %mm0 # sched: [7:1.00]
; BTVER2-NEXT: movq %mm0, %rax # sched: [4:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
;
; BTVER2-LABEL: test_phsubsw:
; BTVER2: # %bb.0:
-; BTVER2-NEXT: phsubsw %mm1, %mm0 # sched: [1:0.50]
-; BTVER2-NEXT: phsubsw (%rdi), %mm0 # sched: [6:1.00]
+; BTVER2-NEXT: phsubsw %mm1, %mm0 # sched: [2:0.50]
+; BTVER2-NEXT: phsubsw (%rdi), %mm0 # sched: [7:1.00]
; BTVER2-NEXT: movq %mm0, %rax # sched: [4:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
;
; BTVER2-LABEL: test_phsubw:
; BTVER2: # %bb.0:
-; BTVER2-NEXT: phsubw %mm1, %mm0 # sched: [1:0.50]
-; BTVER2-NEXT: phsubw (%rdi), %mm0 # sched: [6:1.00]
+; BTVER2-NEXT: phsubw %mm1, %mm0 # sched: [2:0.50]
+; BTVER2-NEXT: phsubw (%rdi), %mm0 # sched: [7:1.00]
; BTVER2-NEXT: movq %mm0, %rax # sched: [4:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
;
; BTVER2-SSE-LABEL: test_haddpd:
; BTVER2-SSE: # %bb.0:
-; BTVER2-SSE-NEXT: haddpd %xmm1, %xmm0 # sched: [3:1.00]
-; BTVER2-SSE-NEXT: haddpd (%rdi), %xmm0 # sched: [8:1.00]
+; BTVER2-SSE-NEXT: haddpd %xmm1, %xmm0 # sched: [4:1.00]
+; BTVER2-SSE-NEXT: haddpd (%rdi), %xmm0 # sched: [9:1.00]
; BTVER2-SSE-NEXT: retq # sched: [4:1.00]
;
; BTVER2-LABEL: test_haddpd:
; BTVER2: # %bb.0:
-; BTVER2-NEXT: vhaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; BTVER2-NEXT: vhaddpd (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
+; BTVER2-NEXT: vhaddpd %xmm1, %xmm0, %xmm0 # sched: [4:1.00]
+; BTVER2-NEXT: vhaddpd (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-SSE-LABEL: test_haddpd:
;
; BTVER2-SSE-LABEL: test_haddps:
; BTVER2-SSE: # %bb.0:
-; BTVER2-SSE-NEXT: haddps %xmm1, %xmm0 # sched: [3:1.00]
-; BTVER2-SSE-NEXT: haddps (%rdi), %xmm0 # sched: [8:1.00]
+; BTVER2-SSE-NEXT: haddps %xmm1, %xmm0 # sched: [4:1.00]
+; BTVER2-SSE-NEXT: haddps (%rdi), %xmm0 # sched: [9:1.00]
; BTVER2-SSE-NEXT: retq # sched: [4:1.00]
;
; BTVER2-LABEL: test_haddps:
; BTVER2: # %bb.0:
-; BTVER2-NEXT: vhaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; BTVER2-NEXT: vhaddps (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
+; BTVER2-NEXT: vhaddps %xmm1, %xmm0, %xmm0 # sched: [4:1.00]
+; BTVER2-NEXT: vhaddps (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-SSE-LABEL: test_haddps:
;
; BTVER2-SSE-LABEL: test_hsubpd:
; BTVER2-SSE: # %bb.0:
-; BTVER2-SSE-NEXT: hsubpd %xmm1, %xmm0 # sched: [3:1.00]
-; BTVER2-SSE-NEXT: hsubpd (%rdi), %xmm0 # sched: [8:1.00]
+; BTVER2-SSE-NEXT: hsubpd %xmm1, %xmm0 # sched: [4:1.00]
+; BTVER2-SSE-NEXT: hsubpd (%rdi), %xmm0 # sched: [9:1.00]
; BTVER2-SSE-NEXT: retq # sched: [4:1.00]
;
; BTVER2-LABEL: test_hsubpd:
; BTVER2: # %bb.0:
-; BTVER2-NEXT: vhsubpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; BTVER2-NEXT: vhsubpd (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
+; BTVER2-NEXT: vhsubpd %xmm1, %xmm0, %xmm0 # sched: [4:1.00]
+; BTVER2-NEXT: vhsubpd (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-SSE-LABEL: test_hsubpd:
;
; BTVER2-SSE-LABEL: test_hsubps:
; BTVER2-SSE: # %bb.0:
-; BTVER2-SSE-NEXT: hsubps %xmm1, %xmm0 # sched: [3:1.00]
-; BTVER2-SSE-NEXT: hsubps (%rdi), %xmm0 # sched: [8:1.00]
+; BTVER2-SSE-NEXT: hsubps %xmm1, %xmm0 # sched: [4:1.00]
+; BTVER2-SSE-NEXT: hsubps (%rdi), %xmm0 # sched: [9:1.00]
; BTVER2-SSE-NEXT: retq # sched: [4:1.00]
;
; BTVER2-LABEL: test_hsubps:
; BTVER2: # %bb.0:
-; BTVER2-NEXT: vhsubps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; BTVER2-NEXT: vhsubps (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
+; BTVER2-NEXT: vhsubps %xmm1, %xmm0, %xmm0 # sched: [4:1.00]
+; BTVER2-NEXT: vhsubps (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-SSE-LABEL: test_hsubps:
;
; BTVER2-SSE-LABEL: test_phaddd:
; BTVER2-SSE: # %bb.0:
-; BTVER2-SSE-NEXT: phaddd %xmm1, %xmm0 # sched: [1:0.50]
-; BTVER2-SSE-NEXT: phaddd (%rdi), %xmm0 # sched: [6:1.00]
+; BTVER2-SSE-NEXT: phaddd %xmm1, %xmm0 # sched: [2:0.50]
+; BTVER2-SSE-NEXT: phaddd (%rdi), %xmm0 # sched: [7:1.00]
; BTVER2-SSE-NEXT: retq # sched: [4:1.00]
;
; BTVER2-LABEL: test_phaddd:
; BTVER2: # %bb.0:
-; BTVER2-NEXT: vphaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; BTVER2-NEXT: vphaddd (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
+; BTVER2-NEXT: vphaddd %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
+; BTVER2-NEXT: vphaddd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-SSE-LABEL: test_phaddd:
;
; BTVER2-SSE-LABEL: test_phaddsw:
; BTVER2-SSE: # %bb.0:
-; BTVER2-SSE-NEXT: phaddsw %xmm1, %xmm0 # sched: [1:0.50]
-; BTVER2-SSE-NEXT: phaddsw (%rdi), %xmm0 # sched: [6:1.00]
+; BTVER2-SSE-NEXT: phaddsw %xmm1, %xmm0 # sched: [2:0.50]
+; BTVER2-SSE-NEXT: phaddsw (%rdi), %xmm0 # sched: [7:1.00]
; BTVER2-SSE-NEXT: retq # sched: [4:1.00]
;
; BTVER2-LABEL: test_phaddsw:
; BTVER2: # %bb.0:
-; BTVER2-NEXT: vphaddsw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; BTVER2-NEXT: vphaddsw (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
+; BTVER2-NEXT: vphaddsw %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
+; BTVER2-NEXT: vphaddsw (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-SSE-LABEL: test_phaddsw:
;
; BTVER2-SSE-LABEL: test_phaddw:
; BTVER2-SSE: # %bb.0:
-; BTVER2-SSE-NEXT: phaddw %xmm1, %xmm0 # sched: [1:0.50]
-; BTVER2-SSE-NEXT: phaddw (%rdi), %xmm0 # sched: [6:1.00]
+; BTVER2-SSE-NEXT: phaddw %xmm1, %xmm0 # sched: [2:0.50]
+; BTVER2-SSE-NEXT: phaddw (%rdi), %xmm0 # sched: [7:1.00]
; BTVER2-SSE-NEXT: retq # sched: [4:1.00]
;
; BTVER2-LABEL: test_phaddw:
; BTVER2: # %bb.0:
-; BTVER2-NEXT: vphaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; BTVER2-NEXT: vphaddw (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
+; BTVER2-NEXT: vphaddw %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
+; BTVER2-NEXT: vphaddw (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-SSE-LABEL: test_phaddw:
;
; BTVER2-SSE-LABEL: test_phsubd:
; BTVER2-SSE: # %bb.0:
-; BTVER2-SSE-NEXT: phsubd %xmm1, %xmm0 # sched: [1:0.50]
-; BTVER2-SSE-NEXT: phsubd (%rdi), %xmm0 # sched: [6:1.00]
+; BTVER2-SSE-NEXT: phsubd %xmm1, %xmm0 # sched: [2:0.50]
+; BTVER2-SSE-NEXT: phsubd (%rdi), %xmm0 # sched: [7:1.00]
; BTVER2-SSE-NEXT: retq # sched: [4:1.00]
;
; BTVER2-LABEL: test_phsubd:
; BTVER2: # %bb.0:
-; BTVER2-NEXT: vphsubd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; BTVER2-NEXT: vphsubd (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
+; BTVER2-NEXT: vphsubd %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
+; BTVER2-NEXT: vphsubd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-SSE-LABEL: test_phsubd:
;
; BTVER2-SSE-LABEL: test_phsubsw:
; BTVER2-SSE: # %bb.0:
-; BTVER2-SSE-NEXT: phsubsw %xmm1, %xmm0 # sched: [1:0.50]
-; BTVER2-SSE-NEXT: phsubsw (%rdi), %xmm0 # sched: [6:1.00]
+; BTVER2-SSE-NEXT: phsubsw %xmm1, %xmm0 # sched: [2:0.50]
+; BTVER2-SSE-NEXT: phsubsw (%rdi), %xmm0 # sched: [7:1.00]
; BTVER2-SSE-NEXT: retq # sched: [4:1.00]
;
; BTVER2-LABEL: test_phsubsw:
; BTVER2: # %bb.0:
-; BTVER2-NEXT: vphsubsw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; BTVER2-NEXT: vphsubsw (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
+; BTVER2-NEXT: vphsubsw %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
+; BTVER2-NEXT: vphsubsw (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-SSE-LABEL: test_phsubsw:
;
; BTVER2-SSE-LABEL: test_phsubw:
; BTVER2-SSE: # %bb.0:
-; BTVER2-SSE-NEXT: phsubw %xmm1, %xmm0 # sched: [1:0.50]
-; BTVER2-SSE-NEXT: phsubw (%rdi), %xmm0 # sched: [6:1.00]
+; BTVER2-SSE-NEXT: phsubw %xmm1, %xmm0 # sched: [2:0.50]
+; BTVER2-SSE-NEXT: phsubw (%rdi), %xmm0 # sched: [7:1.00]
; BTVER2-SSE-NEXT: retq # sched: [4:1.00]
;
; BTVER2-LABEL: test_phsubw:
; BTVER2: # %bb.0:
-; BTVER2-NEXT: vphsubw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; BTVER2-NEXT: vphsubw (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
+; BTVER2-NEXT: vphsubw %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
+; BTVER2-NEXT: vphsubw (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-SSE-LABEL: test_phsubw:
# CHECK: Iterations: 300
# CHECK-NEXT: Instructions: 900
-# CHECK-NEXT: Total Cycles: 610
+# CHECK-NEXT: Total Cycles: 611
# CHECK-NEXT: Total uOps: 900
# CHECK: Dispatch Width: 2
-# CHECK-NEXT: uOps Per Cycle: 1.48
-# CHECK-NEXT: IPC: 1.48
+# CHECK-NEXT: uOps Per Cycle: 1.47
+# CHECK-NEXT: IPC: 1.47
# CHECK-NEXT: Block RThroughput: 2.0
# CHECK: Instruction Info:
# CHECK: [1] [2] [3] [4] [5] [6] Instructions:
# CHECK-NEXT: 1 2 1.00 vmulps %xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 1 3 1.00 vhaddps %xmm2, %xmm2, %xmm3
-# CHECK-NEXT: 1 3 1.00 vhaddps %xmm3, %xmm3, %xmm4
+# CHECK-NEXT: 1 4 1.00 vhaddps %xmm2, %xmm2, %xmm3
+# CHECK-NEXT: 1 4 1.00 vhaddps %xmm3, %xmm3, %xmm4
# CHECK: Resources:
# CHECK-NEXT: [0] - JALU0
# CHECK-NEXT: Index 0123456789
# CHECK: [0,0] DeeER. . . vmulps %xmm0, %xmm1, %xmm2
-# CHECK-NEXT: [0,1] D==eeeER . . vhaddps %xmm2, %xmm2, %xmm3
-# CHECK-NEXT: [0,2] .D====eeeER . vhaddps %xmm3, %xmm3, %xmm4
-# CHECK-NEXT: [1,0] .DeeE-----R . vmulps %xmm0, %xmm1, %xmm2
-# CHECK-NEXT: [1,1] . D=eeeE---R . vhaddps %xmm2, %xmm2, %xmm3
-# CHECK-NEXT: [1,2] . D====eeeER . vhaddps %xmm3, %xmm3, %xmm4
-# CHECK-NEXT: [2,0] . DeeE-----R . vmulps %xmm0, %xmm1, %xmm2
-# CHECK-NEXT: [2,1] . D====eeeER . vhaddps %xmm2, %xmm2, %xmm3
-# CHECK-NEXT: [2,2] . D======eeeER vhaddps %xmm3, %xmm3, %xmm4
+# CHECK-NEXT: [0,1] D==eeeeER . . vhaddps %xmm2, %xmm2, %xmm3
+# CHECK-NEXT: [0,2] .D=====eeeeER . vhaddps %xmm3, %xmm3, %xmm4
+# CHECK-NEXT: [1,0] .DeeE-------R . vmulps %xmm0, %xmm1, %xmm2
+# CHECK-NEXT: [1,1] . D=eeeeE----R . vhaddps %xmm2, %xmm2, %xmm3
+# CHECK-NEXT: [1,2] . D=====eeeeER . vhaddps %xmm3, %xmm3, %xmm4
+# CHECK-NEXT: [2,0] . DeeE-------R. vmulps %xmm0, %xmm1, %xmm2
+# CHECK-NEXT: [2,1] . D==eeeeE---R. vhaddps %xmm2, %xmm2, %xmm3
+# CHECK-NEXT: [2,2] . D=====eeeeER vhaddps %xmm3, %xmm3, %xmm4
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
# CHECK: [0] [1] [2] [3]
-# CHECK-NEXT: 0. 3 1.0 1.0 3.3 vmulps %xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 1. 3 3.3 0.7 1.0 vhaddps %xmm2, %xmm2, %xmm3
-# CHECK-NEXT: 2. 3 5.7 0.0 0.0 vhaddps %xmm3, %xmm3, %xmm4
+# CHECK-NEXT: 0. 3 1.0 1.0 4.7 vmulps %xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 1. 3 2.7 0.0 2.3 vhaddps %xmm2, %xmm2, %xmm3
+# CHECK-NEXT: 2. 3 6.0 0.0 0.0 vhaddps %xmm3, %xmm3, %xmm4
# CHECK: Iterations: 1
# CHECK-NEXT: Instructions: 2
-# CHECK-NEXT: Total Cycles: 11
+# CHECK-NEXT: Total Cycles: 12
# CHECK-NEXT: Total uOps: 2
# CHECK: Dispatch Width: 2
-# CHECK-NEXT: uOps Per Cycle: 0.18
-# CHECK-NEXT: IPC: 0.18
+# CHECK-NEXT: uOps Per Cycle: 0.17
+# CHECK-NEXT: IPC: 0.17
# CHECK-NEXT: Block RThroughput: 1.0
# CHECK: Instruction Info:
# CHECK: [1] [2] [3] [4] [5] [6] Instructions:
# CHECK-NEXT: 1 1 0.50 vshufps $0, %xmm0, %xmm1, %xmm1
-# CHECK-NEXT: 1 8 1.00 * vhaddps (%rdi), %xmm1, %xmm2
+# CHECK-NEXT: 1 9 1.00 * vhaddps (%rdi), %xmm1, %xmm2
# CHECK: Timeline view:
-# CHECK-NEXT: 0
+# CHECK-NEXT: 01
# CHECK-NEXT: Index 0123456789
-# CHECK: [0,0] DeER . . vshufps $0, %xmm0, %xmm1, %xmm1
-# CHECK-NEXT: [0,1] DeeeeeeeeER vhaddps (%rdi), %xmm1, %xmm2
+# CHECK: [0,0] DeER . .. vshufps $0, %xmm0, %xmm1, %xmm1
+# CHECK-NEXT: [0,1] DeeeeeeeeeER vhaddps (%rdi), %xmm1, %xmm2
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
# CHECK: Iterations: 1
# CHECK-NEXT: Instructions: 2
-# CHECK-NEXT: Total Cycles: 12
+# CHECK-NEXT: Total Cycles: 13
# CHECK-NEXT: Total uOps: 3
# CHECK: Dispatch Width: 2
-# CHECK-NEXT: uOps Per Cycle: 0.25
-# CHECK-NEXT: IPC: 0.17
+# CHECK-NEXT: uOps Per Cycle: 0.23
+# CHECK-NEXT: IPC: 0.15
# CHECK-NEXT: Block RThroughput: 2.0
# CHECK: Instruction Info:
# CHECK: [1] [2] [3] [4] [5] [6] Instructions:
# CHECK-NEXT: 1 1 0.50 vshufps $0, %xmm0, %xmm1, %xmm1
-# CHECK-NEXT: 2 8 2.00 * vhaddps (%rdi), %ymm1, %ymm2
+# CHECK-NEXT: 2 9 2.00 * vhaddps (%rdi), %ymm1, %ymm2
# CHECK: Timeline view:
-# CHECK-NEXT: 01
+# CHECK-NEXT: 012
# CHECK-NEXT: Index 0123456789
-# CHECK: [0,0] DeER . .. vshufps $0, %xmm0, %xmm1, %xmm1
-# CHECK-NEXT: [0,1] .DeeeeeeeeER vhaddps (%rdi), %ymm1, %ymm2
+# CHECK: [0,0] DeER . . . vshufps $0, %xmm0, %xmm1, %xmm1
+# CHECK-NEXT: [0,1] .DeeeeeeeeeER vhaddps (%rdi), %ymm1, %ymm2
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
# ENABLED: Iterations: 100
# ENABLED-NEXT: Instructions: 300
-# ENABLED-NEXT: Total Cycles: 209
+# ENABLED-NEXT: Total Cycles: 211
# ENABLED-NEXT: Total uOps: 300
# ENABLED: Dispatch Width: 2
-# ENABLED-NEXT: uOps Per Cycle: 1.44
-# ENABLED-NEXT: IPC: 1.44
+# ENABLED-NEXT: uOps Per Cycle: 1.42
+# ENABLED-NEXT: IPC: 1.42
# ENABLED-NEXT: Block RThroughput: 2.0
# ENABLED: Instruction Info:
# ENABLED: [1] [2] [3] [4] [5] [6] Instructions:
# ENABLED-NEXT: 1 2 1.00 vmulps %xmm0, %xmm1, %xmm2
-# ENABLED-NEXT: 1 3 1.00 vhaddps %xmm2, %xmm2, %xmm3
-# ENABLED-NEXT: 1 3 1.00 vhaddps %xmm3, %xmm3, %xmm4
+# ENABLED-NEXT: 1 4 1.00 vhaddps %xmm2, %xmm2, %xmm3
+# ENABLED-NEXT: 1 4 1.00 vhaddps %xmm3, %xmm3, %xmm4
# CHECK-NEXT: 1 1 1.00 * vextractf128 $1, %ymm0, (%rax)
# CHECK-NEXT: 1 3 1.00 vextractps $1, %xmm0, %ecx
# CHECK-NEXT: 1 3 1.00 * vextractps $1, %xmm0, (%rax)
-# CHECK-NEXT: 1 3 1.00 vhaddpd %xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 1 8 1.00 * vhaddpd (%rax), %xmm1, %xmm2
-# CHECK-NEXT: 2 3 2.00 vhaddpd %ymm0, %ymm1, %ymm2
-# CHECK-NEXT: 2 8 2.00 * vhaddpd (%rax), %ymm1, %ymm2
-# CHECK-NEXT: 1 3 1.00 vhaddps %xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 1 8 1.00 * vhaddps (%rax), %xmm1, %xmm2
-# CHECK-NEXT: 2 3 2.00 vhaddps %ymm0, %ymm1, %ymm2
-# CHECK-NEXT: 2 8 2.00 * vhaddps (%rax), %ymm1, %ymm2
-# CHECK-NEXT: 1 3 1.00 vhsubpd %xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 1 8 1.00 * vhsubpd (%rax), %xmm1, %xmm2
-# CHECK-NEXT: 2 3 2.00 vhsubpd %ymm0, %ymm1, %ymm2
-# CHECK-NEXT: 2 8 2.00 * vhsubpd (%rax), %ymm1, %ymm2
-# CHECK-NEXT: 1 3 1.00 vhsubps %xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 1 8 1.00 * vhsubps (%rax), %xmm1, %xmm2
-# CHECK-NEXT: 2 3 2.00 vhsubps %ymm0, %ymm1, %ymm2
-# CHECK-NEXT: 2 8 2.00 * vhsubps (%rax), %ymm1, %ymm2
+# CHECK-NEXT: 1 4 1.00 vhaddpd %xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 1 9 1.00 * vhaddpd (%rax), %xmm1, %xmm2
+# CHECK-NEXT: 2 4 2.00 vhaddpd %ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 2 9 2.00 * vhaddpd (%rax), %ymm1, %ymm2
+# CHECK-NEXT: 1 4 1.00 vhaddps %xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 1 9 1.00 * vhaddps (%rax), %xmm1, %xmm2
+# CHECK-NEXT: 2 4 2.00 vhaddps %ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 2 9 2.00 * vhaddps (%rax), %ymm1, %ymm2
+# CHECK-NEXT: 1 4 1.00 vhsubpd %xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 1 9 1.00 * vhsubpd (%rax), %xmm1, %xmm2
+# CHECK-NEXT: 2 4 2.00 vhsubpd %ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 2 9 2.00 * vhsubpd (%rax), %ymm1, %ymm2
+# CHECK-NEXT: 1 4 1.00 vhsubps %xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 1 9 1.00 * vhsubps (%rax), %xmm1, %xmm2
+# CHECK-NEXT: 2 4 2.00 vhsubps %ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 2 9 2.00 * vhsubps (%rax), %ymm1, %ymm2
# CHECK-NEXT: 2 1 1.00 vinsertf128 $1, %xmm0, %ymm1, %ymm2
# CHECK-NEXT: 2 6 1.00 * vinsertf128 $1, (%rax), %ymm1, %ymm2
# CHECK-NEXT: 1 1 0.50 vinsertps $1, %xmm0, %xmm1, %xmm2
# CHECK-NEXT: 1 3 1.00 * vpextrq $1, %xmm0, (%rax)
# CHECK-NEXT: 1 3 1.00 vpextrw $1, %xmm0, %ecx
# CHECK-NEXT: 1 3 1.00 * vpextrw $1, %xmm0, (%rax)
-# CHECK-NEXT: 1 1 0.50 vphaddd %xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 1 6 1.00 * vphaddd (%rax), %xmm1, %xmm2
-# CHECK-NEXT: 1 1 0.50 vphaddsw %xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 1 6 1.00 * vphaddsw (%rax), %xmm1, %xmm2
-# CHECK-NEXT: 1 1 0.50 vphaddw %xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 1 6 1.00 * vphaddw (%rax), %xmm1, %xmm2
+# CHECK-NEXT: 1 2 0.50 vphaddd %xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 1 7 1.00 * vphaddd (%rax), %xmm1, %xmm2
+# CHECK-NEXT: 1 2 0.50 vphaddsw %xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 1 7 1.00 * vphaddsw (%rax), %xmm1, %xmm2
+# CHECK-NEXT: 1 2 0.50 vphaddw %xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 1 7 1.00 * vphaddw (%rax), %xmm1, %xmm2
# CHECK-NEXT: 1 2 0.50 vphminposuw %xmm0, %xmm2
# CHECK-NEXT: 1 7 1.00 * vphminposuw (%rax), %xmm2
-# CHECK-NEXT: 1 1 0.50 vphsubd %xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 1 6 1.00 * vphsubd (%rax), %xmm1, %xmm2
-# CHECK-NEXT: 1 1 0.50 vphsubsw %xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 1 6 1.00 * vphsubsw (%rax), %xmm1, %xmm2
-# CHECK-NEXT: 1 1 0.50 vphsubw %xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 1 6 1.00 * vphsubw (%rax), %xmm1, %xmm2
+# CHECK-NEXT: 1 2 0.50 vphsubd %xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 1 7 1.00 * vphsubd (%rax), %xmm1, %xmm2
+# CHECK-NEXT: 1 2 0.50 vphsubsw %xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 1 7 1.00 * vphsubsw (%rax), %xmm1, %xmm2
+# CHECK-NEXT: 1 2 0.50 vphsubw %xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 1 7 1.00 * vphsubw (%rax), %xmm1, %xmm2
# CHECK-NEXT: 2 7 0.50 vpinsrb $1, %eax, %xmm1, %xmm2
# CHECK-NEXT: 1 4 1.00 * vpinsrb $1, (%rax), %xmm1, %xmm2
# CHECK-NEXT: 2 7 0.50 vpinsrd $1, %eax, %xmm1, %xmm2
# CHECK-NEXT: 1 8 1.00 * addsubpd (%rax), %xmm2
# CHECK-NEXT: 1 3 1.00 addsubps %xmm0, %xmm2
# CHECK-NEXT: 1 8 1.00 * addsubps (%rax), %xmm2
-# CHECK-NEXT: 1 3 1.00 haddpd %xmm0, %xmm2
-# CHECK-NEXT: 1 8 1.00 * haddpd (%rax), %xmm2
-# CHECK-NEXT: 1 3 1.00 haddps %xmm0, %xmm2
-# CHECK-NEXT: 1 8 1.00 * haddps (%rax), %xmm2
-# CHECK-NEXT: 1 3 1.00 hsubpd %xmm0, %xmm2
-# CHECK-NEXT: 1 8 1.00 * hsubpd (%rax), %xmm2
-# CHECK-NEXT: 1 3 1.00 hsubps %xmm0, %xmm2
-# CHECK-NEXT: 1 8 1.00 * hsubps (%rax), %xmm2
+# CHECK-NEXT: 1 4 1.00 haddpd %xmm0, %xmm2
+# CHECK-NEXT: 1 9 1.00 * haddpd (%rax), %xmm2
+# CHECK-NEXT: 1 4 1.00 haddps %xmm0, %xmm2
+# CHECK-NEXT: 1 9 1.00 * haddps (%rax), %xmm2
+# CHECK-NEXT: 1 4 1.00 hsubpd %xmm0, %xmm2
+# CHECK-NEXT: 1 9 1.00 * hsubpd (%rax), %xmm2
+# CHECK-NEXT: 1 4 1.00 hsubps %xmm0, %xmm2
+# CHECK-NEXT: 1 9 1.00 * hsubps (%rax), %xmm2
# CHECK-NEXT: 1 5 1.00 * lddqu (%rax), %xmm2
# CHECK-NEXT: 1 1 0.50 movddup %xmm0, %xmm2
# CHECK-NEXT: 1 6 1.00 * movddup (%rax), %xmm2
# CHECK-NEXT: 1 6 1.00 * palignr $1, (%rax), %mm2
# CHECK-NEXT: 1 1 0.50 palignr $1, %xmm0, %xmm2
# CHECK-NEXT: 1 6 1.00 * palignr $1, (%rax), %xmm2
-# CHECK-NEXT: 1 1 0.50 phaddd %mm0, %mm2
-# CHECK-NEXT: 1 6 1.00 * phaddd (%rax), %mm2
-# CHECK-NEXT: 1 1 0.50 phaddd %xmm0, %xmm2
-# CHECK-NEXT: 1 6 1.00 * phaddd (%rax), %xmm2
-# CHECK-NEXT: 1 1 0.50 phaddsw %mm0, %mm2
-# CHECK-NEXT: 1 6 1.00 * phaddsw (%rax), %mm2
-# CHECK-NEXT: 1 1 0.50 phaddsw %xmm0, %xmm2
-# CHECK-NEXT: 1 6 1.00 * phaddsw (%rax), %xmm2
-# CHECK-NEXT: 1 1 0.50 phaddw %mm0, %mm2
-# CHECK-NEXT: 1 6 1.00 * phaddw (%rax), %mm2
-# CHECK-NEXT: 1 1 0.50 phaddw %xmm0, %xmm2
-# CHECK-NEXT: 1 6 1.00 * phaddw (%rax), %xmm2
-# CHECK-NEXT: 1 1 0.50 phsubd %mm0, %mm2
-# CHECK-NEXT: 1 6 1.00 * phsubd (%rax), %mm2
-# CHECK-NEXT: 1 1 0.50 phsubd %xmm0, %xmm2
-# CHECK-NEXT: 1 6 1.00 * phsubd (%rax), %xmm2
-# CHECK-NEXT: 1 1 0.50 phsubsw %mm0, %mm2
-# CHECK-NEXT: 1 6 1.00 * phsubsw (%rax), %mm2
-# CHECK-NEXT: 1 1 0.50 phsubsw %xmm0, %xmm2
-# CHECK-NEXT: 1 6 1.00 * phsubsw (%rax), %xmm2
-# CHECK-NEXT: 1 1 0.50 phsubw %mm0, %mm2
-# CHECK-NEXT: 1 6 1.00 * phsubw (%rax), %mm2
-# CHECK-NEXT: 1 1 0.50 phsubw %xmm0, %xmm2
-# CHECK-NEXT: 1 6 1.00 * phsubw (%rax), %xmm2
+# CHECK-NEXT: 1 2 0.50 phaddd %mm0, %mm2
+# CHECK-NEXT: 1 7 1.00 * phaddd (%rax), %mm2
+# CHECK-NEXT: 1 2 0.50 phaddd %xmm0, %xmm2
+# CHECK-NEXT: 1 7 1.00 * phaddd (%rax), %xmm2
+# CHECK-NEXT: 1 2 0.50 phaddsw %mm0, %mm2
+# CHECK-NEXT: 1 7 1.00 * phaddsw (%rax), %mm2
+# CHECK-NEXT: 1 2 0.50 phaddsw %xmm0, %xmm2
+# CHECK-NEXT: 1 7 1.00 * phaddsw (%rax), %xmm2
+# CHECK-NEXT: 1 2 0.50 phaddw %mm0, %mm2
+# CHECK-NEXT: 1 7 1.00 * phaddw (%rax), %mm2
+# CHECK-NEXT: 1 2 0.50 phaddw %xmm0, %xmm2
+# CHECK-NEXT: 1 7 1.00 * phaddw (%rax), %xmm2
+# CHECK-NEXT: 1 2 0.50 phsubd %mm0, %mm2
+# CHECK-NEXT: 1 7 1.00 * phsubd (%rax), %mm2
+# CHECK-NEXT: 1 2 0.50 phsubd %xmm0, %xmm2
+# CHECK-NEXT: 1 7 1.00 * phsubd (%rax), %xmm2
+# CHECK-NEXT: 1 2 0.50 phsubsw %mm0, %mm2
+# CHECK-NEXT: 1 7 1.00 * phsubsw (%rax), %mm2
+# CHECK-NEXT: 1 2 0.50 phsubsw %xmm0, %xmm2
+# CHECK-NEXT: 1 7 1.00 * phsubsw (%rax), %xmm2
+# CHECK-NEXT: 1 2 0.50 phsubw %mm0, %mm2
+# CHECK-NEXT: 1 7 1.00 * phsubw (%rax), %mm2
+# CHECK-NEXT: 1 2 0.50 phsubw %xmm0, %xmm2
+# CHECK-NEXT: 1 7 1.00 * phsubw (%rax), %xmm2
# CHECK-NEXT: 1 2 1.00 pmaddubsw %mm0, %mm2
# CHECK-NEXT: 1 7 1.00 * pmaddubsw (%rax), %mm2
# CHECK-NEXT: 1 2 1.00 pmaddubsw %xmm0, %xmm2