0. 3 1.0 1.0 3.3 vmulps %xmm0, %xmm1, %xmm2
1. 3 3.3 0.7 1.0 vhaddps %xmm2, %xmm2, %xmm3
2. 3 5.7 0.0 0.0 vhaddps %xmm3, %xmm3, %xmm4
+ 3 3.3 0.5 1.4 <total>
The timeline view is interesting because it shows instruction state changes
during execution. It also gives an idea of how the tool processes instructions
Table *Average Wait times* helps diagnose performance issues that are caused by
the presence of long latency instructions and potentially long data dependencies
-which may limit the ILP. Note that :program:`llvm-mca`, by default, assumes at
+which may limit the ILP. Last row, ``<total>``, shows a global average over all
+instructions measured. Note that :program:`llvm-mca`, by default, assumes at
least 1cy between the dispatch event and the issue event.
When the performance is limited by data dependencies and/or long latency
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 3 1.0 1.0 0.0 ldm r2!, {r3, r4, r5, r6, r12, lr}
# CHECK-NEXT: 1. 3 18.3 0.3 0.0 stm r0!, {r3, r4, r5, r6, r12, lr}
+# CHECK-NEXT: 3 9.7 0.7 0.0 <total>
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 5 3.0 0.2 1.6 add r1, r1, r12
# CHECK-NEXT: 1. 5 4.0 0.0 0.0 vld1.32 {d16, d17}, [r1]!
+# CHECK-NEXT: 5 3.5 0.1 0.8 <total>
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 3 9.7 0.3 0.0 stmg %r6, %r15, 48(%r15)
# CHECK-NEXT: 1. 3 9.0 0.3 0.0 lmg %r6, %r15, 48(%r15)
+# CHECK-NEXT: 3 9.3 0.3 0.0 <total>
# CHECK-NEXT: 1. 2 5.5 1.5 0.0 lzcntl %ecx, %eax
# CHECK-NEXT: 2. 2 8.5 0.0 0.0 andq %rcx, %rax
# CHECK-NEXT: 3. 2 9.5 0.0 0.0 bsfq %rax, %rcx
+# CHECK-NEXT: 2 7.0 0.5 0.0 <total>
# CHECK-NEXT: 0. 3 14.7 8.0 0.0 sqrtss %xmm0, %xmm0
# CHECK-NEXT: 1. 3 1.0 1.0 21.3 movss (%eax), %xmm0
# CHECK-NEXT: 2. 3 7.0 0.3 18.0 addps %xmm0, %xmm0
+# CHECK-NEXT: 3 7.6 3.1 13.1 <total>
# CHECK: [1] Code Region
# CHECK-NEXT: 0. 3 21.7 15.0 0.0 sqrtsd %xmm0, %xmm0
# CHECK-NEXT: 1. 3 1.0 1.0 35.3 movsd (%eax), %xmm0
# CHECK-NEXT: 2. 3 7.0 0.3 32.0 addps %xmm0, %xmm0
+# CHECK-NEXT: 3 9.9 5.4 22.4 <total>
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 3 3.7 0.3 0.0 cmpl %eax, %eax
# CHECK-NEXT: 1. 3 4.0 0.0 0.0 cmovael %ebx, %eax
+# CHECK-NEXT: 3 3.8 0.2 0.0 <total>
# CHECK-NEXT: 4. 3 3.7 0.0 12.0 pcmpeqd %xmm0, %xmm0
# CHECK-NEXT: 5. 3 4.3 0.0 11.0 pcmpeqq %xmm0, %xmm0
# CHECK-NEXT: 6. 3 5.0 0.0 10.0 pcmpeqw %xmm0, %xmm0
+# CHECK-NEXT: 3 7.1 0.1 6.6 <total>
# CHECK-NEXT: 4. 3 0.0 0.0 16.7 pcmpgtd %xmm0, %xmm0
# CHECK-NEXT: 5. 3 0.0 0.0 16.3 pcmpgtq %xmm0, %xmm0
# CHECK-NEXT: 6. 3 0.0 0.0 16.0 pcmpgtw %xmm0, %xmm0
+# CHECK-NEXT: 3 4.9 0.0 9.4 <total>
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 3 4.0 0.3 0.0 sbbl %edx, %edx
# CHECK-NEXT: 1. 3 6.0 0.0 0.0 sbbl %eax, %eax
+# CHECK-NEXT: 3 5.0 0.2 0.0 <total>
# CHECK-NEXT: 0. 3 5.0 0.3 0.0 imull %edx, %eax
# CHECK-NEXT: 1. 3 1.0 0.3 6.0 addl %edx, %edx
# CHECK-NEXT: 2. 3 8.0 0.0 0.0 sbbl %eax, %eax
+# CHECK-NEXT: 3 4.7 0.2 2.0 <total>
# CHECK-NEXT: 0. 3 1.0 0.7 2.7 addl %eax, %eax
# CHECK-NEXT: 1. 3 4.3 0.0 0.0 pinsrw $0, %eax, %xmm0
# CHECK-NEXT: 2. 3 5.7 0.0 0.0 pinsrw $1, %eax, %xmm0
+# CHECK-NEXT: 3 3.7 0.2 0.9 <total>
# CHECK-NEXT: 1. 1 1.0 1.0 0.0 movb (%rcx), %bpl
# CHECK-NEXT: 2. 1 2.0 2.0 0.0 movb (%rdx), %sil
# CHECK-NEXT: 3. 1 7.0 0.0 0.0 movb %dil, (%rbx)
+# CHECK-NEXT: 1 2.8 1.0 0.0 <total>
# CHECK: [1] Code Region
# CHECK-NEXT: 1. 1 1.0 1.0 0.0 movw (%rcx), %bp
# CHECK-NEXT: 2. 1 2.0 2.0 0.0 movw (%rdx), %si
# CHECK-NEXT: 3. 1 7.0 0.0 0.0 movw %di, (%rbx)
+# CHECK-NEXT: 1 2.8 1.0 0.0 <total>
# CHECK: [2] Code Region
# CHECK-NEXT: 1. 1 1.0 1.0 0.0 movl (%rcx), %ebp
# CHECK-NEXT: 2. 1 2.0 2.0 0.0 movl (%rdx), %esi
# CHECK-NEXT: 3. 1 7.0 0.0 0.0 movl %edi, (%rbx)
+# CHECK-NEXT: 1 2.8 1.0 0.0 <total>
# CHECK: [3] Code Region
# CHECK-NEXT: 1. 1 1.0 1.0 0.0 movq (%rcx), %rbp
# CHECK-NEXT: 2. 1 2.0 2.0 0.0 movq (%rdx), %rsi
# CHECK-NEXT: 3. 1 7.0 0.0 0.0 movq %rdi, (%rbx)
+# CHECK-NEXT: 1 2.8 1.0 0.0 <total>
# CHECK: [4] Code Region
# CHECK-NEXT: 1. 1 1.0 1.0 0.0 movd (%rcx), %mm1
# CHECK-NEXT: 2. 1 2.0 2.0 0.0 movd (%rdx), %mm2
# CHECK-NEXT: 3. 1 7.0 0.0 0.0 movd %mm3, (%rbx)
+# CHECK-NEXT: 1 2.8 1.0 0.0 <total>
# CHECK: [5] Code Region
# CHECK-NEXT: 1. 1 1.0 1.0 0.0 movaps (%rcx), %xmm1
# CHECK-NEXT: 2. 1 2.0 2.0 0.0 movaps (%rdx), %xmm2
# CHECK-NEXT: 3. 1 8.0 0.0 0.0 movaps %xmm3, (%rbx)
+# CHECK-NEXT: 1 3.0 1.0 0.0 <total>
# CHECK-NEXT: 1. 1 1.0 1.0 0.0 movb (%rcx), %bpl
# CHECK-NEXT: 2. 1 2.0 2.0 0.0 movb (%rdx), %sil
# CHECK-NEXT: 3. 1 2.0 2.0 0.0 movb (%rbx), %dil
+# CHECK-NEXT: 1 1.5 1.5 0.0 <total>
# CHECK: [1] Code Region
# CHECK-NEXT: 1. 1 1.0 1.0 0.0 movw (%rcx), %bp
# CHECK-NEXT: 2. 1 2.0 2.0 0.0 movw (%rdx), %si
# CHECK-NEXT: 3. 1 2.0 2.0 0.0 movw (%rbx), %di
+# CHECK-NEXT: 1 1.5 1.5 0.0 <total>
# CHECK: [2] Code Region
# CHECK-NEXT: 1. 1 1.0 1.0 0.0 movl (%rcx), %ebp
# CHECK-NEXT: 2. 1 2.0 2.0 0.0 movl (%rdx), %esi
# CHECK-NEXT: 3. 1 2.0 2.0 0.0 movl (%rbx), %edi
+# CHECK-NEXT: 1 1.5 1.5 0.0 <total>
# CHECK: [3] Code Region
# CHECK-NEXT: 1. 1 1.0 1.0 0.0 movq (%rcx), %rbp
# CHECK-NEXT: 2. 1 2.0 2.0 0.0 movq (%rdx), %rsi
# CHECK-NEXT: 3. 1 2.0 2.0 0.0 movq (%rbx), %rdi
+# CHECK-NEXT: 1 1.5 1.5 0.0 <total>
# CHECK: [4] Code Region
# CHECK-NEXT: 1. 1 1.0 1.0 0.0 movd (%rcx), %mm1
# CHECK-NEXT: 2. 1 2.0 2.0 0.0 movd (%rdx), %mm2
# CHECK-NEXT: 3. 1 2.0 2.0 0.0 movd (%rbx), %mm3
+# CHECK-NEXT: 1 1.5 1.5 0.0 <total>
# CHECK: [5] Code Region
# CHECK-NEXT: 1. 1 1.0 1.0 0.0 movaps (%rcx), %xmm1
# CHECK-NEXT: 2. 1 2.0 2.0 0.0 movaps (%rdx), %xmm2
# CHECK-NEXT: 3. 1 2.0 2.0 0.0 movaps (%rbx), %xmm3
+# CHECK-NEXT: 1 1.5 1.5 0.0 <total>
# CHECK-NEXT: 4. 1 1.0 0.0 7.0 pcmpeqd %xmm2, %xmm2
# CHECK-NEXT: 5. 1 2.0 0.0 6.0 pcmpeqq %xmm2, %xmm2
# CHECK-NEXT: 6. 1 3.0 0.0 5.0 pcmpeqw %xmm2, %xmm2
+# CHECK-NEXT: 1 2.7 0.3 3.7 <total>
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 imulq %rax, %rbx
# CHECK-NEXT: 1. 1 2.0 2.0 0.0 lzcntw %ax, %bx
# CHECK-NEXT: 2. 1 5.0 0.0 0.0 addl %ecx, %ebx
+# CHECK-NEXT: 1 2.7 1.0 0.0 <total>
# CHECK-NEXT: 0. 3 2.3 0.3 0.0 addw %cx, %dx
# CHECK-NEXT: 1. 3 1.0 1.0 1.0 movw %ax, %dx
# CHECK-NEXT: 2. 3 1.7 0.0 0.3 xorw %bx, %dx
+# CHECK-NEXT: 3 1.7 0.4 0.4 <total>
# CHECK-NEXT: 0. 3 4.3 0.3 0.0 imulw %ax, %bx
# CHECK-NEXT: 1. 3 2.3 2.3 2.0 lzcntw %ax, %bx
# CHECK-NEXT: 2. 3 5.0 0.0 1.3 addw %cx, %bx
+# CHECK-NEXT: 3 3.9 0.9 1.1 <total>
# CHECK-NEXT: 0. 3 7.3 0.3 0.0 imull %edx, %ecx
# CHECK-NEXT: 1. 3 2.3 2.3 1.7 lzcntw (%rsp), %cx
# CHECK-NEXT: 2. 3 2.7 2.7 1.0 lzcntw 2(%rsp), %cx
+# CHECK-NEXT: 3 4.1 1.8 0.9 <total>
# CHECK-NEXT: 2. 5 9.4 0.0 0.0 shll $2, %eax
# CHECK-NEXT: 3. 5 10.2 0.0 0.0 imull %ecx, %eax
# CHECK-NEXT: 4. 5 12.8 0.0 0.0 cmpl $1025, %eax
+# CHECK-NEXT: 5 10.1 0.1 0.2 <total>
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 imulw %ax, %cx
# CHECK-NEXT: 1. 1 4.0 0.0 0.0 addb %al, %cl
# CHECK-NEXT: 2. 1 5.0 0.0 0.0 addl %ecx, %ebx
+# CHECK-NEXT: 1 3.3 0.3 0.0 <total>
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 addps %xmm0, %xmm1
# CHECK-NEXT: 1. 1 1.0 0.0 0.0 mulps (%rdi), %xmm1
+# CHECK-NEXT: 1 1.0 0.5 0.0 <total>
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 imull %esi
# CHECK-NEXT: 1. 1 1.0 1.0 0.0 imull (%rdi)
+# CHECK-NEXT: 1 1.0 1.0 0.0 <total>
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 addq %rdi, %rsi
# CHECK-NEXT: 1. 1 1.0 0.0 0.0 addq (%rsp), %rsi
# CHECK-NEXT: 2. 1 1.0 1.0 4.0 addq %rdx, %r8
+# CHECK-NEXT: 1 1.0 0.7 1.3 <total>
# CHECK-NEXT: 0. 3 0.0 0.0 3.3 xorps %xmm0, %xmm0
# CHECK-NEXT: 1. 3 1.3 1.3 1.3 movaps %xmm0, %xmm1
# CHECK-NEXT: 2. 3 2.0 0.0 0.0 addps %xmm1, %xmm1
+# CHECK-NEXT: 3 1.1 0.4 1.6 <total>
# CHECK-NEXT: 6. 3 7.7 0.0 0.0 movupd %xmm3, %xmm4
# CHECK-NEXT: 7. 3 8.3 0.0 0.0 movdqa %xmm4, %xmm5
# CHECK-NEXT: 8. 3 9.0 0.0 0.0 movdqu %xmm5, %xmm0
+# CHECK-NEXT: 3 5.7 0.2 0.9 <total>
# CHECK-NEXT: 4. 3 7.7 0.0 0.0 movupd %xmm3, %xmm4
# CHECK-NEXT: 5. 3 8.3 0.0 0.0 movdqa %xmm4, %xmm5
# CHECK-NEXT: 6. 3 9.0 0.0 0.0 movdqu %xmm5, %xmm0
+# CHECK-NEXT: 3 7.0 0.0 0.0 <total>
# CHECK-NEXT: 2. 3 4.7 0.0 0.0 movl %ebx, %ecx
# CHECK-NEXT: 3. 3 5.3 0.0 0.0 movl %ecx, %edx
# CHECK-NEXT: 4. 3 6.0 0.0 0.0 movl %edx, %eax
+# CHECK-NEXT: 3 4.7 0.1 0.0 <total>
# CHECK-NEXT: 2. 3 4.7 0.0 0.0 movq %rbx, %rcx
# CHECK-NEXT: 3. 3 5.3 0.0 0.0 movq %rcx, %rdx
# CHECK-NEXT: 4. 3 6.0 0.0 0.0 movq %rdx, %rax
+# CHECK-NEXT: 3 4.7 0.1 0.0 <total>
# CHECK-NEXT: 3. 3 3.0 0.0 0.0 addq %rcx, %rcx
# CHECK-NEXT: 4. 3 3.3 0.0 0.0 addq %rcx, %rcx
# CHECK-NEXT: 5. 3 1.0 1.0 2.3 movl %esi, %ecx
+# CHECK-NEXT: 3 1.8 0.2 1.1 <total>
# CHECK-NEXT: 1. 1 2.0 0.0 0.0 movb %bpl, (%rcx)
# CHECK-NEXT: 2. 1 3.0 0.0 0.0 movb %sil, (%rdx)
# CHECK-NEXT: 3. 1 4.0 0.0 0.0 movb %dil, (%rbx)
+# CHECK-NEXT: 1 2.5 0.3 0.0 <total>
# CHECK: [1] Code Region
# CHECK-NEXT: 1. 1 2.0 0.0 0.0 movw %bp, (%rcx)
# CHECK-NEXT: 2. 1 3.0 0.0 0.0 movw %si, (%rdx)
# CHECK-NEXT: 3. 1 4.0 0.0 0.0 movw %di, (%rbx)
+# CHECK-NEXT: 1 2.5 0.3 0.0 <total>
# CHECK: [2] Code Region
# CHECK-NEXT: 1. 1 2.0 0.0 0.0 movl %ebp, (%rcx)
# CHECK-NEXT: 2. 1 3.0 0.0 0.0 movl %esi, (%rdx)
# CHECK-NEXT: 3. 1 4.0 0.0 0.0 movl %edi, (%rbx)
+# CHECK-NEXT: 1 2.5 0.3 0.0 <total>
# CHECK: [3] Code Region
# CHECK-NEXT: 1. 1 2.0 0.0 0.0 movq %rbp, (%rcx)
# CHECK-NEXT: 2. 1 3.0 0.0 0.0 movq %rsi, (%rdx)
# CHECK-NEXT: 3. 1 4.0 0.0 0.0 movq %rdi, (%rbx)
+# CHECK-NEXT: 1 2.5 0.3 0.0 <total>
# CHECK: [4] Code Region
# CHECK-NEXT: 1. 1 2.0 0.0 0.0 movd %mm1, (%rcx)
# CHECK-NEXT: 2. 1 3.0 0.0 0.0 movd %mm2, (%rdx)
# CHECK-NEXT: 3. 1 4.0 0.0 0.0 movd %mm3, (%rbx)
+# CHECK-NEXT: 1 2.5 0.3 0.0 <total>
# CHECK: [5] Code Region
# CHECK-NEXT: 1. 1 2.0 0.0 0.0 movaps %xmm1, (%rcx)
# CHECK-NEXT: 2. 1 3.0 0.0 0.0 movaps %xmm2, (%rdx)
# CHECK-NEXT: 3. 1 4.0 0.0 0.0 movaps %xmm3, (%rbx)
+# CHECK-NEXT: 1 2.5 0.3 0.0 <total>
# CHECK-NEXT: 32. 1 3.0 0.0 25.0 xorpd %xmm1, %xmm1
# CHECK-NEXT: 33. 1 28.0 0.0 0.0 pxor %mm2, %mm2
# CHECK-NEXT: 34. 1 3.0 0.0 26.0 pxor %xmm2, %xmm2
+# CHECK-NEXT: 1 6.7 0.2 10.3 <total>
# CHECK-NEXT: 0. 10 12.0 2.0 0.0 addl %eax, %ecx
# CHECK-NEXT: 1. 10 10.7 1.8 1.0 addl %esi, %eax
# CHECK-NEXT: 2. 10 12.5 1.0 0.0 addl %eax, %edx
+# CHECK-NEXT: 10 11.7 1.6 0.3 <total>
# CHECK-NEXT: 1. 2 4.0 2.0 2.5 lzcntl %ecx, %eax
# CHECK-NEXT: 2. 2 6.0 0.0 1.5 andq %rcx, %rax
# CHECK-NEXT: 3. 2 6.0 0.0 0.0 bsfq %rax, %rcx
+# CHECK-NEXT: 2 4.8 0.6 1.0 <total>
# CHECK-NEXT: 15. 2 29.5 18.5 0.0 vaddps %ymm3, %ymm1, %ymm4
# CHECK-NEXT: 16. 2 29.5 19.0 0.0 vaddps %ymm3, %ymm1, %ymm4
# CHECK-NEXT: 17. 2 34.5 0.0 0.0 vandps %xmm4, %xmm1, %xmm0
+# CHECK-NEXT: 2 24.0 9.6 0.2 <total>
# CHECK-NEXT: 0. 2 7.0 1.0 0.0 sqrtss %xmm0, %xmm0
# CHECK-NEXT: 1. 2 2.0 2.0 8.5 movss (%eax), %xmm0
# CHECK-NEXT: 2. 2 8.5 1.5 2.5 addps %xmm0, %xmm0
+# CHECK-NEXT: 2 5.8 1.5 3.7 <total>
# CHECK: [1] Code Region
# CHECK-NEXT: 0. 2 7.0 1.0 0.0 sqrtsd %xmm0, %xmm0
# CHECK-NEXT: 1. 2 2.0 2.0 8.5 movsd (%eax), %xmm0
# CHECK-NEXT: 2. 2 8.5 1.5 2.5 addps %xmm0, %xmm0
+# CHECK-NEXT: 2 5.8 1.5 3.7 <total>
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 3 1.3 1.3 1.0 cmpl %eax, %eax
# CHECK-NEXT: 1. 3 3.7 0.3 0.0 cmovael %ebx, %eax
+# CHECK-NEXT: 3 2.5 0.8 0.5 <total>
# CHECK-NEXT: 1. 3 6.0 6.0 0.0 vpcmpeqw %xmm1, %xmm1, %xmm2
# CHECK-NEXT: 2. 3 4.0 4.0 2.0 vpcmpeqd %xmm2, %xmm2, %xmm3
# CHECK-NEXT: 3. 3 6.0 0.0 0.0 vpcmpeqq %xmm3, %xmm3, %xmm0
+# CHECK-NEXT: 3 5.0 3.5 0.5 <total>
# CHECK-NEXT: 1. 3 0.0 0.0 1.3 vpcmpgtw %xmm1, %xmm1, %xmm2
# CHECK-NEXT: 2. 3 0.0 0.0 1.3 vpcmpgtd %xmm2, %xmm2, %xmm3
# CHECK-NEXT: 3. 3 1.0 1.0 0.0 vpcmpgtq %xmm3, %xmm3, %xmm0
+# CHECK-NEXT: 3 0.3 0.3 1.0 <total>
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 3 2.7 0.3 0.0 sbbl %edx, %edx
# CHECK-NEXT: 1. 3 3.7 0.0 0.0 sbbl %eax, %eax
+# CHECK-NEXT: 3 3.2 0.2 0.0 <total>
# CHECK-NEXT: 0. 3 5.7 2.0 0.0 imull %edx, %eax
# CHECK-NEXT: 1. 3 1.7 0.7 6.7 addl %edx, %edx
# CHECK-NEXT: 2. 3 5.0 2.7 3.0 sbbl %eax, %eax
+# CHECK-NEXT: 3 4.1 1.8 3.2 <total>
# CHECK-NEXT: 0. 10 25.0 0.1 0.0 vpmuldq %xmm0, %xmm0, %xmm1
# CHECK-NEXT: 1. 10 28.7 0.0 0.0 vpaddd %xmm1, %xmm1, %xmm0
# CHECK-NEXT: 2. 10 30.5 0.0 0.0 vpaddd %xmm0, %xmm0, %xmm3
+# CHECK-NEXT: 10 28.1 0.0 0.0 <total>
# CHECK-NEXT: 0. 3 1.0 1.0 13.7 vmulps %xmm0, %xmm1, %xmm2
# CHECK-NEXT: 1. 3 6.0 0.7 5.7 vhaddps %xmm2, %xmm2, %xmm3
# CHECK-NEXT: 2. 3 16.0 0.0 0.0 vhaddps %xmm3, %xmm3, %xmm4
+# CHECK-NEXT: 3 7.7 0.6 6.4 <total>
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 vshufps $0, %xmm0, %xmm1, %xmm1
# CHECK-NEXT: 1. 1 1.0 1.0 0.0 vhaddps (%rdi), %xmm1, %xmm2
+# CHECK-NEXT: 1 1.0 1.0 0.0 <total>
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 vshufps $0, %xmm0, %xmm1, %xmm1
# CHECK-NEXT: 1. 1 1.0 1.0 0.0 vhaddps (%rdi), %ymm1, %ymm2
+# CHECK-NEXT: 1 1.0 1.0 0.0 <total>
# CHECK-NEXT: 0. 3 1.0 0.7 9.3 addl %eax, %eax
# CHECK-NEXT: 1. 3 14.3 0.0 0.0 vpinsrb $0, %eax, %xmm0, %xmm0
# CHECK-NEXT: 2. 3 15.7 0.0 0.0 vpinsrb $1, %eax, %xmm0, %xmm0
+# CHECK-NEXT: 3 10.3 0.2 3.1 <total>
# CHECK-NEXT: 5. 1 17.0 0.0 0.0 vmovaps %xmm0, 32(%rdi)
# CHECK-NEXT: 6. 1 18.0 0.0 0.0 vmovaps 48(%rsi), %xmm0
# CHECK-NEXT: 7. 1 23.0 0.0 0.0 vmovaps %xmm0, 48(%rdi)
+# CHECK-NEXT: 1 12.0 0.1 0.0 <total>
# CHECK-NEXT: 1. 1 1.0 1.0 0.0 movb (%rcx), %bpl
# CHECK-NEXT: 2. 1 2.0 2.0 0.0 movb (%rdx), %sil
# CHECK-NEXT: 3. 1 7.0 0.0 0.0 movb %dil, (%rbx)
+# CHECK-NEXT: 1 2.8 1.0 0.0 <total>
# CHECK: [1] Code Region
# CHECK-NEXT: 1. 1 1.0 1.0 0.0 movw (%rcx), %bp
# CHECK-NEXT: 2. 1 2.0 2.0 0.0 movw (%rdx), %si
# CHECK-NEXT: 3. 1 7.0 0.0 0.0 movw %di, (%rbx)
+# CHECK-NEXT: 1 2.8 1.0 0.0 <total>
# CHECK: [2] Code Region
# CHECK-NEXT: 1. 1 1.0 1.0 0.0 movl (%rcx), %ebp
# CHECK-NEXT: 2. 1 2.0 2.0 0.0 movl (%rdx), %esi
# CHECK-NEXT: 3. 1 7.0 0.0 0.0 movl %edi, (%rbx)
+# CHECK-NEXT: 1 2.8 1.0 0.0 <total>
# CHECK: [3] Code Region
# CHECK-NEXT: 1. 1 1.0 1.0 0.0 movq (%rcx), %rbp
# CHECK-NEXT: 2. 1 2.0 2.0 0.0 movq (%rdx), %rsi
# CHECK-NEXT: 3. 1 7.0 0.0 0.0 movq %rdi, (%rbx)
+# CHECK-NEXT: 1 2.8 1.0 0.0 <total>
# CHECK: [4] Code Region
# CHECK-NEXT: 1. 1 1.0 1.0 0.0 movd (%rcx), %mm1
# CHECK-NEXT: 2. 1 2.0 2.0 0.0 movd (%rdx), %mm2
# CHECK-NEXT: 3. 1 7.0 0.0 0.0 movd %mm3, (%rbx)
+# CHECK-NEXT: 1 2.8 1.0 0.0 <total>
# CHECK: [5] Code Region
# CHECK-NEXT: 1. 1 1.0 1.0 0.0 movaps (%rcx), %xmm1
# CHECK-NEXT: 2. 1 2.0 2.0 0.0 movaps (%rdx), %xmm2
# CHECK-NEXT: 3. 1 7.0 0.0 0.0 movaps %xmm3, (%rbx)
+# CHECK-NEXT: 1 2.8 1.0 0.0 <total>
# CHECK-NEXT: 1. 1 1.0 1.0 0.0 movb (%rcx), %bpl
# CHECK-NEXT: 2. 1 3.0 3.0 0.0 movb (%rdx), %sil
# CHECK-NEXT: 3. 1 3.0 3.0 0.0 movb (%rbx), %dil
+# CHECK-NEXT: 1 2.0 2.0 0.0 <total>
# CHECK: [1] Code Region
# CHECK-NEXT: 1. 1 1.0 1.0 0.0 movw (%rcx), %bp
# CHECK-NEXT: 2. 1 3.0 3.0 0.0 movw (%rdx), %si
# CHECK-NEXT: 3. 1 3.0 3.0 0.0 movw (%rbx), %di
+# CHECK-NEXT: 1 2.0 2.0 0.0 <total>
# CHECK: [2] Code Region
# CHECK-NEXT: 1. 1 1.0 1.0 0.0 movl (%rcx), %ebp
# CHECK-NEXT: 2. 1 3.0 3.0 0.0 movl (%rdx), %esi
# CHECK-NEXT: 3. 1 3.0 3.0 0.0 movl (%rbx), %edi
+# CHECK-NEXT: 1 2.0 2.0 0.0 <total>
# CHECK: [3] Code Region
# CHECK-NEXT: 1. 1 1.0 1.0 0.0 movq (%rcx), %rbp
# CHECK-NEXT: 2. 1 3.0 3.0 0.0 movq (%rdx), %rsi
# CHECK-NEXT: 3. 1 3.0 3.0 0.0 movq (%rbx), %rdi
+# CHECK-NEXT: 1 2.0 2.0 0.0 <total>
# CHECK: [4] Code Region
# CHECK-NEXT: 1. 1 1.0 1.0 0.0 movd (%rcx), %mm1
# CHECK-NEXT: 2. 1 4.0 4.0 0.0 movd (%rdx), %mm2
# CHECK-NEXT: 3. 1 4.0 4.0 0.0 movd (%rbx), %mm3
+# CHECK-NEXT: 1 2.5 2.5 0.0 <total>
# CHECK: [5] Code Region
# CHECK-NEXT: 1. 1 1.0 1.0 0.0 movaps (%rcx), %xmm1
# CHECK-NEXT: 2. 1 4.0 4.0 0.0 movaps (%rdx), %xmm2
# CHECK-NEXT: 3. 1 4.0 4.0 0.0 movaps (%rbx), %xmm3
+# CHECK-NEXT: 1 2.5 2.5 0.0 <total>
# CHECK: [6] Code Region
# CHECK-NEXT: 1. 1 1.0 1.0 0.0 vmovaps (%rcx), %ymm1
# CHECK-NEXT: 2. 1 3.0 3.0 0.0 vmovaps (%rdx), %ymm2
# CHECK-NEXT: 3. 1 3.0 3.0 0.0 vmovaps (%rbx), %ymm3
+# CHECK-NEXT: 1 2.0 2.0 0.0 <total>
# CHECK-NEXT: 5. 1 9.0 1.0 0.0 vmovaps %xmm0, 32(%rdi)
# CHECK-NEXT: 6. 1 3.0 3.0 2.0 vmovaps 48(%rsi), %xmm0
# CHECK-NEXT: 7. 1 10.0 0.0 0.0 vmovaps %xmm0, 48(%rdi)
+# CHECK-NEXT: 1 5.3 1.3 0.5 <total>
# CHECK-NEXT: 12. 1 8.0 8.0 0.0 vpcmpeqd %xmm3, %xmm3, %xmm5
# CHECK-NEXT: 13. 1 9.0 2.0 0.0 vpcmpeqq %xmm3, %xmm3, %xmm5
# CHECK-NEXT: 14. 1 10.0 10.0 0.0 vpcmpeqw %xmm3, %xmm3, %xmm5
+# CHECK-NEXT: 1 4.9 3.8 0.2 <total>
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 imulq %rax, %rbx
# CHECK-NEXT: 1. 1 6.0 0.0 0.0 lzcntw %ax, %bx
# CHECK-NEXT: 2. 1 8.0 0.0 0.0 addl %ecx, %ebx
+# CHECK-NEXT: 1 5.0 0.3 0.0 <total>
# CHECK-NEXT: 0. 3 3.7 0.3 0.0 addw %cx, %dx
# CHECK-NEXT: 1. 3 4.3 0.0 0.0 movw %ax, %dx
# CHECK-NEXT: 2. 3 5.0 0.0 0.0 xorw %bx, %dx
+# CHECK-NEXT: 3 4.3 0.1 0.0 <total>
# CHECK-NEXT: 0. 3 6.7 0.7 0.0 imulw %ax, %bx
# CHECK-NEXT: 1. 3 9.7 0.0 0.0 lzcntw %ax, %bx
# CHECK-NEXT: 2. 3 11.7 0.0 0.0 addw %cx, %bx
+# CHECK-NEXT: 3 9.3 0.2 0.0 <total>
# CHECK-NEXT: 0. 3 7.7 0.3 0.0 imull %edx, %ecx
# CHECK-NEXT: 1. 3 7.3 0.0 0.0 lzcntw (%rsp), %cx
# CHECK-NEXT: 2. 3 8.7 1.0 0.0 lzcntw 2(%rsp), %cx
+# CHECK-NEXT: 3 7.9 0.4 0.0 <total>
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 imulw %ax, %cx
# CHECK-NEXT: 1. 1 5.0 0.0 0.0 addb %al, %cl
# CHECK-NEXT: 2. 1 6.0 0.0 0.0 addl %ecx, %ebx
+# CHECK-NEXT: 1 4.0 0.3 0.0 <total>
# CHECK-NEXT: 5. 2 3.5 3.5 12.0 vsqrtps %xmm0, %xmm2
# CHECK-NEXT: 6. 2 19.5 19.5 0.0 vaddps %ymm0, %ymm1, %ymm2
# CHECK-NEXT: 7. 2 7.5 7.5 8.0 vsqrtps %ymm0, %ymm2
+# CHECK-NEXT: 2 7.9 7.9 6.1 <total>
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 2 51.5 0.5 0.0 int3
# CHECK-NEXT: 1. 2 151.0 0.0 0.0 stmxcsr (%rsp)
+# CHECK-NEXT: 2 101.3 0.3 0.0 <total>
# CHECK-NEXT: 5. 3 10.7 1.0 0.0 addl %edx, %esi
# CHECK-NEXT: 6. 3 12.0 1.0 0.0 addl %ebx, %eax
# CHECK-NEXT: 7. 3 13.0 0.0 0.0 addl %ebx, %eax
+# CHECK-NEXT: 3 9.9 1.1 0.3 <total>
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 vaddps %xmm0, %xmm0, %xmm1
# CHECK-NEXT: 1. 1 1.0 0.0 0.0 vmulps (%rdi), %xmm1, %xmm2
+# CHECK-NEXT: 1 1.0 0.5 0.0 <total>
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 imull %esi
# CHECK-NEXT: 1. 1 5.0 4.0 0.0 imull (%rdi)
+# CHECK-NEXT: 1 3.0 2.5 0.0 <total>
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 addq %rdi, %rsi
# CHECK-NEXT: 1. 1 1.0 0.0 0.0 addq (%rsp), %rsi
# CHECK-NEXT: 2. 1 3.0 3.0 2.0 addq %rdx, %r8
+# CHECK-NEXT: 1 1.7 1.3 0.7 <total>
# CHECK-NEXT: 0. 3 0.0 0.0 5.3 vxorps %xmm0, %xmm0, %xmm0
# CHECK-NEXT: 1. 3 1.7 1.7 3.0 vmovaps %xmm0, %xmm1
# CHECK-NEXT: 2. 3 3.3 1.0 0.0 vaddps %xmm1, %xmm1, %xmm2
+# CHECK-NEXT: 3 1.7 0.9 2.8 <total>
# CHECK-NEXT: 6. 3 4.7 0.0 0.0 movupd %xmm3, %xmm4
# CHECK-NEXT: 7. 3 5.3 0.0 0.0 movdqa %xmm4, %xmm5
# CHECK-NEXT: 8. 3 6.0 0.0 0.0 movdqu %xmm5, %xmm0
+# CHECK-NEXT: 3 3.0 0.4 1.4 <total>
# CHECK-NEXT: 4. 3 5.7 0.0 0.0 vmovupd %xmm3, %xmm4
# CHECK-NEXT: 5. 3 6.3 0.0 0.0 vmovdqa %xmm4, %xmm5
# CHECK-NEXT: 6. 3 7.0 0.0 0.0 vmovdqu %xmm5, %xmm0
+# CHECK-NEXT: 3 4.5 0.5 0.8 <total>
# CHECK-NEXT: 2. 3 4.7 0.0 0.0 movl %ebx, %ecx
# CHECK-NEXT: 3. 3 5.3 0.0 0.0 movl %ecx, %edx
# CHECK-NEXT: 4. 3 6.0 0.0 0.0 movl %edx, %eax
+# CHECK-NEXT: 3 4.0 0.8 0.7 <total>
# CHECK-NEXT: 2. 3 4.7 0.0 0.0 movq %rbx, %rcx
# CHECK-NEXT: 3. 3 5.3 0.0 0.0 movq %rcx, %rdx
# CHECK-NEXT: 4. 3 6.0 0.0 0.0 movq %rdx, %rax
+# CHECK-NEXT: 3 4.0 0.8 0.7 <total>
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 5 20.2 0.2 0.0 vaddps %xmm0, %xmm0, %xmm0
# CHECK-NEXT: 1. 5 25.2 0.0 0.0 vmulps %xmm0, %xmm0, %xmm0
+# CHECK-NEXT: 5 22.7 0.1 0.0 <total>
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 5 14.0 0.2 0.0 vaddps %xmm0, %xmm0, %xmm0
# CHECK-NEXT: 1. 5 15.8 0.0 0.0 vmulps %xmm0, %xmm0, %xmm0
+# CHECK-NEXT: 5 14.9 0.1 0.0 <total>
# CHECK-NEXT: 30. 1 24.0 11.0 0.0 vaddps %ymm3, %ymm0, %ymm4
# CHECK-NEXT: 31. 1 25.0 12.0 0.0 vaddps %ymm3, %ymm0, %ymm5
# CHECK-NEXT: 32. 1 25.0 13.0 0.0 vaddps %ymm3, %ymm0, %ymm6
+# CHECK-NEXT: 1 15.6 11.2 0.6 <total>
# CHECK-NEXT: 1. 1 2.0 0.0 0.0 movb %bpl, (%rcx)
# CHECK-NEXT: 2. 1 3.0 0.0 0.0 movb %sil, (%rdx)
# CHECK-NEXT: 3. 1 4.0 0.0 0.0 movb %dil, (%rbx)
+# CHECK-NEXT: 1 2.5 0.3 0.0 <total>
# CHECK: [1] Code Region
# CHECK-NEXT: 1. 1 2.0 0.0 0.0 movw %bp, (%rcx)
# CHECK-NEXT: 2. 1 3.0 0.0 0.0 movw %si, (%rdx)
# CHECK-NEXT: 3. 1 4.0 0.0 0.0 movw %di, (%rbx)
+# CHECK-NEXT: 1 2.5 0.3 0.0 <total>
# CHECK: [2] Code Region
# CHECK-NEXT: 1. 1 2.0 0.0 0.0 movl %ebp, (%rcx)
# CHECK-NEXT: 2. 1 3.0 0.0 0.0 movl %esi, (%rdx)
# CHECK-NEXT: 3. 1 4.0 0.0 0.0 movl %edi, (%rbx)
+# CHECK-NEXT: 1 2.5 0.3 0.0 <total>
# CHECK: [3] Code Region
# CHECK-NEXT: 1. 1 2.0 0.0 0.0 movq %rbp, (%rcx)
# CHECK-NEXT: 2. 1 3.0 0.0 0.0 movq %rsi, (%rdx)
# CHECK-NEXT: 3. 1 4.0 0.0 0.0 movq %rdi, (%rbx)
+# CHECK-NEXT: 1 2.5 0.3 0.0 <total>
# CHECK: [4] Code Region
# CHECK-NEXT: 1. 1 3.0 0.0 0.0 movd %mm1, (%rcx)
# CHECK-NEXT: 2. 1 5.0 0.0 0.0 movd %mm2, (%rdx)
# CHECK-NEXT: 3. 1 7.0 0.0 0.0 movd %mm3, (%rbx)
+# CHECK-NEXT: 1 4.0 0.3 0.0 <total>
# CHECK: [5] Code Region
# CHECK-NEXT: 1. 1 2.0 0.0 0.0 movaps %xmm1, (%rcx)
# CHECK-NEXT: 2. 1 4.0 1.0 0.0 movaps %xmm2, (%rdx)
# CHECK-NEXT: 3. 1 5.0 0.0 0.0 movaps %xmm3, (%rbx)
+# CHECK-NEXT: 1 3.0 0.5 0.0 <total>
# CHECK: [6] Code Region
# CHECK-NEXT: 1. 1 2.0 1.0 0.0 vmovaps %ymm1, (%rcx)
# CHECK-NEXT: 2. 1 35.0 33.0 0.0 vmovaps %ymm2, (%rdx)
# CHECK-NEXT: 3. 1 36.0 1.0 0.0 vmovaps %ymm3, (%rbx)
+# CHECK-NEXT: 1 18.5 9.0 0.0 <total>
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 3 1.0 1.0 2.7 leaq 8(%rsp,%rdi,2), %rax
# CHECK-NEXT: 1. 3 1.7 0.7 0.0 vbroadcastss (%rax), %ymm0
+# CHECK-NEXT: 3 1.3 0.8 1.3 <total>
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 vaddps %xmm0, %xmm0, %xmm1
# CHECK-NEXT: 1. 1 1.0 0.0 0.0 vandps (%rdi), %xmm1, %xmm2
+# CHECK-NEXT: 1 1.0 0.5 0.0 <total>
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 vaddps %ymm0, %ymm0, %ymm1
# CHECK-NEXT: 1. 1 1.0 0.0 0.0 vandps (%rdi), %ymm1, %ymm2
+# CHECK-NEXT: 1 1.0 0.5 0.0 <total>
# CHECK-NEXT: 3. 2 16.0 0.0 6.0 vaddps %ymm4, %ymm5, %ymm6
# CHECK-NEXT: 4. 2 20.0 0.0 4.0 vmulps %ymm6, %ymm3, %ymm4
# CHECK-NEXT: 5. 2 25.0 0.0 1.5 vaddps %ymm4, %ymm5, %ymm0
+# CHECK-NEXT: 2 14.8 0.4 5.3 <total>
# CHECK-NEXT: 3. 2 8.0 0.0 6.0 vaddps %ymm4, %ymm5, %ymm6
# CHECK-NEXT: 4. 2 12.0 0.0 4.0 vmulps %ymm6, %ymm3, %ymm4
# CHECK-NEXT: 5. 2 17.0 0.0 1.5 vaddps %ymm4, %ymm5, %ymm0
+# CHECK-NEXT: 2 8.7 0.3 5.3 <total>
# CHECK-NEXT: 0. 3 2.0 2.0 0.0 vaddps %ymm0, %ymm0, %ymm1
# CHECK-NEXT: 1. 3 3.0 3.0 1.7 vxorps %ymm1, %ymm1, %ymm1
# CHECK-NEXT: 2. 3 4.3 0.0 0.3 vblendps $2, %ymm1, %ymm2, %ymm3
+# CHECK-NEXT: 3 3.1 1.7 0.7 <total>
# CHECK: [1] Code Region - ZERO-IDIOM-2
# CHECK-NEXT: 0. 3 2.0 2.0 0.0 vaddpd %ymm0, %ymm0, %ymm1
# CHECK-NEXT: 1. 3 3.0 3.0 1.7 vxorpd %ymm1, %ymm1, %ymm1
# CHECK-NEXT: 2. 3 4.3 0.0 0.3 vblendpd $2, %ymm1, %ymm2, %ymm3
+# CHECK-NEXT: 3 3.1 1.7 0.7 <total>
# CHECK: [2] Code Region - ZERO-IDIOM-3
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 3 2.0 2.0 0.0 vaddps %ymm0, %ymm1, %ymm2
# CHECK-NEXT: 1. 3 2.0 2.0 3.0 vandnps %ymm2, %ymm2, %ymm3
+# CHECK-NEXT: 3 2.0 2.0 1.5 <total>
# CHECK: [3] Code Region - ZERO-IDIOM-4
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 3 2.0 2.0 0.0 vaddps %ymm0, %ymm1, %ymm2
# CHECK-NEXT: 1. 3 2.0 2.0 3.0 vandnps %ymm2, %ymm2, %ymm3
+# CHECK-NEXT: 3 2.0 2.0 1.5 <total>
# CHECK: [4] Code Region - ZERO-IDIOM-5
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 3 7.0 0.3 0.0 vperm2f128 $136, %ymm0, %ymm0, %ymm1
# CHECK-NEXT: 1. 3 9.0 0.0 0.0 vaddps %ymm1, %ymm1, %ymm0
+# CHECK-NEXT: 3 8.0 0.2 0.0 <total>
# CHECK-NEXT: 68. 1 0.0 0.0 11.0 vxorps %xmm4, %xmm4, %xmm5
# CHECK-NEXT: 69. 1 0.0 0.0 11.0 vxorpd %xmm1, %xmm1, %xmm3
# CHECK-NEXT: 70. 1 0.0 0.0 12.0 vpxor %xmm3, %xmm3, %xmm5
+# CHECK-NEXT: 1 1.2 1.2 4.1 <total>
# CHECK-NEXT: 72. 1 0.0 0.0 3.0 vxorpd %ymm1, %ymm1, %ymm3
# CHECK-NEXT: 73. 1 0.0 0.0 3.0 vpxor %xmm3, %xmm3, %xmm5
# CHECK-NEXT: 74. 1 0.0 0.0 3.0 vpxor %ymm3, %ymm3, %ymm5
+# CHECK-NEXT: 1 0.9 0.2 1.8 <total>
# CHECK-NEXT: 0. 10 2.5 0.4 0.0 addl %eax, %ecx
# CHECK-NEXT: 1. 10 2.1 0.7 0.5 addl %esi, %eax
# CHECK-NEXT: 2. 10 2.6 0.0 0.3 addl %eax, %edx
+# CHECK-NEXT: 10 2.4 0.4 0.3 <total>
# CHECK-NEXT: 1. 1 2.0 0.0 0.0 addl %ebx, %ecx
# CHECK-NEXT: 2. 1 2.0 0.0 0.0 addl %ecx, %edx
# CHECK-NEXT: 3. 1 3.0 0.0 0.0 addl %edx, %eax
+# CHECK-NEXT: 1 2.0 0.3 0.0 <total>
# CHECK-NEXT: 5. 1 16.0 0.0 0.0 vmovaps %xmm0, 32(%rdi)
# CHECK-NEXT: 6. 1 16.0 0.0 0.0 vmovaps 48(%rsi), %xmm0
# CHECK-NEXT: 7. 1 21.0 0.0 0.0 vmovaps %xmm0, 48(%rdi)
+# CHECK-NEXT: 1 11.0 0.1 0.0 <total>
# CHECK-NEXT: 1. 2 1.5 1.0 4.5 lzcntl %ecx, %eax
# CHECK-NEXT: 2. 2 2.0 0.0 4.5 andq %rcx, %rax
# CHECK-NEXT: 3. 2 2.0 0.0 0.5 bsfq %rax, %rcx
+# CHECK-NEXT: 2 1.8 0.4 2.4 <total>
# CHECK-NEXT: 15. 2 21.0 21.0 13.5 vaddps %ymm3, %ymm1, %ymm4
# CHECK-NEXT: 16. 2 22.0 22.0 12.5 vaddps %ymm3, %ymm1, %ymm4
# CHECK-NEXT: 17. 2 24.0 0.0 11.5 vandps %xmm4, %xmm1, %xmm0
+# CHECK-NEXT: 2 17.5 9.9 21.6 <total>
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 imulq %rax, %rax
# CHECK-NEXT: 1. 1 3.0 0.0 0.0 cmpxchgq %rcx, (%rdx)
+# CHECK-NEXT: 1 2.0 0.5 0.0 <total>
# CHECK: [1] Code Region
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 imulq %rcx, %rcx
# CHECK-NEXT: 1. 1 3.0 0.0 0.0 cmpxchgq %rcx, (%rdx)
+# CHECK-NEXT: 1 2.0 0.5 0.0 <total>
# CHECK: [2] Code Region
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 imulq %rax, %rax
# CHECK-NEXT: 1. 1 3.0 0.0 0.0 lock cmpxchgq %rcx, (%rdx)
+# CHECK-NEXT: 1 2.0 0.5 0.0 <total>
# CHECK: [3] Code Region
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 imulq %rcx, %rcx
# CHECK-NEXT: 1. 1 3.0 0.0 0.0 lock cmpxchgq %rcx, (%rdx)
+# CHECK-NEXT: 1 2.0 0.5 0.0 <total>
# CHECK: [4] Code Region
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 imull %eax, %eax
# CHECK-NEXT: 1. 1 2.0 2.0 0.0 imull %edx, %edx
# CHECK-NEXT: 2. 1 1.0 0.0 0.0 cmpxchg8b (%rsp)
+# CHECK-NEXT: 1 1.3 1.0 0.0 <total>
# CHECK: [5] Code Region
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 imull %eax, %eax
# CHECK-NEXT: 1. 1 2.0 2.0 0.0 imull %edx, %edx
# CHECK-NEXT: 2. 1 1.0 0.0 0.0 cmpxchg16b (%rsp)
+# CHECK-NEXT: 1 1.3 1.0 0.0 <total>
# CHECK: [6] Code Region
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 imull %ebx, %ebx
# CHECK-NEXT: 1. 1 2.0 2.0 0.0 imull %ecx, %ecx
# CHECK-NEXT: 2. 1 1.0 0.0 0.0 lock cmpxchg8b (%rsp)
+# CHECK-NEXT: 1 1.3 1.0 0.0 <total>
# CHECK: [7] Code Region
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 imull %ebx, %ebx
# CHECK-NEXT: 1. 1 2.0 2.0 0.0 imull %ecx, %ecx
# CHECK-NEXT: 2. 1 1.0 0.0 0.0 lock cmpxchg16b (%rsp)
+# CHECK-NEXT: 1 1.3 1.0 0.0 <total>
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 3 1.0 1.0 0.0 cmpl %eax, %eax
# CHECK-NEXT: 1. 3 2.0 0.0 0.0 cmovael %ebx, %eax
+# CHECK-NEXT: 3 1.5 0.5 0.0 <total>
# CHECK-NEXT: 1. 3 1.0 1.0 0.0 vpcmpeqw %xmm1, %xmm1, %xmm2
# CHECK-NEXT: 2. 3 1.0 1.0 0.0 vpcmpeqd %xmm2, %xmm2, %xmm3
# CHECK-NEXT: 3. 3 1.0 1.0 0.0 vpcmpeqq %xmm3, %xmm3, %xmm0
+# CHECK-NEXT: 3 1.0 1.0 0.0 <total>
# CHECK-NEXT: 1. 3 0.0 0.0 0.0 vpcmpgtw %xmm1, %xmm1, %xmm2
# CHECK-NEXT: 2. 3 0.0 0.0 0.0 vpcmpgtd %xmm2, %xmm2, %xmm3
# CHECK-NEXT: 3. 3 0.0 0.0 0.0 vpcmpgtq %xmm3, %xmm3, %xmm0
+# CHECK-NEXT: 3 0.0 0.0 0.0 <total>
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 3 2.0 0.3 0.0 sbbl %edx, %edx
# CHECK-NEXT: 1. 3 3.0 0.0 0.0 sbbl %eax, %eax
+# CHECK-NEXT: 3 2.5 0.2 0.0 <total>
# CHECK-NEXT: 0. 3 2.3 1.0 0.0 imull %edx, %eax
# CHECK-NEXT: 1. 3 1.3 1.0 2.7 addl %edx, %edx
# CHECK-NEXT: 2. 3 1.7 0.0 2.7 sbbl %eax, %eax
+# CHECK-NEXT: 3 1.8 0.7 1.8 <total>
# CHECK-NEXT: 0. 10 8.0 0.1 0.0 vpmuldq %xmm0, %xmm0, %xmm1
# CHECK-NEXT: 1. 10 9.5 0.0 0.0 vpaddd %xmm1, %xmm1, %xmm0
# CHECK-NEXT: 2. 10 10.0 0.0 0.0 vpaddd %xmm0, %xmm0, %xmm3
+# CHECK-NEXT: 10 9.2 0.0 0.0 <total>
# CHECK-NEXT: 0. 3 1.0 1.0 4.7 vmulps %xmm0, %xmm1, %xmm2
# CHECK-NEXT: 1. 3 2.7 0.0 2.3 vhaddps %xmm2, %xmm2, %xmm3
# CHECK-NEXT: 2. 3 6.0 0.0 0.0 vhaddps %xmm3, %xmm3, %xmm4
+# CHECK-NEXT: 3 3.2 0.3 2.3 <total>
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 vshufps $0, %xmm0, %xmm1, %xmm1
# CHECK-NEXT: 1. 1 1.0 0.0 0.0 vhaddps (%rdi), %xmm1, %xmm2
+# CHECK-NEXT: 1 1.0 0.5 0.0 <total>
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 vshufps $0, %xmm0, %xmm1, %xmm1
# CHECK-NEXT: 1. 1 1.0 1.0 0.0 vhaddps (%rdi), %ymm1, %ymm2
+# CHECK-NEXT: 1 1.0 1.0 0.0 <total>
# CHECK-NEXT: 0. 3 1.0 1.0 3.3 addl %eax, %eax
# CHECK-NEXT: 1. 3 7.0 0.0 0.0 vpinsrb $0, %eax, %xmm0, %xmm0
# CHECK-NEXT: 2. 3 7.0 0.0 0.0 vpinsrb $1, %eax, %xmm0, %xmm0
+# CHECK-NEXT: 3 5.0 0.3 1.1 <total>
# CHECK-NEXT: 5. 1 16.0 0.0 0.0 vmovaps %xmm0, 32(%rdi)
# CHECK-NEXT: 6. 1 16.0 0.0 0.0 vmovaps 48(%rsi), %xmm0
# CHECK-NEXT: 7. 1 21.0 0.0 0.0 vmovaps %xmm0, 48(%rdi)
+# CHECK-NEXT: 1 11.0 0.1 0.0 <total>
# CHECK-NEXT: 5. 1 6.0 0.0 0.0 vmovaps %xmm0, 32(%rdi)
# CHECK-NEXT: 6. 1 1.0 1.0 0.0 vmovaps 48(%rsi), %xmm0
# CHECK-NEXT: 7. 1 6.0 0.0 0.0 vmovaps %xmm0, 48(%rdi)
+# CHECK-NEXT: 1 3.5 0.5 0.0 <total>
# CHECK-NEXT: 12. 1 1.0 1.0 0.0 vpcmpeqd %xmm3, %xmm3, %xmm5
# CHECK-NEXT: 13. 1 1.0 1.0 0.0 vpcmpeqq %xmm3, %xmm3, %xmm5
# CHECK-NEXT: 14. 1 1.0 1.0 0.0 vpcmpeqw %xmm3, %xmm3, %xmm5
+# CHECK-NEXT: 1 1.0 1.0 0.0 <total>
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 imulq %rax, %rbx
# CHECK-NEXT: 1. 1 7.0 0.0 0.0 lzcntw %ax, %bx
# CHECK-NEXT: 2. 1 7.0 0.0 0.0 addl %ecx, %ebx
+# CHECK-NEXT: 1 5.0 0.3 0.0 <total>
# CHECK-NEXT: 0. 3 2.7 0.3 0.0 addw %cx, %dx
# CHECK-NEXT: 1. 3 3.3 0.0 0.0 movw %ax, %dx
# CHECK-NEXT: 2. 3 3.7 0.0 0.0 xorw %bx, %dx
+# CHECK-NEXT: 3 3.2 0.1 0.0 <total>
# CHECK-NEXT: 0. 3 4.7 0.3 0.0 imulw %ax, %bx
# CHECK-NEXT: 1. 3 7.3 0.0 0.0 lzcntw %ax, %bx
# CHECK-NEXT: 2. 3 7.7 0.0 0.0 addw %cx, %bx
+# CHECK-NEXT: 3 6.6 0.1 0.0 <total>
# CHECK-NEXT: 0. 3 4.7 0.3 0.0 imull %edx, %ecx
# CHECK-NEXT: 1. 3 4.3 0.0 0.0 lzcntw (%rsp), %cx
# CHECK-NEXT: 2. 3 4.7 0.0 0.0 lzcntw 2(%rsp), %cx
+# CHECK-NEXT: 3 4.6 0.1 0.0 <total>
# CHECK-NEXT: 2. 5 6.2 0.0 0.8 shll $2, %eax
# CHECK-NEXT: 3. 5 6.8 0.0 0.0 imull %ecx, %eax
# CHECK-NEXT: 4. 5 9.2 0.0 0.0 cmpl $1025, %eax
+# CHECK-NEXT: 5 7.0 0.1 0.3 <total>
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 imulw %ax, %cx
# CHECK-NEXT: 1. 1 4.0 0.0 0.0 addb %al, %cl
# CHECK-NEXT: 2. 1 4.0 0.0 0.0 addl %ecx, %ebx
+# CHECK-NEXT: 1 3.0 0.3 0.0 <total>
# CHECK-NEXT: 5. 2 29.5 29.5 0.0 vsqrtps %xmm0, %xmm2
# CHECK-NEXT: 6. 2 1.0 1.0 45.5 vaddps %ymm0, %ymm1, %ymm2
# CHECK-NEXT: 7. 2 48.5 48.5 0.0 vsqrtps %ymm0, %ymm2
+# CHECK-NEXT: 2 10.5 10.5 23.7 <total>
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 2 51.0 0.5 0.0 int3
# CHECK-NEXT: 1. 2 151.0 0.0 0.0 stmxcsr (%rsp)
+# CHECK-NEXT: 2 101.0 0.3 0.0 <total>
# CHECK-NEXT: 5. 3 2.0 0.0 0.0 addl %edx, %esi
# CHECK-NEXT: 6. 3 2.0 0.0 0.0 addl %ebx, %eax
# CHECK-NEXT: 7. 3 3.0 0.0 0.0 addl %ebx, %eax
+# CHECK-NEXT: 3 2.1 0.2 0.0 <total>
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 vaddps %xmm0, %xmm0, %xmm1
# CHECK-NEXT: 1. 1 1.0 0.0 0.0 vmulps (%rdi), %xmm1, %xmm2
+# CHECK-NEXT: 1 1.0 0.5 0.0 <total>
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 imull %esi
# CHECK-NEXT: 1. 1 2.0 2.0 0.0 imull (%rdi)
+# CHECK-NEXT: 1 1.5 1.5 0.0 <total>
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 addq %rdi, %rsi
# CHECK-NEXT: 1. 1 1.0 0.0 0.0 addq (%rsp), %rsi
# CHECK-NEXT: 2. 1 2.0 2.0 2.0 addq %rdx, %r8
+# CHECK-NEXT: 1 1.3 1.0 0.7 <total>
# CHECK-NEXT: 0. 3 0.0 0.0 2.7 vxorps %xmm0, %xmm0, %xmm0
# CHECK-NEXT: 1. 3 0.0 0.0 2.7 vmovaps %xmm0, %xmm1
# CHECK-NEXT: 2. 3 1.0 1.0 0.0 vaddps %xmm1, %xmm1, %xmm2
+# CHECK-NEXT: 3 0.3 0.3 1.8 <total>
# CHECK-NEXT: 6. 3 0.0 0.0 0.0 movupd %xmm3, %xmm4
# CHECK-NEXT: 7. 3 0.0 0.0 0.0 movdqa %xmm4, %xmm5
# CHECK-NEXT: 8. 3 0.0 0.0 0.0 movdqu %xmm5, %xmm0
+# CHECK-NEXT: 3 0.0 0.0 0.0 <total>
# CHECK-NEXT: 4. 3 0.0 0.0 0.0 vmovupd %xmm3, %xmm4
# CHECK-NEXT: 5. 3 0.0 0.0 0.0 vmovdqa %xmm4, %xmm5
# CHECK-NEXT: 6. 3 0.0 0.0 0.0 vmovdqu %xmm5, %xmm0
+# CHECK-NEXT: 3 0.0 0.0 0.0 <total>
# CHECK-NEXT: 2. 3 0.0 0.0 0.0 movl %ebx, %ecx
# CHECK-NEXT: 3. 3 0.0 0.0 0.0 movl %ecx, %edx
# CHECK-NEXT: 4. 3 0.0 0.0 0.0 movl %edx, %eax
+# CHECK-NEXT: 3 0.0 0.0 0.0 <total>
# CHECK-NEXT: 2. 3 0.0 0.0 0.0 movq %rbx, %rcx
# CHECK-NEXT: 3. 3 0.0 0.0 0.0 movq %rcx, %rdx
# CHECK-NEXT: 4. 3 0.0 0.0 0.0 movq %rdx, %rax
+# CHECK-NEXT: 3 0.0 0.0 0.0 <total>
# CHECK-NEXT: 3. 3 2.0 0.0 0.0 addq %rcx, %rcx
# CHECK-NEXT: 4. 3 2.0 0.0 0.0 addq %rcx, %rcx
# CHECK-NEXT: 5. 3 0.0 0.0 3.0 movl %esi, %ecx
+# CHECK-NEXT: 3 1.0 0.2 1.1 <total>
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 5 9.0 0.2 0.0 vaddps %xmm0, %xmm0, %xmm0
# CHECK-NEXT: 1. 5 12.0 0.0 0.0 vmulps %xmm0, %xmm0, %xmm0
+# CHECK-NEXT: 5 10.5 0.1 0.0 <total>
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 5 6.6 0.2 0.0 vaddps %xmm0, %xmm0, %xmm0
# CHECK-NEXT: 1. 5 7.8 0.0 0.0 vmulps %xmm0, %xmm0, %xmm0
+# CHECK-NEXT: 5 7.2 0.1 0.0 <total>
# CHECK-NEXT: 30. 1 30.0 25.0 0.0 vaddps %ymm3, %ymm0, %ymm4
# CHECK-NEXT: 31. 1 31.0 27.0 0.0 vaddps %ymm3, %ymm0, %ymm5
# CHECK-NEXT: 32. 1 24.0 24.0 0.0 vaddps %ymm3, %ymm0, %ymm6
+# CHECK-NEXT: 1 15.8 14.0 12.4 <total>
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 3 1.0 1.0 2.7 leaq 8(%rsp,%rdi,2), %rax
# CHECK-NEXT: 1. 3 2.0 0.0 0.0 vbroadcastss (%rax), %ymm0
+# CHECK-NEXT: 3 1.5 0.5 1.3 <total>
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 vaddps %xmm0, %xmm0, %xmm1
# CHECK-NEXT: 1. 1 1.0 0.0 0.0 vandps (%rdi), %xmm1, %xmm2
+# CHECK-NEXT: 1 1.0 0.5 0.0 <total>
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 vaddps %ymm0, %ymm0, %ymm1
# CHECK-NEXT: 1. 1 1.0 1.0 0.0 vandps (%rdi), %ymm1, %ymm2
+# CHECK-NEXT: 1 1.0 1.0 0.0 <total>
# CHECK-NEXT: 2. 2 6.5 0.0 7.0 addl %ecx, %ecx
# CHECK-NEXT: 3. 2 6.5 0.0 4.0 imull %ecx, %ecx
# CHECK-NEXT: 4. 2 9.5 0.0 2.0 imull %ecx, %ecx
+# CHECK-NEXT: 2 6.5 0.1 4.0 <total>
# CHECK: [1] Code Region
# CHECK-NEXT: 2. 2 17.0 0.0 4.0 addl %ecx, %ecx
# CHECK-NEXT: 3. 2 17.0 0.0 1.0 imull %ecx, %ecx
# CHECK-NEXT: 4. 2 20.0 0.0 0.0 imull %ecx, %ecx
+# CHECK-NEXT: 2 15.4 0.1 1.8 <total>
# CHECK-NEXT: 2. 2 17.0 0.0 4.0 addl %ecx, %ecx
# CHECK-NEXT: 3. 2 18.0 0.0 1.0 imull %ecx, %ecx
# CHECK-NEXT: 4. 2 20.0 0.0 0.0 imull %ecx, %ecx
+# CHECK-NEXT: 2 15.8 0.1 1.8 <total>
# CHECK-NEXT: 0. 3 1.3 1.3 0.0 vaddps %ymm0, %ymm0, %ymm1
# CHECK-NEXT: 1. 3 1.0 1.0 1.3 vxorps %ymm1, %ymm1, %ymm1
# CHECK-NEXT: 2. 3 1.0 0.0 1.3 vblendps $2, %ymm1, %ymm2, %ymm3
+# CHECK-NEXT: 3 1.1 0.8 0.9 <total>
# CHECK: [1] Code Region - ZERO-IDIOM-2
# CHECK-NEXT: 0. 3 1.3 1.3 0.0 vaddpd %ymm0, %ymm0, %ymm1
# CHECK-NEXT: 1. 3 1.0 1.0 1.3 vxorpd %ymm1, %ymm1, %ymm1
# CHECK-NEXT: 2. 3 1.0 0.0 1.3 vblendpd $2, %ymm1, %ymm2, %ymm3
+# CHECK-NEXT: 3 1.1 0.8 0.9 <total>
# CHECK: [2] Code Region - ZERO-IDIOM-3
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 3 1.0 1.0 0.0 vaddps %ymm0, %ymm1, %ymm2
# CHECK-NEXT: 1. 3 1.0 1.0 1.0 vandnps %ymm2, %ymm2, %ymm3
+# CHECK-NEXT: 3 1.0 1.0 0.5 <total>
# CHECK: [3] Code Region - ZERO-IDIOM-4
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 3 1.0 1.0 0.0 vaddps %ymm0, %ymm1, %ymm2
# CHECK-NEXT: 1. 3 1.0 1.0 1.0 vandnps %ymm2, %ymm2, %ymm3
+# CHECK-NEXT: 3 1.0 1.0 0.5 <total>
# CHECK: [4] Code Region - ZERO-IDIOM-5
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 3 1.0 1.0 0.7 vperm2f128 $136, %ymm0, %ymm0, %ymm1
# CHECK-NEXT: 1. 3 1.0 0.0 0.0 vaddps %ymm1, %ymm1, %ymm0
+# CHECK-NEXT: 3 1.0 0.5 0.3 <total>
# CHECK-NEXT: 68. 1 0.0 0.0 0.0 vxorps %xmm4, %xmm4, %xmm5
# CHECK-NEXT: 69. 1 0.0 0.0 0.0 vxorpd %xmm1, %xmm1, %xmm3
# CHECK-NEXT: 70. 1 0.0 0.0 0.0 vpxor %xmm3, %xmm3, %xmm5
+# CHECK-NEXT: 1 0.0 0.0 0.0 <total>
# CHECK-NEXT: 3. 2 8.5 0.0 6.0 vaddps %xmm4, %xmm5, %xmm6
# CHECK-NEXT: 4. 2 11.0 0.0 3.5 vmulps %xmm6, %xmm3, %xmm4
# CHECK-NEXT: 5. 2 16.0 0.0 2.0 vaddps %xmm4, %xmm5, %xmm0
+# CHECK-NEXT: 2 8.3 0.3 5.0 <total>
# CHECK-NEXT: 3. 2 8.5 0.0 6.0 vaddps %xmm4, %xmm5, %xmm6
# CHECK-NEXT: 4. 2 11.0 0.0 3.5 vmulps %xmm6, %xmm3, %xmm4
# CHECK-NEXT: 5. 2 16.0 0.0 2.0 vaddps %xmm4, %xmm5, %xmm0
+# CHECK-NEXT: 2 8.3 0.3 5.0 <total>
# CHECK-NEXT: 3. 2 8.5 0.0 6.0 vaddps %xmm4, %xmm18, %xmm6
# CHECK-NEXT: 4. 2 11.0 0.0 3.5 vmulps %xmm6, %xmm19, %xmm4
# CHECK-NEXT: 5. 2 16.0 0.0 2.0 vaddps %xmm4, %xmm20, %xmm0
+# CHECK-NEXT: 2 8.3 0.3 5.0 <total>
# CHECK-NEXT: 3. 2 8.5 0.0 6.0 vaddps %ymm4, %ymm5, %ymm6
# CHECK-NEXT: 4. 2 11.0 0.0 3.5 vmulps %ymm6, %ymm3, %ymm4
# CHECK-NEXT: 5. 2 16.0 0.0 2.0 vaddps %ymm4, %ymm5, %ymm0
+# CHECK-NEXT: 2 8.3 0.3 5.0 <total>
# CHECK-NEXT: 3. 2 6.5 0.0 6.0 vaddps %ymm4, %ymm5, %ymm6
# CHECK-NEXT: 4. 2 9.0 0.0 3.5 vmulps %ymm6, %ymm3, %ymm4
# CHECK-NEXT: 5. 2 14.0 0.0 2.0 vaddps %ymm4, %ymm5, %ymm0
+# CHECK-NEXT: 2 6.8 0.3 5.2 <total>
# CHECK-NEXT: 80. 1 0.0 0.0 3.0 vxorpd %ymm1, %ymm1, %ymm3
# CHECK-NEXT: 81. 1 0.0 0.0 3.0 vpxor %xmm3, %xmm3, %xmm5
# CHECK-NEXT: 82. 1 0.0 0.0 3.0 vpxor %ymm3, %ymm3, %ymm5
+# CHECK-NEXT: 1 0.8 0.2 1.6 <total>
# CHECK-NEXT: 60. 1 0.0 0.0 10.0 vxorps %ymm4, %ymm4, %ymm5
# CHECK-NEXT: 61. 1 0.0 0.0 10.0 vxorpd %ymm1, %ymm1, %ymm3
# CHECK-NEXT: 62. 1 0.0 0.0 10.0 vpxor %xmm3, %xmm3, %xmm5
+# CHECK-NEXT: 1 2.1 0.3 8.7 <total>
# CHECK-NEXT: 80. 1 0.0 0.0 2.0 vxorpd %ymm1, %ymm1, %ymm3
# CHECK-NEXT: 81. 1 0.0 0.0 2.0 vpxor %xmm3, %xmm3, %xmm5
# CHECK-NEXT: 82. 1 0.0 0.0 2.0 vpxor %ymm3, %ymm3, %ymm5
+# CHECK-NEXT: 1 0.6 0.2 1.5 <total>
# CHECK-NEXT: 136. 1 1.0 0.0 2.0 vpxorq %ymm19, %ymm19, %ymm21
# CHECK-NEXT: 137. 1 1.0 0.0 2.0 vpxord %zmm19, %zmm19, %zmm21
# CHECK-NEXT: 138. 1 0.0 0.0 2.0 vpxorq %zmm19, %zmm19, %zmm21
+# CHECK-NEXT: 1 1.1 0.2 1.8 <total>
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 imulq %rax, %rbx
# CHECK-NEXT: 1. 1 4.0 0.0 0.0 lzcntw %ax, %bx
# CHECK-NEXT: 2. 1 6.0 0.0 0.0 addl %ecx, %ebx
+# CHECK-NEXT: 1 3.7 0.3 0.0 <total>
# CHECK-NEXT: 0. 6 7.0 0.2 0.0 addw %cx, %dx
# CHECK-NEXT: 1. 6 7.7 0.0 0.0 movw %ax, %dx
# CHECK-NEXT: 2. 6 8.5 0.0 0.0 xorw %bx, %dx
+# CHECK-NEXT: 6 7.7 0.1 0.0 <total>
# CHECK-NEXT: 0. 7 14.1 0.1 0.0 imulw %ax, %bx
# CHECK-NEXT: 1. 7 15.9 0.0 0.0 lzcntw %ax, %bx
# CHECK-NEXT: 2. 7 17.6 0.0 0.0 addw %cx, %bx
+# CHECK-NEXT: 7 15.9 0.0 0.0 <total>
# CHECK-NEXT: 0. 4 9.5 0.3 0.0 imull %edx, %ecx
# CHECK-NEXT: 1. 4 9.0 0.0 0.0 lzcntw (%rsp), %cx
# CHECK-NEXT: 2. 4 9.5 0.0 0.0 lzcntw 2(%rsp), %cx
+# CHECK-NEXT: 4 9.3 0.1 0.0 <total>
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 imulq %rax, %rcx
# CHECK-NEXT: 1. 1 5.0 0.0 0.0 addl %edx, %ecx
# CHECK-NEXT: 2. 1 6.0 0.0 0.0 addq %rcx, %rdx
+# CHECK-NEXT: 1 4.0 0.3 0.0 <total>
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 imulw %ax, %cx
# CHECK-NEXT: 1. 1 4.0 0.0 0.0 addb %al, %cl
# CHECK-NEXT: 2. 1 5.0 0.0 0.0 addl %ecx, %ebx
+# CHECK-NEXT: 1 3.3 0.3 0.0 <total>
# ALL: [0] [1] [2] [3]
# ALL-NEXT: 0. 1 1.0 1.0 0.0 addl %edi, %esi
# ALL-NEXT: 1. 1 1.0 0.0 0.0 bextrl %esi, (%rdi), %eax
+# ALL-NEXT: 1 1.0 0.5 0.0 <total>
# ALL: [0] [1] [2] [3]
# ALL-NEXT: 0. 1 1.0 1.0 0.0 addl %edi, %esi
# ALL-NEXT: 1. 1 1.0 0.0 0.0 bzhil %esi, (%rdi), %eax
+# ALL-NEXT: 1 1.0 0.5 0.0 <total>
# ALL: [0] [1] [2] [3]
# ALL-NEXT: 0. 1 1.0 1.0 0.0 vaddps %xmm0, %xmm0, %xmm1
# ALL-NEXT: 1. 1 1.0 0.0 0.0 vfmadd213ps (%rdi), %xmm1, %xmm2
+# ALL-NEXT: 1 1.0 0.5 0.0 <total>
# ALL: [0] [1] [2] [3]
# ALL-NEXT: 0. 1 1.0 1.0 0.0 vaddps %xmm0, %xmm0, %xmm2
# ALL-NEXT: 1. 1 1.0 0.0 0.0 vfmadd213ps (%rdi), %xmm1, %xmm2
+# ALL-NEXT: 1 1.0 0.5 0.0 <total>
# ALL-NEXT: 0. 1 1.0 1.0 0.0 vdivps %xmm0, %xmm1, %xmm1
# BARCELONA-NEXT: 1. 1 9.0 0.0 0.0 vaddps (%rax), %xmm1, %xmm1
+# BARCELONA-NEXT: 1 5.0 0.5 0.0 <total>
+
# BDVER2-NEXT: 1. 1 5.0 0.0 0.0 vaddps (%rax), %xmm1, %xmm1
+# BDVER2-NEXT: 1 3.0 0.5 0.0 <total>
+
# BDWELL-NEXT: 1. 1 7.0 0.0 0.0 vaddps (%rax), %xmm1, %xmm1
+# BDWELL-NEXT: 1 4.0 0.5 0.0 <total>
+
# BTVER2-NEXT: 1. 1 15.0 0.0 0.0 vaddps (%rax), %xmm1, %xmm1
+# BTVER2-NEXT: 1 8.0 0.5 0.0 <total>
+
# HASWELL-NEXT: 1. 1 8.0 0.0 0.0 vaddps (%rax), %xmm1, %xmm1
+# HASWELL-NEXT: 1 4.5 0.5 0.0 <total>
+
# SANDY-NEXT: 1. 1 9.0 0.0 0.0 vaddps (%rax), %xmm1, %xmm1
+# SANDY-NEXT: 1 5.0 0.5 0.0 <total>
+
# SKYLAKE-NEXT: 1. 1 6.0 0.0 0.0 vaddps (%rax), %xmm1, %xmm1
+# SKYLAKE-NEXT: 1 3.5 0.5 0.0 <total>
+
# ZNVER1-NEXT: 1. 1 8.0 0.0 0.0 vaddps (%rax), %xmm1, %xmm1
+# ZNVER1-NEXT: 1 4.5 0.5 0.0 <total>
# BDWELL-NEXT: 2. 10 1.0 0.4 5.7 addq $32, %r8
# BDWELL-NEXT: 3. 10 1.0 0.0 5.3 cmpl %edi, %edx
+# BDWELL-NEXT: 10 1.0 0.2 3.9 <total>
# HASWELL-NEXT: 2. 10 1.0 0.4 6.7 addq $32, %r8
# HASWELL-NEXT: 3. 10 1.0 0.0 6.3 cmpl %edi, %edx
+# HASWELL-NEXT: 10 1.0 0.2 4.6 <total>
# SKYLAKE-NEXT: 2. 10 1.0 0.1 7.0 addq $32, %r8
# SKYLAKE-NEXT: 3. 10 2.0 0.0 6.0 cmpl %edi, %edx
+# SKYLAKE-NEXT: 10 1.5 0.1 4.6 <total>
# ZNVER1-NEXT: 2. 10 1.0 0.1 7.0 addq $32, %r8
# ZNVER1-NEXT: 3. 10 2.0 0.0 6.0 cmpl %edi, %edx
+# ZNVER1-NEXT: 10 1.3 0.1 4.6 <total>
# ALL: [0] [1] [2] [3]
# ALL-NEXT: 0. 1 1.0 1.0 0.0 addl %edi, %esi
# ALL-NEXT: 1. 1 1.0 0.0 0.0 addl (%rdi), %esi
+# ALL-NEXT: 1 1.0 0.5 0.0 <total>
# ALL-NEXT: 0. 1 1.0 1.0 0.0 leaq 8(%rsp,%rdi,2), %rax
# BARCELONA-NEXT: 1. 1 2.0 0.0 0.0 sqrtss (%rax), %xmm1
+# BARCELONA-NEXT: 1 1.5 0.5 0.0 <total>
+
# BDVER2-NEXT: 1. 1 2.0 0.0 0.0 sqrtss (%rax), %xmm1
+# BDVER2-NEXT: 1 1.5 0.5 0.0 <total>
+
# BROADWELL-NEXT: 1. 1 2.0 0.0 0.0 sqrtss (%rax), %xmm1
+# BROADWELL-NEXT: 1 1.5 0.5 0.0 <total>
+
# BTVER2-NEXT: 1. 1 3.0 0.0 0.0 sqrtss (%rax), %xmm1
+# BTVER2-NEXT: 1 2.0 0.5 0.0 <total>
+
# HASWELL-NEXT: 1. 1 2.0 0.0 0.0 sqrtss (%rax), %xmm1
+# HASWELL-NEXT: 1 1.5 0.5 0.0 <total>
+
# SKYLAKE-NEXT: 1. 1 2.0 0.0 0.0 sqrtss (%rax), %xmm1
+# SKYLAKE-NEXT: 1 1.5 0.5 0.0 <total>
+
# ZNVER1-NEXT: 1. 1 2.0 0.0 0.0 sqrtss (%rax), %xmm1
+# ZNVER1-NEXT: 1 1.5 0.5 0.0 <total>
# ALL: [1] Code Region - test_sqrtsd
# ALL-NEXT: 0. 1 1.0 1.0 0.0 leaq 8(%rsp,%rdi,2), %rax
# BARCELONA-NEXT: 1. 1 2.0 0.0 0.0 sqrtsd (%rax), %xmm1
+# BARCELONA-NEXT: 1 1.5 0.5 0.0 <total>
+
# BDVER2-NEXT: 1. 1 2.0 0.0 0.0 sqrtsd (%rax), %xmm1
+# BDVER2-NEXT: 1 1.5 0.5 0.0 <total>
+
# BROADWELL-NEXT: 1. 1 2.0 0.0 0.0 sqrtsd (%rax), %xmm1
+# BROADWELL-NEXT: 1 1.5 0.5 0.0 <total>
+
# BTVER2-NEXT: 1. 1 3.0 0.0 0.0 sqrtsd (%rax), %xmm1
+# BTVER2-NEXT: 1 2.0 0.5 0.0 <total>
+
# HASWELL-NEXT: 1. 1 2.0 0.0 0.0 sqrtsd (%rax), %xmm1
+# HASWELL-NEXT: 1 1.5 0.5 0.0 <total>
+
# SKYLAKE-NEXT: 1. 1 2.0 0.0 0.0 sqrtsd (%rax), %xmm1
+# SKYLAKE-NEXT: 1 1.5 0.5 0.0 <total>
+
# ZNVER1-NEXT: 1. 1 2.0 0.0 0.0 sqrtsd (%rax), %xmm1
+# ZNVER1-NEXT: 1 1.5 0.5 0.0 <total>
# ALL: [2] Code Region - test_rsqrtss
# ALL-NEXT: 0. 1 1.0 1.0 0.0 leaq 8(%rsp,%rdi,2), %rax
# BARCELONA-NEXT: 1. 1 2.0 0.0 0.0 rsqrtss (%rax), %xmm1
+# BARCELONA-NEXT: 1 1.5 0.5 0.0 <total>
+
# BDVER2-NEXT: 1. 1 2.0 0.0 0.0 rsqrtss (%rax), %xmm1
+# BDVER2-NEXT: 1 1.5 0.5 0.0 <total>
+
# BROADWELL-NEXT: 1. 1 2.0 0.0 0.0 rsqrtss (%rax), %xmm1
+# BROADWELL-NEXT: 1 1.5 0.5 0.0 <total>
+
# BTVER2-NEXT: 1. 1 3.0 0.0 0.0 rsqrtss (%rax), %xmm1
+# BTVER2-NEXT: 1 2.0 0.5 0.0 <total>
+
# HASWELL-NEXT: 1. 1 2.0 0.0 0.0 rsqrtss (%rax), %xmm1
+# HASWELL-NEXT: 1 1.5 0.5 0.0 <total>
+
# SKYLAKE-NEXT: 1. 1 2.0 0.0 0.0 rsqrtss (%rax), %xmm1
+# SKYLAKE-NEXT: 1 1.5 0.5 0.0 <total>
+
# ZNVER1-NEXT: 1. 1 2.0 0.0 0.0 rsqrtss (%rax), %xmm1
+# ZNVER1-NEXT: 1 1.5 0.5 0.0 <total>
# ALL: [3] Code Region - test_rcp
# ALL-NEXT: 0. 1 1.0 1.0 0.0 leaq 8(%rsp,%rdi,2), %rax
# BARCELONA-NEXT: 1. 1 2.0 0.0 0.0 rcpss (%rax), %xmm1
+# BARCELONA-NEXT: 1 1.5 0.5 0.0 <total>
+
# BDVER2-NEXT: 1. 1 2.0 0.0 0.0 rcpss (%rax), %xmm1
+# BDVER2-NEXT: 1 1.5 0.5 0.0 <total>
+
# BROADWELL-NEXT: 1. 1 2.0 0.0 0.0 rcpss (%rax), %xmm1
+# BROADWELL-NEXT: 1 1.5 0.5 0.0 <total>
+
# BTVER2-NEXT: 1. 1 3.0 0.0 0.0 rcpss (%rax), %xmm1
+# BTVER2-NEXT: 1 2.0 0.5 0.0 <total>
+
# HASWELL-NEXT: 1. 1 2.0 0.0 0.0 rcpss (%rax), %xmm1
+# HASWELL-NEXT: 1 1.5 0.5 0.0 <total>
+
# SKYLAKE-NEXT: 1. 1 2.0 0.0 0.0 rcpss (%rax), %xmm1
+# SKYLAKE-NEXT: 1 1.5 0.5 0.0 <total>
+
# ZNVER1-NEXT: 1. 1 2.0 0.0 0.0 rcpss (%rax), %xmm1
+# ZNVER1-NEXT: 1 1.5 0.5 0.0 <total>
# ALL-NEXT: 0. 1 1.0 1.0 0.0 vaddps %xmm0, %xmm0, %xmm1
# BDVER2-NEXT: 1. 1 1.0 0.0 0.0 vblendvps %xmm1, (%rdi), %xmm2, %xmm3
+# BDVER2-NEXT: 1 1.0 0.5 0.0 <total>
+
# BDWELL-NEXT: 1. 1 1.0 0.0 0.0 vblendvps %xmm1, (%rdi), %xmm2, %xmm3
+# BDWELL-NEXT: 1 1.0 0.5 0.0 <total>
+
# BTVER2-NEXT: 1. 1 1.0 1.0 0.0 vblendvps %xmm1, (%rdi), %xmm2, %xmm3
+# BTVER2-NEXT: 1 1.0 1.0 0.0 <total>
+
# HASWELL-NEXT: 1. 1 1.0 0.0 0.0 vblendvps %xmm1, (%rdi), %xmm2, %xmm3
+# HASWELL-NEXT: 1 1.0 0.5 0.0 <total>
+
# IVY-NEXT: 1. 1 1.0 0.0 0.0 vblendvps %xmm1, (%rdi), %xmm2, %xmm3
+# IVY-NEXT: 1 1.0 0.5 0.0 <total>
+
# SANDY-NEXT: 1. 1 1.0 0.0 0.0 vblendvps %xmm1, (%rdi), %xmm2, %xmm3
+# SANDY-NEXT: 1 1.0 0.5 0.0 <total>
+
# SKYLAKE-NEXT: 1. 1 1.0 0.0 0.0 vblendvps %xmm1, (%rdi), %xmm2, %xmm3
+# SKYLAKE-NEXT: 1 1.0 0.5 0.0 <total>
+
# ZNVER1-NEXT: 1. 1 1.0 0.0 0.0 vblendvps %xmm1, (%rdi), %xmm2, %xmm3
+# ZNVER1-NEXT: 1 1.0 0.5 0.0 <total>
# ALL-NEXT: 0. 1 1.0 1.0 0.0 vaddps %xmm0, %xmm0, %xmm2
# BDVER2-NEXT: 1. 1 1.0 0.0 0.0 vblendvps %xmm1, (%rdi), %xmm2, %xmm3
+# BDVER2-NEXT: 1 1.0 0.5 0.0 <total>
+
# BDWELL-NEXT: 1. 1 1.0 0.0 0.0 vblendvps %xmm1, (%rdi), %xmm2, %xmm3
+# BDWELL-NEXT: 1 1.0 0.5 0.0 <total>
+
# BTVER2-NEXT: 1. 1 1.0 1.0 0.0 vblendvps %xmm1, (%rdi), %xmm2, %xmm3
+# BTVER2-NEXT: 1 1.0 1.0 0.0 <total>
+
# HASWELL-NEXT: 1. 1 1.0 0.0 0.0 vblendvps %xmm1, (%rdi), %xmm2, %xmm3
+# HASWELL-NEXT: 1 1.0 0.5 0.0 <total>
+
# IVY-NEXT: 1. 1 1.0 0.0 0.0 vblendvps %xmm1, (%rdi), %xmm2, %xmm3
+# IVY-NEXT: 1 1.0 0.5 0.0 <total>
+
# SANDY-NEXT: 1. 1 1.0 0.0 0.0 vblendvps %xmm1, (%rdi), %xmm2, %xmm3
+# SANDY-NEXT: 1 1.0 0.5 0.0 <total>
+
# SKYLAKE-NEXT: 1. 1 1.0 0.0 0.0 vblendvps %xmm1, (%rdi), %xmm2, %xmm3
+# SKYLAKE-NEXT: 1 1.0 0.5 0.0 <total>
+
# ZNVER1-NEXT: 1. 1 1.0 0.0 0.0 vblendvps %xmm1, (%rdi), %xmm2, %xmm3
+# ZNVER1-NEXT: 1 1.0 0.5 0.0 <total>
//===----------------------------------------------------------------------===//
#include "Views/TimelineView.h"
+#include <numeric>
namespace llvm {
namespace mca {
const WaitTimeEntry &Entry,
unsigned SourceIndex,
unsigned Executions) const {
- OS << SourceIndex << '.';
+ bool PrintingTotals = SourceIndex == Source.size();
+ unsigned CumulativeExecutions = PrintingTotals ? Timeline.size() : Executions;
+
+ if (!PrintingTotals)
+ OS << SourceIndex << '.';
+
OS.PadToColumn(7);
double AverageTime1, AverageTime2, AverageTime3;
- AverageTime1 = (double)Entry.CyclesSpentInSchedulerQueue / Executions;
- AverageTime2 = (double)Entry.CyclesSpentInSQWhileReady / Executions;
- AverageTime3 = (double)Entry.CyclesSpentAfterWBAndBeforeRetire / Executions;
+ AverageTime1 =
+ (double)Entry.CyclesSpentInSchedulerQueue / CumulativeExecutions;
+ AverageTime2 = (double)Entry.CyclesSpentInSQWhileReady / CumulativeExecutions;
+ AverageTime3 =
+ (double)Entry.CyclesSpentAfterWBAndBeforeRetire / CumulativeExecutions;
OS << Executions;
OS.PadToColumn(13);
- int BufferSize = UsedBuffer[SourceIndex].second;
- tryChangeColor(OS, Entry.CyclesSpentInSchedulerQueue, Executions, BufferSize);
+
+ int BufferSize = PrintingTotals ? 0 : UsedBuffer[SourceIndex].second;
+ if (!PrintingTotals)
+ tryChangeColor(OS, Entry.CyclesSpentInSchedulerQueue, CumulativeExecutions,
+ BufferSize);
OS << format("%.1f", floor((AverageTime1 * 10) + 0.5) / 10);
OS.PadToColumn(20);
- tryChangeColor(OS, Entry.CyclesSpentInSQWhileReady, Executions, BufferSize);
+ if (!PrintingTotals)
+ tryChangeColor(OS, Entry.CyclesSpentInSQWhileReady, CumulativeExecutions,
+ BufferSize);
OS << format("%.1f", floor((AverageTime2 * 10) + 0.5) / 10);
OS.PadToColumn(27);
- tryChangeColor(OS, Entry.CyclesSpentAfterWBAndBeforeRetire, Executions,
- STI.getSchedModel().MicroOpBufferSize);
+ if (!PrintingTotals)
+ tryChangeColor(OS, Entry.CyclesSpentAfterWBAndBeforeRetire,
+ CumulativeExecutions, STI.getSchedModel().MicroOpBufferSize);
OS << format("%.1f", floor((AverageTime3 * 10) + 0.5) / 10);
if (OS.has_colors())
++IID;
}
+
+ // If the timeline contains more than one instruction,
+ // let's also print global averages.
+ if (Source.size() != 1) {
+ WaitTimeEntry TotalWaitTime = std::accumulate(
+ WaitTime.begin(), WaitTime.end(), WaitTimeEntry{0, 0, 0},
+ [](const WaitTimeEntry &A, const WaitTimeEntry &B) {
+ return WaitTimeEntry{
+ A.CyclesSpentInSchedulerQueue + B.CyclesSpentInSchedulerQueue,
+ A.CyclesSpentInSQWhileReady + B.CyclesSpentInSQWhileReady,
+ A.CyclesSpentAfterWBAndBeforeRetire +
+ B.CyclesSpentAfterWBAndBeforeRetire};
+ });
+ printWaitTimeEntry(FOS, TotalWaitTime, IID, Executions);
+ FOS << " "
+ << "<total>" << '\n';
+ InstrStream.flush();
+ }
}
void TimelineView::printTimelineViewEntry(formatted_raw_ostream &OS,
/// 3. 2 1.5 0.5 1.0 vaddss %xmm1, %xmm0, %xmm3
/// 4. 2 3.5 0.0 0.0 vaddss %xmm3, %xmm2, %xmm4
/// 5. 2 6.5 0.0 0.0 vaddss %xmm4, %xmm5, %xmm6
+/// 2 2.4 0.6 1.6 <total>
///
/// By comparing column [2] with column [1], we get an idea about how many
/// cycles were spent in the scheduler's queue due to data dependencies.