]> granicus.if.org Git - llvm/commitdiff
[X86][Btver2] Fix latency and throughput of CMPXCHG instructions.
authorAndrea Di Biagio <Andrea_DiBiagio@sn.scee.net>
Tue, 20 Aug 2019 10:23:55 +0000 (10:23 +0000)
committerAndrea Di Biagio <Andrea_DiBiagio@sn.scee.net>
Tue, 20 Aug 2019 10:23:55 +0000 (10:23 +0000)
On Jaguar, CMPXCHG has a latency of 11cy, and a maximum throughput of 0.33 IPC.
Throughput is superiorly limited to 0.33 because of the implicit in/out
dependency on register EAX. In the case of repeated non-atomic CMPXCHG with the
same memory location, store-to-load forwarding occurs and values for sequent
loads are quickly forwarded from the store buffer.

Interestingly, the functionality in LLVM that computes the reciprocal throughput
doesn't seem to know about RMW instructions. That functionality only looks at
the "consumed resource cycles" for the throughput computation. It should be
fixed/improved by a future patch. In particular, for RMW instructions, that
logic should also take into account for the write latency of in/out register
operands.

An atomic CMPXCHG has a latency of ~17cy. Throughput is also limited to
~17cy/inst due to cache locking, which prevents other memory uOPs to start
executing before the "lock releasing" store uOP.

CMPXCHG8rr and CMPXCHG8rm are treated specially because they decode to one less
macro opcode. Their latency tend to be the same as the other RR/RM variants. RR
variants are relatively fast 3cy (but still microcoded - 5 macro opcodes).

CMPXCHG8B is 11cy and unfortunately doesn't seem to benefit from store-to-load
forwarding. That means, throughput is clearly limited by the in/out dependency
on GPR registers. The uOP composition is sadly unknown (due to the lack of PMCs
for the Integer pipes). I have reused the same mix of consumed resource from the
other CMPXCHG instructions for CMPXCHG8B too.
LOCK CMPXCHG8B is instead 18cycles.

CMPXCHG16B is 32cycles. Up to 38cycles when the LOCK prefix is specified. Due to
the in/out dependencies, throughput is limited to 1 instruction every 32 (or 38)
cycles dependeing on whether the LOCK prefix is specified or not.
I wouldn't be surprised if the microcode for CMPXCHG16B is similar to 2x
microcode from CMPXCHG8B. So, I have speculatively set the JALU01 consumption to
2x the resource cycles used for CMPXCHG8B.

The two new hasLockPrefix() functions are used by the btver2 scheduling model
check if a MCInst/MachineInst has a LOCK prefix. Calls to hasLockPrefix() have
been encoded in predicates of variant scheduling classes that describe lat/thr
of CMPXCHG.

Differential Revision: https://reviews.llvm.org/D66424

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@369365 91177308-0d34-0410-b5e6-96231b3b80d8

lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h
lib/Target/X86/X86InstrInfo.h
lib/Target/X86/X86SchedPredicates.td
lib/Target/X86/X86ScheduleBtVer2.td
test/tools/llvm-mca/X86/BtVer2/resources-cmpxchg.s
test/tools/llvm-mca/X86/BtVer2/resources-x86_64.s

index 2ccbfd79d314510c274be25f122aa823f9da83e1..ced9eacc8b9769a375ee92f9c464c3b3ea0d424f 100644 (file)
@@ -70,6 +70,10 @@ unsigned X86_MC::getDwarfRegFlavour(const Triple &TT, bool isEH) {
   return DWARFFlavour::X86_32_Generic;
 }
 
+bool X86_MC::hasLockPrefix(const MCInst &MI) {
+  return MI.getFlags() & X86::IP_HAS_LOCK;
+}
+
 void X86_MC::initLLVMToSEHAndCVRegMapping(MCRegisterInfo *MRI) {
   // FIXME: TableGen these.
   for (unsigned Reg = X86::NoRegister + 1; Reg < X86::NUM_TARGET_REGS; ++Reg) {
index f3df47a181800a0debd7df6297ffd4f2eda43887..0c789061f0e137f22f1219367d4ebc38155694d1 100644 (file)
@@ -58,6 +58,10 @@ unsigned getDwarfRegFlavour(const Triple &TT, bool isEH);
 
 void initLLVMToSEHAndCVRegMapping(MCRegisterInfo *MRI);
 
+
+/// Returns true if this instruction has a LOCK prefix.
+bool hasLockPrefix(const MCInst &MI);
+
 /// Create a X86 MCSubtargetInfo instance. This is exposed so Asm parser, etc.
 /// do not need to go through TargetRegistry.
 MCSubtargetInfo *createX86MCSubtargetInfo(const Triple &TT, StringRef CPU,
index 1b42f86f90f21245386b9db51467a592098f03c4..cfcaafbc2f41f8926a8d5f26d66d30252e4cb114 100644 (file)
@@ -527,6 +527,10 @@ public:
 #define GET_INSTRINFO_HELPER_DECLS
 #include "X86GenInstrInfo.inc"
 
+  static bool hasLockPrefix(const MachineInstr &MI) {
+    return MI.getDesc().TSFlags & X86II::LOCK;
+  }
+
   Optional<ParamLoadedValue>
   describeLoadedValue(const MachineInstr &MI) const override;
 
index 41bd776648f787261bca73f03f5dda0c52d7ea74..76001d382a278f5a52aa1c135c9463fb3138fb35 100644 (file)
@@ -84,3 +84,60 @@ def IsSETAm_Or_SETBEm : CheckAny<[
   CheckImmOperand_s<5, "X86::COND_A">,
   CheckImmOperand_s<5, "X86::COND_BE">
 ]>;
+
+// A predicate used to check if an instruction has a LOCK prefix.
+def CheckLockPrefix : CheckFunctionPredicate<
+  "X86_MC::hasLockPrefix",
+  "X86InstrInfo::hasLockPrefix"
+>;
+
+def IsRegRegCompareAndSwap_8 : CheckOpcode<[ CMPXCHG8rr ]>;
+
+def IsRegMemCompareAndSwap_8 : CheckOpcode<[
+  LCMPXCHG8, CMPXCHG8rm
+]>;
+
+def IsRegRegCompareAndSwap_16_32_64  : CheckOpcode<[
+  CMPXCHG16rr, CMPXCHG32rr, CMPXCHG64rr
+]>;
+
+def IsRegMemCompareAndSwap_16_32_64  : CheckOpcode<[
+  CMPXCHG16rm, CMPXCHG32rm, CMPXCHG64rm,
+  LCMPXCHG16, LCMPXCHG32, LCMPXCHG64,
+  LCMPXCHG8B, LCMPXCHG16B
+]>;
+
+def IsCompareAndSwap8B  : CheckOpcode<[ CMPXCHG8B, LCMPXCHG8B ]>;
+def IsCompareAndSwap16B : CheckOpcode<[ CMPXCHG16B, LCMPXCHG16B ]>;
+
+def IsRegMemCompareAndSwap  : CheckOpcode<
+  !listconcat(
+    IsRegMemCompareAndSwap_8.ValidOpcodes,
+    IsRegMemCompareAndSwap_16_32_64.ValidOpcodes
+  )>;
+
+def IsRegRegCompareAndSwap  : CheckOpcode<
+  !listconcat(
+    IsRegRegCompareAndSwap_8.ValidOpcodes,
+    IsRegRegCompareAndSwap_16_32_64.ValidOpcodes
+  )>;
+
+def IsAtomicCompareAndSwap_8 : CheckAll<[
+  CheckLockPrefix,
+  IsRegMemCompareAndSwap_8
+]>;
+
+def IsAtomicCompareAndSwap : CheckAll<[
+  CheckLockPrefix,
+  IsRegMemCompareAndSwap
+]>;
+
+def IsAtomicCompareAndSwap8B : CheckAll<[
+  CheckLockPrefix,
+  IsCompareAndSwap8B
+]>;
+
+def IsAtomicCompareAndSwap16B : CheckAll<[
+  CheckLockPrefix,
+  IsCompareAndSwap16B
+]>;
index 2d26232b41326ccf045a1d77ff91038bd427c34c..aaa8121f5a3ac19a43dbe0cdeeac163fa1099459 100644 (file)
@@ -191,10 +191,10 @@ defm : X86WriteRes<WriteRMW, [JSAGU], 1, [1], 0>;
 defm : JWriteResIntPair<WriteALU,    [JALU01], 1>;
 defm : JWriteResIntPair<WriteADC,    [JALU01], 1, [2]>;
 
-defm : X86WriteRes<WriteBSWAP32, [JALU01], 1, [1], 1>;
-defm : X86WriteRes<WriteBSWAP64, [JALU01], 1, [1], 1>;
-defm : X86WriteRes<WriteCMPXCHG,[JALU01], 1, [1], 1>;
-defm : X86WriteRes<WriteCMPXCHGRMW,[JALU01, JSAGU, JLAGU], 4, [1, 1, 1], 2>;
+defm : X86WriteRes<WriteBSWAP32,     [JALU01], 1, [1], 1>;
+defm : X86WriteRes<WriteBSWAP64,     [JALU01], 1, [1], 1>;
+defm : X86WriteRes<WriteCMPXCHG,     [JALU01], 3, [3], 5>;
+defm : X86WriteRes<WriteCMPXCHGRMW,  [JALU01, JSAGU, JLAGU], 11, [3, 1, 1], 6>;
 defm : X86WriteRes<WriteXCHG,        [JALU01], 1, [1], 1>;
 
 defm : JWriteResIntPair<WriteIMul8,     [JALU1, JMul], 3, [1, 1], 2>;
@@ -305,6 +305,73 @@ def : WriteRes<WriteFence,  [JSAGU]>;
 // to '1' to tell the scheduler that the nop uses an ALU slot for a cycle.
 def : WriteRes<WriteNop, [JALU01]> { let Latency = 1; }
 
+def JWriteCMPXCHG8rr : SchedWriteRes<[JALU01]> {
+  let Latency = 3;
+  let ResourceCycles = [3];
+  let NumMicroOps = 3;
+}
+
+def JWriteLOCK_CMPXCHG8rm : SchedWriteRes<[JALU01, JLAGU, JSAGU]> {
+  let Latency = 16;
+  let ResourceCycles = [3,16,16];
+  let NumMicroOps = 5;
+}
+
+def JWriteLOCK_CMPXCHGrm : SchedWriteRes<[JALU01, JLAGU, JSAGU]> {
+  let Latency = 17;
+  let ResourceCycles = [3,17,17];
+  let NumMicroOps = 6;
+}
+
+def JWriteCMPXCHG8rm : SchedWriteRes<[JALU01, JLAGU, JSAGU]> {
+  let Latency = 11;
+  let ResourceCycles = [3,1,1];
+  let NumMicroOps = 5;
+}
+
+def JWriteCMPXCHG8B : SchedWriteRes<[JALU01, JLAGU, JSAGU]> {
+  let Latency = 11;
+  let ResourceCycles = [3,1,1];
+  let NumMicroOps = 18;
+}
+
+def JWriteCMPXCHG16B : SchedWriteRes<[JALU01, JLAGU, JSAGU]> {
+  let Latency = 32;
+  let ResourceCycles = [6,1,1];
+  let NumMicroOps = 28;
+}
+
+def JWriteLOCK_CMPXCHG8B : SchedWriteRes<[JALU01, JLAGU, JSAGU]> {
+  let Latency = 19;
+  let ResourceCycles = [3,19,19];
+  let NumMicroOps = 18;
+}
+
+def JWriteLOCK_CMPXCHG16B : SchedWriteRes<[JALU01, JLAGU, JSAGU]> {
+  let Latency = 38;
+  let ResourceCycles = [6,38,38];
+  let NumMicroOps = 28;
+}
+
+def JWriteCMPXCHGVariant :  SchedWriteVariant<[
+  SchedVar<MCSchedPredicate<IsAtomicCompareAndSwap8B>,  [JWriteLOCK_CMPXCHG8B]>,
+  SchedVar<MCSchedPredicate<IsAtomicCompareAndSwap16B>, [JWriteLOCK_CMPXCHG16B]>,
+  SchedVar<MCSchedPredicate<IsAtomicCompareAndSwap_8>,  [JWriteLOCK_CMPXCHG8rm]>,
+  SchedVar<MCSchedPredicate<IsAtomicCompareAndSwap>,    [JWriteLOCK_CMPXCHGrm]>,
+  SchedVar<MCSchedPredicate<IsCompareAndSwap8B>,        [JWriteCMPXCHG8B]>,
+  SchedVar<MCSchedPredicate<IsCompareAndSwap16B>,       [JWriteCMPXCHG16B]>,
+  SchedVar<MCSchedPredicate<IsRegMemCompareAndSwap_8>,  [JWriteCMPXCHG8rm]>,
+  SchedVar<MCSchedPredicate<IsRegMemCompareAndSwap>,    [WriteCMPXCHGRMW]>,
+  SchedVar<MCSchedPredicate<IsRegRegCompareAndSwap_8>,  [JWriteCMPXCHG8rr]>,
+  SchedVar<NoSchedPred,                                 [WriteCMPXCHG]>
+]>;
+def : InstRW<[JWriteCMPXCHGVariant], (instrs CMPXCHG8rr, LCMPXCHG8, CMPXCHG8rm,
+                                             CMPXCHG16rm, CMPXCHG32rm, CMPXCHG64rm,
+                                             LCMPXCHG16, LCMPXCHG32, LCMPXCHG64,
+                                             CMPXCHG8B, CMPXCHG16B,
+                                             LCMPXCHG8B, LCMPXCHG16B)>;
+
+
 ////////////////////////////////////////////////////////////////////////////////
 // Floating point. This covers both scalar and vector operations.
 ////////////////////////////////////////////////////////////////////////////////
index 9882a53607d86d77a315f17ca303771d3b7f95aa..4eb95fd923f5aa37694b89054d6f9047f1fe981b 100644 (file)
@@ -15,10 +15,10 @@ lock cmpxchg16b (%rax)
 # CHECK-NEXT: [6]: HasSideEffects (U)
 
 # CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
-# CHECK-NEXT:  2      4     1.00    *      *            cmpxchg8b      (%rax)
-# CHECK-NEXT:  2      4     1.00    *      *            cmpxchg16b     (%rax)
-# CHECK-NEXT:  2      4     1.00    *      *            lock           cmpxchg8b       (%rax)
-# CHECK-NEXT:  2      4     1.00    *      *            lock           cmpxchg16b      (%rax)
+# CHECK-NEXT:  18     11    1.50    *      *            cmpxchg8b      (%rax)
+# CHECK-NEXT:  28     32    3.00    *      *            cmpxchg16b     (%rax)
+# CHECK-NEXT:  18     19    19.00   *      *            lock           cmpxchg8b       (%rax)
+# CHECK-NEXT:  28     38    38.00   *      *            lock           cmpxchg16b      (%rax)
 
 # CHECK:      Resources:
 # CHECK-NEXT: [0]   - JALU0
@@ -38,11 +38,11 @@ lock cmpxchg16b (%rax)
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   [13]
-# CHECK-NEXT: 2.00   2.00    -      -      -      -      -     4.00    -     4.00    -      -      -      -
+# CHECK-NEXT: 9.00   9.00    -      -      -      -      -     59.00   -     59.00   -      -      -      -
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   [13]   Instructions:
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -     1.00    -     1.00    -      -      -      -     cmpxchg8b      (%rax)
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -     1.00    -     1.00    -      -      -      -     cmpxchg16b     (%rax)
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -     1.00    -     1.00    -      -      -      -     lock           cmpxchg8b       (%rax)
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -     1.00    -     1.00    -      -      -      -     lock           cmpxchg16b      (%rax)
+# CHECK-NEXT: 1.50   1.50    -      -      -      -      -     1.00    -     1.00    -      -      -      -     cmpxchg8b      (%rax)
+# CHECK-NEXT: 3.00   3.00    -      -      -      -      -     1.00    -     1.00    -      -      -      -     cmpxchg16b     (%rax)
+# CHECK-NEXT: 1.50   1.50    -      -      -      -      -     19.00   -     19.00   -      -      -      -     lock           cmpxchg8b       (%rax)
+# CHECK-NEXT: 3.00   3.00    -      -      -      -      -     38.00   -     38.00   -      -      -      -     lock           cmpxchg16b      (%rax)
index 5fae0f6bc3ef1bb4c31d370a7400301359ace638..c4b136a96256ab39b9778d090142eca9854c0afb 100644 (file)
@@ -1110,18 +1110,18 @@ xorq (%rax), %rdi
 # CHECK-NEXT:  1      100   0.50                  U     cmpsw  %es:(%rdi), (%rsi)
 # CHECK-NEXT:  1      100   0.50                  U     cmpsl  %es:(%rdi), (%rsi)
 # CHECK-NEXT:  1      100   0.50                  U     cmpsq  %es:(%rdi), (%rsi)
-# CHECK-NEXT:  1      1     0.50                        cmpxchgb       %cl, %bl
-# CHECK-NEXT:  2      4     1.00    *      *            cmpxchgb       %cl, (%rbx)
-# CHECK-NEXT:  2      4     1.00    *      *            lock           cmpxchgb        %cl, (%rbx)
-# CHECK-NEXT:  1      1     0.50                        cmpxchgw       %cx, %bx
-# CHECK-NEXT:  2      4     1.00    *      *            cmpxchgw       %cx, (%rbx)
-# CHECK-NEXT:  2      4     1.00    *      *            lock           cmpxchgw        %cx, (%rbx)
-# CHECK-NEXT:  1      1     0.50                        cmpxchgl       %ecx, %ebx
-# CHECK-NEXT:  2      4     1.00    *      *            cmpxchgl       %ecx, (%rbx)
-# CHECK-NEXT:  2      4     1.00    *      *            lock           cmpxchgl        %ecx, (%rbx)
-# CHECK-NEXT:  1      1     0.50                        cmpxchgq       %rcx, %rbx
-# CHECK-NEXT:  2      4     1.00    *      *            cmpxchgq       %rcx, (%rbx)
-# CHECK-NEXT:  2      4     1.00    *      *            lock           cmpxchgq        %rcx, (%rbx)
+# CHECK-NEXT:  3      3     1.50                        cmpxchgb       %cl, %bl
+# CHECK-NEXT:  5      11    1.50    *      *            cmpxchgb       %cl, (%rbx)
+# CHECK-NEXT:  5      16    16.00   *      *            lock           cmpxchgb        %cl, (%rbx)
+# CHECK-NEXT:  5      3     1.50                        cmpxchgw       %cx, %bx
+# CHECK-NEXT:  6      11    1.50    *      *            cmpxchgw       %cx, (%rbx)
+# CHECK-NEXT:  6      17    17.00   *      *            lock           cmpxchgw        %cx, (%rbx)
+# CHECK-NEXT:  5      3     1.50                        cmpxchgl       %ecx, %ebx
+# CHECK-NEXT:  6      11    1.50    *      *            cmpxchgl       %ecx, (%rbx)
+# CHECK-NEXT:  6      17    17.00   *      *            lock           cmpxchgl        %ecx, (%rbx)
+# CHECK-NEXT:  5      3     1.50                        cmpxchgq       %rcx, %rbx
+# CHECK-NEXT:  6      11    1.50    *      *            cmpxchgq       %rcx, (%rbx)
+# CHECK-NEXT:  6      17    17.00   *      *            lock           cmpxchgq        %rcx, (%rbx)
 # CHECK-NEXT:  1      100   0.50                  U     cpuid
 # CHECK-NEXT:  1      1     0.50                        decb   %dil
 # CHECK-NEXT:  1      5     1.00    *      *            decb   (%rax)
@@ -1705,7 +1705,7 @@ xorq (%rax), %rdi
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   [13]
-# CHECK-NEXT: 612.00 662.00 380.00  -      -      -      -     334.00 64.00  235.00  -      -      -      -
+# CHECK-NEXT: 624.00 674.00 380.00  -      -      -      -     397.00 64.00  298.00  -      -      -      -
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   [13]   Instructions:
@@ -1916,18 +1916,18 @@ xorq (%rax), %rdi
 # CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     cmpsw  %es:(%rdi), (%rsi)
 # CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     cmpsl  %es:(%rdi), (%rsi)
 # CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     cmpsq  %es:(%rdi), (%rsi)
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     cmpxchgb       %cl, %bl
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -     1.00    -     1.00    -      -      -      -     cmpxchgb       %cl, (%rbx)
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -     1.00    -     1.00    -      -      -      -     lock           cmpxchgb        %cl, (%rbx)
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     cmpxchgw       %cx, %bx
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -     1.00    -     1.00    -      -      -      -     cmpxchgw       %cx, (%rbx)
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -     1.00    -     1.00    -      -      -      -     lock           cmpxchgw        %cx, (%rbx)
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     cmpxchgl       %ecx, %ebx
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -     1.00    -     1.00    -      -      -      -     cmpxchgl       %ecx, (%rbx)
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -     1.00    -     1.00    -      -      -      -     lock           cmpxchgl        %ecx, (%rbx)
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     cmpxchgq       %rcx, %rbx
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -     1.00    -     1.00    -      -      -      -     cmpxchgq       %rcx, (%rbx)
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -     1.00    -     1.00    -      -      -      -     lock           cmpxchgq        %rcx, (%rbx)
+# CHECK-NEXT: 1.50   1.50    -      -      -      -      -      -      -      -      -      -      -      -     cmpxchgb       %cl, %bl
+# CHECK-NEXT: 1.50   1.50    -      -      -      -      -     1.00    -     1.00    -      -      -      -     cmpxchgb       %cl, (%rbx)
+# CHECK-NEXT: 1.50   1.50    -      -      -      -      -     16.00   -     16.00   -      -      -      -     lock           cmpxchgb        %cl, (%rbx)
+# CHECK-NEXT: 1.50   1.50    -      -      -      -      -      -      -      -      -      -      -      -     cmpxchgw       %cx, %bx
+# CHECK-NEXT: 1.50   1.50    -      -      -      -      -     1.00    -     1.00    -      -      -      -     cmpxchgw       %cx, (%rbx)
+# CHECK-NEXT: 1.50   1.50    -      -      -      -      -     17.00   -     17.00   -      -      -      -     lock           cmpxchgw        %cx, (%rbx)
+# CHECK-NEXT: 1.50   1.50    -      -      -      -      -      -      -      -      -      -      -      -     cmpxchgl       %ecx, %ebx
+# CHECK-NEXT: 1.50   1.50    -      -      -      -      -     1.00    -     1.00    -      -      -      -     cmpxchgl       %ecx, (%rbx)
+# CHECK-NEXT: 1.50   1.50    -      -      -      -      -     17.00   -     17.00   -      -      -      -     lock           cmpxchgl        %ecx, (%rbx)
+# CHECK-NEXT: 1.50   1.50    -      -      -      -      -      -      -      -      -      -      -      -     cmpxchgq       %rcx, %rbx
+# CHECK-NEXT: 1.50   1.50    -      -      -      -      -     1.00    -     1.00    -      -      -      -     cmpxchgq       %rcx, (%rbx)
+# CHECK-NEXT: 1.50   1.50    -      -      -      -      -     17.00   -     17.00   -      -      -      -     lock           cmpxchgq        %rcx, (%rbx)
 # CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     cpuid
 # CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     decb   %dil
 # CHECK-NEXT: 0.50   0.50    -      -      -      -      -     1.00    -     1.00    -      -      -      -     decb   (%rax)