From b3da570deaab8e1e94f328962ba4fe32715bdaca Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@intel.com>
Date: Tue, 5 Mar 2019 18:37:33 +0000
Subject: [PATCH] [X86] Enable 8-bit OR with disjoint bits to convert to LEA

We already support 8-bits adds in convertToThreeAddress. But we can also support 8-bit OR if the bits are disjoint. We already do this for 16/32/64.

Differential Revision: https://reviews.llvm.org/D58863

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@355423 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86InstrCompiler.td            |  7 +++
 lib/Target/X86/X86InstrFoldTables.cpp         |  3 ++
 lib/Target/X86/X86InstrInfo.cpp               | 26 ++++++----
 lib/Target/X86/X86InstrInfo.h                 |  3 +-
 lib/Target/X86/X86MCInstLower.cpp             |  2 +
 lib/Target/X86/X86MacroFusion.cpp             |  2 +
 test/CodeGen/X86/bitreverse.ll                | 52 +++++++++----------
 test/CodeGen/X86/bool-math.ll                 |  6 +--
 test/CodeGen/X86/fshl.ll                      |  7 +--
 test/CodeGen/X86/select.ll                    | 27 +++++++---
 test/CodeGen/X86/select_const.ll              |  6 +--
 ...asked-merge-scalar-constmask-innerouter.ll | 14 ++---
 ...-merge-scalar-constmask-interleavedbits.ll | 14 ++---
 ...-scalar-constmask-interleavedbytehalves.ll | 14 ++---
 ...d-masked-merge-scalar-constmask-lowhigh.ll | 14 ++---
 test/CodeGen/X86/vector-bitreverse.ll         | 52 +++++++++----------
 16 files changed, 146 insertions(+), 103 deletions(-)

diff --git a/lib/Target/X86/X86InstrCompiler.td b/lib/Target/X86/X86InstrCompiler.td
index fe826c4fa13..ea7453d5b4b 100644
--- a/lib/Target/X86/X86InstrCompiler.td
+++ b/lib/Target/X86/X86InstrCompiler.td
@@ -1381,6 +1381,9 @@ let SchedRW = [WriteALU] in {
 let isConvertibleToThreeAddress = 1,
     Constraints = "$src1 = $dst", Defs = [EFLAGS] in {
 let isCommutable = 1 in {
+def ADD8rr_DB   : I<0, Pseudo, (outs GR8:$dst), (ins GR8:$src1, GR8:$src2),
+                    "", // orb/addb REG, REG
+                    [(set GR8:$dst, (or_is_add GR8:$src1, GR8:$src2))]>;
 def ADD16rr_DB  : I<0, Pseudo, (outs GR16:$dst), (ins GR16:$src1, GR16:$src2),
                     "", // orw/addw REG, REG
                     [(set GR16:$dst, (or_is_add GR16:$src1, GR16:$src2))]>;
@@ -1395,6 +1398,10 @@ def ADD64rr_DB  : I<0, Pseudo, (outs GR64:$dst), (ins GR64:$src1, GR64:$src2),
 // NOTE: These are order specific, we want the ri8 forms to be listed
 // first so that they are slightly preferred to the ri forms.
 
+def ADD8ri_DB :   I<0, Pseudo,
+                    (outs GR8:$dst), (ins GR8:$src1, i8imm:$src2),
+                    "", // orb/addb REG, imm8
+                    [(set GR8:$dst, (or_is_add GR8:$src1, imm:$src2))]>;
 def ADD16ri8_DB : I<0, Pseudo,
                     (outs GR16:$dst), (ins GR16:$src1, i16i8imm:$src2),
                     "", // orw/addw REG, imm8
diff --git a/lib/Target/X86/X86InstrFoldTables.cpp b/lib/Target/X86/X86InstrFoldTables.cpp
index 17ef1d67af9..5cd327ff23b 100644
--- a/lib/Target/X86/X86InstrFoldTables.cpp
+++ b/lib/Target/X86/X86InstrFoldTables.cpp
@@ -65,7 +65,9 @@ static const X86MemoryFoldTableEntry MemoryFoldTable2Addr[] = {
   { X86::ADD64rr_DB,  X86::ADD64mr,    TB_NO_REVERSE },
   { X86::ADD8ri,      X86::ADD8mi,     0 },
   { X86::ADD8ri8,     X86::ADD8mi8,    0 },
+  { X86::ADD8ri_DB,   X86::ADD8mi,     TB_NO_REVERSE },
   { X86::ADD8rr,      X86::ADD8mr,     0 },
+  { X86::ADD8rr_DB,   X86::ADD8mr,     TB_NO_REVERSE },
   { X86::AND16ri,     X86::AND16mi,    0 },
   { X86::AND16ri8,    X86::AND16mi8,   0 },
   { X86::AND16rr,     X86::AND16mr,    0 },
@@ -1218,6 +1220,7 @@ static const X86MemoryFoldTableEntry MemoryFoldTable2[] = {
   { X86::ADD64rr,                  X86::ADD64rm,                  0 },
   { X86::ADD64rr_DB,               X86::ADD64rm,                  TB_NO_REVERSE },
   { X86::ADD8rr,                   X86::ADD8rm,                   0 },
+  { X86::ADD8rr_DB,                X86::ADD8rm,                   TB_NO_REVERSE },
   { X86::ADDPDrr,                  X86::ADDPDrm,                  TB_ALIGN_16 },
   { X86::ADDPSrr,                  X86::ADDPSrm,                  TB_ALIGN_16 },
   { X86::ADDSDrr,                  X86::ADDSDrm,                  0 },
diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp
index acc790c68c8..f17d8812a8a 100644
--- a/lib/Target/X86/X86InstrInfo.cpp
+++ b/lib/Target/X86/X86InstrInfo.cpp
@@ -710,11 +710,10 @@ bool X86InstrInfo::classifyLEAReg(MachineInstr &MI, const MachineOperand &Src,
 
 MachineInstr *X86InstrInfo::convertToThreeAddressWithLEA(
     unsigned MIOpc, MachineFunction::iterator &MFI, MachineInstr &MI,
-    LiveVariables *LV) const {
+    LiveVariables *LV, bool Is8BitOp) const {
   // We handle 8-bit adds and various 16-bit opcodes in the switch below.
-  bool Is16BitOp = !(MIOpc == X86::ADD8rr || MIOpc == X86::ADD8ri);
   MachineRegisterInfo &RegInfo = MFI->getParent()->getRegInfo();
-  assert((!Is16BitOp || RegInfo.getTargetRegisterInfo()->getRegSizeInBits(
+  assert((Is8BitOp || RegInfo.getTargetRegisterInfo()->getRegSizeInBits(
               *RegInfo.getRegClass(MI.getOperand(0).getReg())) == 16) &&
          "Unexpected type for LEA transform");
 
@@ -744,7 +743,7 @@ MachineInstr *X86InstrInfo::convertToThreeAddressWithLEA(
   unsigned Src = MI.getOperand(1).getReg();
   bool IsDead = MI.getOperand(0).isDead();
   bool IsKill = MI.getOperand(1).isKill();
-  unsigned SubReg = Is16BitOp ? X86::sub_16bit : X86::sub_8bit;
+  unsigned SubReg = Is8BitOp ? X86::sub_8bit : X86::sub_16bit;
   assert(!MI.getOperand(1).isUndef() && "Undef op doesn't need optimization");
   BuildMI(*MFI, MBBI, MI.getDebugLoc(), get(X86::IMPLICIT_DEF), InRegLEA);
   MachineInstr *InsMI =
@@ -769,6 +768,7 @@ MachineInstr *X86InstrInfo::convertToThreeAddressWithLEA(
     addRegOffset(MIB, InRegLEA, true, -1);
     break;
   case X86::ADD8ri:
+  case X86::ADD8ri_DB:
   case X86::ADD16ri:
   case X86::ADD16ri8:
   case X86::ADD16ri_DB:
@@ -776,6 +776,7 @@ MachineInstr *X86InstrInfo::convertToThreeAddressWithLEA(
     addRegOffset(MIB, InRegLEA, true, MI.getOperand(2).getImm());
     break;
   case X86::ADD8rr:
+  case X86::ADD8rr_DB:
   case X86::ADD16rr:
   case X86::ADD16rr_DB: {
     unsigned Src2 = MI.getOperand(2).getReg();
@@ -862,6 +863,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
   MachineInstr *NewMI = nullptr;
   bool Is64Bit = Subtarget.is64Bit();
 
+  bool Is8BitOp = false;
   unsigned MIOpc = MI.getOpcode();
   switch (MIOpc) {
   default: return nullptr;
@@ -919,7 +921,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
     unsigned ShAmt = getTruncatedShiftCount(MI, 2);
     if (!isTruncatedShiftCountForLEA(ShAmt))
       return nullptr;
-    return convertToThreeAddressWithLEA(MIOpc, MFI, MI, LV);
+    return convertToThreeAddressWithLEA(MIOpc, MFI, MI, LV, Is8BitOp);
   }
   case X86::INC64r:
   case X86::INC32r: {
@@ -944,7 +946,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
     break;
   }
   case X86::INC16r:
-    return convertToThreeAddressWithLEA(MIOpc, MFI, MI, LV);
+    return convertToThreeAddressWithLEA(MIOpc, MFI, MI, LV, Is8BitOp);
   case X86::DEC64r:
   case X86::DEC32r: {
     assert(MI.getNumOperands() >= 2 && "Unknown dec instruction!");
@@ -969,7 +971,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
     break;
   }
   case X86::DEC16r:
-    return convertToThreeAddressWithLEA(MIOpc, MFI, MI, LV);
+    return convertToThreeAddressWithLEA(MIOpc, MFI, MI, LV, Is8BitOp);
   case X86::ADD64rr:
   case X86::ADD64rr_DB:
   case X86::ADD32rr:
@@ -1008,9 +1010,12 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
     break;
   }
   case X86::ADD8rr:
+  case X86::ADD8rr_DB:
+    Is8BitOp = true;
+    LLVM_FALLTHROUGH;
   case X86::ADD16rr:
   case X86::ADD16rr_DB:
-    return convertToThreeAddressWithLEA(MIOpc, MFI, MI, LV);
+    return convertToThreeAddressWithLEA(MIOpc, MFI, MI, LV, Is8BitOp);
   case X86::ADD64ri32:
   case X86::ADD64ri8:
   case X86::ADD64ri32_DB:
@@ -1044,11 +1049,14 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
     break;
   }
   case X86::ADD8ri:
+  case X86::ADD8ri_DB:
+    Is8BitOp = true;
+    LLVM_FALLTHROUGH;
   case X86::ADD16ri:
   case X86::ADD16ri8:
   case X86::ADD16ri_DB:
   case X86::ADD16ri8_DB:
-    return convertToThreeAddressWithLEA(MIOpc, MFI, MI, LV);
+    return convertToThreeAddressWithLEA(MIOpc, MFI, MI, LV, Is8BitOp);
   case X86::VMOVDQU8Z128rmk:
   case X86::VMOVDQU8Z256rmk:
   case X86::VMOVDQU8Zrmk:
diff --git a/lib/Target/X86/X86InstrInfo.h b/lib/Target/X86/X86InstrInfo.h
index d18e93ad39a..f95681b14e2 100644
--- a/lib/Target/X86/X86InstrInfo.h
+++ b/lib/Target/X86/X86InstrInfo.h
@@ -592,7 +592,8 @@ private:
   MachineInstr *convertToThreeAddressWithLEA(unsigned MIOpc,
                                              MachineFunction::iterator &MFI,
                                              MachineInstr &MI,
-                                             LiveVariables *LV) const;
+                                             LiveVariables *LV,
+                                             bool Is8BitOp) const;
 
   /// Handles memory folding for special case instructions, for instance those
   /// requiring custom manipulation of the address.
diff --git a/lib/Target/X86/X86MCInstLower.cpp b/lib/Target/X86/X86MCInstLower.cpp
index cc2a1a43b67..ce6bdafbc27 100644
--- a/lib/Target/X86/X86MCInstLower.cpp
+++ b/lib/Target/X86/X86MCInstLower.cpp
@@ -586,9 +586,11 @@ ReSimplify:
   // These are pseudo-ops for OR to help with the OR->ADD transformation.  We do
   // this with an ugly goto in case the resultant OR uses EAX and needs the
   // short form.
+  case X86::ADD8rr_DB:    OutMI.setOpcode(X86::OR8rr);    goto ReSimplify;
   case X86::ADD16rr_DB:   OutMI.setOpcode(X86::OR16rr);   goto ReSimplify;
   case X86::ADD32rr_DB:   OutMI.setOpcode(X86::OR32rr);   goto ReSimplify;
   case X86::ADD64rr_DB:   OutMI.setOpcode(X86::OR64rr);   goto ReSimplify;
+  case X86::ADD8ri_DB:    OutMI.setOpcode(X86::OR8ri);    goto ReSimplify;
   case X86::ADD16ri_DB:   OutMI.setOpcode(X86::OR16ri);   goto ReSimplify;
   case X86::ADD32ri_DB:   OutMI.setOpcode(X86::OR32ri);   goto ReSimplify;
   case X86::ADD64ri32_DB: OutMI.setOpcode(X86::OR64ri32); goto ReSimplify;
diff --git a/lib/Target/X86/X86MacroFusion.cpp b/lib/Target/X86/X86MacroFusion.cpp
index 8c2d0fe6690..e5e80a2339a 100644
--- a/lib/Target/X86/X86MacroFusion.cpp
+++ b/lib/Target/X86/X86MacroFusion.cpp
@@ -140,8 +140,10 @@ static bool shouldScheduleAdjacent(const TargetInstrInfo &TII,
   case X86::ADD64rr:
   case X86::ADD64rr_DB:
   case X86::ADD8ri:
+  case X86::ADD8ri_DB:
   case X86::ADD8rm:
   case X86::ADD8rr:
+  case X86::ADD8rr_DB:
   case X86::SUB16ri:
   case X86::SUB16ri8:
   case X86::SUB16rm:
diff --git a/test/CodeGen/X86/bitreverse.ll b/test/CodeGen/X86/bitreverse.ll
index ed3fdefce7a..5da95c574eb 100644
--- a/test/CodeGen/X86/bitreverse.ll
+++ b/test/CodeGen/X86/bitreverse.ll
@@ -340,20 +340,20 @@ define i8 @test_bitreverse_i8(i8 %a) {
 ;
 ; X64-LABEL: test_bitreverse_i8:
 ; X64:       # %bb.0:
+; X64-NEXT:    # kill: def $edi killed $edi def $rdi
+; X64-NEXT:    rolb $4, %dil
 ; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    rolb $4, %al
-; X64-NEXT:    movl %eax, %ecx
-; X64-NEXT:    andb $51, %cl
-; X64-NEXT:    shlb $2, %cl
-; X64-NEXT:    andb $-52, %al
-; X64-NEXT:    shrb $2, %al
-; X64-NEXT:    orb %cl, %al
-; X64-NEXT:    movl %eax, %ecx
-; X64-NEXT:    andb $85, %cl
-; X64-NEXT:    addb %cl, %cl
-; X64-NEXT:    andb $-86, %al
-; X64-NEXT:    shrb %al
-; X64-NEXT:    orb %cl, %al
+; X64-NEXT:    andb $51, %al
+; X64-NEXT:    shlb $2, %al
+; X64-NEXT:    andb $-52, %dil
+; X64-NEXT:    shrb $2, %dil
+; X64-NEXT:    orb %al, %dil
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    andb $85, %al
+; X64-NEXT:    addb %al, %al
+; X64-NEXT:    andb $-86, %dil
+; X64-NEXT:    shrb %dil
+; X64-NEXT:    leal (%rdi,%rax), %eax
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
   %b = call i8 @llvm.bitreverse.i8(i8 %a)
@@ -384,20 +384,20 @@ define i4 @test_bitreverse_i4(i4 %a) {
 ;
 ; X64-LABEL: test_bitreverse_i4:
 ; X64:       # %bb.0:
+; X64-NEXT:    # kill: def $edi killed $edi def $rdi
+; X64-NEXT:    rolb $4, %dil
 ; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    rolb $4, %al
-; X64-NEXT:    movl %eax, %ecx
-; X64-NEXT:    andb $51, %cl
-; X64-NEXT:    shlb $2, %cl
-; X64-NEXT:    andb $-52, %al
-; X64-NEXT:    shrb $2, %al
-; X64-NEXT:    orb %cl, %al
-; X64-NEXT:    movl %eax, %ecx
-; X64-NEXT:    andb $80, %cl
-; X64-NEXT:    addb %cl, %cl
-; X64-NEXT:    andb $-96, %al
-; X64-NEXT:    shrb %al
-; X64-NEXT:    orb %cl, %al
+; X64-NEXT:    andb $51, %al
+; X64-NEXT:    shlb $2, %al
+; X64-NEXT:    andb $-52, %dil
+; X64-NEXT:    shrb $2, %dil
+; X64-NEXT:    orb %al, %dil
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    andb $80, %al
+; X64-NEXT:    addb %al, %al
+; X64-NEXT:    andb $-96, %dil
+; X64-NEXT:    shrb %dil
+; X64-NEXT:    leal (%rdi,%rax), %eax
 ; X64-NEXT:    shrb $4, %al
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
diff --git a/test/CodeGen/X86/bool-math.ll b/test/CodeGen/X86/bool-math.ll
index 3a7193bd631..c0a7a5bd4fb 100644
--- a/test/CodeGen/X86/bool-math.ll
+++ b/test/CodeGen/X86/bool-math.ll
@@ -47,9 +47,9 @@ define i32 @sub_zext_cmp_mask_wider_result(i8 %x) {
 define i8 @sub_zext_cmp_mask_narrower_result(i32 %x) {
 ; X64-LABEL: sub_zext_cmp_mask_narrower_result:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    andb $1, %al
-; X64-NEXT:    orb $46, %al
+; X64-NEXT:    # kill: def $edi killed $edi def $rdi
+; X64-NEXT:    andb $1, %dil
+; X64-NEXT:    leal 46(%rdi), %eax
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
 ;
diff --git a/test/CodeGen/X86/fshl.ll b/test/CodeGen/X86/fshl.ll
index ccf451e0451..0e1bcb2e26d 100644
--- a/test/CodeGen/X86/fshl.ll
+++ b/test/CodeGen/X86/fshl.ll
@@ -381,10 +381,11 @@ define i8 @const_shift_i8(i8 %x, i8 %y) nounwind {
 ;
 ; X64-LABEL: const_shift_i8:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    # kill: def $esi killed $esi def $rsi
+; X64-NEXT:    # kill: def $edi killed $edi def $rdi
 ; X64-NEXT:    shrb %sil
-; X64-NEXT:    shlb $7, %al
-; X64-NEXT:    orb %sil, %al
+; X64-NEXT:    shlb $7, %dil
+; X64-NEXT:    leal (%rdi,%rsi), %eax
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
   %tmp = tail call i8 @llvm.fshl.i8(i8 %x, i8 %y, i8 7)
diff --git a/test/CodeGen/X86/select.ll b/test/CodeGen/X86/select.ll
index 99c4a99b6ed..558dc7db42d 100644
--- a/test/CodeGen/X86/select.ll
+++ b/test/CodeGen/X86/select.ll
@@ -1088,14 +1088,25 @@ define i8 @test18(i32 %x, i8 zeroext %a, i8 zeroext %b) nounwind {
 }
 
 define i32 @trunc_select_miscompile(i32 %a, i1 zeroext %cc) {
-; CHECK-LABEL: trunc_select_miscompile:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    movl %esi, %ecx
-; CHECK-NEXT:    movl %edi, %eax
-; CHECK-NEXT:    orb $2, %cl
-; CHECK-NEXT:    ## kill: def $cl killed $cl killed $ecx
-; CHECK-NEXT:    shll %cl, %eax
-; CHECK-NEXT:    retq
+; GENERIC-LABEL: trunc_select_miscompile:
+; GENERIC:       ## %bb.0:
+; GENERIC-NEXT:    ## kill: def $esi killed $esi def $rsi
+; GENERIC-NEXT:    movl %edi, %eax
+; GENERIC-NEXT:    leal 2(%rsi), %ecx
+; GENERIC-NEXT:    ## kill: def $cl killed $cl killed $ecx
+; GENERIC-NEXT:    shll %cl, %eax
+; GENERIC-NEXT:    retq
+;
+; ATOM-LABEL: trunc_select_miscompile:
+; ATOM:       ## %bb.0:
+; ATOM-NEXT:    ## kill: def $esi killed $esi def $rsi
+; ATOM-NEXT:    leal 2(%rsi), %ecx
+; ATOM-NEXT:    movl %edi, %eax
+; ATOM-NEXT:    ## kill: def $cl killed $cl killed $ecx
+; ATOM-NEXT:    shll %cl, %eax
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    retq
 ;
 ; ATHLON-LABEL: trunc_select_miscompile:
 ; ATHLON:       ## %bb.0:
diff --git a/test/CodeGen/X86/select_const.ll b/test/CodeGen/X86/select_const.ll
index 8ab603d396f..8f7989e2bed 100644
--- a/test/CodeGen/X86/select_const.ll
+++ b/test/CodeGen/X86/select_const.ll
@@ -328,9 +328,9 @@ define i32 @sel_neg1_1_32(i32 %x) {
 define i8 @select_pow2_diff(i1 zeroext %cond) {
 ; CHECK-LABEL: select_pow2_diff:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movl %edi, %eax
-; CHECK-NEXT:    shlb $4, %al
-; CHECK-NEXT:    orb $3, %al
+; CHECK-NEXT:    # kill: def $edi killed $edi def $rdi
+; CHECK-NEXT:    shlb $4, %dil
+; CHECK-NEXT:    leal 3(%rdi), %eax
 ; CHECK-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    retq
   %sel = select i1 %cond, i8 19, i8 3
diff --git a/test/CodeGen/X86/unfold-masked-merge-scalar-constmask-innerouter.ll b/test/CodeGen/X86/unfold-masked-merge-scalar-constmask-innerouter.ll
index ac554781653..8daa0a6e969 100644
--- a/test/CodeGen/X86/unfold-masked-merge-scalar-constmask-innerouter.ll
+++ b/test/CodeGen/X86/unfold-masked-merge-scalar-constmask-innerouter.ll
@@ -10,19 +10,21 @@
 define i8 @out8_constmask(i8 %x, i8 %y) {
 ; CHECK-NOBMI-LABEL: out8_constmask:
 ; CHECK-NOBMI:       # %bb.0:
-; CHECK-NOBMI-NEXT:    movl %esi, %eax
+; CHECK-NOBMI-NEXT:    # kill: def $esi killed $esi def $rsi
+; CHECK-NOBMI-NEXT:    # kill: def $edi killed $edi def $rdi
 ; CHECK-NOBMI-NEXT:    andb $60, %dil
-; CHECK-NOBMI-NEXT:    andb $-61, %al
-; CHECK-NOBMI-NEXT:    orb %dil, %al
+; CHECK-NOBMI-NEXT:    andb $-61, %sil
+; CHECK-NOBMI-NEXT:    leal (%rsi,%rdi), %eax
 ; CHECK-NOBMI-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-NOBMI-NEXT:    retq
 ;
 ; CHECK-BMI-LABEL: out8_constmask:
 ; CHECK-BMI:       # %bb.0:
-; CHECK-BMI-NEXT:    movl %esi, %eax
+; CHECK-BMI-NEXT:    # kill: def $esi killed $esi def $rsi
+; CHECK-BMI-NEXT:    # kill: def $edi killed $edi def $rdi
 ; CHECK-BMI-NEXT:    andb $60, %dil
-; CHECK-BMI-NEXT:    andb $-61, %al
-; CHECK-BMI-NEXT:    orb %dil, %al
+; CHECK-BMI-NEXT:    andb $-61, %sil
+; CHECK-BMI-NEXT:    leal (%rsi,%rdi), %eax
 ; CHECK-BMI-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-BMI-NEXT:    retq
   %mx = and i8 %x, 60
diff --git a/test/CodeGen/X86/unfold-masked-merge-scalar-constmask-interleavedbits.ll b/test/CodeGen/X86/unfold-masked-merge-scalar-constmask-interleavedbits.ll
index 95de3bb2e03..33b6b66b66d 100644
--- a/test/CodeGen/X86/unfold-masked-merge-scalar-constmask-interleavedbits.ll
+++ b/test/CodeGen/X86/unfold-masked-merge-scalar-constmask-interleavedbits.ll
@@ -10,19 +10,21 @@
 define i8 @out8_constmask(i8 %x, i8 %y) {
 ; CHECK-NOBMI-LABEL: out8_constmask:
 ; CHECK-NOBMI:       # %bb.0:
-; CHECK-NOBMI-NEXT:    movl %esi, %eax
+; CHECK-NOBMI-NEXT:    # kill: def $esi killed $esi def $rsi
+; CHECK-NOBMI-NEXT:    # kill: def $edi killed $edi def $rdi
 ; CHECK-NOBMI-NEXT:    andb $85, %dil
-; CHECK-NOBMI-NEXT:    andb $-86, %al
-; CHECK-NOBMI-NEXT:    orb %dil, %al
+; CHECK-NOBMI-NEXT:    andb $-86, %sil
+; CHECK-NOBMI-NEXT:    leal (%rsi,%rdi), %eax
 ; CHECK-NOBMI-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-NOBMI-NEXT:    retq
 ;
 ; CHECK-BMI-LABEL: out8_constmask:
 ; CHECK-BMI:       # %bb.0:
-; CHECK-BMI-NEXT:    movl %esi, %eax
+; CHECK-BMI-NEXT:    # kill: def $esi killed $esi def $rsi
+; CHECK-BMI-NEXT:    # kill: def $edi killed $edi def $rdi
 ; CHECK-BMI-NEXT:    andb $85, %dil
-; CHECK-BMI-NEXT:    andb $-86, %al
-; CHECK-BMI-NEXT:    orb %dil, %al
+; CHECK-BMI-NEXT:    andb $-86, %sil
+; CHECK-BMI-NEXT:    leal (%rsi,%rdi), %eax
 ; CHECK-BMI-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-BMI-NEXT:    retq
   %mx = and i8 %x, 85
diff --git a/test/CodeGen/X86/unfold-masked-merge-scalar-constmask-interleavedbytehalves.ll b/test/CodeGen/X86/unfold-masked-merge-scalar-constmask-interleavedbytehalves.ll
index c7579e2aa9d..bbc987d4474 100644
--- a/test/CodeGen/X86/unfold-masked-merge-scalar-constmask-interleavedbytehalves.ll
+++ b/test/CodeGen/X86/unfold-masked-merge-scalar-constmask-interleavedbytehalves.ll
@@ -10,19 +10,21 @@
 define i8 @out8_constmask(i8 %x, i8 %y) {
 ; CHECK-NOBMI-LABEL: out8_constmask:
 ; CHECK-NOBMI:       # %bb.0:
-; CHECK-NOBMI-NEXT:    movl %esi, %eax
+; CHECK-NOBMI-NEXT:    # kill: def $esi killed $esi def $rsi
+; CHECK-NOBMI-NEXT:    # kill: def $edi killed $edi def $rdi
 ; CHECK-NOBMI-NEXT:    andb $15, %dil
-; CHECK-NOBMI-NEXT:    andb $-16, %al
-; CHECK-NOBMI-NEXT:    orb %dil, %al
+; CHECK-NOBMI-NEXT:    andb $-16, %sil
+; CHECK-NOBMI-NEXT:    leal (%rsi,%rdi), %eax
 ; CHECK-NOBMI-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-NOBMI-NEXT:    retq
 ;
 ; CHECK-BMI-LABEL: out8_constmask:
 ; CHECK-BMI:       # %bb.0:
-; CHECK-BMI-NEXT:    movl %esi, %eax
+; CHECK-BMI-NEXT:    # kill: def $esi killed $esi def $rsi
+; CHECK-BMI-NEXT:    # kill: def $edi killed $edi def $rdi
 ; CHECK-BMI-NEXT:    andb $15, %dil
-; CHECK-BMI-NEXT:    andb $-16, %al
-; CHECK-BMI-NEXT:    orb %dil, %al
+; CHECK-BMI-NEXT:    andb $-16, %sil
+; CHECK-BMI-NEXT:    leal (%rsi,%rdi), %eax
 ; CHECK-BMI-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-BMI-NEXT:    retq
   %mx = and i8 %x, 15
diff --git a/test/CodeGen/X86/unfold-masked-merge-scalar-constmask-lowhigh.ll b/test/CodeGen/X86/unfold-masked-merge-scalar-constmask-lowhigh.ll
index 4a63eba04d8..78faa3ca717 100644
--- a/test/CodeGen/X86/unfold-masked-merge-scalar-constmask-lowhigh.ll
+++ b/test/CodeGen/X86/unfold-masked-merge-scalar-constmask-lowhigh.ll
@@ -10,19 +10,21 @@
 define i8 @out8_constmask(i8 %x, i8 %y) {
 ; CHECK-NOBMI-LABEL: out8_constmask:
 ; CHECK-NOBMI:       # %bb.0:
-; CHECK-NOBMI-NEXT:    movl %esi, %eax
+; CHECK-NOBMI-NEXT:    # kill: def $esi killed $esi def $rsi
+; CHECK-NOBMI-NEXT:    # kill: def $edi killed $edi def $rdi
 ; CHECK-NOBMI-NEXT:    andb $15, %dil
-; CHECK-NOBMI-NEXT:    andb $-16, %al
-; CHECK-NOBMI-NEXT:    orb %dil, %al
+; CHECK-NOBMI-NEXT:    andb $-16, %sil
+; CHECK-NOBMI-NEXT:    leal (%rsi,%rdi), %eax
 ; CHECK-NOBMI-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-NOBMI-NEXT:    retq
 ;
 ; CHECK-BMI-LABEL: out8_constmask:
 ; CHECK-BMI:       # %bb.0:
-; CHECK-BMI-NEXT:    movl %esi, %eax
+; CHECK-BMI-NEXT:    # kill: def $esi killed $esi def $rsi
+; CHECK-BMI-NEXT:    # kill: def $edi killed $edi def $rdi
 ; CHECK-BMI-NEXT:    andb $15, %dil
-; CHECK-BMI-NEXT:    andb $-16, %al
-; CHECK-BMI-NEXT:    orb %dil, %al
+; CHECK-BMI-NEXT:    andb $-16, %sil
+; CHECK-BMI-NEXT:    leal (%rsi,%rdi), %eax
 ; CHECK-BMI-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-BMI-NEXT:    retq
   %mx = and i8 %x, 15
diff --git a/test/CodeGen/X86/vector-bitreverse.ll b/test/CodeGen/X86/vector-bitreverse.ll
index df1b5041caf..a564bbc1743 100644
--- a/test/CodeGen/X86/vector-bitreverse.ll
+++ b/test/CodeGen/X86/vector-bitreverse.ll
@@ -14,39 +14,39 @@
 define i8 @test_bitreverse_i8(i8 %a) nounwind {
 ; SSE-LABEL: test_bitreverse_i8:
 ; SSE:       # %bb.0:
+; SSE-NEXT:    # kill: def $edi killed $edi def $rdi
+; SSE-NEXT:    rolb $4, %dil
 ; SSE-NEXT:    movl %edi, %eax
-; SSE-NEXT:    rolb $4, %al
-; SSE-NEXT:    movl %eax, %ecx
-; SSE-NEXT:    andb $51, %cl
-; SSE-NEXT:    shlb $2, %cl
-; SSE-NEXT:    andb $-52, %al
-; SSE-NEXT:    shrb $2, %al
-; SSE-NEXT:    orb %cl, %al
-; SSE-NEXT:    movl %eax, %ecx
-; SSE-NEXT:    andb $85, %cl
-; SSE-NEXT:    addb %cl, %cl
-; SSE-NEXT:    andb $-86, %al
-; SSE-NEXT:    shrb %al
-; SSE-NEXT:    orb %cl, %al
+; SSE-NEXT:    andb $51, %al
+; SSE-NEXT:    shlb $2, %al
+; SSE-NEXT:    andb $-52, %dil
+; SSE-NEXT:    shrb $2, %dil
+; SSE-NEXT:    orb %al, %dil
+; SSE-NEXT:    movl %edi, %eax
+; SSE-NEXT:    andb $85, %al
+; SSE-NEXT:    addb %al, %al
+; SSE-NEXT:    andb $-86, %dil
+; SSE-NEXT:    shrb %dil
+; SSE-NEXT:    leal (%rdi,%rax), %eax
 ; SSE-NEXT:    # kill: def $al killed $al killed $eax
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: test_bitreverse_i8:
 ; AVX:       # %bb.0:
+; AVX-NEXT:    # kill: def $edi killed $edi def $rdi
+; AVX-NEXT:    rolb $4, %dil
 ; AVX-NEXT:    movl %edi, %eax
-; AVX-NEXT:    rolb $4, %al
-; AVX-NEXT:    movl %eax, %ecx
-; AVX-NEXT:    andb $51, %cl
-; AVX-NEXT:    shlb $2, %cl
-; AVX-NEXT:    andb $-52, %al
-; AVX-NEXT:    shrb $2, %al
-; AVX-NEXT:    orb %cl, %al
-; AVX-NEXT:    movl %eax, %ecx
-; AVX-NEXT:    andb $85, %cl
-; AVX-NEXT:    addb %cl, %cl
-; AVX-NEXT:    andb $-86, %al
-; AVX-NEXT:    shrb %al
-; AVX-NEXT:    orb %cl, %al
+; AVX-NEXT:    andb $51, %al
+; AVX-NEXT:    shlb $2, %al
+; AVX-NEXT:    andb $-52, %dil
+; AVX-NEXT:    shrb $2, %dil
+; AVX-NEXT:    orb %al, %dil
+; AVX-NEXT:    movl %edi, %eax
+; AVX-NEXT:    andb $85, %al
+; AVX-NEXT:    addb %al, %al
+; AVX-NEXT:    andb $-86, %dil
+; AVX-NEXT:    shrb %dil
+; AVX-NEXT:    leal (%rdi,%rax), %eax
 ; AVX-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX-NEXT:    retq
 ;
-- 
2.50.1