From: Sanjay Patel <spatel@rotateright.com>
Date: Tue, 20 Jun 2017 15:58:30 +0000 (+0000)
Subject: [x86] enable CGP memcmp() expansion for 2/4/8 byte sizes
X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=aab686b3f7f5ac6d7e5dd815ce7dd869615a38e1;p=llvm

[x86] enable CGP memcmp() expansion for 2/4/8 byte sizes

There are a couple of potential improvements as seen in the IR and asm:
1. We're unnecessarily extending to a larger type to compare values.
2. The codegen for (select cond, 1, -1) could avoid a cmov.
(or we could change the order of the compares, so we have a select with 0 operand)


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@305802 91177308-0d34-0410-b5e6-96231b3b80d8
---

diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index eacf2e55143..8dfaf3f080e 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -1662,6 +1662,12 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
   MaxStoresPerMemcpyOptSize = 4;
   MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
   MaxStoresPerMemmoveOptSize = 4;
+
+  // TODO: These control memcmp expansion in CGP and are set low to prevent
+  // altering the vector expansion for 16/32 byte memcmp in SelectionDAGBuilder.
+  MaxLoadsPerMemcmp = 1;
+  MaxLoadsPerMemcmpOptSize = 1;
+
   // Set loop alignment to 2^ExperimentalPrefLoopAlignment bytes (default: 2^4).
   setPrefLoopAlignment(ExperimentalPrefLoopAlignment);
 
diff --git a/lib/Target/X86/X86TargetTransformInfo.cpp b/lib/Target/X86/X86TargetTransformInfo.cpp
index 1d58cccc308..f13933e9288 100644
--- a/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -2232,6 +2232,12 @@ bool X86TTIImpl::areInlineCompatible(const Function *Caller,
   return (CallerBits & CalleeBits) == CalleeBits;
 }
 
+bool X86TTIImpl::expandMemCmp(Instruction *I, unsigned &MaxLoadSize) {
+  // TODO: We can increase these based on available vector ops.
+  MaxLoadSize = ST->is64Bit() ? 8 : 4;
+  return true;
+}
+
 bool X86TTIImpl::enableInterleavedAccessVectorization() {
   // TODO: We expect this to be beneficial regardless of arch,
   // but there are currently some unexplained performance artifacts on Atom.
diff --git a/lib/Target/X86/X86TargetTransformInfo.h b/lib/Target/X86/X86TargetTransformInfo.h
index b907b7556a1..375fb924c2c 100644
--- a/lib/Target/X86/X86TargetTransformInfo.h
+++ b/lib/Target/X86/X86TargetTransformInfo.h
@@ -107,7 +107,7 @@ public:
   bool isLegalMaskedScatter(Type *DataType);
   bool areInlineCompatible(const Function *Caller,
                            const Function *Callee) const;
-
+  bool expandMemCmp(Instruction *I, unsigned &MaxLoadSize);
   bool enableInterleavedAccessVectorization();
 private:
   int getGSScalarCost(unsigned Opcode, Type *DataTy, bool VariableMask,
diff --git a/test/CodeGen/X86/memcmp.ll b/test/CodeGen/X86/memcmp.ll
index 4e2475b1c67..9d26aee2e8b 100644
--- a/test/CodeGen/X86/memcmp.ll
+++ b/test/CodeGen/X86/memcmp.ll
@@ -12,19 +12,46 @@ declare i32 @memcmp(i8*, i8*, i64)
 
 define i32 @length2(i8* %X, i8* %Y) nounwind {
 ; X32-LABEL: length2:
-; X32:       # BB#0:
-; X32-NEXT:    pushl $0
-; X32-NEXT:    pushl $2
-; X32-NEXT:    pushl {{[0-9]+}}(%esp)
-; X32-NEXT:    pushl {{[0-9]+}}(%esp)
-; X32-NEXT:    calll memcmp
-; X32-NEXT:    addl $16, %esp
+; X32:       # BB#0: # %loadbb
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movzwl (%ecx), %ecx
+; X32-NEXT:    movzwl (%eax), %eax
+; X32-NEXT:    rolw $8, %cx
+; X32-NEXT:    rolw $8, %ax
+; X32-NEXT:    movzwl %cx, %ecx
+; X32-NEXT:    movzwl %ax, %eax
+; X32-NEXT:    cmpl %eax, %ecx
+; X32-NEXT:    je .LBB0_1
+; X32-NEXT:  # BB#2: # %res_block
+; X32-NEXT:    movl $-1, %eax
+; X32-NEXT:    jb .LBB0_4
+; X32-NEXT:  # BB#3: # %res_block
+; X32-NEXT:    movl $1, %eax
+; X32-NEXT:  .LBB0_4: # %endblock
+; X32-NEXT:    retl
+; X32-NEXT:  .LBB0_1:
+; X32-NEXT:    xorl %eax, %eax
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: length2:
-; X64:       # BB#0:
-; X64-NEXT:    movl $2, %edx
-; X64-NEXT:    jmp memcmp # TAILCALL
+; X64:       # BB#0: # %loadbb
+; X64-NEXT:    movzwl (%rdi), %eax
+; X64-NEXT:    movzwl (%rsi), %ecx
+; X64-NEXT:    rolw $8, %ax
+; X64-NEXT:    rolw $8, %cx
+; X64-NEXT:    movzwl %ax, %eax
+; X64-NEXT:    movzwl %cx, %ecx
+; X64-NEXT:    cmpq %rcx, %rax
+; X64-NEXT:    je .LBB0_1
+; X64-NEXT:  # BB#2: # %res_block
+; X64-NEXT:    movl $-1, %ecx
+; X64-NEXT:    movl $1, %eax
+; X64-NEXT:    cmovbl %ecx, %eax
+; X64-NEXT:    retq
+; X64-NEXT:  .LBB0_1:
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    retq
   %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 2) nounwind
   ret i32 %m
 }
@@ -145,19 +172,42 @@ define i1 @length3_eq(i8* %X, i8* %Y) nounwind {
 
 define i32 @length4(i8* %X, i8* %Y) nounwind {
 ; X32-LABEL: length4:
-; X32:       # BB#0:
-; X32-NEXT:    pushl $0
-; X32-NEXT:    pushl $4
-; X32-NEXT:    pushl {{[0-9]+}}(%esp)
-; X32-NEXT:    pushl {{[0-9]+}}(%esp)
-; X32-NEXT:    calll memcmp
-; X32-NEXT:    addl $16, %esp
+; X32:       # BB#0: # %loadbb
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movl (%ecx), %ecx
+; X32-NEXT:    movl (%eax), %eax
+; X32-NEXT:    bswapl %ecx
+; X32-NEXT:    bswapl %eax
+; X32-NEXT:    cmpl %eax, %ecx
+; X32-NEXT:    je .LBB6_1
+; X32-NEXT:  # BB#2: # %res_block
+; X32-NEXT:    movl $-1, %eax
+; X32-NEXT:    jb .LBB6_4
+; X32-NEXT:  # BB#3: # %res_block
+; X32-NEXT:    movl $1, %eax
+; X32-NEXT:  .LBB6_4: # %endblock
+; X32-NEXT:    retl
+; X32-NEXT:  .LBB6_1:
+; X32-NEXT:    xorl %eax, %eax
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: length4:
-; X64:       # BB#0:
-; X64-NEXT:    movl $4, %edx
-; X64-NEXT:    jmp memcmp # TAILCALL
+; X64:       # BB#0: # %loadbb
+; X64-NEXT:    movl (%rdi), %eax
+; X64-NEXT:    movl (%rsi), %ecx
+; X64-NEXT:    bswapl %eax
+; X64-NEXT:    bswapl %ecx
+; X64-NEXT:    cmpq %rcx, %rax
+; X64-NEXT:    je .LBB6_1
+; X64-NEXT:  # BB#2: # %res_block
+; X64-NEXT:    movl $-1, %ecx
+; X64-NEXT:    movl $1, %eax
+; X64-NEXT:    cmovbl %ecx, %eax
+; X64-NEXT:    retq
+; X64-NEXT:  .LBB6_1:
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    retq
   %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 4) nounwind
   ret i32 %m
 }
@@ -259,9 +309,21 @@ define i32 @length8(i8* %X, i8* %Y) nounwind {
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: length8:
-; X64:       # BB#0:
-; X64-NEXT:    movl $8, %edx
-; X64-NEXT:    jmp memcmp # TAILCALL
+; X64:       # BB#0: # %loadbb
+; X64-NEXT:    movq (%rdi), %rax
+; X64-NEXT:    movq (%rsi), %rcx
+; X64-NEXT:    bswapq %rax
+; X64-NEXT:    bswapq %rcx
+; X64-NEXT:    cmpq %rcx, %rax
+; X64-NEXT:    je .LBB11_1
+; X64-NEXT:  # BB#2: # %res_block
+; X64-NEXT:    movl $-1, %ecx
+; X64-NEXT:    movl $1, %eax
+; X64-NEXT:    cmovbl %ecx, %eax
+; X64-NEXT:    retq
+; X64-NEXT:  .LBB11_1:
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    retq
   %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 8) nounwind
   ret i32 %m
 }
diff --git a/test/Transforms/CodeGenPrepare/X86/memcmp.ll b/test/Transforms/CodeGenPrepare/X86/memcmp.ll
index 328e8cc2907..337889b34d6 100644
--- a/test/Transforms/CodeGenPrepare/X86/memcmp.ll
+++ b/test/Transforms/CodeGenPrepare/X86/memcmp.ll
@@ -6,9 +6,47 @@ target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 declare i32 @memcmp(i8* nocapture, i8* nocapture, i64)
 
 define i32 @cmp2(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
-; ALL-LABEL: @cmp2(
-; ALL-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* %x, i8* %y, i64 2)
-; ALL-NEXT:    ret i32 [[CALL]]
+; X32-LABEL: @cmp2(
+; X32-NEXT:  loadbb:
+; X32-NEXT:    [[TMP0:%.*]] = bitcast i8* %x to i16*
+; X32-NEXT:    [[TMP1:%.*]] = bitcast i8* %y to i16*
+; X32-NEXT:    [[TMP2:%.*]] = load i16, i16* [[TMP0]]
+; X32-NEXT:    [[TMP3:%.*]] = load i16, i16* [[TMP1]]
+; X32-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X32-NEXT:    [[TMP5:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP3]])
+; X32-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X32-NEXT:    [[TMP7:%.*]] = zext i16 [[TMP5]] to i32
+; X32-NEXT:    [[TMP8:%.*]] = sub i32 [[TMP6]], [[TMP7]]
+; X32-NEXT:    [[TMP9:%.*]] = icmp ne i32 [[TMP8]], 0
+; X32-NEXT:    br i1 [[TMP9]], label %res_block, label %endblock
+; X32:       res_block:
+; X32-NEXT:    [[TMP10:%.*]] = icmp ult i32 [[TMP6]], [[TMP7]]
+; X32-NEXT:    [[TMP11:%.*]] = select i1 [[TMP10]], i32 -1, i32 1
+; X32-NEXT:    br label %endblock
+; X32:       endblock:
+; X32-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, %loadbb ], [ [[TMP11]], %res_block ]
+; X32-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-LABEL: @cmp2(
+; X64-NEXT:  loadbb:
+; X64-NEXT:    [[TMP0:%.*]] = bitcast i8* %x to i16*
+; X64-NEXT:    [[TMP1:%.*]] = bitcast i8* %y to i16*
+; X64-NEXT:    [[TMP2:%.*]] = load i16, i16* [[TMP0]]
+; X64-NEXT:    [[TMP3:%.*]] = load i16, i16* [[TMP1]]
+; X64-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X64-NEXT:    [[TMP5:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP3]])
+; X64-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i64
+; X64-NEXT:    [[TMP7:%.*]] = zext i16 [[TMP5]] to i64
+; X64-NEXT:    [[TMP8:%.*]] = sub i64 [[TMP6]], [[TMP7]]
+; X64-NEXT:    [[TMP9:%.*]] = icmp ne i64 [[TMP8]], 0
+; X64-NEXT:    br i1 [[TMP9]], label %res_block, label %endblock
+; X64:       res_block:
+; X64-NEXT:    [[TMP10:%.*]] = icmp ult i64 [[TMP6]], [[TMP7]]
+; X64-NEXT:    [[TMP11:%.*]] = select i1 [[TMP10]], i32 -1, i32 1
+; X64-NEXT:    br label %endblock
+; X64:       endblock:
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, %loadbb ], [ [[TMP11]], %res_block ]
+; X64-NEXT:    ret i32 [[PHI_RES]]
 ;
   %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 2)
   ret i32 %call
@@ -24,9 +62,45 @@ define i32 @cmp3(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
 }
 
 define i32 @cmp4(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
-; ALL-LABEL: @cmp4(
-; ALL-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* %x, i8* %y, i64 4)
-; ALL-NEXT:    ret i32 [[CALL]]
+; X32-LABEL: @cmp4(
+; X32-NEXT:  loadbb:
+; X32-NEXT:    [[TMP0:%.*]] = bitcast i8* %x to i32*
+; X32-NEXT:    [[TMP1:%.*]] = bitcast i8* %y to i32*
+; X32-NEXT:    [[TMP2:%.*]] = load i32, i32* [[TMP0]]
+; X32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1]]
+; X32-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X32-NEXT:    [[TMP5:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X32-NEXT:    [[TMP6:%.*]] = sub i32 [[TMP4]], [[TMP5]]
+; X32-NEXT:    [[TMP7:%.*]] = icmp ne i32 [[TMP6]], 0
+; X32-NEXT:    br i1 [[TMP7]], label %res_block, label %endblock
+; X32:       res_block:
+; X32-NEXT:    [[TMP8:%.*]] = icmp ult i32 [[TMP4]], [[TMP5]]
+; X32-NEXT:    [[TMP9:%.*]] = select i1 [[TMP8]], i32 -1, i32 1
+; X32-NEXT:    br label %endblock
+; X32:       endblock:
+; X32-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, %loadbb ], [ [[TMP9]], %res_block ]
+; X32-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-LABEL: @cmp4(
+; X64-NEXT:  loadbb:
+; X64-NEXT:    [[TMP0:%.*]] = bitcast i8* %x to i32*
+; X64-NEXT:    [[TMP1:%.*]] = bitcast i8* %y to i32*
+; X64-NEXT:    [[TMP2:%.*]] = load i32, i32* [[TMP0]]
+; X64-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1]]
+; X64-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X64-NEXT:    [[TMP5:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-NEXT:    [[TMP6:%.*]] = zext i32 [[TMP4]] to i64
+; X64-NEXT:    [[TMP7:%.*]] = zext i32 [[TMP5]] to i64
+; X64-NEXT:    [[TMP8:%.*]] = sub i64 [[TMP6]], [[TMP7]]
+; X64-NEXT:    [[TMP9:%.*]] = icmp ne i64 [[TMP8]], 0
+; X64-NEXT:    br i1 [[TMP9]], label %res_block, label %endblock
+; X64:       res_block:
+; X64-NEXT:    [[TMP10:%.*]] = icmp ult i64 [[TMP6]], [[TMP7]]
+; X64-NEXT:    [[TMP11:%.*]] = select i1 [[TMP10]], i32 -1, i32 1
+; X64-NEXT:    br label %endblock
+; X64:       endblock:
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, %loadbb ], [ [[TMP11]], %res_block ]
+; X64-NEXT:    ret i32 [[PHI_RES]]
 ;
   %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 4)
   ret i32 %call
@@ -60,9 +134,28 @@ define i32 @cmp7(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
 }
 
 define i32 @cmp8(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
-; ALL-LABEL: @cmp8(
-; ALL-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* %x, i8* %y, i64 8)
-; ALL-NEXT:    ret i32 [[CALL]]
+; X32-LABEL: @cmp8(
+; X32-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* %x, i8* %y, i64 8)
+; X32-NEXT:    ret i32 [[CALL]]
+;
+; X64-LABEL: @cmp8(
+; X64-NEXT:  loadbb:
+; X64-NEXT:    [[TMP0:%.*]] = bitcast i8* %x to i64*
+; X64-NEXT:    [[TMP1:%.*]] = bitcast i8* %y to i64*
+; X64-NEXT:    [[TMP2:%.*]] = load i64, i64* [[TMP0]]
+; X64-NEXT:    [[TMP3:%.*]] = load i64, i64* [[TMP1]]
+; X64-NEXT:    [[TMP4:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP2]])
+; X64-NEXT:    [[TMP5:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-NEXT:    [[TMP6:%.*]] = sub i64 [[TMP4]], [[TMP5]]
+; X64-NEXT:    [[TMP7:%.*]] = icmp ne i64 [[TMP6]], 0
+; X64-NEXT:    br i1 [[TMP7]], label %res_block, label %endblock
+; X64:       res_block:
+; X64-NEXT:    [[TMP8:%.*]] = icmp ult i64 [[TMP4]], [[TMP5]]
+; X64-NEXT:    [[TMP9:%.*]] = select i1 [[TMP8]], i32 -1, i32 1
+; X64-NEXT:    br label %endblock
+; X64:       endblock:
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, %loadbb ], [ [[TMP9]], %res_block ]
+; X64-NEXT:    ret i32 [[PHI_RES]]
 ;
   %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 8)
   ret i32 %call
@@ -142,8 +235,13 @@ define i32 @cmp16(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
 
 define i32 @cmp_eq2(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
 ; ALL-LABEL: @cmp_eq2(
-; ALL-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* %x, i8* %y, i64 2)
-; ALL-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
+; ALL-NEXT:    [[TMP1:%.*]] = bitcast i8* %x to i16*
+; ALL-NEXT:    [[TMP2:%.*]] = bitcast i8* %y to i16*
+; ALL-NEXT:    [[TMP3:%.*]] = load i16, i16* [[TMP1]]
+; ALL-NEXT:    [[TMP4:%.*]] = load i16, i16* [[TMP2]]
+; ALL-NEXT:    [[TMP5:%.*]] = icmp ne i16 [[TMP3]], [[TMP4]]
+; ALL-NEXT:    [[TMP6:%.*]] = zext i1 [[TMP5]] to i32
+; ALL-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP6]], 0
 ; ALL-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
 ; ALL-NEXT:    ret i32 [[CONV]]
 ;
@@ -168,8 +266,13 @@ define i32 @cmp_eq3(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
 
 define i32 @cmp_eq4(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
 ; ALL-LABEL: @cmp_eq4(
-; ALL-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* %x, i8* %y, i64 4)
-; ALL-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
+; ALL-NEXT:    [[TMP1:%.*]] = bitcast i8* %x to i32*
+; ALL-NEXT:    [[TMP2:%.*]] = bitcast i8* %y to i32*
+; ALL-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1]]
+; ALL-NEXT:    [[TMP4:%.*]] = load i32, i32* [[TMP2]]
+; ALL-NEXT:    [[TMP5:%.*]] = icmp ne i32 [[TMP3]], [[TMP4]]
+; ALL-NEXT:    [[TMP6:%.*]] = zext i1 [[TMP5]] to i32
+; ALL-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP6]], 0
 ; ALL-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
 ; ALL-NEXT:    ret i32 [[CONV]]
 ;
@@ -219,11 +322,22 @@ define i32 @cmp_eq7(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
 }
 
 define i32 @cmp_eq8(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
-; ALL-LABEL: @cmp_eq8(
-; ALL-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* %x, i8* %y, i64 8)
-; ALL-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
-; ALL-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
-; ALL-NEXT:    ret i32 [[CONV]]
+; X32-LABEL: @cmp_eq8(
+; X32-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* %x, i8* %y, i64 8)
+; X32-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
+; X32-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+; X32-NEXT:    ret i32 [[CONV]]
+;
+; X64-LABEL: @cmp_eq8(
+; X64-NEXT:    [[TMP1:%.*]] = bitcast i8* %x to i64*
+; X64-NEXT:    [[TMP2:%.*]] = bitcast i8* %y to i64*
+; X64-NEXT:    [[TMP3:%.*]] = load i64, i64* [[TMP1]]
+; X64-NEXT:    [[TMP4:%.*]] = load i64, i64* [[TMP2]]
+; X64-NEXT:    [[TMP5:%.*]] = icmp ne i64 [[TMP3]], [[TMP4]]
+; X64-NEXT:    [[TMP6:%.*]] = zext i1 [[TMP5]] to i32
+; X64-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP6]], 0
+; X64-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+; X64-NEXT:    ret i32 [[CONV]]
 ;
   %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 8)
   %cmp = icmp eq i32 %call, 0