From e26bd3af58c7d43bf74f658cd72febea08a915f1 Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Tue, 28 Mar 2017 17:23:49 +0000
Subject: [PATCH] [x86] use VPMOVMSK to replace memcmp libcalls for 32-byte
 equality

Follow-up to:
https://reviews.llvm.org/rL298775


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@298933 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../SelectionDAG/SelectionDAGBuilder.cpp      | 16 ++---
 lib/Target/X86/X86ISelLowering.cpp            |  6 +-
 test/CodeGen/X86/memcmp.ll                    | 58 +++++++++++++------
 3 files changed, 52 insertions(+), 28 deletions(-)

diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 25523e52af1..9a4d44842ea 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -6069,20 +6069,20 @@ bool SelectionDAGBuilder::visitMemCmpCall(const CallInst &I) {
   // supports the MVT we'll be loading or if it is small enough (<= 4) that
   // we'll only produce a small number of byte loads.
   MVT LoadVT;
-  switch (CSize->getZExtValue()) {
+  unsigned NumBitsToCompare = CSize->getZExtValue() * 8;
+  switch (NumBitsToCompare) {
   default:
     return false;
-  case 2:
+  case 16:
     LoadVT = MVT::i16;
     break;
-  case 4:
+  case 32:
     LoadVT = MVT::i32;
     break;
-  case 8:
-    LoadVT = hasFastLoadsAndCompare(64);
-    break;
-  case 16:
-    LoadVT = hasFastLoadsAndCompare(128);
+  case 64:
+  case 128:
+  case 256:
+    LoadVT = hasFastLoadsAndCompare(NumBitsToCompare);
     break;
   }
 
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 88e09cd56f8..eab398ac3bb 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -4646,8 +4646,12 @@ MVT X86TargetLowering::hasFastEqualityCompare(unsigned NumBits) const {
   if (NumBits == 128 && isTypeLegal(MVT::v16i8))
     return MVT::v16i8;
 
+  // VPMOVMSKB can handle this.
+  if (NumBits == 256 && isTypeLegal(MVT::v32i8))
+    return MVT::v32i8;
+
   // TODO: Allow 64-bit type for 32-bit target.
-  // TODO: 256- and 512-bit types should be allowed, but make sure that those
+  // TODO: 512-bit types should be allowed, but make sure that those
   // cases are handled in combineVectorSizedSetCCEquality().
 
   return MVT::INVALID_SIMPLE_VALUE_TYPE;
diff --git a/test/CodeGen/X86/memcmp.ll b/test/CodeGen/X86/memcmp.ll
index 1da6c254e07..ce1bb3b06ce 100644
--- a/test/CodeGen/X86/memcmp.ll
+++ b/test/CodeGen/X86/memcmp.ll
@@ -249,15 +249,25 @@ define i1 @length32(i8* %x, i8* %y) nounwind {
 ; X32-NEXT:    sete %al
 ; X32-NEXT:    retl
 ;
-; X64-LABEL: length32:
-; X64:       # BB#0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $32, %edx
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    testl %eax, %eax
-; X64-NEXT:    sete %al
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
+; SSE2-LABEL: length32:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    pushq %rax
+; SSE2-NEXT:    movl $32, %edx
+; SSE2-NEXT:    callq memcmp
+; SSE2-NEXT:    testl %eax, %eax
+; SSE2-NEXT:    sete %al
+; SSE2-NEXT:    popq %rcx
+; SSE2-NEXT:    retq
+;
+; AVX2-LABEL: length32:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovdqu (%rdi), %ymm0
+; AVX2-NEXT:    vpcmpeqb (%rsi), %ymm0, %ymm0
+; AVX2-NEXT:    vpmovmskb %ymm0, %eax
+; AVX2-NEXT:    cmpl $-1, %eax
+; AVX2-NEXT:    sete %al
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
   %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 32) nounwind
   %cmp = icmp eq i32 %call, 0
   ret i1 %cmp
@@ -276,16 +286,26 @@ define i1 @length32_const(i8* %X, i32* nocapture %P) nounwind {
 ; X32-NEXT:    setne %al
 ; X32-NEXT:    retl
 ;
-; X64-LABEL: length32_const:
-; X64:       # BB#0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $.L.str, %esi
-; X64-NEXT:    movl $32, %edx
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    testl %eax, %eax
-; X64-NEXT:    setne %al
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
+; SSE2-LABEL: length32_const:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    pushq %rax
+; SSE2-NEXT:    movl $.L.str, %esi
+; SSE2-NEXT:    movl $32, %edx
+; SSE2-NEXT:    callq memcmp
+; SSE2-NEXT:    testl %eax, %eax
+; SSE2-NEXT:    setne %al
+; SSE2-NEXT:    popq %rcx
+; SSE2-NEXT:    retq
+;
+; AVX2-LABEL: length32_const:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovdqu (%rdi), %ymm0
+; AVX2-NEXT:    vpcmpeqb {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT:    vpmovmskb %ymm0, %eax
+; AVX2-NEXT:    cmpl $-1, %eax
+; AVX2-NEXT:    setne %al
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
   %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i64 32) nounwind
   %c = icmp ne i32 %m, 0
   ret i1 %c
-- 
2.50.1