From e26bd3af58c7d43bf74f658cd72febea08a915f1 Mon Sep 17 00:00:00 2001 From: Sanjay Patel Date: Tue, 28 Mar 2017 17:23:49 +0000 Subject: [PATCH] [x86] use VPMOVMSK to replace memcmp libcalls for 32-byte equality Follow-up to: https://reviews.llvm.org/rL298775 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@298933 91177308-0d34-0410-b5e6-96231b3b80d8 --- .../SelectionDAG/SelectionDAGBuilder.cpp | 16 ++--- lib/Target/X86/X86ISelLowering.cpp | 6 +- test/CodeGen/X86/memcmp.ll | 58 +++++++++++++------ 3 files changed, 52 insertions(+), 28 deletions(-) diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index 25523e52af1..9a4d44842ea 100644 --- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -6069,20 +6069,20 @@ bool SelectionDAGBuilder::visitMemCmpCall(const CallInst &I) { // supports the MVT we'll be loading or if it is small enough (<= 4) that // we'll only produce a small number of byte loads. MVT LoadVT; - switch (CSize->getZExtValue()) { + unsigned NumBitsToCompare = CSize->getZExtValue() * 8; + switch (NumBitsToCompare) { default: return false; - case 2: + case 16: LoadVT = MVT::i16; break; - case 4: + case 32: LoadVT = MVT::i32; break; - case 8: - LoadVT = hasFastLoadsAndCompare(64); - break; - case 16: - LoadVT = hasFastLoadsAndCompare(128); + case 64: + case 128: + case 256: + LoadVT = hasFastLoadsAndCompare(NumBitsToCompare); break; } diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 88e09cd56f8..eab398ac3bb 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -4646,8 +4646,12 @@ MVT X86TargetLowering::hasFastEqualityCompare(unsigned NumBits) const { if (NumBits == 128 && isTypeLegal(MVT::v16i8)) return MVT::v16i8; + // VPMOVMSKB can handle this. + if (NumBits == 256 && isTypeLegal(MVT::v32i8)) + return MVT::v32i8; + // TODO: Allow 64-bit type for 32-bit target. - // TODO: 256- and 512-bit types should be allowed, but make sure that those + // TODO: 512-bit types should be allowed, but make sure that those // cases are handled in combineVectorSizedSetCCEquality(). return MVT::INVALID_SIMPLE_VALUE_TYPE; diff --git a/test/CodeGen/X86/memcmp.ll b/test/CodeGen/X86/memcmp.ll index 1da6c254e07..ce1bb3b06ce 100644 --- a/test/CodeGen/X86/memcmp.ll +++ b/test/CodeGen/X86/memcmp.ll @@ -249,15 +249,25 @@ define i1 @length32(i8* %x, i8* %y) nounwind { ; X32-NEXT: sete %al ; X32-NEXT: retl ; -; X64-LABEL: length32: -; X64: # BB#0: -; X64-NEXT: pushq %rax -; X64-NEXT: movl $32, %edx -; X64-NEXT: callq memcmp -; X64-NEXT: testl %eax, %eax -; X64-NEXT: sete %al -; X64-NEXT: popq %rcx -; X64-NEXT: retq +; SSE2-LABEL: length32: +; SSE2: # BB#0: +; SSE2-NEXT: pushq %rax +; SSE2-NEXT: movl $32, %edx +; SSE2-NEXT: callq memcmp +; SSE2-NEXT: testl %eax, %eax +; SSE2-NEXT: sete %al +; SSE2-NEXT: popq %rcx +; SSE2-NEXT: retq +; +; AVX2-LABEL: length32: +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqu (%rdi), %ymm0 +; AVX2-NEXT: vpcmpeqb (%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vpmovmskb %ymm0, %eax +; AVX2-NEXT: cmpl $-1, %eax +; AVX2-NEXT: sete %al +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 32) nounwind %cmp = icmp eq i32 %call, 0 ret i1 %cmp @@ -276,16 +286,26 @@ define i1 @length32_const(i8* %X, i32* nocapture %P) nounwind { ; X32-NEXT: setne %al ; X32-NEXT: retl ; -; X64-LABEL: length32_const: -; X64: # BB#0: -; X64-NEXT: pushq %rax -; X64-NEXT: movl $.L.str, %esi -; X64-NEXT: movl $32, %edx -; X64-NEXT: callq memcmp -; X64-NEXT: testl %eax, %eax -; X64-NEXT: setne %al -; X64-NEXT: popq %rcx -; X64-NEXT: retq +; SSE2-LABEL: length32_const: +; SSE2: # BB#0: +; SSE2-NEXT: pushq %rax +; SSE2-NEXT: movl $.L.str, %esi +; SSE2-NEXT: movl $32, %edx +; SSE2-NEXT: callq memcmp +; SSE2-NEXT: testl %eax, %eax +; SSE2-NEXT: setne %al +; SSE2-NEXT: popq %rcx +; SSE2-NEXT: retq +; +; AVX2-LABEL: length32_const: +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqu (%rdi), %ymm0 +; AVX2-NEXT: vpcmpeqb {{.*}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vpmovmskb %ymm0, %eax +; AVX2-NEXT: cmpl $-1, %eax +; AVX2-NEXT: setne %al +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i64 32) nounwind %c = icmp ne i32 %m, 0 ret i1 %c -- 2.40.0