From: Sanjay Patel Date: Sat, 25 Mar 2017 16:05:33 +0000 (+0000) Subject: [x86] use PMOVMSK to replace memcmp libcalls for 16-byte equality X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=a39e64c0aa123a11f3fe6144e29fb9c84cd6612d;p=llvm [x86] use PMOVMSK to replace memcmp libcalls for 16-byte equality This is the payoff for D31156 - if a target has efficient comparison instructions for vector-sized equality, we can replace memcmp calls with inline code that is both smaller and faster. Differential Revision: https://reviews.llvm.org/D31290 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@298775 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/include/llvm/Target/TargetLowering.h b/include/llvm/Target/TargetLowering.h index 59a2275d6a3..8f4c9adb780 100644 --- a/include/llvm/Target/TargetLowering.h +++ b/include/llvm/Target/TargetLowering.h @@ -437,6 +437,15 @@ public: return false; } + /// Return the preferred operand type if the target has a quick way to compare + /// integer values of the given size. Assume that any legal integer type can + /// be compared efficiently. Targets may override this to allow illegal wide + /// types to return a vector type if there is support to compare that type. + virtual MVT hasFastEqualityCompare(unsigned NumBits) const { + MVT VT = MVT::getIntegerVT(NumBits); + return isTypeLegal(VT) ? VT : MVT::INVALID_SIMPLE_VALUE_TYPE; + } + /// Return true if the target should transform: /// (X & Y) == Y ---> (~X & Y) == 0 /// (X & Y) != Y ---> (~X & Y) != 0 diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index 76b80794af8..25523e52af1 100644 --- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -5955,13 +5955,17 @@ static bool IsOnlyUsedInZeroEqualityComparison(const Value *V) { } static SDValue getMemCmpLoad(const Value *PtrVal, MVT LoadVT, - Type *LoadTy, SelectionDAGBuilder &Builder) { // Check to see if this load can be trivially constant folded, e.g. if the // input is from a string literal. if (const Constant *LoadInput = dyn_cast(PtrVal)) { // Cast pointer to the type we really want to load. + Type *LoadTy = + Type::getIntNTy(PtrVal->getContext(), LoadVT.getScalarSizeInBits()); + if (LoadVT.isVector()) + LoadTy = VectorType::get(LoadTy, LoadVT.getVectorNumElements()); + LoadInput = ConstantExpr::getBitCast(const_cast(LoadInput), PointerType::getUnqual(LoadTy)); @@ -6039,57 +6043,64 @@ bool SelectionDAGBuilder::visitMemCmpCall(const CallInst &I) { if (!CSize || !IsOnlyUsedInZeroEqualityComparison(&I)) return false; + // If the target has a fast compare for the given size, it will return a + // preferred load type for that size. Require that the load VT is legal and + // that the target supports unaligned loads of that type. Otherwise, return + // INVALID. + auto hasFastLoadsAndCompare = [&](unsigned NumBits) { + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + MVT LVT = TLI.hasFastEqualityCompare(NumBits); + if (LVT != MVT::INVALID_SIMPLE_VALUE_TYPE) { + // TODO: Handle 5 byte compare as 4-byte + 1 byte. + // TODO: Handle 8 byte compare on x86-32 as two 32-bit loads. + // TODO: Check alignment of src and dest ptrs. + unsigned DstAS = LHS->getType()->getPointerAddressSpace(); + unsigned SrcAS = RHS->getType()->getPointerAddressSpace(); + if (!TLI.isTypeLegal(LVT) || + !TLI.allowsMisalignedMemoryAccesses(LVT, SrcAS) || + !TLI.allowsMisalignedMemoryAccesses(LVT, DstAS)) + LVT = MVT::INVALID_SIMPLE_VALUE_TYPE; + } + + return LVT; + }; + + // This turns into unaligned loads. We only do this if the target natively + // supports the MVT we'll be loading or if it is small enough (<= 4) that + // we'll only produce a small number of byte loads. MVT LoadVT; - Type *LoadTy; switch (CSize->getZExtValue()) { default: return false; case 2: LoadVT = MVT::i16; - LoadTy = Type::getInt16Ty(CSize->getContext()); break; case 4: LoadVT = MVT::i32; - LoadTy = Type::getInt32Ty(CSize->getContext()); break; case 8: - LoadVT = MVT::i64; - LoadTy = Type::getInt64Ty(CSize->getContext()); + LoadVT = hasFastLoadsAndCompare(64); break; - /* case 16: - LoadVT = MVT::v4i32; - LoadTy = Type::getInt32Ty(CSize->getContext()); - LoadTy = VectorType::get(LoadTy, 4); + LoadVT = hasFastLoadsAndCompare(128); break; - */ } - // This turns into unaligned loads. We only do this if the target natively - // supports the MVT we'll be loading or if it is small enough (<= 4) that - // we'll only produce a small number of byte loads. + if (LoadVT == MVT::INVALID_SIMPLE_VALUE_TYPE) + return false; - // Require that we can find a legal MVT, and only do this if the target - // supports unaligned loads of that type. Expanding into byte loads would - // bloat the code. - const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - if (CSize->getZExtValue() > 4) { - unsigned DstAS = LHS->getType()->getPointerAddressSpace(); - unsigned SrcAS = RHS->getType()->getPointerAddressSpace(); - // TODO: Handle 5 byte compare as 4-byte + 1 byte. - // TODO: Handle 8 byte compare on x86-32 as two 32-bit loads. - // TODO: Check alignment of src and dest ptrs. - if (!TLI.isTypeLegal(LoadVT) || - !TLI.allowsMisalignedMemoryAccesses(LoadVT, SrcAS) || - !TLI.allowsMisalignedMemoryAccesses(LoadVT, DstAS)) - return false; + SDValue LoadL = getMemCmpLoad(LHS, LoadVT, *this); + SDValue LoadR = getMemCmpLoad(RHS, LoadVT, *this); + + // Bitcast to a wide integer type if the loads are vectors. + if (LoadVT.isVector()) { + EVT CmpVT = EVT::getIntegerVT(LHS->getContext(), LoadVT.getSizeInBits()); + LoadL = DAG.getBitcast(CmpVT, LoadL); + LoadR = DAG.getBitcast(CmpVT, LoadR); } - SDValue LHSVal = getMemCmpLoad(LHS, LoadVT, LoadTy, *this); - SDValue RHSVal = getMemCmpLoad(RHS, LoadVT, LoadTy, *this); - SDValue SetCC = - DAG.getSetCC(getCurSDLoc(), MVT::i1, LHSVal, RHSVal, ISD::SETNE); - processIntegerCallValue(I, SetCC, false); + SDValue Cmp = DAG.getSetCC(getCurSDLoc(), MVT::i1, LoadL, LoadR, ISD::SETNE); + processIntegerCallValue(I, Cmp, false); return true; } diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 4b807ebcba8..dc6bc115a16 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -4637,6 +4637,22 @@ bool X86TargetLowering::hasAndNotCompare(SDValue Y) const { return true; } +MVT X86TargetLowering::hasFastEqualityCompare(unsigned NumBits) const { + MVT VT = MVT::getIntegerVT(NumBits); + if (isTypeLegal(VT)) + return VT; + + // PMOVMSKB can handle this. + if (NumBits == 128 && isTypeLegal(MVT::v16i8)) + return MVT::v16i8; + + // TODO: Allow 64-bit type for 32-bit target. + // TODO: 256- and 512-bit types should be allowed, but make sure that those + // cases are handled in combineVectorSizedSetCCEquality(). + + return MVT::INVALID_SIMPLE_VALUE_TYPE; +} + /// Val is the undef sentinel value or equal to the specified value. static bool isUndefOrEqual(int Val, int CmpVal) { return ((Val == SM_SentinelUndef) || (Val == CmpVal)); diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h index e38724c3af5..3a3f6807374 100644 --- a/lib/Target/X86/X86ISelLowering.h +++ b/lib/Target/X86/X86ISelLowering.h @@ -815,6 +815,9 @@ namespace llvm { bool hasAndNotCompare(SDValue Y) const override; + /// Vector-sized comparisons are fast using PCMPEQ + PMOVMSK or PTEST. + MVT hasFastEqualityCompare(unsigned NumBits) const override; + /// Return the value type to use for ISD::SETCC. EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override; diff --git a/test/CodeGen/X86/memcmp.ll b/test/CodeGen/X86/memcmp.ll index 307dd81f764..b9c1da5248b 100644 --- a/test/CodeGen/X86/memcmp.ll +++ b/test/CodeGen/X86/memcmp.ll @@ -179,12 +179,12 @@ define i1 @length16(i8* %x, i8* %y) nounwind { ; ; X64-LABEL: length16: ; X64: # BB#0: -; X64-NEXT: pushq %rax -; X64-NEXT: movl $16, %edx -; X64-NEXT: callq memcmp -; X64-NEXT: testl %eax, %eax +; X64-NEXT: movdqu (%rsi), %xmm0 +; X64-NEXT: movdqu (%rdi), %xmm1 +; X64-NEXT: pcmpeqb %xmm0, %xmm1 +; X64-NEXT: pmovmskb %xmm1, %eax +; X64-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; X64-NEXT: setne %al -; X64-NEXT: popq %rcx ; X64-NEXT: retq %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 16) nounwind %cmp = icmp ne i32 %call, 0 @@ -206,13 +206,11 @@ define i1 @length16_const(i8* %X, i32* nocapture %P) nounwind { ; ; X64-LABEL: length16_const: ; X64: # BB#0: -; X64-NEXT: pushq %rax -; X64-NEXT: movl $.L.str, %esi -; X64-NEXT: movl $16, %edx -; X64-NEXT: callq memcmp -; X64-NEXT: testl %eax, %eax +; X64-NEXT: movdqu (%rdi), %xmm0 +; X64-NEXT: pcmpeqb {{.*}}(%rip), %xmm0 +; X64-NEXT: pmovmskb %xmm0, %eax +; X64-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; X64-NEXT: sete %al -; X64-NEXT: popq %rcx ; X64-NEXT: retq %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i64 16) nounwind %c = icmp eq i32 %m, 0