}
static SDValue getMemCmpLoad(const Value *PtrVal, MVT LoadVT,
- Type *LoadTy,
SelectionDAGBuilder &Builder) {
// Check to see if this load can be trivially constant folded, e.g. if the
// input is from a string literal.
if (const Constant *LoadInput = dyn_cast<Constant>(PtrVal)) {
// Cast pointer to the type we really want to load.
+ Type *LoadTy =
+ Type::getIntNTy(PtrVal->getContext(), LoadVT.getScalarSizeInBits());
+ if (LoadVT.isVector())
+ LoadTy = VectorType::get(LoadTy, LoadVT.getVectorNumElements());
+
LoadInput = ConstantExpr::getBitCast(const_cast<Constant *>(LoadInput),
PointerType::getUnqual(LoadTy));
if (!CSize || !IsOnlyUsedInZeroEqualityComparison(&I))
return false;
+ // If the target has a fast compare for the given size, it will return a
+ // preferred load type for that size. Require that the load VT is legal and
+ // that the target supports unaligned loads of that type. Otherwise, return
+ // INVALID.
+ auto hasFastLoadsAndCompare = [&](unsigned NumBits) {
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ MVT LVT = TLI.hasFastEqualityCompare(NumBits);
+ if (LVT != MVT::INVALID_SIMPLE_VALUE_TYPE) {
+ // TODO: Handle 5 byte compare as 4-byte + 1 byte.
+ // TODO: Handle 8 byte compare on x86-32 as two 32-bit loads.
+ // TODO: Check alignment of src and dest ptrs.
+ unsigned DstAS = LHS->getType()->getPointerAddressSpace();
+ unsigned SrcAS = RHS->getType()->getPointerAddressSpace();
+ if (!TLI.isTypeLegal(LVT) ||
+ !TLI.allowsMisalignedMemoryAccesses(LVT, SrcAS) ||
+ !TLI.allowsMisalignedMemoryAccesses(LVT, DstAS))
+ LVT = MVT::INVALID_SIMPLE_VALUE_TYPE;
+ }
+
+ return LVT;
+ };
+
+ // This turns into unaligned loads. We only do this if the target natively
+ // supports the MVT we'll be loading or if it is small enough (<= 4) that
+ // we'll only produce a small number of byte loads.
MVT LoadVT;
- Type *LoadTy;
switch (CSize->getZExtValue()) {
default:
return false;
case 2:
LoadVT = MVT::i16;
- LoadTy = Type::getInt16Ty(CSize->getContext());
break;
case 4:
LoadVT = MVT::i32;
- LoadTy = Type::getInt32Ty(CSize->getContext());
break;
case 8:
- LoadVT = MVT::i64;
- LoadTy = Type::getInt64Ty(CSize->getContext());
+ LoadVT = hasFastLoadsAndCompare(64);
break;
- /*
case 16:
- LoadVT = MVT::v4i32;
- LoadTy = Type::getInt32Ty(CSize->getContext());
- LoadTy = VectorType::get(LoadTy, 4);
+ LoadVT = hasFastLoadsAndCompare(128);
break;
- */
}
- // This turns into unaligned loads. We only do this if the target natively
- // supports the MVT we'll be loading or if it is small enough (<= 4) that
- // we'll only produce a small number of byte loads.
+ if (LoadVT == MVT::INVALID_SIMPLE_VALUE_TYPE)
+ return false;
- // Require that we can find a legal MVT, and only do this if the target
- // supports unaligned loads of that type. Expanding into byte loads would
- // bloat the code.
- const TargetLowering &TLI = DAG.getTargetLoweringInfo();
- if (CSize->getZExtValue() > 4) {
- unsigned DstAS = LHS->getType()->getPointerAddressSpace();
- unsigned SrcAS = RHS->getType()->getPointerAddressSpace();
- // TODO: Handle 5 byte compare as 4-byte + 1 byte.
- // TODO: Handle 8 byte compare on x86-32 as two 32-bit loads.
- // TODO: Check alignment of src and dest ptrs.
- if (!TLI.isTypeLegal(LoadVT) ||
- !TLI.allowsMisalignedMemoryAccesses(LoadVT, SrcAS) ||
- !TLI.allowsMisalignedMemoryAccesses(LoadVT, DstAS))
- return false;
+ SDValue LoadL = getMemCmpLoad(LHS, LoadVT, *this);
+ SDValue LoadR = getMemCmpLoad(RHS, LoadVT, *this);
+
+ // Bitcast to a wide integer type if the loads are vectors.
+ if (LoadVT.isVector()) {
+ EVT CmpVT = EVT::getIntegerVT(LHS->getContext(), LoadVT.getSizeInBits());
+ LoadL = DAG.getBitcast(CmpVT, LoadL);
+ LoadR = DAG.getBitcast(CmpVT, LoadR);
}
- SDValue LHSVal = getMemCmpLoad(LHS, LoadVT, LoadTy, *this);
- SDValue RHSVal = getMemCmpLoad(RHS, LoadVT, LoadTy, *this);
- SDValue SetCC =
- DAG.getSetCC(getCurSDLoc(), MVT::i1, LHSVal, RHSVal, ISD::SETNE);
- processIntegerCallValue(I, SetCC, false);
+ SDValue Cmp = DAG.getSetCC(getCurSDLoc(), MVT::i1, LoadL, LoadR, ISD::SETNE);
+ processIntegerCallValue(I, Cmp, false);
return true;
}
;
; X64-LABEL: length16:
; X64: # BB#0:
-; X64-NEXT: pushq %rax
-; X64-NEXT: movl $16, %edx
-; X64-NEXT: callq memcmp
-; X64-NEXT: testl %eax, %eax
+; X64-NEXT: movdqu (%rsi), %xmm0
+; X64-NEXT: movdqu (%rdi), %xmm1
+; X64-NEXT: pcmpeqb %xmm0, %xmm1
+; X64-NEXT: pmovmskb %xmm1, %eax
+; X64-NEXT: cmpl $65535, %eax # imm = 0xFFFF
; X64-NEXT: setne %al
-; X64-NEXT: popq %rcx
; X64-NEXT: retq
%call = tail call i32 @memcmp(i8* %x, i8* %y, i64 16) nounwind
%cmp = icmp ne i32 %call, 0
;
; X64-LABEL: length16_const:
; X64: # BB#0:
-; X64-NEXT: pushq %rax
-; X64-NEXT: movl $.L.str, %esi
-; X64-NEXT: movl $16, %edx
-; X64-NEXT: callq memcmp
-; X64-NEXT: testl %eax, %eax
+; X64-NEXT: movdqu (%rdi), %xmm0
+; X64-NEXT: pcmpeqb {{.*}}(%rip), %xmm0
+; X64-NEXT: pmovmskb %xmm0, %eax
+; X64-NEXT: cmpl $65535, %eax # imm = 0xFFFF
; X64-NEXT: sete %al
-; X64-NEXT: popq %rcx
; X64-NEXT: retq
%m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i64 16) nounwind
%c = icmp eq i32 %m, 0