[x86] use PMOVMSK to replace memcmp libcalls for 16-byte equality

author Sanjay Patel <spatel@rotateright.com>

Sat, 25 Mar 2017 16:05:33 +0000 (16:05 +0000)

committer Sanjay Patel <spatel@rotateright.com>

Sat, 25 Mar 2017 16:05:33 +0000 (16:05 +0000)
author Sanjay Patel <spatel@rotateright.com>
Sat, 25 Mar 2017 16:05:33 +0000 (16:05 +0000)
committer Sanjay Patel <spatel@rotateright.com>
Sat, 25 Mar 2017 16:05:33 +0000 (16:05 +0000)
diff --git a/include/llvm/Target/TargetLowering.h b/include/llvm/Target/TargetLowering.h

index 59a2275d6a36550e5f6454f69e4c97cadb8aff52..8f4c9adb7809e5b3e033f037aa3a0c2a333f3781 100644 (file)
--- a/include/llvm/Target/TargetLowering.h
+++ b/include/llvm/Target/TargetLowering.h
@@ -437,6 +437,15 @@ public:
      return false;
    }
  
+  /// Return the preferred operand type if the target has a quick way to compare
+  /// integer values of the given size. Assume that any legal integer type can
+  /// be compared efficiently. Targets may override this to allow illegal wide
+  /// types to return a vector type if there is support to compare that type.
+  virtual MVT hasFastEqualityCompare(unsigned NumBits) const {
+    MVT VT = MVT::getIntegerVT(NumBits);
+    return isTypeLegal(VT) ? VT : MVT::INVALID_SIMPLE_VALUE_TYPE;
+  }
+
    /// Return true if the target should transform:
    /// (X & Y) == Y ---> (~X & Y) == 0
    /// (X & Y) != Y ---> (~X & Y) != 0
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp

index 76b80794af8369a3a2dc67c41ce3de98027208e5..25523e52af1a225277bbfab8aaeb8c37669d679f 100644 (file)
--- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -5955,13 +5955,17 @@ static bool IsOnlyUsedInZeroEqualityComparison(const Value *V) {
  }
  
  static SDValue getMemCmpLoad(const Value *PtrVal, MVT LoadVT,
-                             Type *LoadTy,
                               SelectionDAGBuilder &Builder) {
  
    // Check to see if this load can be trivially constant folded, e.g. if the
    // input is from a string literal.
    if (const Constant *LoadInput = dyn_cast<Constant>(PtrVal)) {
      // Cast pointer to the type we really want to load.
+    Type *LoadTy =
+        Type::getIntNTy(PtrVal->getContext(), LoadVT.getScalarSizeInBits());
+    if (LoadVT.isVector())
+      LoadTy = VectorType::get(LoadTy, LoadVT.getVectorNumElements());
+
      LoadInput = ConstantExpr::getBitCast(const_cast<Constant *>(LoadInput),
                                           PointerType::getUnqual(LoadTy));
  
@@ -6039,57 +6043,64 @@ bool SelectionDAGBuilder::visitMemCmpCall(const CallInst &I) {
    if (!CSize || !IsOnlyUsedInZeroEqualityComparison(&I))
      return false;
  
+  // If the target has a fast compare for the given size, it will return a
+  // preferred load type for that size. Require that the load VT is legal and
+  // that the target supports unaligned loads of that type. Otherwise, return
+  // INVALID.
+  auto hasFastLoadsAndCompare = [&](unsigned NumBits) {
+    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+    MVT LVT = TLI.hasFastEqualityCompare(NumBits);
+    if (LVT != MVT::INVALID_SIMPLE_VALUE_TYPE) {
+      // TODO: Handle 5 byte compare as 4-byte + 1 byte.
+      // TODO: Handle 8 byte compare on x86-32 as two 32-bit loads.
+      // TODO: Check alignment of src and dest ptrs.
+      unsigned DstAS = LHS->getType()->getPointerAddressSpace();
+      unsigned SrcAS = RHS->getType()->getPointerAddressSpace();
+      if (!TLI.isTypeLegal(LVT) ||
+          !TLI.allowsMisalignedMemoryAccesses(LVT, SrcAS) ||
+          !TLI.allowsMisalignedMemoryAccesses(LVT, DstAS))
+        LVT = MVT::INVALID_SIMPLE_VALUE_TYPE;
+    }
+
+    return LVT;
+  };
+
+  // This turns into unaligned loads. We only do this if the target natively
+  // supports the MVT we'll be loading or if it is small enough (<= 4) that
+  // we'll only produce a small number of byte loads.
    MVT LoadVT;
-  Type *LoadTy;
    switch (CSize->getZExtValue()) {
    default:
      return false;
    case 2:
      LoadVT = MVT::i16;
-    LoadTy = Type::getInt16Ty(CSize->getContext());
      break;
    case 4:
      LoadVT = MVT::i32;
-    LoadTy = Type::getInt32Ty(CSize->getContext());
      break;
    case 8:
-    LoadVT = MVT::i64;
-    LoadTy = Type::getInt64Ty(CSize->getContext());
+    LoadVT = hasFastLoadsAndCompare(64);
      break;
-  /*
    case 16:
-    LoadVT = MVT::v4i32;
-    LoadTy = Type::getInt32Ty(CSize->getContext());
-    LoadTy = VectorType::get(LoadTy, 4);
+    LoadVT = hasFastLoadsAndCompare(128);
      break;
-  */
    }
  
-  // This turns into unaligned loads.  We only do this if the target natively
-  // supports the MVT we'll be loading or if it is small enough (<= 4) that
-  // we'll only produce a small number of byte loads.
+  if (LoadVT == MVT::INVALID_SIMPLE_VALUE_TYPE)
+    return false;
  
-  // Require that we can find a legal MVT, and only do this if the target
-  // supports unaligned loads of that type.  Expanding into byte loads would
-  // bloat the code.
-  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-  if (CSize->getZExtValue() > 4) {
-    unsigned DstAS = LHS->getType()->getPointerAddressSpace();
-    unsigned SrcAS = RHS->getType()->getPointerAddressSpace();
-    // TODO: Handle 5 byte compare as 4-byte + 1 byte.
-    // TODO: Handle 8 byte compare on x86-32 as two 32-bit loads.
-    // TODO: Check alignment of src and dest ptrs.
-    if (!TLI.isTypeLegal(LoadVT) ||
-        !TLI.allowsMisalignedMemoryAccesses(LoadVT, SrcAS) ||
-        !TLI.allowsMisalignedMemoryAccesses(LoadVT, DstAS))
-      return false;
+  SDValue LoadL = getMemCmpLoad(LHS, LoadVT, *this);
+  SDValue LoadR = getMemCmpLoad(RHS, LoadVT, *this);
+
+  // Bitcast to a wide integer type if the loads are vectors.
+  if (LoadVT.isVector()) {
+    EVT CmpVT = EVT::getIntegerVT(LHS->getContext(), LoadVT.getSizeInBits());
+    LoadL = DAG.getBitcast(CmpVT, LoadL);
+    LoadR = DAG.getBitcast(CmpVT, LoadR);
    }
  
-  SDValue LHSVal = getMemCmpLoad(LHS, LoadVT, LoadTy, *this);
-  SDValue RHSVal = getMemCmpLoad(RHS, LoadVT, LoadTy, *this);
-  SDValue SetCC =
-      DAG.getSetCC(getCurSDLoc(), MVT::i1, LHSVal, RHSVal, ISD::SETNE);
-  processIntegerCallValue(I, SetCC, false);
+  SDValue Cmp = DAG.getSetCC(getCurSDLoc(), MVT::i1, LoadL, LoadR, ISD::SETNE);
+  processIntegerCallValue(I, Cmp, false);
    return true;
  }
  
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp

index 4b807ebcba8d3fdbfdcd4a0091e51f7151cd04ac..dc6bc115a16a04a381c22430439bbe3361a06d48 100644 (file)
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -4637,6 +4637,22 @@ bool X86TargetLowering::hasAndNotCompare(SDValue Y) const {
    return true;
  }
  
+MVT X86TargetLowering::hasFastEqualityCompare(unsigned NumBits) const {
+  MVT VT = MVT::getIntegerVT(NumBits);
+  if (isTypeLegal(VT))
+    return VT;
+
+  // PMOVMSKB can handle this.
+  if (NumBits == 128 && isTypeLegal(MVT::v16i8))
+    return MVT::v16i8;
+
+  // TODO: Allow 64-bit type for 32-bit target.
+  // TODO: 256- and 512-bit types should be allowed, but make sure that those
+  // cases are handled in combineVectorSizedSetCCEquality().
+
+  return MVT::INVALID_SIMPLE_VALUE_TYPE;
+}
+
  /// Val is the undef sentinel value or equal to the specified value.
  static bool isUndefOrEqual(int Val, int CmpVal) {
    return ((Val == SM_SentinelUndef) || (Val == CmpVal));
diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h

index e38724c3af556b4d5f16118e20588feb92c0293f..3a3f6807374d0cc1815608adc3ce93a122c7d4fc 100644 (file)
--- a/lib/Target/X86/X86ISelLowering.h
+++ b/lib/Target/X86/X86ISelLowering.h
@@ -815,6 +815,9 @@ namespace llvm {
  
      bool hasAndNotCompare(SDValue Y) const override;
  
+    /// Vector-sized comparisons are fast using PCMPEQ + PMOVMSK or PTEST.
+    MVT hasFastEqualityCompare(unsigned NumBits) const override;
+
      /// Return the value type to use for ISD::SETCC.
      EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context,
                             EVT VT) const override;
diff --git a/test/CodeGen/X86/memcmp.ll b/test/CodeGen/X86/memcmp.ll

index 307dd81f764b8349fce92625eb426e4be19e77bc..b9c1da5248b4f0f442eefc3aad02eb2d1d7629ab 100644 (file)
--- a/test/CodeGen/X86/memcmp.ll
+++ b/test/CodeGen/X86/memcmp.ll
@@ -179,12 +179,12 @@ define i1 @length16(i8* %x, i8* %y) nounwind {
  ;
  ; X64-LABEL: length16:
  ; X64:       # BB#0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $16, %edx
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    testl %eax, %eax
+; X64-NEXT:    movdqu (%rsi), %xmm0
+; X64-NEXT:    movdqu (%rdi), %xmm1
+; X64-NEXT:    pcmpeqb %xmm0, %xmm1
+; X64-NEXT:    pmovmskb %xmm1, %eax
+; X64-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
  ; X64-NEXT:    setne %al
-; X64-NEXT:    popq %rcx
  ; X64-NEXT:    retq
    %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 16) nounwind
    %cmp = icmp ne i32 %call, 0
@@ -206,13 +206,11 @@ define i1 @length16_const(i8* %X, i32* nocapture %P) nounwind {
  ;
  ; X64-LABEL: length16_const:
  ; X64:       # BB#0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $.L.str, %esi
-; X64-NEXT:    movl $16, %edx
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    testl %eax, %eax
+; X64-NEXT:    movdqu (%rdi), %xmm0
+; X64-NEXT:    pcmpeqb {{.*}}(%rip), %xmm0
+; X64-NEXT:    pmovmskb %xmm0, %eax
+; X64-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
  ; X64-NEXT:    sete %al
-; X64-NEXT:    popq %rcx
  ; X64-NEXT:    retq
    %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i64 16) nounwind
    %c = icmp eq i32 %m, 0
author	Sanjay Patel <spatel@rotateright.com>
	Sat, 25 Mar 2017 16:05:33 +0000 (16:05 +0000)
committer	Sanjay Patel <spatel@rotateright.com>
	Sat, 25 Mar 2017 16:05:33 +0000 (16:05 +0000)
include/llvm/Target/TargetLowering.h		patch \| blob \| history
lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp		patch \| blob \| history
lib/Target/X86/X86ISelLowering.cpp		patch \| blob \| history
lib/Target/X86/X86ISelLowering.h		patch \| blob \| history
test/CodeGen/X86/memcmp.ll		patch \| blob \| history