[x86] eliminate unnecessary vector compare for AVX masked store

author Sanjay Patel <spatel@rotateright.com>

Tue, 12 Sep 2017 23:24:05 +0000 (23:24 +0000)

committer Sanjay Patel <spatel@rotateright.com>

Tue, 12 Sep 2017 23:24:05 +0000 (23:24 +0000)
author Sanjay Patel <spatel@rotateright.com>
Tue, 12 Sep 2017 23:24:05 +0000 (23:24 +0000)
committer Sanjay Patel <spatel@rotateright.com>
Tue, 12 Sep 2017 23:24:05 +0000 (23:24 +0000)
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp

index 725e2eac506ea39c5689308402dc74f2183ef963..942669c958c05e5c9283be51bee8339aaf75ab2b 100644 (file)
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -33145,8 +33145,33 @@ static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG,
    if (Mst->isCompressingStore())
      return SDValue();
  
-  if (!Mst->isTruncatingStore())
-    return reduceMaskedStoreToScalarStore(Mst, DAG);
+  if (!Mst->isTruncatingStore()) {
+    if (SDValue ScalarStore = reduceMaskedStoreToScalarStore(Mst, DAG))
+      return ScalarStore;
+
+    // If the mask is checking (0 > X), we're creating a vector with all-zeros
+    // or all-ones elements based on the sign bits of X. AVX1 masked store only
+    // cares about the sign bit of each mask element, so eliminate the compare:
+    // mstore val, ptr, (pcmpgt 0, X) --> mstore val, ptr, X
+    // Note that by waiting to match an x86-specific PCMPGT node, we're
+    // eliminating potentially more complex matching of a setcc node which has
+    // a full range of predicates.
+    SDValue Mask = Mst->getMask();
+    if (Mask.getOpcode() == X86ISD::PCMPGT &&
+        ISD::isBuildVectorAllZeros(Mask.getOperand(0).getNode())) {
+      assert(Mask.getValueType() == Mask.getOperand(1).getValueType() &&
+             "Unexpected type for PCMPGT");
+      return DAG.getMaskedStore(
+          Mst->getChain(), SDLoc(N), Mst->getValue(), Mst->getBasePtr(),
+          Mask.getOperand(1), Mst->getMemoryVT(), Mst->getMemOperand());
+    }
+
+    // TODO: AVX512 targets should also be able to simplify something like the
+    // pattern above, but that pattern will be different. It will either need to
+    // match setcc more generally or match PCMPGTM later (in tablegen?).
+
+    return SDValue();
+  }
  
    // Resolve truncating stores.
    EVT VT = Mst->getValue().getValueType();
diff --git a/test/CodeGen/X86/masked_memop.ll b/test/CodeGen/X86/masked_memop.ll

index fa540c7643f4c9b8f4299e8c70506506d2c09bb3..b1923ebb081dd5b8e154558313b0a0b88bfb9a60 100644 (file)
--- a/test/CodeGen/X86/masked_memop.ll
+++ b/test/CodeGen/X86/masked_memop.ll
@@ -1140,21 +1140,18 @@ define <8 x double> @load_one_mask_bit_set5(<8 x double>* %addr, <8 x double> %v
    ret <8 x double> %res
  }
  
-; FIXME: The mask bit for each data element is the most significant bit of the mask operand, so a compare isn't needed.
+; The mask bit for each data element is the most significant bit of the mask operand, so a compare isn't needed.
+; FIXME: The AVX512 code should be improved to use 'vpmovd2m'. Add tests for 512-bit vectors when implementing that.
  
  define void @trunc_mask(<4 x float> %x, <4 x float>* %ptr, <4 x float> %y, <4 x i32> %mask) {
  ; AVX-LABEL: trunc_mask:
  ; AVX:       ## BB#0:
-; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vpcmpgtd %xmm2, %xmm1, %xmm1
-; AVX-NEXT:    vmaskmovps %xmm0, %xmm1, (%rdi)
+; AVX-NEXT:    vmaskmovps %xmm0, %xmm2, (%rdi)
  ; AVX-NEXT:    retq
  ;
  ; AVX512F-LABEL: trunc_mask:
  ; AVX512F:       ## BB#0:
-; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT:    vpcmpgtd %xmm2, %xmm1, %xmm1
-; AVX512F-NEXT:    vmaskmovps %xmm0, %xmm1, (%rdi)
+; AVX512F-NEXT:    vmaskmovps %xmm0, %xmm2, (%rdi)
  ; AVX512F-NEXT:    retq
  ;
  ; SKX-LABEL: trunc_mask:
author	Sanjay Patel <spatel@rotateright.com>
	Tue, 12 Sep 2017 23:24:05 +0000 (23:24 +0000)
committer	Sanjay Patel <spatel@rotateright.com>
	Tue, 12 Sep 2017 23:24:05 +0000 (23:24 +0000)
lib/Target/X86/X86ISelLowering.cpp		patch \| blob \| history
test/CodeGen/X86/masked_memop.ll		patch \| blob \| history