From 10559c6633f42815de0523bc0ce3bfc57c36fb2e Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Thu, 26 Jan 2017 14:31:12 +0000 Subject: [PATCH] [X86][SSE] Add support for combining ANDNP byte masks with target shuffles git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@293178 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 32 ++++++++--- .../X86/clear_upper_vector_element_bits.ll | 54 ++++++++----------- 2 files changed, 47 insertions(+), 39 deletions(-) diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 6857c5c7701..8d4399cbc8b 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -4132,6 +4132,7 @@ static bool isTargetShuffleVariableMask(unsigned Opcode) { return true; // 'Faux' Target Shuffles. case ISD::AND: + case X86ISD::ANDNP: return true; } } @@ -5742,11 +5743,16 @@ static bool getFauxShuffleMask(SDValue N, SmallVectorImpl &Mask, unsigned Opcode = N.getOpcode(); switch (Opcode) { - case ISD::AND: { + case ISD::AND: + case X86ISD::ANDNP: { // Attempt to decode as a per-byte mask. SmallBitVector UndefElts; SmallVector EltBits; - if (!getTargetConstantBitsFromNode(N.getOperand(1), 8, UndefElts, EltBits)) + SDValue N0 = N.getOperand(0); + SDValue N1 = N.getOperand(1); + bool IsAndN = (X86ISD::ANDNP == Opcode); + uint64_t ZeroMask = IsAndN ? 255 : 0; + if (!getTargetConstantBitsFromNode(IsAndN ? N0 : N1, 8, UndefElts, EltBits)) return false; for (int i = 0, e = (int)EltBits.size(); i != e; ++i) { if (UndefElts[i]) { @@ -5756,9 +5762,9 @@ static bool getFauxShuffleMask(SDValue N, SmallVectorImpl &Mask, uint64_t ByteBits = EltBits[i].getZExtValue(); if (ByteBits != 0 && ByteBits != 255) return false; - Mask.push_back(ByteBits == 0 ? SM_SentinelZero : i); + Mask.push_back(ByteBits == ZeroMask ? SM_SentinelZero : i); } - Ops.push_back(N.getOperand(0)); + Ops.push_back(IsAndN ? N1 : N0); return true; } case X86ISD::VSHLI: @@ -33010,7 +33016,8 @@ static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG, /// Do target-specific dag combines on X86ISD::ANDNP nodes. static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG, - const X86Subtarget &Subtarget) { + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget &Subtarget) { // ANDNP(0, x) -> x if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode())) return N->getOperand(1); @@ -33019,6 +33026,19 @@ static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG, if (ISD::isBuildVectorAllZeros(N->getOperand(1).getNode())) return getZeroVector(N->getSimpleValueType(0), Subtarget, DAG, SDLoc(N)); + EVT VT = N->getValueType(0); + + // Attempt to recursively combine a bitmask ANDNP with shuffles. + if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) { + SDValue Op(N, 0); + SmallVector NonceMask; // Just a placeholder. + NonceMask.push_back(0); + if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask, + /*Depth*/ 1, /*HasVarMask*/ false, DAG, + DCI, Subtarget)) + return SDValue(); // This routine will use CombineTo to replace N. + } + return SDValue(); } @@ -34088,7 +34108,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case ISD::FSUB: return combineFaddFsub(N, DAG, Subtarget); case ISD::FNEG: return combineFneg(N, DAG, Subtarget); case ISD::TRUNCATE: return combineTruncate(N, DAG, Subtarget); - case X86ISD::ANDNP: return combineAndnp(N, DAG, Subtarget); + case X86ISD::ANDNP: return combineAndnp(N, DAG, DCI, Subtarget); case X86ISD::FAND: return combineFAnd(N, DAG, Subtarget); case X86ISD::FANDN: return combineFAndn(N, DAG, Subtarget); case X86ISD::FXOR: diff --git a/test/CodeGen/X86/clear_upper_vector_element_bits.ll b/test/CodeGen/X86/clear_upper_vector_element_bits.ll index 55ff7a2abe5..3aacc480ac9 100644 --- a/test/CodeGen/X86/clear_upper_vector_element_bits.ll +++ b/test/CodeGen/X86/clear_upper_vector_element_bits.ll @@ -356,43 +356,31 @@ define <4 x i32> @_clearupper4xi32b(<4 x i32>) nounwind { define <8 x i16> @_clearupper8xi16b(<8 x i16>) nounwind { ; SSE-LABEL: _clearupper8xi16b: ; SSE: # BB#0: -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; SSE-NEXT: pand %xmm1, %xmm0 ; SSE-NEXT: xorl %eax, %eax -; SSE-NEXT: movd %eax, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm3 +; SSE-NEXT: movd %eax, %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm3 ; SSE-NEXT: psllw $8, %xmm3 -; SSE-NEXT: pandn %xmm3, %xmm2 -; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255] -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: movdqa %xmm1, %xmm3 +; SSE-NEXT: pandn %xmm3, %xmm1 +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255] +; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm2, %xmm3 ; SSE-NEXT: pslld $24, %xmm3 -; SSE-NEXT: pandn %xmm3, %xmm2 -; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255] -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: movdqa %xmm1, %xmm3 +; SSE-NEXT: pandn %xmm3, %xmm1 +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255] +; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm2, %xmm3 ; SSE-NEXT: psllq $40, %xmm3 -; SSE-NEXT: pandn %xmm3, %xmm2 -; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255] -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: psllq $56, %xmm3 -; SSE-NEXT: pandn %xmm3, %xmm2 -; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255] -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1,2,3,4,5,6] -; SSE-NEXT: pandn %xmm3, %xmm2 -; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255] -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4] -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: pandn %xmm3, %xmm1 +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255] +; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: psllq $56, %xmm2 +; SSE-NEXT: pandn %xmm2, %xmm1 +; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: pand {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; -- 2.50.1