From e8b72983257c60e7e53a970ad13d65c76098c954 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Tue, 31 Jan 2017 13:51:10 +0000 Subject: [PATCH] [X86][SSE] Add support for combining PINSRW into a target shuffle. Also add the ability to recognise PINSR(Vex, 0, Idx). Targets shuffle combines won't replace multiple insertions with a bit mask until a depth of 3 or more, so we avoid codesize bloat. The unnecessary vpblendw in clearupper8xi16a will be fixed in an upcoming patch. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@293627 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 33 +++++++++++++++++-- .../X86/clear_upper_vector_element_bits.ll | 9 ++--- 2 files changed, 34 insertions(+), 8 deletions(-) diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index a05c7f58e9e..6b235d08ce3 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -5770,12 +5770,21 @@ static bool getFauxShuffleMask(SDValue N, SmallVectorImpl &Mask, return true; } case X86ISD::PINSRW: { - // Attempt to recognise a PINSRW(ASSERTZEXT(PEXTRW)) shuffle pattern. - // TODO: Expand this to support PINSRB/INSERT_VECTOR_ELT/etc. SDValue InVec = N.getOperand(0); SDValue InScl = N.getOperand(1); uint64_t InIdx = N.getConstantOperandVal(2); assert(InIdx < NumElts && "Illegal insertion index"); + + // Attempt to recognise a PINSRW(VEC, 0, Idx) shuffle pattern. + if (X86::isZeroNode(InScl)) { + Ops.push_back(InVec); + for (unsigned i = 0; i != NumElts; ++i) + Mask.push_back(i == InIdx ? SM_SentinelZero : i); + return true; + } + + // Attempt to recognise a PINSRW(ASSERTZEXT(PEXTRW)) shuffle pattern. + // TODO: Expand this to support PINSRB/INSERT_VECTOR_ELT/etc. if (InScl.getOpcode() != ISD::AssertZext || InScl.getOperand(0).getOpcode() != X86ISD::PEXTRW) return false; @@ -30597,6 +30606,24 @@ static SDValue combineVectorShift(SDNode *N, SelectionDAG &DAG, return SDValue(); } +static SDValue combineVectorInsert(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget &Subtarget) { + unsigned Opcode = N->getOpcode(); + assert(((X86ISD::PINSRB == Opcode && N->getValueType(0) ==MVT::v16i8) || + (X86ISD::PINSRW == Opcode && N->getValueType(0) ==MVT::v8i16)) && + "Unexpected vector insertion"); + + // Attempt to combine PINSRB/PINSRW patterns to a shuffle. + SDValue Op(N, 0); + SmallVector NonceMask; // Just a placeholder. + NonceMask.push_back(0); + combineX86ShufflesRecursively({Op}, 0, Op, NonceMask, + /*Depth*/ 1, /*HasVarMask*/ false, DAG, + DCI, Subtarget); + return SDValue(); +} + /// Recognize the distinctive (AND (setcc ...) (setcc ..)) where both setccs /// reference the same FP CMP, and rewrite for CMPEQSS and friends. Likewise for /// OR -> CMPNEQSS. @@ -34159,6 +34186,8 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case X86ISD::VSRLI: return combineVectorShift(N, DAG, DCI, Subtarget); case X86ISD::VSEXT: case X86ISD::VZEXT: return combineVSZext(N, DAG, DCI, Subtarget); + case X86ISD::PINSRB: + case X86ISD::PINSRW: return combineVectorInsert(N, DAG, DCI, Subtarget); case X86ISD::SHUFP: // Handle all target specific shuffles case X86ISD::INSERTPS: case X86ISD::PALIGNR: diff --git a/test/CodeGen/X86/clear_upper_vector_element_bits.ll b/test/CodeGen/X86/clear_upper_vector_element_bits.ll index 9256717f155..d22c2e3d98c 100644 --- a/test/CodeGen/X86/clear_upper_vector_element_bits.ll +++ b/test/CodeGen/X86/clear_upper_vector_element_bits.ll @@ -94,7 +94,8 @@ define <8 x i16> @_clearupper8xi16a(<8 x i16>) nounwind { ; ; AVX-LABEL: _clearupper8xi16a: ; AVX: # BB#0: -; AVX-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,7] +; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: retq %x0 = extractelement <8 x i16> %0, i32 0 %x1 = extractelement <8 x i16> %0, i32 1 @@ -317,11 +318,7 @@ define <2 x i64> @_clearupper2xi64b(<2 x i64>) nounwind { define <4 x i32> @_clearupper4xi32b(<4 x i32>) nounwind { ; SSE-LABEL: _clearupper4xi32b: ; SSE: # BB#0: -; SSE-NEXT: xorl %eax, %eax -; SSE-NEXT: pinsrw $1, %eax, %xmm0 -; SSE-NEXT: pinsrw $3, %eax, %xmm0 -; SSE-NEXT: pinsrw $5, %eax, %xmm0 -; SSE-NEXT: pinsrw $7, %eax, %xmm0 +; SSE-NEXT: andps {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: _clearupper4xi32b: -- 2.50.1