WidenVT, N->getOperand(0));
}
+// Return true if this is a node that could have two SETCCs as operands.
+static inline bool isLogicalMaskOp(unsigned Opcode) {
+ switch (Opcode) {
+ case ISD::AND:
+ case ISD::OR:
+ case ISD::XOR:
+ return true;
+ }
+ return false;
+}
+
+// This is used just for the assert in convertMask(). Check that this either
+// a SETCC or a previously handled SETCC by convertMask().
+#ifndef NDEBUG
+static inline bool isSETCCorConvertedSETCC(SDValue N) {
+ if (N.getOpcode() == ISD::EXTRACT_SUBVECTOR)
+ N = N.getOperand(0);
+ else if (N.getOpcode() == ISD::CONCAT_VECTORS) {
+ for (unsigned i = 1; i < N->getNumOperands(); ++i)
+ if (!N->getOperand(i)->isUndef())
+ return false;
+ N = N.getOperand(0);
+ }
+
+ if (N.getOpcode() == ISD::TRUNCATE)
+ N = N.getOperand(0);
+ else if (N.getOpcode() == ISD::SIGN_EXTEND)
+ N = N.getOperand(0);
+
+ return (N.getOpcode() == ISD::SETCC);
+}
+#endif
+
+// Return a mask of vector type MaskVT to replace InMask. Also adjust MaskVT
+// to ToMaskVT if needed with vector extension or truncation.
+SDValue DAGTypeLegalizer::convertMask(SDValue InMask, EVT MaskVT,
+ EVT ToMaskVT) {
+ LLVMContext &Ctx = *DAG.getContext();
+
+ // Currently a SETCC or a AND/OR/XOR with two SETCCs are handled.
+ unsigned InMaskOpc = InMask->getOpcode();
+ assert((InMaskOpc == ISD::SETCC ||
+ (isLogicalMaskOp(InMaskOpc) &&
+ isSETCCorConvertedSETCC(InMask->getOperand(0)) &&
+ isSETCCorConvertedSETCC(InMask->getOperand(1)))) &&
+ "Unexpected mask argument.");
+
+ // Make a new Mask node, with a legal result VT.
+ SmallVector<SDValue, 4> Ops;
+ for (unsigned i = 0; i < InMask->getNumOperands(); ++i)
+ Ops.push_back(InMask->getOperand(i));
+ SDValue Mask = DAG.getNode(InMaskOpc, SDLoc(InMask), MaskVT, Ops);
+
+ // If MaskVT has smaller or bigger elements than ToMaskVT, a vector sign
+ // extend or truncate is needed.
+ unsigned MaskScalarBits = MaskVT.getScalarSizeInBits();
+ unsigned ToMaskScalBits = ToMaskVT.getScalarSizeInBits();
+ if (MaskScalarBits < ToMaskScalBits) {
+ EVT ExtVT = EVT::getVectorVT(Ctx, ToMaskVT.getVectorElementType(),
+ MaskVT.getVectorNumElements());
+ Mask = DAG.getNode(ISD::SIGN_EXTEND, SDLoc(Mask), ExtVT, Mask);
+ } else if (MaskScalarBits > ToMaskScalBits) {
+ EVT TruncVT = EVT::getVectorVT(Ctx, ToMaskVT.getVectorElementType(),
+ MaskVT.getVectorNumElements());
+ Mask = DAG.getNode(ISD::TRUNCATE, SDLoc(Mask), TruncVT, Mask);
+ }
+
+ assert(Mask->getValueType(0).getScalarSizeInBits() ==
+ ToMaskVT.getScalarSizeInBits() &&
+ "Mask should have the right element size by now.");
+
+ // Adjust Mask to the right number of elements.
+ unsigned CurrMaskNumEls = Mask->getValueType(0).getVectorNumElements();
+ if (CurrMaskNumEls > ToMaskVT.getVectorNumElements()) {
+ MVT IdxTy = TLI.getVectorIdxTy(DAG.getDataLayout());
+ SDValue ZeroIdx = DAG.getConstant(0, SDLoc(Mask), IdxTy);
+ Mask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(Mask), ToMaskVT, Mask,
+ ZeroIdx);
+ } else if (CurrMaskNumEls < ToMaskVT.getVectorNumElements()) {
+ unsigned NumSubVecs = (ToMaskVT.getVectorNumElements() / CurrMaskNumEls);
+ EVT SubVT = Mask->getValueType(0);
+ SmallVector<SDValue, 16> SubConcatOps(NumSubVecs);
+ SubConcatOps[0] = Mask;
+ for (unsigned i = 1; i < NumSubVecs; ++i)
+ SubConcatOps[i] = DAG.getUNDEF(SubVT);
+ Mask =
+ DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Mask), ToMaskVT, SubConcatOps);
+ }
+
+ assert((Mask->getValueType(0) == ToMaskVT) &&
+ "A mask of ToMaskVT should have been produced by now.");
+
+ return Mask;
+}
+
+// Get the target mask VT, and widen if needed.
+EVT DAGTypeLegalizer::getSETCCWidenedResultTy(SDValue SetCC) {
+ assert(SetCC->getOpcode() == ISD::SETCC);
+ LLVMContext &Ctx = *DAG.getContext();
+ EVT MaskVT = getSetCCResultType(SetCC->getOperand(0).getValueType());
+ if (getTypeAction(MaskVT) == TargetLowering::TypeWidenVector)
+ MaskVT = TLI.getTypeToTransformTo(Ctx, MaskVT);
+ return MaskVT;
+}
+
+// This method tries to handle VSELECT and its mask by legalizing operands
+// (which may require widening) and if needed adjusting the mask vector type
+// to match that of the VSELECT. Without it, many cases end up with
+// scalarization of the SETCC, with many unnecessary instructions.
+SDValue DAGTypeLegalizer::WidenVSELECTAndMask(SDNode *N) {
+ LLVMContext &Ctx = *DAG.getContext();
+ SDValue Cond = N->getOperand(0);
+
+ if (N->getOpcode() != ISD::VSELECT)
+ return SDValue();
+
+ if (Cond->getOpcode() != ISD::SETCC && !isLogicalMaskOp(Cond->getOpcode()))
+ return SDValue();
+
+ // If this is a splitted VSELECT that was previously already handled, do
+ // nothing.
+ if (Cond->getValueType(0).getScalarSizeInBits() != 1)
+ return SDValue();
+
+ EVT VSelVT = N->getValueType(0);
+ // Only handle vector types which are a power of 2.
+ if (!isPowerOf2_64(VSelVT.getSizeInBits()))
+ return SDValue();
+
+ // Don't touch if this will be scalarized.
+ EVT FinalVT = VSelVT;
+ while (getTypeAction(FinalVT) == TargetLowering::TypeSplitVector)
+ FinalVT = EVT::getVectorVT(Ctx, FinalVT.getVectorElementType(),
+ FinalVT.getVectorNumElements() / 2);
+ if (FinalVT.getVectorNumElements() == 1)
+ return SDValue();
+
+ // If there is support for an i1 vector mask, don't touch.
+ if (Cond.getOpcode() == ISD::SETCC) {
+ EVT SetCCOpVT = Cond->getOperand(0).getValueType();
+ while (TLI.getTypeAction(Ctx, SetCCOpVT) != TargetLowering::TypeLegal)
+ SetCCOpVT = TLI.getTypeToTransformTo(Ctx, SetCCOpVT);
+ EVT SetCCResVT = getSetCCResultType(SetCCOpVT);
+ if (SetCCResVT.getScalarSizeInBits() == 1)
+ return SDValue();
+ }
+
+ // Get the VT and operands for VSELECT, and widen if needed.
+ SDValue VSelOp1 = N->getOperand(1);
+ SDValue VSelOp2 = N->getOperand(2);
+ if (getTypeAction(VSelVT) == TargetLowering::TypeWidenVector) {
+ VSelVT = TLI.getTypeToTransformTo(Ctx, VSelVT);
+ VSelOp1 = GetWidenedVector(VSelOp1);
+ VSelOp2 = GetWidenedVector(VSelOp2);
+ }
+
+ // The mask of the VSELECT should have integer elements.
+ EVT ToMaskVT = VSelVT;
+ if (!ToMaskVT.getScalarType().isInteger())
+ ToMaskVT = ToMaskVT.changeVectorElementTypeToInteger();
+
+ SDValue Mask;
+ if (Cond->getOpcode() == ISD::SETCC) {
+ EVT MaskVT = getSETCCWidenedResultTy(Cond);
+ Mask = convertMask(Cond, MaskVT, ToMaskVT);
+ } else if (isLogicalMaskOp(Cond->getOpcode()) &&
+ Cond->getOperand(0).getOpcode() == ISD::SETCC &&
+ Cond->getOperand(1).getOpcode() == ISD::SETCC) {
+ // Cond is (AND/OR/XOR (SETCC, SETCC))
+ SDValue SETCC0 = Cond->getOperand(0);
+ SDValue SETCC1 = Cond->getOperand(1);
+ EVT VT0 = getSETCCWidenedResultTy(SETCC0);
+ EVT VT1 = getSETCCWidenedResultTy(SETCC1);
+ unsigned ScalarBits0 = VT0.getScalarSizeInBits();
+ unsigned ScalarBits1 = VT1.getScalarSizeInBits();
+ unsigned ScalarBits_ToMask = ToMaskVT.getScalarSizeInBits();
+ EVT MaskVT;
+ // If the two SETCCs have different VTs, either extend/truncate one of
+ // them to the other "towards" ToMaskVT, or truncate one and extend the
+ // other to ToMaskVT.
+ if (ScalarBits0 != ScalarBits1) {
+ EVT NarrowVT = ((ScalarBits0 < ScalarBits1) ? VT0 : VT1);
+ EVT WideVT = ((NarrowVT == VT0) ? VT1 : VT0);
+ if (ScalarBits_ToMask >= WideVT.getScalarSizeInBits())
+ MaskVT = WideVT;
+ else if (ScalarBits_ToMask <= NarrowVT.getScalarSizeInBits())
+ MaskVT = NarrowVT;
+ else
+ MaskVT = ToMaskVT;
+ } else
+ // If the two SETCCs have the same VT, don't change it.
+ MaskVT = VT0;
+
+ // Make new SETCCs and logical nodes.
+ SETCC0 = convertMask(SETCC0, VT0, MaskVT);
+ SETCC1 = convertMask(SETCC1, VT1, MaskVT);
+ Cond = DAG.getNode(Cond->getOpcode(), SDLoc(Cond), MaskVT, SETCC0, SETCC1);
+
+ // Convert the logical op for VSELECT if needed.
+ Mask = convertMask(Cond, MaskVT, ToMaskVT);
+ } else
+ return SDValue();
+
+ return DAG.getNode(ISD::VSELECT, SDLoc(N), VSelVT, Mask, VSelOp1, VSelOp2);
+}
+
SDValue DAGTypeLegalizer::WidenVecRes_SELECT(SDNode *N) {
EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
unsigned WidenNumElts = WidenVT.getVectorNumElements();
SDValue Cond1 = N->getOperand(0);
EVT CondVT = Cond1.getValueType();
if (CondVT.isVector()) {
+ if (SDValue Res = WidenVSELECTAndMask(N))
+ return Res;
+
EVT CondEltVT = CondVT.getVectorElementType();
EVT CondWidenVT = EVT::getVectorVT(*DAG.getContext(),
CondEltVT, WidenNumElts);
; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7]
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
; SSE2-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm3[0]
-; SSE2-NEXT: psllw $15, %xmm4
-; SSE2-NEXT: psraw $15, %xmm4
; SSE2-NEXT: psubd %xmm2, %xmm1
; SSE2-NEXT: pslld $16, %xmm0
; SSE2-NEXT: psrad $16, %xmm0
; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6
; SSSE3-NEXT: pshufb %xmm5, %xmm6
; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm3[0]
-; SSSE3-NEXT: psllw $15, %xmm6
-; SSSE3-NEXT: psraw $15, %xmm6
; SSSE3-NEXT: psubd %xmm2, %xmm1
; SSSE3-NEXT: pshufb %xmm5, %xmm0
; SSSE3-NEXT: pshufb %xmm5, %xmm1
define void @test14(i8* nocapture %head, i32* nocapture %w) nounwind {
; SSE2-LABEL: test14:
; SSE2: ## BB#0: ## %vector.ph
-; SSE2-NEXT: movdqu (%rdi), %xmm1
+; SSE2-NEXT: movdqu (%rdi), %xmm0
; SSE2-NEXT: movdqu (%rsi), %xmm8
; SSE2-NEXT: movdqu 16(%rsi), %xmm9
; SSE2-NEXT: movdqu 32(%rsi), %xmm10
-; SSE2-NEXT: movdqu 48(%rsi), %xmm6
-; SSE2-NEXT: pxor %xmm11, %xmm11
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3],xmm0[4],xmm11[4],xmm0[5],xmm11[5],xmm0[6],xmm11[6],xmm0[7],xmm11[7]
-; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm11[0],xmm3[1],xmm11[1],xmm3[2],xmm11[2],xmm3[3],xmm11[3]
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm11[4],xmm0[5],xmm11[5],xmm0[6],xmm11[6],xmm0[7],xmm11[7]
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm11[8],xmm1[9],xmm11[9],xmm1[10],xmm11[10],xmm1[11],xmm11[11],xmm1[12],xmm11[12],xmm1[13],xmm11[13],xmm1[14],xmm11[14],xmm1[15],xmm11[15]
-; SSE2-NEXT: movdqa %xmm1, %xmm5
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm11[0],xmm5[1],xmm11[1],xmm5[2],xmm11[2],xmm5[3],xmm11[3]
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm11[4],xmm1[5],xmm11[5],xmm1[6],xmm11[6],xmm1[7],xmm11[7]
-; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
-; SSE2-NEXT: movdqa %xmm1, %xmm7
-; SSE2-NEXT: psubd %xmm6, %xmm1
-; SSE2-NEXT: pxor %xmm4, %xmm6
-; SSE2-NEXT: pxor %xmm4, %xmm7
-; SSE2-NEXT: pcmpgtd %xmm7, %xmm6
-; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,2,2,3,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,6,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3]
-; SSE2-NEXT: movdqa %xmm5, %xmm7
-; SSE2-NEXT: psubd %xmm10, %xmm5
-; SSE2-NEXT: pxor %xmm4, %xmm10
-; SSE2-NEXT: pxor %xmm4, %xmm7
-; SSE2-NEXT: pcmpgtd %xmm7, %xmm10
-; SSE2-NEXT: pshuflw {{.*#+}} xmm7 = xmm10[0,2,2,3,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,6,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,2,2,3]
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm7 = xmm7[0],xmm6[0]
-; SSE2-NEXT: psllw $15, %xmm7
-; SSE2-NEXT: psraw $15, %xmm7
-; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [255,255,255,255,255,255,255,255]
-; SSE2-NEXT: pand %xmm10, %xmm7
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: psubd %xmm9, %xmm0
-; SSE2-NEXT: pxor %xmm4, %xmm9
-; SSE2-NEXT: pxor %xmm4, %xmm2
-; SSE2-NEXT: pcmpgtd %xmm2, %xmm9
-; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm9[0,2,2,3,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; SSE2-NEXT: movdqa %xmm8, %xmm6
-; SSE2-NEXT: pxor %xmm4, %xmm6
+; SSE2-NEXT: movdqu 48(%rsi), %xmm7
+; SSE2-NEXT: pxor %xmm3, %xmm3
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15]
+; SSE2-NEXT: movdqa %xmm0, %xmm6
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3]
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
+; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT: movdqa %xmm0, %xmm5
+; SSE2-NEXT: psubd %xmm7, %xmm0
+; SSE2-NEXT: pxor %xmm3, %xmm7
+; SSE2-NEXT: pxor %xmm3, %xmm5
+; SSE2-NEXT: pcmpgtd %xmm5, %xmm7
+; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255]
+; SSE2-NEXT: pand %xmm5, %xmm7
+; SSE2-NEXT: movdqa %xmm6, %xmm4
+; SSE2-NEXT: psubd %xmm10, %xmm6
+; SSE2-NEXT: pxor %xmm3, %xmm10
; SSE2-NEXT: pxor %xmm3, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm4, %xmm6
-; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm6[0,2,2,3,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm2[0]
-; SSE2-NEXT: psllw $15, %xmm4
-; SSE2-NEXT: psraw $15, %xmm4
-; SSE2-NEXT: pand %xmm10, %xmm4
-; SSE2-NEXT: packuswb %xmm7, %xmm4
-; SSE2-NEXT: psllw $7, %xmm4
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm4
-; SSE2-NEXT: pcmpgtb %xmm4, %xmm11
-; SSE2-NEXT: psubd %xmm8, %xmm3
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
-; SSE2-NEXT: pand %xmm2, %xmm1
-; SSE2-NEXT: pand %xmm2, %xmm5
-; SSE2-NEXT: packuswb %xmm1, %xmm5
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: pand %xmm2, %xmm3
-; SSE2-NEXT: packuswb %xmm0, %xmm3
-; SSE2-NEXT: packuswb %xmm5, %xmm3
-; SSE2-NEXT: pandn %xmm3, %xmm11
-; SSE2-NEXT: movdqu %xmm11, (%rdi)
+; SSE2-NEXT: pcmpgtd %xmm4, %xmm10
+; SSE2-NEXT: pand %xmm5, %xmm10
+; SSE2-NEXT: packuswb %xmm7, %xmm10
+; SSE2-NEXT: movdqa %xmm1, %xmm4
+; SSE2-NEXT: psubd %xmm9, %xmm1
+; SSE2-NEXT: pxor %xmm3, %xmm9
+; SSE2-NEXT: pxor %xmm3, %xmm4
+; SSE2-NEXT: pcmpgtd %xmm4, %xmm9
+; SSE2-NEXT: pand %xmm5, %xmm9
+; SSE2-NEXT: movdqa %xmm8, %xmm4
+; SSE2-NEXT: pxor %xmm3, %xmm4
+; SSE2-NEXT: pxor %xmm2, %xmm3
+; SSE2-NEXT: pcmpgtd %xmm3, %xmm4
+; SSE2-NEXT: pand %xmm5, %xmm4
+; SSE2-NEXT: packuswb %xmm9, %xmm4
+; SSE2-NEXT: packuswb %xmm10, %xmm4
+; SSE2-NEXT: psubd %xmm8, %xmm2
+; SSE2-NEXT: pand %xmm5, %xmm0
+; SSE2-NEXT: pand %xmm5, %xmm6
+; SSE2-NEXT: packuswb %xmm0, %xmm6
+; SSE2-NEXT: pand %xmm5, %xmm1
+; SSE2-NEXT: pand %xmm5, %xmm2
+; SSE2-NEXT: packuswb %xmm1, %xmm2
+; SSE2-NEXT: packuswb %xmm6, %xmm2
+; SSE2-NEXT: pandn %xmm2, %xmm4
+; SSE2-NEXT: movdqu %xmm4, (%rdi)
; SSE2-NEXT: retq
;
; SSSE3-LABEL: test14:
; SSSE3: ## BB#0: ## %vector.ph
-; SSSE3-NEXT: movdqu (%rdi), %xmm1
+; SSSE3-NEXT: movdqu (%rdi), %xmm0
; SSSE3-NEXT: movdqu (%rsi), %xmm8
; SSSE3-NEXT: movdqu 16(%rsi), %xmm9
; SSSE3-NEXT: movdqu 32(%rsi), %xmm10
-; SSSE3-NEXT: movdqu 48(%rsi), %xmm4
-; SSSE3-NEXT: pxor %xmm0, %xmm0
+; SSSE3-NEXT: movdqu 48(%rsi), %xmm7
+; SSSE3-NEXT: pxor %xmm3, %xmm3
+; SSSE3-NEXT: movdqa %xmm0, %xmm1
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
; SSSE3-NEXT: movdqa %xmm1, %xmm2
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSSE3-NEXT: movdqa %xmm2, %xmm3
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
-; SSSE3-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSSE3-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
-; SSSE3-NEXT: movdqa %xmm1, %xmm7
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3]
-; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [2147483648,2147483648,2147483648,2147483648]
-; SSSE3-NEXT: movdqa %xmm1, %xmm5
-; SSSE3-NEXT: psubd %xmm4, %xmm1
-; SSSE3-NEXT: pxor %xmm6, %xmm4
-; SSSE3-NEXT: pxor %xmm6, %xmm5
-; SSSE3-NEXT: pcmpgtd %xmm5, %xmm4
-; SSSE3-NEXT: movdqa {{.*#+}} xmm11 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; SSSE3-NEXT: pshufb %xmm11, %xmm4
-; SSSE3-NEXT: movdqa %xmm7, %xmm5
-; SSSE3-NEXT: psubd %xmm10, %xmm7
-; SSSE3-NEXT: pxor %xmm6, %xmm10
-; SSSE3-NEXT: pxor %xmm6, %xmm5
-; SSSE3-NEXT: pcmpgtd %xmm5, %xmm10
-; SSSE3-NEXT: pshufb %xmm11, %xmm10
-; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm10 = xmm10[0],xmm4[0]
-; SSSE3-NEXT: psllw $15, %xmm10
-; SSSE3-NEXT: psraw $15, %xmm10
-; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; SSSE3-NEXT: pshufb %xmm4, %xmm10
-; SSSE3-NEXT: movdqa %xmm2, %xmm5
-; SSSE3-NEXT: psubd %xmm9, %xmm2
-; SSSE3-NEXT: pxor %xmm6, %xmm9
-; SSSE3-NEXT: pxor %xmm6, %xmm5
-; SSSE3-NEXT: pcmpgtd %xmm5, %xmm9
-; SSSE3-NEXT: pshufb %xmm11, %xmm9
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
+; SSSE3-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15]
+; SSSE3-NEXT: movdqa %xmm0, %xmm6
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3]
+; SSSE3-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
+; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
+; SSSE3-NEXT: movdqa %xmm0, %xmm5
+; SSSE3-NEXT: psubd %xmm7, %xmm0
+; SSSE3-NEXT: pxor %xmm3, %xmm7
+; SSSE3-NEXT: pxor %xmm3, %xmm5
+; SSSE3-NEXT: pcmpgtd %xmm5, %xmm7
+; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
+; SSSE3-NEXT: pshufb %xmm5, %xmm7
+; SSSE3-NEXT: movdqa %xmm6, %xmm4
+; SSSE3-NEXT: psubd %xmm10, %xmm6
+; SSSE3-NEXT: pxor %xmm3, %xmm10
+; SSSE3-NEXT: pxor %xmm3, %xmm4
+; SSSE3-NEXT: pcmpgtd %xmm4, %xmm10
+; SSSE3-NEXT: pshufb %xmm5, %xmm10
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm7[0],xmm10[1],xmm7[1]
+; SSSE3-NEXT: movdqa %xmm1, %xmm4
+; SSSE3-NEXT: psubd %xmm9, %xmm1
+; SSSE3-NEXT: pxor %xmm3, %xmm9
+; SSSE3-NEXT: pxor %xmm3, %xmm4
+; SSSE3-NEXT: pcmpgtd %xmm4, %xmm9
+; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; SSSE3-NEXT: pshufb %xmm4, %xmm9
; SSSE3-NEXT: movdqa %xmm8, %xmm5
-; SSSE3-NEXT: pxor %xmm6, %xmm5
-; SSSE3-NEXT: pxor %xmm3, %xmm6
-; SSSE3-NEXT: pcmpgtd %xmm6, %xmm5
-; SSSE3-NEXT: pshufb %xmm11, %xmm5
-; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm9[0]
-; SSSE3-NEXT: psllw $15, %xmm5
-; SSSE3-NEXT: psraw $15, %xmm5
+; SSSE3-NEXT: pxor %xmm3, %xmm5
+; SSSE3-NEXT: pxor %xmm2, %xmm3
+; SSSE3-NEXT: pcmpgtd %xmm3, %xmm5
; SSSE3-NEXT: pshufb %xmm4, %xmm5
-; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm10[0]
-; SSSE3-NEXT: psllw $7, %xmm5
-; SSSE3-NEXT: pand {{.*}}(%rip), %xmm5
-; SSSE3-NEXT: pcmpgtb %xmm5, %xmm0
-; SSSE3-NEXT: psubd %xmm8, %xmm3
-; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
-; SSSE3-NEXT: pand %xmm4, %xmm1
-; SSSE3-NEXT: pand %xmm4, %xmm7
-; SSSE3-NEXT: packuswb %xmm1, %xmm7
-; SSSE3-NEXT: pand %xmm4, %xmm2
-; SSSE3-NEXT: pand %xmm4, %xmm3
-; SSSE3-NEXT: packuswb %xmm2, %xmm3
-; SSSE3-NEXT: packuswb %xmm7, %xmm3
-; SSSE3-NEXT: pandn %xmm3, %xmm0
-; SSSE3-NEXT: movdqu %xmm0, (%rdi)
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1]
+; SSSE3-NEXT: movsd {{.*#+}} xmm10 = xmm5[0],xmm10[1]
+; SSSE3-NEXT: psubd %xmm8, %xmm2
+; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; SSSE3-NEXT: pand %xmm3, %xmm0
+; SSSE3-NEXT: pand %xmm3, %xmm6
+; SSSE3-NEXT: packuswb %xmm0, %xmm6
+; SSSE3-NEXT: pand %xmm3, %xmm1
+; SSSE3-NEXT: pand %xmm3, %xmm2
+; SSSE3-NEXT: packuswb %xmm1, %xmm2
+; SSSE3-NEXT: packuswb %xmm6, %xmm2
+; SSSE3-NEXT: andnpd %xmm2, %xmm10
+; SSSE3-NEXT: movupd %xmm10, (%rdi)
; SSSE3-NEXT: retq
;
; AVX1-LABEL: test14:
; AVX1-NEXT: vpxor %xmm6, %xmm10, %xmm7
; AVX1-NEXT: vpxor %xmm6, %xmm1, %xmm4
; AVX1-NEXT: vpcmpgtd %xmm7, %xmm4, %xmm4
-; AVX1-NEXT: vpacksswb %xmm3, %xmm4, %xmm3
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm11 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX1-NEXT: vpshufb %xmm11, %xmm3, %xmm12
-; AVX1-NEXT: vpxor %xmm6, %xmm9, %xmm7
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
-; AVX1-NEXT: vpxor %xmm6, %xmm4, %xmm3
-; AVX1-NEXT: vpcmpgtd %xmm7, %xmm3, %xmm3
-; AVX1-NEXT: vpxor %xmm6, %xmm8, %xmm7
+; AVX1-NEXT: vpacksswb %xmm3, %xmm4, %xmm11
+; AVX1-NEXT: vpxor %xmm6, %xmm9, %xmm4
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm7
+; AVX1-NEXT: vpxor %xmm6, %xmm7, %xmm3
+; AVX1-NEXT: vpcmpgtd %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vpxor %xmm6, %xmm8, %xmm4
; AVX1-NEXT: vpxor %xmm6, %xmm0, %xmm6
-; AVX1-NEXT: vpcmpgtd %xmm7, %xmm6, %xmm6
-; AVX1-NEXT: vpacksswb %xmm3, %xmm6, %xmm3
-; AVX1-NEXT: vpshufb %xmm11, %xmm3, %xmm3
-; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm12[0]
-; AVX1-NEXT: vpsllw $7, %xmm3, %xmm3
-; AVX1-NEXT: vpand {{.*}}(%rip), %xmm3, %xmm3
+; AVX1-NEXT: vpcmpgtd %xmm4, %xmm6, %xmm4
+; AVX1-NEXT: vpacksswb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpacksswb %xmm11, %xmm3, %xmm3
; AVX1-NEXT: vpsubd %xmm0, %xmm8, %xmm0
-; AVX1-NEXT: vpsubd %xmm4, %xmm9, %xmm4
+; AVX1-NEXT: vpsubd %xmm7, %xmm9, %xmm4
; AVX1-NEXT: vpsubd %xmm1, %xmm10, %xmm1
; AVX1-NEXT: vpsubd %xmm2, %xmm5, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
; AVX2-NEXT: vpcmpgtd %ymm5, %ymm6, %ymm5
; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm6
; AVX2-NEXT: vpacksswb %xmm6, %xmm5, %xmm5
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm6 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX2-NEXT: vpshufb %xmm6, %xmm5, %xmm5
-; AVX2-NEXT: vpxor %ymm4, %ymm2, %ymm7
+; AVX2-NEXT: vpxor %ymm4, %ymm2, %ymm6
; AVX2-NEXT: vpxor %ymm4, %ymm0, %ymm4
-; AVX2-NEXT: vpcmpgtd %ymm7, %ymm4, %ymm4
-; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm7
-; AVX2-NEXT: vpacksswb %xmm7, %xmm4, %xmm4
-; AVX2-NEXT: vpshufb %xmm6, %xmm4, %xmm4
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm5[0]
-; AVX2-NEXT: vpsllw $7, %xmm4, %xmm4
-; AVX2-NEXT: vpand {{.*}}(%rip), %xmm4, %xmm4
+; AVX2-NEXT: vpcmpgtd %ymm6, %ymm4, %ymm4
+; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm6
+; AVX2-NEXT: vpacksswb %xmm6, %xmm4, %xmm4
+; AVX2-NEXT: vpacksswb %xmm5, %xmm4, %xmm4
; AVX2-NEXT: vpsubd %ymm0, %ymm2, %ymm0
; AVX2-NEXT: vpsubd %ymm1, %ymm3, %ymm1
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-NEXT: vpshufb %xmm6, %xmm1, %xmm1
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1
; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: vpshufb %xmm6, %xmm0, %xmm0
+; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpblendvb %xmm4, %xmm1, %xmm0, %xmm0
; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7]
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
; SSE2-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm3[0]
-; SSE2-NEXT: psllw $15, %xmm4
-; SSE2-NEXT: psraw $15, %xmm4
; SSE2-NEXT: psubd %xmm2, %xmm1
; SSE2-NEXT: pslld $16, %xmm0
; SSE2-NEXT: psrad $16, %xmm0
; SSSE3-NEXT: pcmpgtd %xmm6, %xmm3
; SSSE3-NEXT: pshufb %xmm4, %xmm3
; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm5[0]
-; SSSE3-NEXT: psllw $15, %xmm3
-; SSSE3-NEXT: psraw $15, %xmm3
; SSSE3-NEXT: psubd %xmm2, %xmm1
; SSSE3-NEXT: pshufb %xmm4, %xmm0
; SSSE3-NEXT: pshufb %xmm4, %xmm1
; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7]
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
; SSE2-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm3[0]
-; SSE2-NEXT: psllw $15, %xmm4
-; SSE2-NEXT: psraw $15, %xmm4
; SSE2-NEXT: psubd %xmm2, %xmm1
; SSE2-NEXT: pslld $16, %xmm0
; SSE2-NEXT: psrad $16, %xmm0
; SSSE3-NEXT: pcmpgtd %xmm6, %xmm3
; SSSE3-NEXT: pshufb %xmm4, %xmm3
; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm5[0]
-; SSSE3-NEXT: psllw $15, %xmm3
-; SSSE3-NEXT: psraw $15, %xmm3
; SSSE3-NEXT: psubd %xmm2, %xmm1
; SSSE3-NEXT: pshufb %xmm4, %xmm0
; SSSE3-NEXT: pshufb %xmm4, %xmm1