From: Simon Pilgrim Date: Sun, 19 Feb 2017 14:12:25 +0000 (+0000) Subject: [X86][SSE] Add domain crossing support for target shuffle combines. X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=395e4206ab18e394621030f8f68a5d533848df72;p=llvm [X86][SSE] Add domain crossing support for target shuffle combines. Add the infrastructure to flag whether float and/or int domains are permitable. A future patch will enable domain crossing based off shuffle depth and the value types of the source vectors. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@295604 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 5a8dd5bfdcd..1ff30445716 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -26369,8 +26369,8 @@ bool X86TargetLowering::isGAPlusOffset(SDNode *N, // instructions. // TODO: Investigate sharing more of this with shuffle lowering. static bool matchUnaryVectorShuffle(MVT MaskVT, ArrayRef Mask, - bool FloatDomain, SDValue &V1, SDLoc &DL, - SelectionDAG &DAG, + bool AllowFloatDomain, bool AllowIntDomain, + SDValue &V1, SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned &Shuffle, MVT &SrcVT, MVT &DstVT) { unsigned NumMaskElts = Mask.size(); @@ -26387,8 +26387,8 @@ static bool matchUnaryVectorShuffle(MVT MaskVT, ArrayRef Mask, // Match against a VZEXT instruction. // TODO: Add 512-bit vector support (split AVX512F and AVX512BW). - if (!FloatDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE41()) || - (MaskVT.is256BitVector() && Subtarget.hasInt256()))) { + if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE41()) || + (MaskVT.is256BitVector() && Subtarget.hasInt256()))) { unsigned MaxScale = 64 / MaskEltSize; for (unsigned Scale = 2; Scale <= MaxScale; Scale *= 2) { bool Match = true; @@ -26413,7 +26413,7 @@ static bool matchUnaryVectorShuffle(MVT MaskVT, ArrayRef Mask, // Check if we have SSE3 which will let us use MOVDDUP etc. The // instructions are no slower than UNPCKLPD but has the option to // fold the input operand into even an unaligned memory load. - if (MaskVT.is128BitVector() && Subtarget.hasSSE3() && FloatDomain) { + if (MaskVT.is128BitVector() && Subtarget.hasSSE3() && AllowFloatDomain) { if (isTargetShuffleEquivalent(Mask, {0, 0})) { Shuffle = X86ISD::MOVDDUP; SrcVT = DstVT = MVT::v2f64; @@ -26431,7 +26431,7 @@ static bool matchUnaryVectorShuffle(MVT MaskVT, ArrayRef Mask, } } - if (MaskVT.is256BitVector() && FloatDomain) { + if (MaskVT.is256BitVector() && AllowFloatDomain) { assert(Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles"); if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})) { Shuffle = X86ISD::MOVDDUP; @@ -26450,7 +26450,7 @@ static bool matchUnaryVectorShuffle(MVT MaskVT, ArrayRef Mask, } } - if (MaskVT.is512BitVector() && FloatDomain) { + if (MaskVT.is512BitVector() && AllowFloatDomain) { assert(Subtarget.hasAVX512() && "AVX512 required for 512-bit vector shuffles"); if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})) { @@ -26489,7 +26489,8 @@ static bool matchUnaryVectorShuffle(MVT MaskVT, ArrayRef Mask, // permute instructions. // TODO: Investigate sharing more of this with shuffle lowering. static bool matchUnaryPermuteVectorShuffle(MVT MaskVT, ArrayRef Mask, - bool FloatDomain, + bool AllowFloatDomain, + bool AllowIntDomain, const X86Subtarget &Subtarget, unsigned &Shuffle, MVT &ShuffleVT, unsigned &PermuteImm) { @@ -26505,8 +26506,8 @@ static bool matchUnaryPermuteVectorShuffle(MVT MaskVT, ArrayRef Mask, // Attempt to match against byte/bit shifts. // FIXME: Add 512-bit support. - if (!FloatDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) || - (MaskVT.is256BitVector() && Subtarget.hasAVX2()))) { + if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) || + (MaskVT.is256BitVector() && Subtarget.hasAVX2()))) { int ShiftAmt = matchVectorShuffleAsShift(ShuffleVT, Shuffle, MaskVT.getScalarSizeInBits(), Mask, 0, Zeroable, Subtarget); @@ -26569,19 +26570,21 @@ static bool matchUnaryPermuteVectorShuffle(MVT MaskVT, ArrayRef Mask, // AVX introduced the VPERMILPD/VPERMILPS float permutes, before then we // had to use 2-input SHUFPD/SHUFPS shuffles (not handled here). - if (FloatDomain && !Subtarget.hasAVX()) + if ((AllowFloatDomain && !AllowIntDomain) && !Subtarget.hasAVX()) return false; // Pre-AVX2 we must use float shuffles on 256-bit vectors. - if (MaskVT.is256BitVector() && !Subtarget.hasAVX2()) - FloatDomain = true; + if (MaskVT.is256BitVector() && !Subtarget.hasAVX2()) { + AllowFloatDomain = true; + AllowIntDomain = false; + } // Check for lane crossing permutes. if (is128BitLaneCrossingShuffleMask(MaskEltVT, Mask)) { // PERMPD/PERMQ permutes within a 256-bit vector (AVX2+). if (Subtarget.hasAVX2() && MaskVT.is256BitVector() && Mask.size() == 4) { Shuffle = X86ISD::VPERMI; - ShuffleVT = (FloatDomain ? MVT::v4f64 : MVT::v4i64); + ShuffleVT = (AllowFloatDomain ? MVT::v4f64 : MVT::v4i64); PermuteImm = getV4X86ShuffleImm(Mask); return true; } @@ -26589,7 +26592,7 @@ static bool matchUnaryPermuteVectorShuffle(MVT MaskVT, ArrayRef Mask, SmallVector RepeatedMask; if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask)) { Shuffle = X86ISD::VPERMI; - ShuffleVT = (FloatDomain ? MVT::v8f64 : MVT::v8i64); + ShuffleVT = (AllowFloatDomain ? MVT::v8f64 : MVT::v8i64); PermuteImm = getV4X86ShuffleImm(RepeatedMask); return true; } @@ -26598,7 +26601,7 @@ static bool matchUnaryPermuteVectorShuffle(MVT MaskVT, ArrayRef Mask, } // VPERMILPD can permute with a non-repeating shuffle. - if (FloatDomain && MaskScalarSizeInBits == 64) { + if (AllowFloatDomain && MaskScalarSizeInBits == 64) { Shuffle = X86ISD::VPERMILPI; ShuffleVT = MVT::getVectorVT(MVT::f64, Mask.size()); PermuteImm = 0; @@ -26622,8 +26625,8 @@ static bool matchUnaryPermuteVectorShuffle(MVT MaskVT, ArrayRef Mask, if (MaskScalarSizeInBits == 64) scaleShuffleMask(2, RepeatedMask, WordMask); - Shuffle = (FloatDomain ? X86ISD::VPERMILPI : X86ISD::PSHUFD); - ShuffleVT = (FloatDomain ? MVT::f32 : MVT::i32); + Shuffle = (AllowFloatDomain ? X86ISD::VPERMILPI : X86ISD::PSHUFD); + ShuffleVT = (AllowFloatDomain ? MVT::f32 : MVT::i32); ShuffleVT = MVT::getVectorVT(ShuffleVT, InputSizeInBits / 32); PermuteImm = getV4X86ShuffleImm(WordMask); return true; @@ -26633,35 +26636,36 @@ static bool matchUnaryPermuteVectorShuffle(MVT MaskVT, ArrayRef Mask, // shuffle instructions. // TODO: Investigate sharing more of this with shuffle lowering. static bool matchBinaryVectorShuffle(MVT MaskVT, ArrayRef Mask, - bool FloatDomain, SDValue &V1, SDValue &V2, - SDLoc &DL, SelectionDAG &DAG, + bool AllowFloatDomain, bool AllowIntDomain, + SDValue &V1, SDValue &V2, SDLoc &DL, + SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned &Shuffle, MVT &ShuffleVT, bool IsUnary) { unsigned EltSizeInBits = MaskVT.getScalarSizeInBits(); if (MaskVT.is128BitVector()) { - if (isTargetShuffleEquivalent(Mask, {0, 0}) && FloatDomain) { + if (isTargetShuffleEquivalent(Mask, {0, 0}) && AllowFloatDomain) { V2 = V1; Shuffle = X86ISD::MOVLHPS; ShuffleVT = MVT::v4f32; return true; } - if (isTargetShuffleEquivalent(Mask, {1, 1}) && FloatDomain) { + if (isTargetShuffleEquivalent(Mask, {1, 1}) && AllowFloatDomain) { V2 = V1; Shuffle = X86ISD::MOVHLPS; ShuffleVT = MVT::v4f32; return true; } if (isTargetShuffleEquivalent(Mask, {0, 3}) && Subtarget.hasSSE2() && - (FloatDomain || !Subtarget.hasSSE41())) { + (AllowFloatDomain || !Subtarget.hasSSE41())) { std::swap(V1, V2); Shuffle = X86ISD::MOVSD; ShuffleVT = MaskVT; return true; } if (isTargetShuffleEquivalent(Mask, {4, 1, 2, 3}) && - (FloatDomain || !Subtarget.hasSSE41())) { + (AllowFloatDomain || !Subtarget.hasSSE41())) { Shuffle = X86ISD::MOVSS; ShuffleVT = MaskVT; return true; @@ -26687,17 +26691,17 @@ static bool matchBinaryVectorShuffle(MVT MaskVT, ArrayRef Mask, } static bool matchBinaryPermuteVectorShuffle(MVT MaskVT, ArrayRef Mask, - bool FloatDomain, - SDValue &V1, SDValue &V2, - SDLoc &DL, SelectionDAG &DAG, + bool AllowIntDomain, SDValue &V1, + SDValue &V2, SDLoc &DL, + SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned &Shuffle, MVT &ShuffleVT, unsigned &PermuteImm) { unsigned NumMaskElts = Mask.size(); // Attempt to match against PALIGNR byte rotate. - if (!FloatDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSSE3()) || - (MaskVT.is256BitVector() && Subtarget.hasAVX2()))) { + if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSSE3()) || + (MaskVT.is256BitVector() && Subtarget.hasAVX2()))) { int ByteRotation = matchVectorShuffleAsByteRotate(MaskVT, V1, V2, Mask); if (0 < ByteRotation) { Shuffle = X86ISD::PALIGNR; @@ -26958,6 +26962,11 @@ static bool combineX86ShuffleChain(ArrayRef Inputs, SDValue Root, MVT ShuffleSrcVT, ShuffleVT; unsigned Shuffle, PermuteImm; + // Which shuffle domains are permitted? + // TODO - Allow either domain after a threshold depth. + bool AllowFloatDomain = FloatDomain; + bool AllowIntDomain = !FloatDomain; + if (UnaryShuffle) { // If we are shuffling a X86ISD::VZEXT_LOAD then we can use the load // directly if we don't shuffle the lower element and we shuffle the upper @@ -26974,8 +26983,9 @@ static bool combineX86ShuffleChain(ArrayRef Inputs, SDValue Root, } } - if (matchUnaryVectorShuffle(MaskVT, Mask, FloatDomain, V1, DL, DAG, - Subtarget, Shuffle, ShuffleSrcVT, ShuffleVT)) { + if (matchUnaryVectorShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, + V1, DL, DAG, Subtarget, Shuffle, ShuffleSrcVT, + ShuffleVT)) { if (Depth == 1 && Root.getOpcode() == Shuffle) return false; // Nothing to do! if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements())) @@ -26989,8 +26999,9 @@ static bool combineX86ShuffleChain(ArrayRef Inputs, SDValue Root, return true; } - if (matchUnaryPermuteVectorShuffle(MaskVT, Mask, FloatDomain, Subtarget, - Shuffle, ShuffleVT, PermuteImm)) { + if (matchUnaryPermuteVectorShuffle(MaskVT, Mask, AllowFloatDomain, + AllowIntDomain, Subtarget, Shuffle, + ShuffleVT, PermuteImm)) { if (Depth == 1 && Root.getOpcode() == Shuffle) return false; // Nothing to do! if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements())) @@ -27006,8 +27017,9 @@ static bool combineX86ShuffleChain(ArrayRef Inputs, SDValue Root, } } - if (matchBinaryVectorShuffle(MaskVT, Mask, FloatDomain, V1, V2, DL, DAG, - Subtarget, Shuffle, ShuffleVT, UnaryShuffle)) { + if (matchBinaryVectorShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, + V1, V2, DL, DAG, Subtarget, Shuffle, ShuffleVT, + UnaryShuffle)) { if (Depth == 1 && Root.getOpcode() == Shuffle) return false; // Nothing to do! if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements())) @@ -27023,7 +27035,7 @@ static bool combineX86ShuffleChain(ArrayRef Inputs, SDValue Root, return true; } - if (matchBinaryPermuteVectorShuffle(MaskVT, Mask, FloatDomain, V1, V2, DL, + if (matchBinaryPermuteVectorShuffle(MaskVT, Mask, AllowIntDomain, V1, V2, DL, DAG, Subtarget, Shuffle, ShuffleVT, PermuteImm)) { if (Depth == 1 && Root.getOpcode() == Shuffle)