From: Simon Pilgrim Date: Sun, 1 Sep 2019 16:04:28 +0000 (+0000) Subject: [X86][AVX] Rename + cleanup lowerShuffleAsLanePermuteAndBlend. NFCI. X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=a9668dd1796e762201af31d33295539cadb14b45;p=llvm [X86][AVX] Rename + cleanup lowerShuffleAsLanePermuteAndBlend. NFCI. Rename to lowerShuffleAsLanePermuteAndShuffle to make it clear that not just blends are performed. Cleanup the in-lane shuffle mask generation to make it more obvious what's going on. Some prep work noticed while investigating the poor shuffle code mentioned in D66004. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@370613 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 2fba796956c..0dd84ab658c 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -14673,16 +14673,14 @@ static SDValue lowerShuffleAsLanePermuteAndPermute( return DAG.getVectorShuffle(VT, DL, LanePermute, DAG.getUNDEF(VT), PermMask); } -/// Lower a vector shuffle crossing multiple 128-bit lanes as -/// a permutation and blend of those lanes. +/// Lower a vector shuffle crossing multiple 128-bit lanes by shuffling one +/// source with a lane permutation. /// -/// This essentially blends the out-of-lane inputs to each lane into the lane -/// from a permuted copy of the vector. This lowering strategy results in four -/// instructions in the worst case for a single-input cross lane shuffle which -/// is lower than any other fully general cross-lane shuffle strategy I'm aware -/// of. Special cases for each particular shuffle pattern should be handled -/// prior to trying this lowering. -static SDValue lowerShuffleAsLanePermuteAndBlend( +/// This lowering strategy results in four instructions in the worst case for a +/// single-input cross lane shuffle which is lower than any other fully general +/// cross-lane shuffle strategy I'm aware of. Special cases for each particular +/// shuffle pattern should be handled prior to trying this lowering. +static SDValue lowerShuffleAsLanePermuteAndShuffle( const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, SelectionDAG &DAG, const X86Subtarget &Subtarget) { // FIXME: This should probably be generalized for 512-bit vectors as well. @@ -14709,24 +14707,28 @@ static SDValue lowerShuffleAsLanePermuteAndBlend( return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG); } + // TODO - we could support shuffling V2 in the Flipped input. assert(V2.isUndef() && "This last part of this routine only works on single input shuffles"); - SmallVector FlippedBlendMask(Size); - for (int i = 0; i < Size; ++i) - FlippedBlendMask[i] = - Mask[i] < 0 ? -1 : (((Mask[i] % Size) / LaneSize == i / LaneSize) - ? Mask[i] - : Mask[i] % LaneSize + - (i / LaneSize) * LaneSize + Size); + SmallVector InLaneMask(Mask.begin(), Mask.end()); + for (int i = 0; i < Size; ++i) { + int &M = InLaneMask[i]; + if (M < 0) + continue; + if (((M % Size) / LaneSize) != (i / LaneSize)) + M = (M % LaneSize) + ((i / LaneSize) * LaneSize) + Size; + } + assert(!is128BitLaneCrossingShuffleMask(VT, InLaneMask) && + "In-lane shuffle mask expected"); - // Flip the vector, and blend the results which should now be in-lane. + // Flip the lanes, and shuffle the results which should now be in-lane. MVT PVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64; SDValue Flipped = DAG.getBitcast(PVT, V1); - Flipped = DAG.getVectorShuffle(PVT, DL, Flipped, DAG.getUNDEF(PVT), - { 2, 3, 0, 1 }); + Flipped = + DAG.getVectorShuffle(PVT, DL, Flipped, DAG.getUNDEF(PVT), {2, 3, 0, 1}); Flipped = DAG.getBitcast(VT, Flipped); - return DAG.getVectorShuffle(VT, DL, V1, Flipped, FlippedBlendMask); + return DAG.getVectorShuffle(VT, DL, V1, Flipped, InLaneMask); } /// Handle lowering 2-lane 128-bit shuffles. @@ -15487,8 +15489,8 @@ static SDValue lowerV4F64Shuffle(const SDLoc &DL, ArrayRef Mask, return V; // Otherwise, fall back. - return lowerShuffleAsLanePermuteAndBlend(DL, MVT::v4f64, V1, V2, Mask, DAG, - Subtarget); + return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v4f64, V1, V2, Mask, + DAG, Subtarget); } // Use dedicated unpack instructions for masks that match their pattern. @@ -15704,8 +15706,8 @@ static SDValue lowerV8F32Shuffle(const SDLoc &DL, ArrayRef Mask, return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32, VPermMask, V1); // Otherwise, fall back. - return lowerShuffleAsLanePermuteAndBlend(DL, MVT::v8f32, V1, V2, Mask, - DAG, Subtarget); + return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v8f32, V1, V2, Mask, + DAG, Subtarget); } // Try to simplify this by merging 128-bit lanes to enable a lane-based @@ -15912,8 +15914,8 @@ static SDValue lowerV16I16Shuffle(const SDLoc &DL, ArrayRef Mask, DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget)) return V; - return lowerShuffleAsLanePermuteAndBlend(DL, MVT::v16i16, V1, V2, Mask, - DAG, Subtarget); + return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v16i16, V1, V2, Mask, + DAG, Subtarget); } SmallVector RepeatedMask; @@ -16011,8 +16013,8 @@ static SDValue lowerV32I8Shuffle(const SDLoc &DL, ArrayRef Mask, DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget)) return V; - return lowerShuffleAsLanePermuteAndBlend(DL, MVT::v32i8, V1, V2, Mask, DAG, - Subtarget); + return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v32i8, V1, V2, Mask, + DAG, Subtarget); } if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i8, Mask, V1, V2,