return true;
}
+/// Checks if the vector of instructions can be represented as a shuffle, like:
+/// %x0 = extractelement <4 x i8> %x, i32 0
+/// %x3 = extractelement <4 x i8> %x, i32 3
+/// %y1 = extractelement <4 x i8> %y, i32 1
+/// %y2 = extractelement <4 x i8> %y, i32 2
+/// %x0x0 = mul i8 %x0, %x0
+/// %x3x3 = mul i8 %x3, %x3
+/// %y1y1 = mul i8 %y1, %y1
+/// %y2y2 = mul i8 %y2, %y2
+/// %ins1 = insertelement <4 x i8> undef, i8 %x0x0, i32 0
+/// %ins2 = insertelement <4 x i8> %ins1, i8 %x3x3, i32 1
+/// %ins3 = insertelement <4 x i8> %ins2, i8 %y1y1, i32 2
+/// %ins4 = insertelement <4 x i8> %ins3, i8 %y2y2, i32 3
+/// ret <4 x i8> %ins4
+/// can be transformed into:
+/// %1 = shufflevector <4 x i8> %x, <4 x i8> %y, <4 x i32> <i32 0, i32 3, i32 5,
+/// i32 6>
+/// %2 = mul <4 x i8> %1, %1
+/// ret <4 x i8> %2
+/// We convert this initially to something like:
+/// %x0 = extractelement <4 x i8> %x, i32 0
+/// %x3 = extractelement <4 x i8> %x, i32 3
+/// %y1 = extractelement <4 x i8> %y, i32 1
+/// %y2 = extractelement <4 x i8> %y, i32 2
+/// %1 = insertelement <4 x i8> undef, i8 %x0, i32 0
+/// %2 = insertelement <4 x i8> %1, i8 %x3, i32 1
+/// %3 = insertelement <4 x i8> %2, i8 %y1, i32 2
+/// %4 = insertelement <4 x i8> %3, i8 %y2, i32 3
+/// %5 = mul <4 x i8> %4, %4
+/// %6 = extractelement <4 x i8> %5, i32 0
+/// %ins1 = insertelement <4 x i8> undef, i8 %6, i32 0
+/// %7 = extractelement <4 x i8> %5, i32 1
+/// %ins2 = insertelement <4 x i8> %ins1, i8 %7, i32 1
+/// %8 = extractelement <4 x i8> %5, i32 2
+/// %ins3 = insertelement <4 x i8> %ins2, i8 %8, i32 2
+/// %9 = extractelement <4 x i8> %5, i32 3
+/// %ins4 = insertelement <4 x i8> %ins3, i8 %9, i32 3
+/// ret <4 x i8> %ins4
+/// InstCombiner transforms this into a shuffle and vector mul
+static Optional<TargetTransformInfo::ShuffleKind>
+isShuffle(ArrayRef<Value *> VL) {
+ auto *EI0 = cast<ExtractElementInst>(VL[0]);
+ unsigned Size = EI0->getVectorOperandType()->getVectorNumElements();
+ Value *Vec1 = nullptr;
+ Value *Vec2 = nullptr;
+ enum ShuffleMode {Unknown, FirstAlternate, SecondAlternate, Permute};
+ ShuffleMode CommonShuffleMode = Unknown;
+ for (unsigned I = 0, E = VL.size(); I < E; ++I) {
+ auto *EI = cast<ExtractElementInst>(VL[I]);
+ auto *Vec = EI->getVectorOperand();
+ // All vector operands must have the same number of vector elements.
+ if (Vec->getType()->getVectorNumElements() != Size)
+ return None;
+ auto *Idx = dyn_cast<ConstantInt>(EI->getIndexOperand());
+ if (!Idx)
+ return None;
+ // Undefined behavior if Idx is negative or >= Size.
+ if (Idx->getValue().uge(Size))
+ continue;
+ unsigned IntIdx = Idx->getValue().getZExtValue();
+ // We can extractelement from undef vector.
+ if (isa<UndefValue>(Vec))
+ continue;
+ // For correct shuffling we have to have at most 2 different vector operands
+ // in all extractelement instructions.
+ if (Vec1 && Vec2 && Vec != Vec1 && Vec != Vec2)
+ return None;
+ if (CommonShuffleMode == Permute)
+ continue;
+ // If the extract index is not the same as the operation number, it is a
+ // permutation.
+ if (IntIdx != I) {
+ CommonShuffleMode = Permute;
+ continue;
+ }
+ // Check the shuffle mode for the current operation.
+ if (!Vec1)
+ Vec1 = Vec;
+ else if (Vec != Vec1)
+ Vec2 = Vec;
+ // Example: shufflevector A, B, <0,5,2,7>
+ // I is odd and IntIdx for A == I - FirstAlternate shuffle.
+ // I is even and IntIdx for B == I - FirstAlternate shuffle.
+ // Example: shufflevector A, B, <4,1,6,3>
+ // I is even and IntIdx for A == I - SecondAlternate shuffle.
+ // I is odd and IntIdx for B == I - SecondAlternate shuffle.
+ const bool IIsEven = I & 1;
+ const bool CurrVecIsA = Vec == Vec1;
+ const bool IIsOdd = !IIsEven;
+ const bool CurrVecIsB = !CurrVecIsA;
+ ShuffleMode CurrentShuffleMode =
+ ((IIsOdd && CurrVecIsA) || (IIsEven && CurrVecIsB)) ? FirstAlternate
+ : SecondAlternate;
+ // Common mode is not set or the same as the shuffle mode of the current
+ // operation - alternate.
+ if (CommonShuffleMode == Unknown)
+ CommonShuffleMode = CurrentShuffleMode;
+ // Common shuffle mode is not the same as the shuffle mode of the current
+ // operation - permutation.
+ if (CommonShuffleMode != CurrentShuffleMode)
+ CommonShuffleMode = Permute;
+ }
+ // If we're not crossing lanes in different vectors, consider it as blending.
+ if ((CommonShuffleMode == FirstAlternate ||
+ CommonShuffleMode == SecondAlternate) &&
+ Vec2)
+ return TargetTransformInfo::SK_Alternate;
+ // If Vec2 was never used, we have a permutation of a single vector, otherwise
+ // we have permutation of 2 vectors.
+ return Vec2 ? TargetTransformInfo::SK_PermuteTwoSrc
+ : TargetTransformInfo::SK_PermuteSingleSrc;
+}
+
///\returns Opcode that can be clubbed with \p Op to create an alternate
/// sequence which can later be merged as a ShuffleVector instruction.
static unsigned getAltOpcode(unsigned Op) {
if (isSplat(VL)) {
return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, 0);
}
+ if (getSameOpcode(VL) == Instruction::ExtractElement) {
+ Optional<TargetTransformInfo::ShuffleKind> ShuffleKind = isShuffle(VL);
+ if (ShuffleKind.hasValue()) {
+ int Cost = TTI->getShuffleCost(ShuffleKind.getValue(), VecTy);
+ for (auto *V : VL) {
+ // If all users of instruction are going to be vectorized and this
+ // instruction itself is not going to be vectorized, consider this
+ // instruction as dead and remove its cost from the final cost of the
+ // vectorized tree.
+ if (areAllUsersVectorized(cast<Instruction>(V)) &&
+ !ScalarToTreeEntry.count(V)) {
+ auto *IO = cast<ConstantInt>(
+ cast<ExtractElementInst>(V)->getIndexOperand());
+ Cost -= TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy,
+ IO->getZExtValue());
+ }
+ }
+ return Cost;
+ }
+ }
return getGatherCost(E->Scalars);
}
unsigned Opcode = getSameOpcode(VL);
define <2 x i8> @g(<2 x i8> %x, <2 x i8> %y) {
; CHECK-LABEL: @g(
-; CHECK-NEXT: [[X0:%.*]] = extractelement <2 x i8> [[X:%.*]], i32 0
-; CHECK-NEXT: [[Y1:%.*]] = extractelement <2 x i8> [[Y:%.*]], i32 1
-; CHECK-NEXT: [[X0X0:%.*]] = mul i8 [[X0]], [[X0]]
-; CHECK-NEXT: [[Y1Y1:%.*]] = mul i8 [[Y1]], [[Y1]]
-; CHECK-NEXT: [[INS1:%.*]] = insertelement <2 x i8> undef, i8 [[X0X0]], i32 0
-; CHECK-NEXT: [[INS2:%.*]] = insertelement <2 x i8> [[INS1]], i8 [[Y1Y1]], i32 1
-; CHECK-NEXT: ret <2 x i8> [[INS2]]
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i8> [[X:%.*]], <2 x i8> [[Y:%.*]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT: [[TMP2:%.*]] = mul <2 x i8> [[TMP1]], [[TMP1]]
+; CHECK-NEXT: ret <2 x i8> [[TMP2]]
;
%x0 = extractelement <2 x i8> %x, i32 0
%y1 = extractelement <2 x i8> %y, i32 1
define <4 x i8> @h(<4 x i8> %x, <4 x i8> %y) {
; CHECK-LABEL: @h(
-; CHECK-NEXT: [[X0:%.*]] = extractelement <4 x i8> [[X:%.*]], i32 0
-; CHECK-NEXT: [[X3:%.*]] = extractelement <4 x i8> [[X]], i32 3
-; CHECK-NEXT: [[Y1:%.*]] = extractelement <4 x i8> [[Y:%.*]], i32 1
-; CHECK-NEXT: [[Y2:%.*]] = extractelement <4 x i8> [[Y]], i32 2
-; CHECK-NEXT: [[X0X0:%.*]] = mul i8 [[X0]], [[X0]]
-; CHECK-NEXT: [[X3X3:%.*]] = mul i8 [[X3]], [[X3]]
-; CHECK-NEXT: [[Y1Y1:%.*]] = mul i8 [[Y1]], [[Y1]]
-; CHECK-NEXT: [[Y2Y2:%.*]] = mul i8 [[Y2]], [[Y2]]
-; CHECK-NEXT: [[INS1:%.*]] = insertelement <4 x i8> undef, i8 [[X0X0]], i32 0
-; CHECK-NEXT: [[INS2:%.*]] = insertelement <4 x i8> [[INS1]], i8 [[X3X3]], i32 1
-; CHECK-NEXT: [[INS3:%.*]] = insertelement <4 x i8> [[INS2]], i8 [[Y1Y1]], i32 2
-; CHECK-NEXT: [[INS4:%.*]] = insertelement <4 x i8> [[INS3]], i8 [[Y2Y2]], i32 3
-; CHECK-NEXT: ret <4 x i8> [[INS4]]
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i8> [[X:%.*]], <4 x i8> [[Y:%.*]], <4 x i32> <i32 0, i32 3, i32 5, i32 6>
+; CHECK-NEXT: [[TMP2:%.*]] = mul <4 x i8> [[TMP1]], [[TMP1]]
+; CHECK-NEXT: ret <4 x i8> [[TMP2]]
;
%x0 = extractelement <4 x i8> %x, i32 0
%x3 = extractelement <4 x i8> %x, i32 3
define <4 x i8> @h_undef(<4 x i8> %x, <4 x i8> %y) {
; CHECK-LABEL: @h_undef(
-; CHECK-NEXT: [[X3:%.*]] = extractelement <4 x i8> [[X:%.*]], i32 3
-; CHECK-NEXT: [[Y1:%.*]] = extractelement <4 x i8> [[Y:%.*]], i32 1
-; CHECK-NEXT: [[Y2:%.*]] = extractelement <4 x i8> [[Y]], i32 2
-; CHECK-NEXT: [[X3X3:%.*]] = mul i8 [[X3]], [[X3]]
-; CHECK-NEXT: [[Y1Y1:%.*]] = mul i8 [[Y1]], [[Y1]]
-; CHECK-NEXT: [[Y2Y2:%.*]] = mul i8 [[Y2]], [[Y2]]
-; CHECK-NEXT: [[INS2:%.*]] = insertelement <4 x i8> undef, i8 [[X3X3]], i32 1
-; CHECK-NEXT: [[INS3:%.*]] = insertelement <4 x i8> [[INS2]], i8 [[Y1Y1]], i32 2
-; CHECK-NEXT: [[INS4:%.*]] = insertelement <4 x i8> [[INS3]], i8 [[Y2Y2]], i32 3
-; CHECK-NEXT: ret <4 x i8> [[INS4]]
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i8> [[X:%.*]], <4 x i8> [[Y:%.*]], <4 x i32> <i32 undef, i32 3, i32 5, i32 6>
+; CHECK-NEXT: [[TMP2:%.*]] = mul <4 x i8> [[TMP1]], [[TMP1]]
+; CHECK-NEXT: ret <4 x i8> [[TMP2]]
;
%x0 = extractelement <4 x i8> undef, i32 0
%x3 = extractelement <4 x i8> %x, i32 3
define i8 @i(<4 x i8> %x, <4 x i8> %y) {
; CHECK-LABEL: @i(
-; CHECK-NEXT: [[X0:%.*]] = extractelement <4 x i8> [[X:%.*]], i32 0
-; CHECK-NEXT: [[X3:%.*]] = extractelement <4 x i8> [[X]], i32 3
-; CHECK-NEXT: [[Y1:%.*]] = extractelement <4 x i8> [[Y:%.*]], i32 1
-; CHECK-NEXT: [[Y2:%.*]] = extractelement <4 x i8> [[Y]], i32 2
-; CHECK-NEXT: [[X0X0:%.*]] = mul i8 [[X0]], [[X0]]
-; CHECK-NEXT: [[X3X3:%.*]] = mul i8 [[X3]], [[X3]]
-; CHECK-NEXT: [[Y1Y1:%.*]] = mul i8 [[Y1]], [[Y1]]
-; CHECK-NEXT: [[Y2Y2:%.*]] = mul i8 [[Y2]], [[Y2]]
-; CHECK-NEXT: [[TMP1:%.*]] = add i8 [[X0X0]], [[X3X3]]
-; CHECK-NEXT: [[TMP2:%.*]] = add i8 [[Y1Y1]], [[Y2Y2]]
-; CHECK-NEXT: [[TMP3:%.*]] = add i8 [[TMP1]], [[TMP2]]
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i8> [[X:%.*]], <4 x i8> [[Y:%.*]], <4 x i32> <i32 0, i32 3, i32 5, i32 6>
+; CHECK-NEXT: [[TMP2:%.*]] = mul <4 x i8> [[TMP1]], [[TMP1]]
+; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i8> [[TMP2]], [[RDX_SHUF]]
+; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x i8> [[BIN_RDX]], <4 x i8> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT: [[BIN_RDX2:%.*]] = add <4 x i8> [[BIN_RDX]], [[RDX_SHUF1]]
+; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i8> [[BIN_RDX2]], i32 0
; CHECK-NEXT: ret i8 [[TMP3]]
;
%x0 = extractelement <4 x i8> %x, i32 0
define i8 @j(<4 x i8> %x, <4 x i8> %y) {
; CHECK-LABEL: @j(
-; CHECK-NEXT: [[X0:%.*]] = extractelement <4 x i8> [[X:%.*]], i32 0
-; CHECK-NEXT: [[X3:%.*]] = extractelement <4 x i8> [[X]], i32 3
-; CHECK-NEXT: [[Y1:%.*]] = extractelement <4 x i8> [[Y:%.*]], i32 1
-; CHECK-NEXT: [[Y2:%.*]] = extractelement <4 x i8> [[Y]], i32 2
-; CHECK-NEXT: [[X0X0:%.*]] = mul i8 [[X0]], [[X0]]
-; CHECK-NEXT: [[X3X3:%.*]] = mul i8 [[X3]], [[X3]]
-; CHECK-NEXT: [[Y1Y1:%.*]] = mul i8 [[Y1]], [[Y1]]
-; CHECK-NEXT: [[Y2Y2:%.*]] = mul i8 [[Y2]], [[Y2]]
-; CHECK-NEXT: [[TMP1:%.*]] = add i8 [[X0X0]], [[X3X3]]
-; CHECK-NEXT: [[TMP2:%.*]] = add i8 [[Y1Y1]], [[Y2Y2]]
-; CHECK-NEXT: [[TMP3:%.*]] = sdiv i8 [[TMP1]], [[TMP2]]
-; CHECK-NEXT: ret i8 [[TMP3]]
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i8> [[X:%.*]], <4 x i8> [[Y:%.*]], <2 x i32> <i32 0, i32 5>
+; CHECK-NEXT: [[TMP2:%.*]] = mul <2 x i8> [[TMP1]], [[TMP1]]
+; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i8> [[X]], <4 x i8> [[Y]], <2 x i32> <i32 3, i32 6>
+; CHECK-NEXT: [[TMP4:%.*]] = mul <2 x i8> [[TMP3]], [[TMP3]]
+; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i8> [[TMP2]], [[TMP4]]
+; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i8> [[TMP5]], i32 0
+; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i8> [[TMP5]], i32 1
+; CHECK-NEXT: [[TMP8:%.*]] = sdiv i8 [[TMP6]], [[TMP7]]
+; CHECK-NEXT: ret i8 [[TMP8]]
;
%x0 = extractelement <4 x i8> %x, i32 0
%x3 = extractelement <4 x i8> %x, i32 3
define i8 @k(<4 x i8> %x) {
; CHECK-LABEL: @k(
-; CHECK-NEXT: [[X0:%.*]] = extractelement <4 x i8> [[X:%.*]], i32 0
-; CHECK-NEXT: [[X3:%.*]] = extractelement <4 x i8> [[X]], i32 3
-; CHECK-NEXT: [[X1:%.*]] = extractelement <4 x i8> [[X]], i32 1
-; CHECK-NEXT: [[X2:%.*]] = extractelement <4 x i8> [[X]], i32 2
-; CHECK-NEXT: [[X0X0:%.*]] = mul i8 [[X0]], [[X0]]
-; CHECK-NEXT: [[X3X3:%.*]] = mul i8 [[X3]], [[X3]]
-; CHECK-NEXT: [[X1X1:%.*]] = mul i8 [[X1]], [[X1]]
-; CHECK-NEXT: [[X2X2:%.*]] = mul i8 [[X2]], [[X2]]
-; CHECK-NEXT: [[TMP1:%.*]] = add i8 [[X0X0]], [[X3X3]]
-; CHECK-NEXT: [[TMP2:%.*]] = add i8 [[X1X1]], [[X2X2]]
-; CHECK-NEXT: [[TMP3:%.*]] = sdiv i8 [[TMP1]], [[TMP2]]
-; CHECK-NEXT: ret i8 [[TMP3]]
+; CHECK-NEXT: [[TMP1:%.*]] = mul <4 x i8> [[X:%.*]], [[X]]
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> undef, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP3:%.*]] = mul <4 x i8> [[X]], [[X]]
+; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> undef, <2 x i32> <i32 3, i32 2>
+; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i8> [[TMP2]], [[TMP4]]
+; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i8> [[TMP5]], i32 0
+; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i8> [[TMP5]], i32 1
+; CHECK-NEXT: [[TMP8:%.*]] = sdiv i8 [[TMP6]], [[TMP7]]
+; CHECK-NEXT: ret i8 [[TMP8]]
;
%x0 = extractelement <4 x i8> %x, i32 0
%x3 = extractelement <4 x i8> %x, i32 3