From 05f671ecfa202087d165037516f4db151afd86c5 Mon Sep 17 00:00:00 2001 From: Hans Wennborg Date: Mon, 15 May 2017 20:59:32 +0000 Subject: [PATCH] Revert r302678 "[AArch64] Enable use of reduction intrinsics." This caused PR33053. Original commit message: > The new experimental reduction intrinsics can now be used, so I'm enabling this > for AArch64. We will need this for SVE anyway, so it makes sense to do this for > NEON reductions as well. > > The existing code to match shufflevector patterns are replaced with a direct > lowering of the reductions to AArch64-specific nodes. Tests updated with the > new, simpler, representation. > > Differential Revision: https://reviews.llvm.org/D32247 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@303115 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/AArch64/AArch64ISelLowering.cpp | 341 +++++++++++--- lib/Target/AArch64/AArch64ISelLowering.h | 1 - .../AArch64/AArch64TargetTransformInfo.cpp | 23 - .../AArch64/AArch64TargetTransformInfo.h | 3 - test/CodeGen/AArch64/aarch64-addv.ll | 63 ++- test/CodeGen/AArch64/aarch64-minmaxv.ll | 424 +++++++++++++++--- test/CodeGen/AArch64/arm64-vabs.ll | 42 +- .../AArch64/reduction-small-size.ll | 26 +- .../SLPVectorizer/AArch64/gather-root.ll | 40 +- 9 files changed, 750 insertions(+), 213 deletions(-) diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp index 1af36086ad9..4f7c2e12239 100644 --- a/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -553,6 +553,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setTargetDAGCombine(ISD::INTRINSIC_VOID); setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN); setTargetDAGCombine(ISD::INSERT_VECTOR_ELT); + setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); MaxStoresPerMemset = MaxStoresPerMemsetOptSize = 8; MaxStoresPerMemcpy = MaxStoresPerMemcpyOptSize = 4; @@ -658,19 +659,6 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::MUL, MVT::v4i32, Custom); setOperationAction(ISD::MUL, MVT::v2i64, Custom); - // Vector reductions - for (MVT VT : MVT::integer_valuetypes()) { - setOperationAction(ISD::VECREDUCE_ADD, VT, Custom); - setOperationAction(ISD::VECREDUCE_SMAX, VT, Custom); - setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom); - setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom); - setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom); - } - for (MVT VT : MVT::fp_valuetypes()) { - setOperationAction(ISD::VECREDUCE_FMAX, VT, Custom); - setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom); - } - setOperationAction(ISD::ANY_EXTEND, MVT::v4i32, Legal); setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand); // Likewise, narrowing and extending vector loads/stores aren't handled @@ -2618,14 +2606,6 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op, return LowerMUL(Op, DAG); case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); - case ISD::VECREDUCE_ADD: - case ISD::VECREDUCE_SMAX: - case ISD::VECREDUCE_SMIN: - case ISD::VECREDUCE_UMAX: - case ISD::VECREDUCE_UMIN: - case ISD::VECREDUCE_FMAX: - case ISD::VECREDUCE_FMIN: - return LowerVECREDUCE(Op, DAG); } } @@ -7148,47 +7128,6 @@ SDValue AArch64TargetLowering::LowerVSETCC(SDValue Op, return Cmp; } -static SDValue getReductionSDNode(unsigned Op, SDLoc DL, SDValue ScalarOp, - SelectionDAG &DAG) { - SDValue VecOp = ScalarOp.getOperand(0); - auto Rdx = DAG.getNode(Op, DL, VecOp.getSimpleValueType(), VecOp); - return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarOp.getValueType(), Rdx, - DAG.getConstant(0, DL, MVT::i64)); -} - -SDValue AArch64TargetLowering::LowerVECREDUCE(SDValue Op, - SelectionDAG &DAG) const { - SDLoc dl(Op); - switch (Op.getOpcode()) { - case ISD::VECREDUCE_ADD: - return getReductionSDNode(AArch64ISD::UADDV, dl, Op, DAG); - case ISD::VECREDUCE_SMAX: - return getReductionSDNode(AArch64ISD::SMAXV, dl, Op, DAG); - case ISD::VECREDUCE_SMIN: - return getReductionSDNode(AArch64ISD::SMINV, dl, Op, DAG); - case ISD::VECREDUCE_UMAX: - return getReductionSDNode(AArch64ISD::UMAXV, dl, Op, DAG); - case ISD::VECREDUCE_UMIN: - return getReductionSDNode(AArch64ISD::UMINV, dl, Op, DAG); - case ISD::VECREDUCE_FMAX: { - assert(Op->getFlags().hasNoNaNs() && "fmax vector reduction needs NoNaN flag"); - return DAG.getNode( - ISD::INTRINSIC_WO_CHAIN, dl, Op.getValueType(), - DAG.getConstant(Intrinsic::aarch64_neon_fmaxnmv, dl, MVT::i32), - Op.getOperand(0)); - } - case ISD::VECREDUCE_FMIN: { - assert(Op->getFlags().hasNoNaNs() && "fmin vector reduction needs NoNaN flag"); - return DAG.getNode( - ISD::INTRINSIC_WO_CHAIN, dl, Op.getValueType(), - DAG.getConstant(Intrinsic::aarch64_neon_fminnmv, dl, MVT::i32), - Op.getOperand(0)); - } - default: - llvm_unreachable("Unhandled reduction"); - } -} - /// getTgtMemIntrinsic - Represent NEON load and store intrinsics as /// MemIntrinsicNodes. The associated MachineMemOperands record the alignment /// specified in the intrinsic calls. @@ -9551,6 +9490,266 @@ static SDValue performSTORECombine(SDNode *N, return SDValue(); } +/// This function handles the log2-shuffle pattern produced by the +/// LoopVectorizer for the across vector reduction. It consists of +/// log2(NumVectorElements) steps and, in each step, 2^(s) elements +/// are reduced, where s is an induction variable from 0 to +/// log2(NumVectorElements). +static SDValue tryMatchAcrossLaneShuffleForReduction(SDNode *N, SDValue OpV, + unsigned Op, + SelectionDAG &DAG) { + EVT VTy = OpV->getOperand(0).getValueType(); + if (!VTy.isVector()) + return SDValue(); + + int NumVecElts = VTy.getVectorNumElements(); + if (Op == ISD::FMAXNUM || Op == ISD::FMINNUM) { + if (NumVecElts != 4) + return SDValue(); + } else { + if (NumVecElts != 4 && NumVecElts != 8 && NumVecElts != 16) + return SDValue(); + } + + int NumExpectedSteps = APInt(8, NumVecElts).logBase2(); + SDValue PreOp = OpV; + // Iterate over each step of the across vector reduction. + for (int CurStep = 0; CurStep != NumExpectedSteps; ++CurStep) { + SDValue CurOp = PreOp.getOperand(0); + SDValue Shuffle = PreOp.getOperand(1); + if (Shuffle.getOpcode() != ISD::VECTOR_SHUFFLE) { + // Try to swap the 1st and 2nd operand as add and min/max instructions + // are commutative. + CurOp = PreOp.getOperand(1); + Shuffle = PreOp.getOperand(0); + if (Shuffle.getOpcode() != ISD::VECTOR_SHUFFLE) + return SDValue(); + } + + // Check if the input vector is fed by the operator we want to handle, + // except the last step; the very first input vector is not necessarily + // the same operator we are handling. + if (CurOp.getOpcode() != Op && (CurStep != (NumExpectedSteps - 1))) + return SDValue(); + + // Check if it forms one step of the across vector reduction. + // E.g., + // %cur = add %1, %0 + // %shuffle = vector_shuffle %cur, <2, 3, u, u> + // %pre = add %cur, %shuffle + if (Shuffle.getOperand(0) != CurOp) + return SDValue(); + + int NumMaskElts = 1 << CurStep; + ArrayRef Mask = cast(Shuffle)->getMask(); + // Check mask values in each step. + // We expect the shuffle mask in each step follows a specific pattern + // denoted here by the form, where M is a sequence of integers + // starting from NumMaskElts, increasing by 1, and the number integers + // in M should be NumMaskElts. U is a sequence of UNDEFs and the number + // of undef in U should be NumVecElts - NumMaskElts. + // E.g., for <8 x i16>, mask values in each step should be : + // step 0 : <1,u,u,u,u,u,u,u> + // step 1 : <2,3,u,u,u,u,u,u> + // step 2 : <4,5,6,7,u,u,u,u> + for (int i = 0; i < NumVecElts; ++i) + if ((i < NumMaskElts && Mask[i] != (NumMaskElts + i)) || + (i >= NumMaskElts && !(Mask[i] < 0))) + return SDValue(); + + PreOp = CurOp; + } + unsigned Opcode; + bool IsIntrinsic = false; + + switch (Op) { + default: + llvm_unreachable("Unexpected operator for across vector reduction"); + case ISD::ADD: + Opcode = AArch64ISD::UADDV; + break; + case ISD::SMAX: + Opcode = AArch64ISD::SMAXV; + break; + case ISD::UMAX: + Opcode = AArch64ISD::UMAXV; + break; + case ISD::SMIN: + Opcode = AArch64ISD::SMINV; + break; + case ISD::UMIN: + Opcode = AArch64ISD::UMINV; + break; + case ISD::FMAXNUM: + Opcode = Intrinsic::aarch64_neon_fmaxnmv; + IsIntrinsic = true; + break; + case ISD::FMINNUM: + Opcode = Intrinsic::aarch64_neon_fminnmv; + IsIntrinsic = true; + break; + } + SDLoc DL(N); + + return IsIntrinsic + ? DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, N->getValueType(0), + DAG.getConstant(Opcode, DL, MVT::i32), PreOp) + : DAG.getNode( + ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), + DAG.getNode(Opcode, DL, PreOp.getSimpleValueType(), PreOp), + DAG.getConstant(0, DL, MVT::i64)); +} + +/// Target-specific DAG combine for the across vector min/max reductions. +/// This function specifically handles the final clean-up step of the vector +/// min/max reductions produced by the LoopVectorizer. It is the log2-shuffle +/// pattern, which narrows down and finds the final min/max value from all +/// elements of the vector. +/// For example, for a <16 x i8> vector : +/// svn0 = vector_shuffle %0, undef<8,9,10,11,12,13,14,15,u,u,u,u,u,u,u,u> +/// %smax0 = smax %arr, svn0 +/// %svn1 = vector_shuffle %smax0, undef<4,5,6,7,u,u,u,u,u,u,u,u,u,u,u,u> +/// %smax1 = smax %smax0, %svn1 +/// %svn2 = vector_shuffle %smax1, undef<2,3,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +/// %smax2 = smax %smax1, svn2 +/// %svn3 = vector_shuffle %smax2, undef<1,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +/// %sc = setcc %smax2, %svn3, gt +/// %n0 = extract_vector_elt %sc, #0 +/// %n1 = extract_vector_elt %smax2, #0 +/// %n2 = extract_vector_elt $smax2, #1 +/// %result = select %n0, %n1, n2 +/// becomes : +/// %1 = smaxv %0 +/// %result = extract_vector_elt %1, 0 +static SDValue +performAcrossLaneMinMaxReductionCombine(SDNode *N, SelectionDAG &DAG, + const AArch64Subtarget *Subtarget) { + if (!Subtarget->hasNEON()) + return SDValue(); + + SDValue N0 = N->getOperand(0); + SDValue IfTrue = N->getOperand(1); + SDValue IfFalse = N->getOperand(2); + + // Check if the SELECT merges up the final result of the min/max + // from a vector. + if (N0.getOpcode() != ISD::EXTRACT_VECTOR_ELT || + IfTrue.getOpcode() != ISD::EXTRACT_VECTOR_ELT || + IfFalse.getOpcode() != ISD::EXTRACT_VECTOR_ELT) + return SDValue(); + + // Expect N0 is fed by SETCC. + SDValue SetCC = N0.getOperand(0); + EVT SetCCVT = SetCC.getValueType(); + if (SetCC.getOpcode() != ISD::SETCC || !SetCCVT.isVector() || + SetCCVT.getVectorElementType() != MVT::i1) + return SDValue(); + + SDValue VectorOp = SetCC.getOperand(0); + unsigned Op = VectorOp->getOpcode(); + // Check if the input vector is fed by the operator we want to handle. + if (Op != ISD::SMAX && Op != ISD::UMAX && Op != ISD::SMIN && + Op != ISD::UMIN && Op != ISD::FMAXNUM && Op != ISD::FMINNUM) + return SDValue(); + + EVT VTy = VectorOp.getValueType(); + if (!VTy.isVector()) + return SDValue(); + + if (VTy.getSizeInBits() < 64) + return SDValue(); + + EVT EltTy = VTy.getVectorElementType(); + if (Op == ISD::FMAXNUM || Op == ISD::FMINNUM) { + if (EltTy != MVT::f32) + return SDValue(); + } else { + if (EltTy != MVT::i32 && EltTy != MVT::i16 && EltTy != MVT::i8) + return SDValue(); + } + + // Check if extracting from the same vector. + // For example, + // %sc = setcc %vector, %svn1, gt + // %n0 = extract_vector_elt %sc, #0 + // %n1 = extract_vector_elt %vector, #0 + // %n2 = extract_vector_elt $vector, #1 + if (!(VectorOp == IfTrue->getOperand(0) && + VectorOp == IfFalse->getOperand(0))) + return SDValue(); + + // Check if the condition code is matched with the operator type. + ISD::CondCode CC = cast(SetCC->getOperand(2))->get(); + if ((Op == ISD::SMAX && CC != ISD::SETGT && CC != ISD::SETGE) || + (Op == ISD::UMAX && CC != ISD::SETUGT && CC != ISD::SETUGE) || + (Op == ISD::SMIN && CC != ISD::SETLT && CC != ISD::SETLE) || + (Op == ISD::UMIN && CC != ISD::SETULT && CC != ISD::SETULE) || + (Op == ISD::FMAXNUM && CC != ISD::SETOGT && CC != ISD::SETOGE && + CC != ISD::SETUGT && CC != ISD::SETUGE && CC != ISD::SETGT && + CC != ISD::SETGE) || + (Op == ISD::FMINNUM && CC != ISD::SETOLT && CC != ISD::SETOLE && + CC != ISD::SETULT && CC != ISD::SETULE && CC != ISD::SETLT && + CC != ISD::SETLE)) + return SDValue(); + + // Expect to check only lane 0 from the vector SETCC. + if (!isNullConstant(N0.getOperand(1))) + return SDValue(); + + // Expect to extract the true value from lane 0. + if (!isNullConstant(IfTrue.getOperand(1))) + return SDValue(); + + // Expect to extract the false value from lane 1. + if (!isOneConstant(IfFalse.getOperand(1))) + return SDValue(); + + return tryMatchAcrossLaneShuffleForReduction(N, SetCC, Op, DAG); +} + +/// Target-specific DAG combine for the across vector add reduction. +/// This function specifically handles the final clean-up step of the vector +/// add reduction produced by the LoopVectorizer. It is the log2-shuffle +/// pattern, which adds all elements of a vector together. +/// For example, for a <4 x i32> vector : +/// %1 = vector_shuffle %0, <2,3,u,u> +/// %2 = add %0, %1 +/// %3 = vector_shuffle %2, <1,u,u,u> +/// %4 = add %2, %3 +/// %result = extract_vector_elt %4, 0 +/// becomes : +/// %0 = uaddv %0 +/// %result = extract_vector_elt %0, 0 +static SDValue +performAcrossLaneAddReductionCombine(SDNode *N, SelectionDAG &DAG, + const AArch64Subtarget *Subtarget) { + if (!Subtarget->hasNEON()) + return SDValue(); + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + + // Check if the input vector is fed by the ADD. + if (N0->getOpcode() != ISD::ADD) + return SDValue(); + + // The vector extract idx must constant zero because we only expect the final + // result of the reduction is placed in lane 0. + if (!isNullConstant(N1)) + return SDValue(); + + EVT VTy = N0.getValueType(); + if (!VTy.isVector()) + return SDValue(); + + EVT EltTy = VTy.getVectorElementType(); + if (EltTy != MVT::i32 && EltTy != MVT::i16 && EltTy != MVT::i8) + return SDValue(); + + if (VTy.getSizeInBits() < 64) + return SDValue(); + + return tryMatchAcrossLaneShuffleForReduction(N, N0, ISD::ADD, DAG); +} /// Target-specific DAG combine function for NEON load/store intrinsics /// to merge base address updates. @@ -10229,8 +10428,12 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N, return performBitcastCombine(N, DCI, DAG); case ISD::CONCAT_VECTORS: return performConcatVectorsCombine(N, DCI, DAG); - case ISD::SELECT: - return performSelectCombine(N, DCI); + case ISD::SELECT: { + SDValue RV = performSelectCombine(N, DCI); + if (!RV.getNode()) + RV = performAcrossLaneMinMaxReductionCombine(N, DAG, Subtarget); + return RV; + } case ISD::VSELECT: return performVSelectCombine(N, DCI.DAG); case ISD::LOAD: @@ -10252,6 +10455,8 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N, return performNVCASTCombine(N); case ISD::INSERT_VECTOR_ELT: return performPostLD1Combine(N, DCI, true); + case ISD::EXTRACT_VECTOR_ELT: + return performAcrossLaneAddReductionCombine(N, DAG, Subtarget); case ISD::INTRINSIC_VOID: case ISD::INTRINSIC_W_CHAIN: switch (cast(N->getOperand(1))->getZExtValue()) { @@ -10471,14 +10676,6 @@ void AArch64TargetLowering::ReplaceNodeResults( case ISD::BITCAST: ReplaceBITCASTResults(N, Results, DAG); return; - case ISD::VECREDUCE_ADD: - case ISD::VECREDUCE_SMAX: - case ISD::VECREDUCE_SMIN: - case ISD::VECREDUCE_UMAX: - case ISD::VECREDUCE_UMIN: - Results.push_back(LowerVECREDUCE(SDValue(N, 0), DAG)); - return; - case AArch64ISD::SADDV: ReplaceReductionResults(N, Results, DAG, ISD::ADD, AArch64ISD::SADDV); return; diff --git a/lib/Target/AArch64/AArch64ISelLowering.h b/lib/Target/AArch64/AArch64ISelLowering.h index ecc2517fb28..89db566c219 100644 --- a/lib/Target/AArch64/AArch64ISelLowering.h +++ b/lib/Target/AArch64/AArch64ISelLowering.h @@ -568,7 +568,6 @@ private: SDValue LowerVectorOR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerVECREDUCE(SDValue Op, SelectionDAG &DAG) const; SDValue BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG, std::vector *Created) const override; diff --git a/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index f41f3ddc819..7c6f55c06bc 100644 --- a/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -769,26 +769,3 @@ unsigned AArch64TTIImpl::getMinPrefetchStride() { unsigned AArch64TTIImpl::getMaxPrefetchIterationsAhead() { return ST->getMaxPrefetchIterationsAhead(); } - -bool AArch64TTIImpl::useReductionIntrinsic(unsigned Opcode, Type *Ty, - TTI::ReductionFlags Flags) const { - assert(isa(Ty) && "Expected Ty to be a vector type"); - switch (Opcode) { - case Instruction::FAdd: - case Instruction::FMul: - case Instruction::And: - case Instruction::Or: - case Instruction::Xor: - case Instruction::Mul: - return false; - case Instruction::Add: - return Ty->getScalarSizeInBits() * Ty->getVectorNumElements() >= 128; - case Instruction::ICmp: - return Ty->getScalarSizeInBits() < 64; - case Instruction::FCmp: - return Flags.NoNaN; - default: - llvm_unreachable("Unhandled reduction opcode"); - } - return false; -} diff --git a/lib/Target/AArch64/AArch64TargetTransformInfo.h b/lib/Target/AArch64/AArch64TargetTransformInfo.h index c48f24a7363..39258115dcb 100644 --- a/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ b/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -141,9 +141,6 @@ public: bool shouldExpandReduction(const IntrinsicInst *II) const { return false; } - - bool useReductionIntrinsic(unsigned Opcode, Type *Ty, - TTI::ReductionFlags Flags) const; /// @} }; diff --git a/test/CodeGen/AArch64/aarch64-addv.ll b/test/CodeGen/AArch64/aarch64-addv.ll index e65992e9913..91797c062b8 100644 --- a/test/CodeGen/AArch64/aarch64-addv.ll +++ b/test/CodeGen/AArch64/aarch64-addv.ll @@ -1,16 +1,18 @@ ; RUN: llc < %s -mtriple=aarch64-eabi -aarch64-neon-syntax=generic | FileCheck %s -; Function Attrs: nounwind readnone -declare i64 @llvm.experimental.vector.reduce.add.i64.v2i64(<2 x i64>) -declare i32 @llvm.experimental.vector.reduce.add.i32.v4i32(<4 x i32>) -declare i16 @llvm.experimental.vector.reduce.add.i16.v8i16(<8 x i16>) -declare i8 @llvm.experimental.vector.reduce.add.i8.v16i8(<16 x i8>) - define i8 @add_B(<16 x i8>* %arr) { ; CHECK-LABEL: add_B ; CHECK: addv {{b[0-9]+}}, {{v[0-9]+}}.16b %bin.rdx = load <16 x i8>, <16 x i8>* %arr - %r = call i8 @llvm.experimental.vector.reduce.add.i8.v16i8(<16 x i8> %bin.rdx) + %rdx.shuf0 = shufflevector <16 x i8> %bin.rdx, <16 x i8> undef, <16 x i32> + %bin.rdx0 = add <16 x i8> %bin.rdx, %rdx.shuf0 + %rdx.shuf = shufflevector <16 x i8> %bin.rdx0, <16 x i8> undef, <16 x i32> + %bin.rdx11 = add <16 x i8> %bin.rdx0, %rdx.shuf + %rdx.shuf12 = shufflevector <16 x i8> %bin.rdx11, <16 x i8> undef, <16 x i32> + %bin.rdx13 = add <16 x i8> %bin.rdx11, %rdx.shuf12 + %rdx.shuf13 = shufflevector <16 x i8> %bin.rdx13, <16 x i8> undef, <16 x i32> + %bin.rdx14 = add <16 x i8> %bin.rdx13, %rdx.shuf13 + %r = extractelement <16 x i8> %bin.rdx14, i32 0 ret i8 %r } @@ -18,7 +20,13 @@ define i16 @add_H(<8 x i16>* %arr) { ; CHECK-LABEL: add_H ; CHECK: addv {{h[0-9]+}}, {{v[0-9]+}}.8h %bin.rdx = load <8 x i16>, <8 x i16>* %arr - %r = call i16 @llvm.experimental.vector.reduce.add.i16.v8i16(<8 x i16> %bin.rdx) + %rdx.shuf = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef, <8 x i32> + %bin.rdx11 = add <8 x i16> %bin.rdx, %rdx.shuf + %rdx.shuf12 = shufflevector <8 x i16> %bin.rdx11, <8 x i16> undef, <8 x i32> + %bin.rdx13 = add <8 x i16> %bin.rdx11, %rdx.shuf12 + %rdx.shuf13 = shufflevector <8 x i16> %bin.rdx13, <8 x i16> undef, <8 x i32> + %bin.rdx14 = add <8 x i16> %bin.rdx13, %rdx.shuf13 + %r = extractelement <8 x i16> %bin.rdx14, i32 0 ret i16 %r } @@ -26,7 +34,11 @@ define i32 @add_S( <4 x i32>* %arr) { ; CHECK-LABEL: add_S ; CHECK: addv {{s[0-9]+}}, {{v[0-9]+}}.4s %bin.rdx = load <4 x i32>, <4 x i32>* %arr - %r = call i32 @llvm.experimental.vector.reduce.add.i32.v4i32(<4 x i32> %bin.rdx) + %rdx.shuf = shufflevector <4 x i32> %bin.rdx, <4 x i32> undef, <4 x i32> + %bin.rdx11 = add <4 x i32> %bin.rdx, %rdx.shuf + %rdx.shuf12 = shufflevector <4 x i32> %bin.rdx11, <4 x i32> undef, <4 x i32> + %bin.rdx13 = add <4 x i32> %bin.rdx11, %rdx.shuf12 + %r = extractelement <4 x i32> %bin.rdx13, i32 0 ret i32 %r } @@ -34,12 +46,12 @@ define i64 @add_D(<2 x i64>* %arr) { ; CHECK-LABEL: add_D ; CHECK-NOT: addv %bin.rdx = load <2 x i64>, <2 x i64>* %arr - %r = call i64 @llvm.experimental.vector.reduce.add.i64.v2i64(<2 x i64> %bin.rdx) + %rdx.shuf0 = shufflevector <2 x i64> %bin.rdx, <2 x i64> undef, <2 x i32> + %bin.rdx0 = add <2 x i64> %bin.rdx, %rdx.shuf0 + %r = extractelement <2 x i64> %bin.rdx0, i32 0 ret i64 %r } -declare i32 @llvm.experimental.vector.reduce.add.i32.v8i32(<8 x i32>) - define i32 @oversized_ADDV_256(i8* noalias nocapture readonly %arg1, i8* noalias nocapture readonly %arg2) { ; CHECK-LABEL: oversized_ADDV_256 ; CHECK: addv {{s[0-9]+}}, {{v[0-9]+}}.4s @@ -54,16 +66,33 @@ entry: %7 = icmp slt <8 x i32> %6, zeroinitializer %8 = sub nsw <8 x i32> zeroinitializer, %6 %9 = select <8 x i1> %7, <8 x i32> %8, <8 x i32> %6 - %r = call i32 @llvm.experimental.vector.reduce.add.i32.v8i32(<8 x i32> %9) - ret i32 %r + %rdx.shuf = shufflevector <8 x i32> %9, <8 x i32> undef, <8 x i32> + %bin.rdx = add <8 x i32> %9, %rdx.shuf + %rdx.shuf1 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> + %bin.rdx2 = add <8 x i32> %bin.rdx, %rdx.shuf1 + %rdx.shuf3 = shufflevector <8 x i32> %bin.rdx2, <8 x i32> undef, <8 x i32> + %bin.rdx4 = add <8 x i32> %bin.rdx2, %rdx.shuf3 + %10 = extractelement <8 x i32> %bin.rdx4, i32 0 + ret i32 %10 } -declare i32 @llvm.experimental.vector.reduce.add.i32.v16i32(<16 x i32>) - define i32 @oversized_ADDV_512(<16 x i32>* %arr) { ; CHECK-LABEL: oversized_ADDV_512 ; CHECK: addv {{s[0-9]+}}, {{v[0-9]+}}.4s %bin.rdx = load <16 x i32>, <16 x i32>* %arr - %r = call i32 @llvm.experimental.vector.reduce.add.i32.v16i32(<16 x i32> %bin.rdx) + + %rdx.shuf0 = shufflevector <16 x i32> %bin.rdx, <16 x i32> undef, <16 x i32> + %bin.rdx0 = add <16 x i32> %bin.rdx, %rdx.shuf0 + + %rdx.shuf = shufflevector <16 x i32> %bin.rdx0, <16 x i32> undef, <16 x i32> + %bin.rdx11 = add <16 x i32> %bin.rdx0, %rdx.shuf + + %rdx.shuf12 = shufflevector <16 x i32> %bin.rdx11, <16 x i32> undef, <16 x i32> + %bin.rdx13 = add <16 x i32> %bin.rdx11, %rdx.shuf12 + + %rdx.shuf13 = shufflevector <16 x i32> %bin.rdx13, <16 x i32> undef, <16 x i32> + %bin.rdx14 = add <16 x i32> %bin.rdx13, %rdx.shuf13 + + %r = extractelement <16 x i32> %bin.rdx14, i32 0 ret i32 %r } diff --git a/test/CodeGen/AArch64/aarch64-minmaxv.ll b/test/CodeGen/AArch64/aarch64-minmaxv.ll index 760a8f8419f..9a56cd6ae7c 100644 --- a/test/CodeGen/AArch64/aarch64-minmaxv.ll +++ b/test/CodeGen/AArch64/aarch64-minmaxv.ll @@ -2,148 +2,344 @@ target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" -declare i8 @llvm.experimental.vector.reduce.smax.i8.v16i8(<16 x i8>) -declare i16 @llvm.experimental.vector.reduce.smax.i16.v8i16(<8 x i16>) -declare i32 @llvm.experimental.vector.reduce.smax.i32.v4i32(<4 x i32>) -declare i8 @llvm.experimental.vector.reduce.umax.i8.v16i8(<16 x i8>) -declare i16 @llvm.experimental.vector.reduce.umax.i16.v8i16(<8 x i16>) -declare i32 @llvm.experimental.vector.reduce.umax.i32.v4i32(<4 x i32>) - -declare i8 @llvm.experimental.vector.reduce.smin.i8.v16i8(<16 x i8>) -declare i16 @llvm.experimental.vector.reduce.smin.i16.v8i16(<8 x i16>) -declare i32 @llvm.experimental.vector.reduce.smin.i32.v4i32(<4 x i32>) -declare i8 @llvm.experimental.vector.reduce.umin.i8.v16i8(<16 x i8>) -declare i16 @llvm.experimental.vector.reduce.umin.i16.v8i16(<8 x i16>) -declare i32 @llvm.experimental.vector.reduce.umin.i32.v4i32(<4 x i32>) - -declare float @llvm.experimental.vector.reduce.fmax.f32.v4f32(<4 x float>) -declare float @llvm.experimental.vector.reduce.fmin.f32.v4f32(<4 x float>) - ; CHECK-LABEL: smax_B ; CHECK: smaxv {{b[0-9]+}}, {{v[0-9]+}}.16b define i8 @smax_B(<16 x i8>* nocapture readonly %arr) { %arr.load = load <16 x i8>, <16 x i8>* %arr - %r = call i8 @llvm.experimental.vector.reduce.smax.i8.v16i8(<16 x i8> %arr.load) + %rdx.shuf = shufflevector <16 x i8> %arr.load, <16 x i8> undef, <16 x i32> + %rdx.minmax.cmp22 = icmp sgt <16 x i8> %arr.load, %rdx.shuf + %rdx.minmax.select23 = select <16 x i1> %rdx.minmax.cmp22, <16 x i8> %arr.load, <16 x i8> %rdx.shuf + %rdx.shuf24 = shufflevector <16 x i8> %rdx.minmax.select23, <16 x i8> undef, <16 x i32> + %rdx.minmax.cmp25 = icmp sgt <16 x i8> %rdx.minmax.select23, %rdx.shuf24 + %rdx.minmax.select26 = select <16 x i1> %rdx.minmax.cmp25, <16 x i8> %rdx.minmax.select23, <16 x i8> %rdx.shuf24 + %rdx.shuf27 = shufflevector <16 x i8> %rdx.minmax.select26, <16 x i8> undef, <16 x i32> + %rdx.minmax.cmp28 = icmp sgt <16 x i8> %rdx.minmax.select26, %rdx.shuf27 + %rdx.minmax.select29 = select <16 x i1> %rdx.minmax.cmp28, <16 x i8> %rdx.minmax.select26, <16 x i8> %rdx.shuf27 + %rdx.shuf30 = shufflevector <16 x i8> %rdx.minmax.select29, <16 x i8> undef, <16 x i32> + %rdx.minmax.cmp31 = icmp sgt <16 x i8> %rdx.minmax.select29, %rdx.shuf30 + %rdx.minmax.cmp31.elt = extractelement <16 x i1> %rdx.minmax.cmp31, i32 0 + %rdx.minmax.select29.elt = extractelement <16 x i8> %rdx.minmax.select29, i32 0 + %rdx.shuf30.elt = extractelement <16 x i8> %rdx.minmax.select29, i32 1 + %r = select i1 %rdx.minmax.cmp31.elt, i8 %rdx.minmax.select29.elt, i8 %rdx.shuf30.elt ret i8 %r } ; CHECK-LABEL: smax_H ; CHECK: smaxv {{h[0-9]+}}, {{v[0-9]+}}.8h define i16 @smax_H(<8 x i16>* nocapture readonly %arr) { - %arr.load = load <8 x i16>, <8 x i16>* %arr - %r = call i16 @llvm.experimental.vector.reduce.smax.i16.v8i16(<8 x i16> %arr.load) + %rdx.minmax.select = load <8 x i16>, <8 x i16>* %arr + %rdx.shuf = shufflevector <8 x i16> %rdx.minmax.select, <8 x i16> undef, <8 x i32> + %rdx.minmax.cmp23 = icmp sgt <8 x i16> %rdx.minmax.select, %rdx.shuf + %rdx.minmax.select24 = select <8 x i1> %rdx.minmax.cmp23, <8 x i16> %rdx.minmax.select, <8 x i16> %rdx.shuf + %rdx.shuf25 = shufflevector <8 x i16> %rdx.minmax.select24, <8 x i16> undef, <8 x i32> + %rdx.minmax.cmp26 = icmp sgt <8 x i16> %rdx.minmax.select24, %rdx.shuf25 + %rdx.minmax.select27 = select <8 x i1> %rdx.minmax.cmp26, <8 x i16> %rdx.minmax.select24, <8 x i16> %rdx.shuf25 + %rdx.shuf28 = shufflevector <8 x i16> %rdx.minmax.select27, <8 x i16> undef, <8 x i32> + %rdx.minmax.cmp29 = icmp sgt <8 x i16> %rdx.minmax.select27, %rdx.shuf28 + %rdx.minmax.cmp29.elt = extractelement <8 x i1> %rdx.minmax.cmp29, i32 0 + %rdx.minmax.select27.elt = extractelement <8 x i16> %rdx.minmax.select27, i32 0 + %rdx.shuf28.elt = extractelement <8 x i16> %rdx.minmax.select27, i32 1 + %r = select i1 %rdx.minmax.cmp29.elt, i16 %rdx.minmax.select27.elt, i16 %rdx.shuf28.elt ret i16 %r } ; CHECK-LABEL: smax_S ; CHECK: smaxv {{s[0-9]+}}, {{v[0-9]+}}.4s define i32 @smax_S(<4 x i32> * nocapture readonly %arr) { - %arr.load = load <4 x i32>, <4 x i32>* %arr - %r = call i32 @llvm.experimental.vector.reduce.smax.i32.v4i32(<4 x i32> %arr.load) + %rdx.minmax.select = load <4 x i32>, <4 x i32>* %arr + %rdx.shuf = shufflevector <4 x i32> %rdx.minmax.select, <4 x i32> undef, <4 x i32> + %rdx.minmax.cmp18 = icmp sgt <4 x i32> %rdx.minmax.select, %rdx.shuf + %rdx.minmax.select19 = select <4 x i1> %rdx.minmax.cmp18, <4 x i32> %rdx.minmax.select, <4 x i32> %rdx.shuf + %rdx.shuf20 = shufflevector <4 x i32> %rdx.minmax.select19, <4 x i32> undef, <4 x i32> + %rdx.minmax.cmp21 = icmp sgt <4 x i32> %rdx.minmax.select19, %rdx.shuf20 + %rdx.minmax.cmp21.elt = extractelement <4 x i1> %rdx.minmax.cmp21, i32 0 + %rdx.minmax.select19.elt = extractelement <4 x i32> %rdx.minmax.select19, i32 0 + %rdx.shuf20.elt = extractelement <4 x i32> %rdx.minmax.select19, i32 1 + %r = select i1 %rdx.minmax.cmp21.elt, i32 %rdx.minmax.select19.elt, i32 %rdx.shuf20.elt ret i32 %r } +; CHECK-LABEL: smax_D +; CHECK-NOT: smaxv +define i64 @smax_D(<2 x i64>* nocapture readonly %arr) { + %rdx.minmax.select = load <2 x i64>, <2 x i64>* %arr + %rdx.shuf = shufflevector <2 x i64> %rdx.minmax.select, <2 x i64> undef, <2 x i32> + %rdx.minmax.cmp18 = icmp sgt <2 x i64> %rdx.minmax.select, %rdx.shuf + %rdx.minmax.cmp18.elt = extractelement <2 x i1> %rdx.minmax.cmp18, i32 0 + %rdx.minmax.select.elt = extractelement <2 x i64> %rdx.minmax.select, i32 0 + %rdx.shuf.elt = extractelement <2 x i64> %rdx.minmax.select, i32 1 + %r = select i1 %rdx.minmax.cmp18.elt, i64 %rdx.minmax.select.elt, i64 %rdx.shuf.elt + ret i64 %r +} + + ; CHECK-LABEL: umax_B ; CHECK: umaxv {{b[0-9]+}}, {{v[0-9]+}}.16b define i8 @umax_B(<16 x i8>* nocapture readonly %arr) { - %arr.load = load <16 x i8>, <16 x i8>* %arr - %r = call i8 @llvm.experimental.vector.reduce.umax.i8.v16i8(<16 x i8> %arr.load) + %rdx.minmax.select = load <16 x i8>, <16 x i8>* %arr + %rdx.shuf = shufflevector <16 x i8> %rdx.minmax.select, <16 x i8> undef, <16 x i32> + %rdx.minmax.cmp22 = icmp ugt <16 x i8> %rdx.minmax.select, %rdx.shuf + %rdx.minmax.select23 = select <16 x i1> %rdx.minmax.cmp22, <16 x i8> %rdx.minmax.select, <16 x i8> %rdx.shuf + %rdx.shuf24 = shufflevector <16 x i8> %rdx.minmax.select23, <16 x i8> undef, <16 x i32> + %rdx.minmax.cmp25 = icmp ugt <16 x i8> %rdx.minmax.select23, %rdx.shuf24 + %rdx.minmax.select26 = select <16 x i1> %rdx.minmax.cmp25, <16 x i8> %rdx.minmax.select23, <16 x i8> %rdx.shuf24 + %rdx.shuf27 = shufflevector <16 x i8> %rdx.minmax.select26, <16 x i8> undef, <16 x i32> + %rdx.minmax.cmp28 = icmp ugt <16 x i8> %rdx.minmax.select26, %rdx.shuf27 + %rdx.minmax.select29 = select <16 x i1> %rdx.minmax.cmp28, <16 x i8> %rdx.minmax.select26, <16 x i8> %rdx.shuf27 + %rdx.shuf30 = shufflevector <16 x i8> %rdx.minmax.select29, <16 x i8> undef, <16 x i32> + %rdx.minmax.cmp31 = icmp ugt <16 x i8> %rdx.minmax.select29, %rdx.shuf30 + %rdx.minmax.cmp31.elt = extractelement <16 x i1> %rdx.minmax.cmp31, i32 0 + %rdx.minmax.select29.elt = extractelement <16 x i8> %rdx.minmax.select29, i32 0 + %rdx.shuf30.elt = extractelement <16 x i8> %rdx.minmax.select29, i32 1 + %r = select i1 %rdx.minmax.cmp31.elt, i8 %rdx.minmax.select29.elt, i8 %rdx.shuf30.elt ret i8 %r } ; CHECK-LABEL: umax_H ; CHECK: umaxv {{h[0-9]+}}, {{v[0-9]+}}.8h define i16 @umax_H(<8 x i16>* nocapture readonly %arr) { - %arr.load = load <8 x i16>, <8 x i16>* %arr - %r = call i16 @llvm.experimental.vector.reduce.umax.i16.v8i16(<8 x i16> %arr.load) + %rdx.minmax.select = load <8 x i16>, <8 x i16>* %arr + %rdx.shuf = shufflevector <8 x i16> %rdx.minmax.select, <8 x i16> undef, <8 x i32> + %rdx.minmax.cmp23 = icmp ugt <8 x i16> %rdx.minmax.select, %rdx.shuf + %rdx.minmax.select24 = select <8 x i1> %rdx.minmax.cmp23, <8 x i16> %rdx.minmax.select, <8 x i16> %rdx.shuf + %rdx.shuf25 = shufflevector <8 x i16> %rdx.minmax.select24, <8 x i16> undef, <8 x i32> + %rdx.minmax.cmp26 = icmp ugt <8 x i16> %rdx.minmax.select24, %rdx.shuf25 + %rdx.minmax.select27 = select <8 x i1> %rdx.minmax.cmp26, <8 x i16> %rdx.minmax.select24, <8 x i16> %rdx.shuf25 + %rdx.shuf28 = shufflevector <8 x i16> %rdx.minmax.select27, <8 x i16> undef, <8 x i32> + %rdx.minmax.cmp29 = icmp ugt <8 x i16> %rdx.minmax.select27, %rdx.shuf28 + %rdx.minmax.cmp29.elt = extractelement <8 x i1> %rdx.minmax.cmp29, i32 0 + %rdx.minmax.select27.elt = extractelement <8 x i16> %rdx.minmax.select27, i32 0 + %rdx.shuf28.elt = extractelement <8 x i16> %rdx.minmax.select27, i32 1 + %r = select i1 %rdx.minmax.cmp29.elt, i16 %rdx.minmax.select27.elt, i16 %rdx.shuf28.elt ret i16 %r } ; CHECK-LABEL: umax_S ; CHECK: umaxv {{s[0-9]+}}, {{v[0-9]+}}.4s define i32 @umax_S(<4 x i32>* nocapture readonly %arr) { - %arr.load = load <4 x i32>, <4 x i32>* %arr - %r = call i32 @llvm.experimental.vector.reduce.umax.i32.v4i32(<4 x i32> %arr.load) + %rdx.minmax.select = load <4 x i32>, <4 x i32>* %arr + %rdx.shuf = shufflevector <4 x i32> %rdx.minmax.select, <4 x i32> undef, <4 x i32> + %rdx.minmax.cmp18 = icmp ugt <4 x i32> %rdx.minmax.select, %rdx.shuf + %rdx.minmax.select19 = select <4 x i1> %rdx.minmax.cmp18, <4 x i32> %rdx.minmax.select, <4 x i32> %rdx.shuf + %rdx.shuf20 = shufflevector <4 x i32> %rdx.minmax.select19, <4 x i32> undef, <4 x i32> + %rdx.minmax.cmp21 = icmp ugt <4 x i32> %rdx.minmax.select19, %rdx.shuf20 + %rdx.minmax.cmp21.elt = extractelement <4 x i1> %rdx.minmax.cmp21, i32 0 + %rdx.minmax.select19.elt = extractelement <4 x i32> %rdx.minmax.select19, i32 0 + %rdx.shuf20.elt = extractelement <4 x i32> %rdx.minmax.select19, i32 1 + %r = select i1 %rdx.minmax.cmp21.elt, i32 %rdx.minmax.select19.elt, i32 %rdx.shuf20.elt ret i32 %r } +; CHECK-LABEL: umax_D +; CHECK-NOT: umaxv +define i64 @umax_D(<2 x i64>* nocapture readonly %arr) { + %rdx.minmax.select = load <2 x i64>, <2 x i64>* %arr + %rdx.shuf = shufflevector <2 x i64> %rdx.minmax.select, <2 x i64> undef, <2 x i32> + %rdx.minmax.cmp18 = icmp ugt <2 x i64> %rdx.minmax.select, %rdx.shuf + %rdx.minmax.cmp18.elt = extractelement <2 x i1> %rdx.minmax.cmp18, i32 0 + %rdx.minmax.select.elt = extractelement <2 x i64> %rdx.minmax.select, i32 0 + %rdx.shuf.elt = extractelement <2 x i64> %rdx.minmax.select, i32 1 + %r = select i1 %rdx.minmax.cmp18.elt, i64 %rdx.minmax.select.elt, i64 %rdx.shuf.elt + ret i64 %r +} + + ; CHECK-LABEL: smin_B ; CHECK: sminv {{b[0-9]+}}, {{v[0-9]+}}.16b define i8 @smin_B(<16 x i8>* nocapture readonly %arr) { - %arr.load = load <16 x i8>, <16 x i8>* %arr - %r = call i8 @llvm.experimental.vector.reduce.smin.i8.v16i8(<16 x i8> %arr.load) + %rdx.minmax.select = load <16 x i8>, <16 x i8>* %arr + %rdx.shuf = shufflevector <16 x i8> %rdx.minmax.select, <16 x i8> undef, <16 x i32> + %rdx.minmax.cmp22 = icmp slt <16 x i8> %rdx.minmax.select, %rdx.shuf + %rdx.minmax.select23 = select <16 x i1> %rdx.minmax.cmp22, <16 x i8> %rdx.minmax.select, <16 x i8> %rdx.shuf + %rdx.shuf24 = shufflevector <16 x i8> %rdx.minmax.select23, <16 x i8> undef, <16 x i32> + %rdx.minmax.cmp25 = icmp slt <16 x i8> %rdx.minmax.select23, %rdx.shuf24 + %rdx.minmax.select26 = select <16 x i1> %rdx.minmax.cmp25, <16 x i8> %rdx.minmax.select23, <16 x i8> %rdx.shuf24 + %rdx.shuf27 = shufflevector <16 x i8> %rdx.minmax.select26, <16 x i8> undef, <16 x i32> + %rdx.minmax.cmp28 = icmp slt <16 x i8> %rdx.minmax.select26, %rdx.shuf27 + %rdx.minmax.select29 = select <16 x i1> %rdx.minmax.cmp28, <16 x i8> %rdx.minmax.select26, <16 x i8> %rdx.shuf27 + %rdx.shuf30 = shufflevector <16 x i8> %rdx.minmax.select29, <16 x i8> undef, <16 x i32> + %rdx.minmax.cmp31 = icmp slt <16 x i8> %rdx.minmax.select29, %rdx.shuf30 + %rdx.minmax.cmp31.elt = extractelement <16 x i1> %rdx.minmax.cmp31, i32 0 + %rdx.minmax.select29.elt = extractelement <16 x i8> %rdx.minmax.select29, i32 0 + %rdx.shuf30.elt = extractelement <16 x i8> %rdx.minmax.select29, i32 1 + %r = select i1 %rdx.minmax.cmp31.elt, i8 %rdx.minmax.select29.elt, i8 %rdx.shuf30.elt ret i8 %r } ; CHECK-LABEL: smin_H ; CHECK: sminv {{h[0-9]+}}, {{v[0-9]+}}.8h define i16 @smin_H(<8 x i16>* nocapture readonly %arr) { - %arr.load = load <8 x i16>, <8 x i16>* %arr - %r = call i16 @llvm.experimental.vector.reduce.smin.i16.v8i16(<8 x i16> %arr.load) + %rdx.minmax.select = load <8 x i16>, <8 x i16>* %arr + %rdx.shuf = shufflevector <8 x i16> %rdx.minmax.select, <8 x i16> undef, <8 x i32> + %rdx.minmax.cmp23 = icmp slt <8 x i16> %rdx.minmax.select, %rdx.shuf + %rdx.minmax.select24 = select <8 x i1> %rdx.minmax.cmp23, <8 x i16> %rdx.minmax.select, <8 x i16> %rdx.shuf + %rdx.shuf25 = shufflevector <8 x i16> %rdx.minmax.select24, <8 x i16> undef, <8 x i32> + %rdx.minmax.cmp26 = icmp slt <8 x i16> %rdx.minmax.select24, %rdx.shuf25 + %rdx.minmax.select27 = select <8 x i1> %rdx.minmax.cmp26, <8 x i16> %rdx.minmax.select24, <8 x i16> %rdx.shuf25 + %rdx.shuf28 = shufflevector <8 x i16> %rdx.minmax.select27, <8 x i16> undef, <8 x i32> + %rdx.minmax.cmp29 = icmp slt <8 x i16> %rdx.minmax.select27, %rdx.shuf28 + %rdx.minmax.cmp29.elt = extractelement <8 x i1> %rdx.minmax.cmp29, i32 0 + %rdx.minmax.select27.elt = extractelement <8 x i16> %rdx.minmax.select27, i32 0 + %rdx.shuf28.elt = extractelement <8 x i16> %rdx.minmax.select27, i32 1 + %r = select i1 %rdx.minmax.cmp29.elt, i16 %rdx.minmax.select27.elt, i16 %rdx.shuf28.elt ret i16 %r } ; CHECK-LABEL: smin_S ; CHECK: sminv {{s[0-9]+}}, {{v[0-9]+}}.4s define i32 @smin_S(<4 x i32>* nocapture readonly %arr) { - %arr.load = load <4 x i32>, <4 x i32>* %arr - %r = call i32 @llvm.experimental.vector.reduce.smin.i32.v4i32(<4 x i32> %arr.load) + %rdx.minmax.select = load <4 x i32>, <4 x i32>* %arr + %rdx.shuf = shufflevector <4 x i32> %rdx.minmax.select, <4 x i32> undef, <4 x i32> + %rdx.minmax.cmp18 = icmp slt <4 x i32> %rdx.minmax.select, %rdx.shuf + %rdx.minmax.select19 = select <4 x i1> %rdx.minmax.cmp18, <4 x i32> %rdx.minmax.select, <4 x i32> %rdx.shuf + %rdx.shuf20 = shufflevector <4 x i32> %rdx.minmax.select19, <4 x i32> undef, <4 x i32> + %rdx.minmax.cmp21 = icmp slt <4 x i32> %rdx.minmax.select19, %rdx.shuf20 + %rdx.minmax.cmp21.elt = extractelement <4 x i1> %rdx.minmax.cmp21, i32 0 + %rdx.minmax.select19.elt = extractelement <4 x i32> %rdx.minmax.select19, i32 0 + %rdx.shuf20.elt = extractelement <4 x i32> %rdx.minmax.select19, i32 1 + %r = select i1 %rdx.minmax.cmp21.elt, i32 %rdx.minmax.select19.elt, i32 %rdx.shuf20.elt ret i32 %r } +; CHECK-LABEL: smin_D +; CHECK-NOT: sminv +define i64 @smin_D(<2 x i64>* nocapture readonly %arr) { + %rdx.minmax.select = load <2 x i64>, <2 x i64>* %arr + %rdx.shuf = shufflevector <2 x i64> %rdx.minmax.select, <2 x i64> undef, <2 x i32> + %rdx.minmax.cmp18 = icmp slt <2 x i64> %rdx.minmax.select, %rdx.shuf + %rdx.minmax.cmp18.elt = extractelement <2 x i1> %rdx.minmax.cmp18, i32 0 + %rdx.minmax.select.elt = extractelement <2 x i64> %rdx.minmax.select, i32 0 + %rdx.shuf.elt = extractelement <2 x i64> %rdx.minmax.select, i32 1 + %r = select i1 %rdx.minmax.cmp18.elt, i64 %rdx.minmax.select.elt, i64 %rdx.shuf.elt + ret i64 %r +} + + ; CHECK-LABEL: umin_B ; CHECK: uminv {{b[0-9]+}}, {{v[0-9]+}}.16b define i8 @umin_B(<16 x i8>* nocapture readonly %arr) { - %arr.load = load <16 x i8>, <16 x i8>* %arr - %r = call i8 @llvm.experimental.vector.reduce.umin.i8.v16i8(<16 x i8> %arr.load) + %rdx.minmax.select = load <16 x i8>, <16 x i8>* %arr + %rdx.shuf = shufflevector <16 x i8> %rdx.minmax.select, <16 x i8> undef, <16 x i32> + %rdx.minmax.cmp22 = icmp ult <16 x i8> %rdx.minmax.select, %rdx.shuf + %rdx.minmax.select23 = select <16 x i1> %rdx.minmax.cmp22, <16 x i8> %rdx.minmax.select, <16 x i8> %rdx.shuf + %rdx.shuf24 = shufflevector <16 x i8> %rdx.minmax.select23, <16 x i8> undef, <16 x i32> + %rdx.minmax.cmp25 = icmp ult <16 x i8> %rdx.minmax.select23, %rdx.shuf24 + %rdx.minmax.select26 = select <16 x i1> %rdx.minmax.cmp25, <16 x i8> %rdx.minmax.select23, <16 x i8> %rdx.shuf24 + %rdx.shuf27 = shufflevector <16 x i8> %rdx.minmax.select26, <16 x i8> undef, <16 x i32> + %rdx.minmax.cmp28 = icmp ult <16 x i8> %rdx.minmax.select26, %rdx.shuf27 + %rdx.minmax.select29 = select <16 x i1> %rdx.minmax.cmp28, <16 x i8> %rdx.minmax.select26, <16 x i8> %rdx.shuf27 + %rdx.shuf30 = shufflevector <16 x i8> %rdx.minmax.select29, <16 x i8> undef, <16 x i32> + %rdx.minmax.cmp31 = icmp ult <16 x i8> %rdx.minmax.select29, %rdx.shuf30 + %rdx.minmax.cmp31.elt = extractelement <16 x i1> %rdx.minmax.cmp31, i32 0 + %rdx.minmax.select29.elt = extractelement <16 x i8> %rdx.minmax.select29, i32 0 + %rdx.shuf30.elt = extractelement <16 x i8> %rdx.minmax.select29, i32 1 + %r = select i1 %rdx.minmax.cmp31.elt, i8 %rdx.minmax.select29.elt, i8 %rdx.shuf30.elt ret i8 %r } ; CHECK-LABEL: umin_H ; CHECK: uminv {{h[0-9]+}}, {{v[0-9]+}}.8h define i16 @umin_H(<8 x i16>* nocapture readonly %arr) { - %arr.load = load <8 x i16>, <8 x i16>* %arr - %r = call i16 @llvm.experimental.vector.reduce.umin.i16.v8i16(<8 x i16> %arr.load) + %rdx.minmax.select = load <8 x i16>, <8 x i16>* %arr + %rdx.shuf = shufflevector <8 x i16> %rdx.minmax.select, <8 x i16> undef, <8 x i32> + %rdx.minmax.cmp23 = icmp ult <8 x i16> %rdx.minmax.select, %rdx.shuf + %rdx.minmax.select24 = select <8 x i1> %rdx.minmax.cmp23, <8 x i16> %rdx.minmax.select, <8 x i16> %rdx.shuf + %rdx.shuf25 = shufflevector <8 x i16> %rdx.minmax.select24, <8 x i16> undef, <8 x i32> + %rdx.minmax.cmp26 = icmp ult <8 x i16> %rdx.minmax.select24, %rdx.shuf25 + %rdx.minmax.select27 = select <8 x i1> %rdx.minmax.cmp26, <8 x i16> %rdx.minmax.select24, <8 x i16> %rdx.shuf25 + %rdx.shuf28 = shufflevector <8 x i16> %rdx.minmax.select27, <8 x i16> undef, <8 x i32> + %rdx.minmax.cmp29 = icmp ult <8 x i16> %rdx.minmax.select27, %rdx.shuf28 + %rdx.minmax.cmp29.elt = extractelement <8 x i1> %rdx.minmax.cmp29, i32 0 + %rdx.minmax.select27.elt = extractelement <8 x i16> %rdx.minmax.select27, i32 0 + %rdx.shuf28.elt = extractelement <8 x i16> %rdx.minmax.select27, i32 1 + %r = select i1 %rdx.minmax.cmp29.elt, i16 %rdx.minmax.select27.elt, i16 %rdx.shuf28.elt ret i16 %r } ; CHECK-LABEL: umin_S ; CHECK: uminv {{s[0-9]+}}, {{v[0-9]+}}.4s define i32 @umin_S(<4 x i32>* nocapture readonly %arr) { - %arr.load = load <4 x i32>, <4 x i32>* %arr - %r = call i32 @llvm.experimental.vector.reduce.umin.i32.v4i32(<4 x i32> %arr.load) + %rdx.minmax.select = load <4 x i32>, <4 x i32>* %arr + %rdx.shuf = shufflevector <4 x i32> %rdx.minmax.select, <4 x i32> undef, <4 x i32> + %rdx.minmax.cmp18 = icmp ult <4 x i32> %rdx.minmax.select, %rdx.shuf + %rdx.minmax.select19 = select <4 x i1> %rdx.minmax.cmp18, <4 x i32> %rdx.minmax.select, <4 x i32> %rdx.shuf + %rdx.shuf20 = shufflevector <4 x i32> %rdx.minmax.select19, <4 x i32> undef, <4 x i32> + %rdx.minmax.cmp21 = icmp ult <4 x i32> %rdx.minmax.select19, %rdx.shuf20 + %rdx.minmax.cmp21.elt = extractelement <4 x i1> %rdx.minmax.cmp21, i32 0 + %rdx.minmax.select19.elt = extractelement <4 x i32> %rdx.minmax.select19, i32 0 + %rdx.shuf20.elt = extractelement <4 x i32> %rdx.minmax.select19, i32 1 + %r = select i1 %rdx.minmax.cmp21.elt, i32 %rdx.minmax.select19.elt, i32 %rdx.shuf20.elt ret i32 %r } +; CHECK-LABEL: umin_D +; CHECK-NOT: uminv +define i64 @umin_D(<2 x i64>* nocapture readonly %arr) { + %rdx.minmax.select = load <2 x i64>, <2 x i64>* %arr + %rdx.shuf = shufflevector <2 x i64> %rdx.minmax.select, <2 x i64> undef, <2 x i32> + %rdx.minmax.cmp18 = icmp ult <2 x i64> %rdx.minmax.select, %rdx.shuf + %rdx.minmax.cmp18.elt = extractelement <2 x i1> %rdx.minmax.cmp18, i32 0 + %rdx.minmax.select.elt = extractelement <2 x i64> %rdx.minmax.select, i32 0 + %rdx.shuf.elt = extractelement <2 x i64> %rdx.minmax.select, i32 1 + %r = select i1 %rdx.minmax.cmp18.elt, i64 %rdx.minmax.select.elt, i64 %rdx.shuf.elt + ret i64 %r +} + ; CHECK-LABEL: fmaxnm_S ; CHECK: fmaxnmv define float @fmaxnm_S(<4 x float>* nocapture readonly %arr) { - %arr.load = load <4 x float>, <4 x float>* %arr - %r = call nnan float @llvm.experimental.vector.reduce.fmax.f32.v4f32(<4 x float> %arr.load) + %rdx.minmax.select = load <4 x float>, <4 x float>* %arr + %rdx.shuf = shufflevector <4 x float> %rdx.minmax.select, <4 x float> undef, <4 x i32> + %rdx.minmax.cmp = fcmp fast oge <4 x float> %rdx.minmax.select, %rdx.shuf + %rdx.minmax.select1 = select <4 x i1> %rdx.minmax.cmp, <4 x float> %rdx.minmax.select, <4 x float> %rdx.shuf + %rdx.shuf1 = shufflevector <4 x float> %rdx.minmax.select1, <4 x float> undef, <4 x i32> + %rdx.minmax.cmp1 = fcmp fast oge <4 x float> %rdx.minmax.select1, %rdx.shuf1 + %rdx.minmax.cmp1.elt = extractelement <4 x i1> %rdx.minmax.cmp1, i32 0 + %rdx.minmax.select1.elt = extractelement <4 x float> %rdx.minmax.select1, i32 0 + %rdx.shuf1.elt = extractelement <4 x float> %rdx.minmax.select1, i32 1 + %r = select i1 %rdx.minmax.cmp1.elt, float %rdx.minmax.select1.elt, float %rdx.shuf1.elt ret float %r } ; CHECK-LABEL: fminnm_S ; CHECK: fminnmv define float @fminnm_S(<4 x float>* nocapture readonly %arr) { - %arr.load = load <4 x float>, <4 x float>* %arr - %r = call nnan float @llvm.experimental.vector.reduce.fmin.f32.v4f32(<4 x float> %arr.load) + %rdx.minmax.select = load <4 x float>, <4 x float>* %arr + %rdx.shuf = shufflevector <4 x float> %rdx.minmax.select, <4 x float> undef, <4 x i32> + %rdx.minmax.cmp = fcmp fast ole <4 x float> %rdx.minmax.select, %rdx.shuf + %rdx.minmax.select1 = select <4 x i1> %rdx.minmax.cmp, <4 x float> %rdx.minmax.select, <4 x float> %rdx.shuf + %rdx.shuf1 = shufflevector <4 x float> %rdx.minmax.select1, <4 x float> undef, <4 x i32> + %rdx.minmax.cmp1 = fcmp fast ole <4 x float> %rdx.minmax.select1, %rdx.shuf1 + %rdx.minmax.cmp1.elt = extractelement <4 x i1> %rdx.minmax.cmp1, i32 0 + %rdx.minmax.select1.elt = extractelement <4 x float> %rdx.minmax.select1, i32 0 + %rdx.shuf1.elt = extractelement <4 x float> %rdx.minmax.select1, i32 1 + %r = select i1 %rdx.minmax.cmp1.elt, float %rdx.minmax.select1.elt, float %rdx.shuf1.elt ret float %r } -declare i16 @llvm.experimental.vector.reduce.umax.i16.v16i16(<16 x i16>) - define i16 @oversized_umax_256(<16 x i16>* nocapture readonly %arr) { ; CHECK-LABEL: oversized_umax_256 ; CHECK: umax [[V0:v[0-9]+]].8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h ; CHECK: umaxv {{h[0-9]+}}, [[V0]] - %arr.load = load <16 x i16>, <16 x i16>* %arr - %r = call i16 @llvm.experimental.vector.reduce.umax.i16.v16i16(<16 x i16> %arr.load) + %rdx.minmax.select = load <16 x i16>, <16 x i16>* %arr + %rdx.shuf = shufflevector <16 x i16> %rdx.minmax.select, <16 x i16> undef, <16 x i32> + %rdx.minmax.cmp22 = icmp ugt <16 x i16> %rdx.minmax.select, %rdx.shuf + %rdx.minmax.select23 = select <16 x i1> %rdx.minmax.cmp22, <16 x i16> %rdx.minmax.select, <16 x i16> %rdx.shuf + %rdx.shuf24 = shufflevector <16 x i16> %rdx.minmax.select23, <16 x i16> undef, <16 x i32> + %rdx.minmax.cmp25 = icmp ugt <16 x i16> %rdx.minmax.select23, %rdx.shuf24 + %rdx.minmax.select26 = select <16 x i1> %rdx.minmax.cmp25, <16 x i16> %rdx.minmax.select23, <16 x i16> %rdx.shuf24 + %rdx.shuf27 = shufflevector <16 x i16> %rdx.minmax.select26, <16 x i16> undef, <16 x i32> + %rdx.minmax.cmp28 = icmp ugt <16 x i16> %rdx.minmax.select26, %rdx.shuf27 + %rdx.minmax.select29 = select <16 x i1> %rdx.minmax.cmp28, <16 x i16> %rdx.minmax.select26, <16 x i16> %rdx.shuf27 + %rdx.shuf30 = shufflevector <16 x i16> %rdx.minmax.select29, <16 x i16> undef, <16 x i32> + %rdx.minmax.cmp31 = icmp ugt <16 x i16> %rdx.minmax.select29, %rdx.shuf30 + %rdx.minmax.cmp31.elt = extractelement <16 x i1> %rdx.minmax.cmp31, i32 0 + %rdx.minmax.select29.elt = extractelement <16 x i16> %rdx.minmax.select29, i32 0 + %rdx.shuf30.elt = extractelement <16 x i16> %rdx.minmax.select29, i32 1 + %r = select i1 %rdx.minmax.cmp31.elt, i16 %rdx.minmax.select29.elt, i16 %rdx.shuf30.elt ret i16 %r } -declare i32 @llvm.experimental.vector.reduce.umax.i32.v16i32(<16 x i32>) - define i32 @oversized_umax_512(<16 x i32>* nocapture readonly %arr) { ; CHECK-LABEL: oversized_umax_512 ; CHECK: umax v @@ -151,23 +347,47 @@ define i32 @oversized_umax_512(<16 x i32>* nocapture readonly %arr) { ; CHECK-NEXT: umax [[V0:v[0-9]+]].4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s ; CHECK-NEXT: umaxv {{s[0-9]+}}, [[V0]] %arr.load = load <16 x i32>, <16 x i32>* %arr - %r = call i32 @llvm.experimental.vector.reduce.umax.i32.v16i32(<16 x i32> %arr.load) + %rdx.shuf = shufflevector <16 x i32> %arr.load, <16 x i32> undef, <16 x i32> + %rdx.minmax.cmp22 = icmp ugt <16 x i32> %arr.load, %rdx.shuf + %rdx.minmax.select23 = select <16 x i1> %rdx.minmax.cmp22, <16 x i32> %arr.load, <16 x i32> %rdx.shuf + %rdx.shuf24 = shufflevector <16 x i32> %rdx.minmax.select23, <16 x i32> undef, <16 x i32> + %rdx.minmax.cmp25 = icmp ugt <16 x i32> %rdx.minmax.select23, %rdx.shuf24 + %rdx.minmax.select26 = select <16 x i1> %rdx.minmax.cmp25, <16 x i32> %rdx.minmax.select23, <16 x i32> %rdx.shuf24 + %rdx.shuf27 = shufflevector <16 x i32> %rdx.minmax.select26, <16 x i32> undef, <16 x i32> + %rdx.minmax.cmp28 = icmp ugt <16 x i32> %rdx.minmax.select26, %rdx.shuf27 + %rdx.minmax.select29 = select <16 x i1> %rdx.minmax.cmp28, <16 x i32> %rdx.minmax.select26, <16 x i32> %rdx.shuf27 + %rdx.shuf30 = shufflevector <16 x i32> %rdx.minmax.select29, <16 x i32> undef, <16 x i32> + %rdx.minmax.cmp31 = icmp ugt <16 x i32> %rdx.minmax.select29, %rdx.shuf30 + %rdx.minmax.cmp31.elt = extractelement <16 x i1> %rdx.minmax.cmp31, i32 0 + %rdx.minmax.select29.elt = extractelement <16 x i32> %rdx.minmax.select29, i32 0 + %rdx.shuf30.elt = extractelement <16 x i32> %rdx.minmax.select29, i32 1 + %r = select i1 %rdx.minmax.cmp31.elt, i32 %rdx.minmax.select29.elt, i32 %rdx.shuf30.elt ret i32 %r } -declare i16 @llvm.experimental.vector.reduce.umin.i16.v16i16(<16 x i16>) - define i16 @oversized_umin_256(<16 x i16>* nocapture readonly %arr) { ; CHECK-LABEL: oversized_umin_256 ; CHECK: umin [[V0:v[0-9]+]].8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h ; CHECK: uminv {{h[0-9]+}}, [[V0]] - %arr.load = load <16 x i16>, <16 x i16>* %arr - %r = call i16 @llvm.experimental.vector.reduce.umin.i16.v16i16(<16 x i16> %arr.load) + %rdx.minmax.select = load <16 x i16>, <16 x i16>* %arr + %rdx.shuf = shufflevector <16 x i16> %rdx.minmax.select, <16 x i16> undef, <16 x i32> + %rdx.minmax.cmp22 = icmp ult <16 x i16> %rdx.minmax.select, %rdx.shuf + %rdx.minmax.select23 = select <16 x i1> %rdx.minmax.cmp22, <16 x i16> %rdx.minmax.select, <16 x i16> %rdx.shuf + %rdx.shuf24 = shufflevector <16 x i16> %rdx.minmax.select23, <16 x i16> undef, <16 x i32> + %rdx.minmax.cmp25 = icmp ult <16 x i16> %rdx.minmax.select23, %rdx.shuf24 + %rdx.minmax.select26 = select <16 x i1> %rdx.minmax.cmp25, <16 x i16> %rdx.minmax.select23, <16 x i16> %rdx.shuf24 + %rdx.shuf27 = shufflevector <16 x i16> %rdx.minmax.select26, <16 x i16> undef, <16 x i32> + %rdx.minmax.cmp28 = icmp ult <16 x i16> %rdx.minmax.select26, %rdx.shuf27 + %rdx.minmax.select29 = select <16 x i1> %rdx.minmax.cmp28, <16 x i16> %rdx.minmax.select26, <16 x i16> %rdx.shuf27 + %rdx.shuf30 = shufflevector <16 x i16> %rdx.minmax.select29, <16 x i16> undef, <16 x i32> + %rdx.minmax.cmp31 = icmp ult <16 x i16> %rdx.minmax.select29, %rdx.shuf30 + %rdx.minmax.cmp31.elt = extractelement <16 x i1> %rdx.minmax.cmp31, i32 0 + %rdx.minmax.select29.elt = extractelement <16 x i16> %rdx.minmax.select29, i32 0 + %rdx.shuf30.elt = extractelement <16 x i16> %rdx.minmax.select29, i32 1 + %r = select i1 %rdx.minmax.cmp31.elt, i16 %rdx.minmax.select29.elt, i16 %rdx.shuf30.elt ret i16 %r } -declare i32 @llvm.experimental.vector.reduce.umin.i32.v16i32(<16 x i32>) - define i32 @oversized_umin_512(<16 x i32>* nocapture readonly %arr) { ; CHECK-LABEL: oversized_umin_512 ; CHECK: umin v @@ -175,23 +395,47 @@ define i32 @oversized_umin_512(<16 x i32>* nocapture readonly %arr) { ; CHECK-NEXT: umin [[V0:v[0-9]+]].4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s ; CHECK-NEXT: uminv {{s[0-9]+}}, [[V0]] %arr.load = load <16 x i32>, <16 x i32>* %arr - %r = call i32 @llvm.experimental.vector.reduce.umin.i32.v16i32(<16 x i32> %arr.load) + %rdx.shuf = shufflevector <16 x i32> %arr.load, <16 x i32> undef, <16 x i32> + %rdx.minmax.cmp22 = icmp ult <16 x i32> %arr.load, %rdx.shuf + %rdx.minmax.select23 = select <16 x i1> %rdx.minmax.cmp22, <16 x i32> %arr.load, <16 x i32> %rdx.shuf + %rdx.shuf24 = shufflevector <16 x i32> %rdx.minmax.select23, <16 x i32> undef, <16 x i32> + %rdx.minmax.cmp25 = icmp ult <16 x i32> %rdx.minmax.select23, %rdx.shuf24 + %rdx.minmax.select26 = select <16 x i1> %rdx.minmax.cmp25, <16 x i32> %rdx.minmax.select23, <16 x i32> %rdx.shuf24 + %rdx.shuf27 = shufflevector <16 x i32> %rdx.minmax.select26, <16 x i32> undef, <16 x i32> + %rdx.minmax.cmp28 = icmp ult <16 x i32> %rdx.minmax.select26, %rdx.shuf27 + %rdx.minmax.select29 = select <16 x i1> %rdx.minmax.cmp28, <16 x i32> %rdx.minmax.select26, <16 x i32> %rdx.shuf27 + %rdx.shuf30 = shufflevector <16 x i32> %rdx.minmax.select29, <16 x i32> undef, <16 x i32> + %rdx.minmax.cmp31 = icmp ult <16 x i32> %rdx.minmax.select29, %rdx.shuf30 + %rdx.minmax.cmp31.elt = extractelement <16 x i1> %rdx.minmax.cmp31, i32 0 + %rdx.minmax.select29.elt = extractelement <16 x i32> %rdx.minmax.select29, i32 0 + %rdx.shuf30.elt = extractelement <16 x i32> %rdx.minmax.select29, i32 1 + %r = select i1 %rdx.minmax.cmp31.elt, i32 %rdx.minmax.select29.elt, i32 %rdx.shuf30.elt ret i32 %r } -declare i16 @llvm.experimental.vector.reduce.smax.i16.v16i16(<16 x i16>) - define i16 @oversized_smax_256(<16 x i16>* nocapture readonly %arr) { ; CHECK-LABEL: oversized_smax_256 ; CHECK: smax [[V0:v[0-9]+]].8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h ; CHECK: smaxv {{h[0-9]+}}, [[V0]] %arr.load = load <16 x i16>, <16 x i16>* %arr - %r = call i16 @llvm.experimental.vector.reduce.smax.i16.v16i16(<16 x i16> %arr.load) + %rdx.shuf = shufflevector <16 x i16> %arr.load, <16 x i16> undef, <16 x i32> + %rdx.minmax.cmp22 = icmp sgt <16 x i16> %arr.load, %rdx.shuf + %rdx.minmax.select23 = select <16 x i1> %rdx.minmax.cmp22, <16 x i16> %arr.load, <16 x i16> %rdx.shuf + %rdx.shuf24 = shufflevector <16 x i16> %rdx.minmax.select23, <16 x i16> undef, <16 x i32> + %rdx.minmax.cmp25 = icmp sgt <16 x i16> %rdx.minmax.select23, %rdx.shuf24 + %rdx.minmax.select26 = select <16 x i1> %rdx.minmax.cmp25, <16 x i16> %rdx.minmax.select23, <16 x i16> %rdx.shuf24 + %rdx.shuf27 = shufflevector <16 x i16> %rdx.minmax.select26, <16 x i16> undef, <16 x i32> + %rdx.minmax.cmp28 = icmp sgt <16 x i16> %rdx.minmax.select26, %rdx.shuf27 + %rdx.minmax.select29 = select <16 x i1> %rdx.minmax.cmp28, <16 x i16> %rdx.minmax.select26, <16 x i16> %rdx.shuf27 + %rdx.shuf30 = shufflevector <16 x i16> %rdx.minmax.select29, <16 x i16> undef, <16 x i32> + %rdx.minmax.cmp31 = icmp sgt <16 x i16> %rdx.minmax.select29, %rdx.shuf30 + %rdx.minmax.cmp31.elt = extractelement <16 x i1> %rdx.minmax.cmp31, i32 0 + %rdx.minmax.select29.elt = extractelement <16 x i16> %rdx.minmax.select29, i32 0 + %rdx.shuf30.elt = extractelement <16 x i16> %rdx.minmax.select29, i32 1 + %r = select i1 %rdx.minmax.cmp31.elt, i16 %rdx.minmax.select29.elt, i16 %rdx.shuf30.elt ret i16 %r } -declare i32 @llvm.experimental.vector.reduce.smax.i32.v16i32(<16 x i32>) - define i32 @oversized_smax_512(<16 x i32>* nocapture readonly %arr) { ; CHECK-LABEL: oversized_smax_512 ; CHECK: smax v @@ -199,23 +443,47 @@ define i32 @oversized_smax_512(<16 x i32>* nocapture readonly %arr) { ; CHECK-NEXT: smax [[V0:v[0-9]+]].4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s ; CHECK-NEXT: smaxv {{s[0-9]+}}, [[V0]] %arr.load = load <16 x i32>, <16 x i32>* %arr - %r = call i32 @llvm.experimental.vector.reduce.smax.i32.v16i32(<16 x i32> %arr.load) + %rdx.shuf = shufflevector <16 x i32> %arr.load, <16 x i32> undef, <16 x i32> + %rdx.minmax.cmp22 = icmp sgt <16 x i32> %arr.load, %rdx.shuf + %rdx.minmax.select23 = select <16 x i1> %rdx.minmax.cmp22, <16 x i32> %arr.load, <16 x i32> %rdx.shuf + %rdx.shuf24 = shufflevector <16 x i32> %rdx.minmax.select23, <16 x i32> undef, <16 x i32> + %rdx.minmax.cmp25 = icmp sgt <16 x i32> %rdx.minmax.select23, %rdx.shuf24 + %rdx.minmax.select26 = select <16 x i1> %rdx.minmax.cmp25, <16 x i32> %rdx.minmax.select23, <16 x i32> %rdx.shuf24 + %rdx.shuf27 = shufflevector <16 x i32> %rdx.minmax.select26, <16 x i32> undef, <16 x i32> + %rdx.minmax.cmp28 = icmp sgt <16 x i32> %rdx.minmax.select26, %rdx.shuf27 + %rdx.minmax.select29 = select <16 x i1> %rdx.minmax.cmp28, <16 x i32> %rdx.minmax.select26, <16 x i32> %rdx.shuf27 + %rdx.shuf30 = shufflevector <16 x i32> %rdx.minmax.select29, <16 x i32> undef, <16 x i32> + %rdx.minmax.cmp31 = icmp sgt <16 x i32> %rdx.minmax.select29, %rdx.shuf30 + %rdx.minmax.cmp31.elt = extractelement <16 x i1> %rdx.minmax.cmp31, i32 0 + %rdx.minmax.select29.elt = extractelement <16 x i32> %rdx.minmax.select29, i32 0 + %rdx.shuf30.elt = extractelement <16 x i32> %rdx.minmax.select29, i32 1 + %r = select i1 %rdx.minmax.cmp31.elt, i32 %rdx.minmax.select29.elt, i32 %rdx.shuf30.elt ret i32 %r } -declare i16 @llvm.experimental.vector.reduce.smin.i16.v16i16(<16 x i16>) - define i16 @oversized_smin_256(<16 x i16>* nocapture readonly %arr) { ; CHECK-LABEL: oversized_smin_256 ; CHECK: smin [[V0:v[0-9]+]].8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h ; CHECK: sminv {{h[0-9]+}}, [[V0]] - %arr.load = load <16 x i16>, <16 x i16>* %arr - %r = call i16 @llvm.experimental.vector.reduce.smin.i16.v16i16(<16 x i16> %arr.load) + %rdx.minmax.select = load <16 x i16>, <16 x i16>* %arr + %rdx.shuf = shufflevector <16 x i16> %rdx.minmax.select, <16 x i16> undef, <16 x i32> + %rdx.minmax.cmp22 = icmp slt <16 x i16> %rdx.minmax.select, %rdx.shuf + %rdx.minmax.select23 = select <16 x i1> %rdx.minmax.cmp22, <16 x i16> %rdx.minmax.select, <16 x i16> %rdx.shuf + %rdx.shuf24 = shufflevector <16 x i16> %rdx.minmax.select23, <16 x i16> undef, <16 x i32> + %rdx.minmax.cmp25 = icmp slt <16 x i16> %rdx.minmax.select23, %rdx.shuf24 + %rdx.minmax.select26 = select <16 x i1> %rdx.minmax.cmp25, <16 x i16> %rdx.minmax.select23, <16 x i16> %rdx.shuf24 + %rdx.shuf27 = shufflevector <16 x i16> %rdx.minmax.select26, <16 x i16> undef, <16 x i32> + %rdx.minmax.cmp28 = icmp slt <16 x i16> %rdx.minmax.select26, %rdx.shuf27 + %rdx.minmax.select29 = select <16 x i1> %rdx.minmax.cmp28, <16 x i16> %rdx.minmax.select26, <16 x i16> %rdx.shuf27 + %rdx.shuf30 = shufflevector <16 x i16> %rdx.minmax.select29, <16 x i16> undef, <16 x i32> + %rdx.minmax.cmp31 = icmp slt <16 x i16> %rdx.minmax.select29, %rdx.shuf30 + %rdx.minmax.cmp31.elt = extractelement <16 x i1> %rdx.minmax.cmp31, i32 0 + %rdx.minmax.select29.elt = extractelement <16 x i16> %rdx.minmax.select29, i32 0 + %rdx.shuf30.elt = extractelement <16 x i16> %rdx.minmax.select29, i32 1 + %r = select i1 %rdx.minmax.cmp31.elt, i16 %rdx.minmax.select29.elt, i16 %rdx.shuf30.elt ret i16 %r } -declare i32 @llvm.experimental.vector.reduce.smin.i32.v16i32(<16 x i32>) - define i32 @oversized_smin_512(<16 x i32>* nocapture readonly %arr) { ; CHECK-LABEL: oversized_smin_512 ; CHECK: smin v @@ -223,6 +491,20 @@ define i32 @oversized_smin_512(<16 x i32>* nocapture readonly %arr) { ; CHECK-NEXT: smin [[V0:v[0-9]+]].4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s ; CHECK-NEXT: sminv {{s[0-9]+}}, [[V0]] %arr.load = load <16 x i32>, <16 x i32>* %arr - %r = call i32 @llvm.experimental.vector.reduce.smin.i32.v16i32(<16 x i32> %arr.load) + %rdx.shuf = shufflevector <16 x i32> %arr.load, <16 x i32> undef, <16 x i32> + %rdx.minmax.cmp22 = icmp slt <16 x i32> %arr.load, %rdx.shuf + %rdx.minmax.select23 = select <16 x i1> %rdx.minmax.cmp22, <16 x i32> %arr.load, <16 x i32> %rdx.shuf + %rdx.shuf24 = shufflevector <16 x i32> %rdx.minmax.select23, <16 x i32> undef, <16 x i32> + %rdx.minmax.cmp25 = icmp slt <16 x i32> %rdx.minmax.select23, %rdx.shuf24 + %rdx.minmax.select26 = select <16 x i1> %rdx.minmax.cmp25, <16 x i32> %rdx.minmax.select23, <16 x i32> %rdx.shuf24 + %rdx.shuf27 = shufflevector <16 x i32> %rdx.minmax.select26, <16 x i32> undef, <16 x i32> + %rdx.minmax.cmp28 = icmp slt <16 x i32> %rdx.minmax.select26, %rdx.shuf27 + %rdx.minmax.select29 = select <16 x i1> %rdx.minmax.cmp28, <16 x i32> %rdx.minmax.select26, <16 x i32> %rdx.shuf27 + %rdx.shuf30 = shufflevector <16 x i32> %rdx.minmax.select29, <16 x i32> undef, <16 x i32> + %rdx.minmax.cmp31 = icmp slt <16 x i32> %rdx.minmax.select29, %rdx.shuf30 + %rdx.minmax.cmp31.elt = extractelement <16 x i1> %rdx.minmax.cmp31, i32 0 + %rdx.minmax.select29.elt = extractelement <16 x i32> %rdx.minmax.select29, i32 0 + %rdx.shuf30.elt = extractelement <16 x i32> %rdx.minmax.select29, i32 1 + %r = select i1 %rdx.minmax.cmp31.elt, i32 %rdx.minmax.select29.elt, i32 %rdx.shuf30.elt ret i32 %r } diff --git a/test/CodeGen/AArch64/arm64-vabs.ll b/test/CodeGen/AArch64/arm64-vabs.ll index ff7a0a8300e..c7b0c33550d 100644 --- a/test/CodeGen/AArch64/arm64-vabs.ll +++ b/test/CodeGen/AArch64/arm64-vabs.ll @@ -134,10 +134,8 @@ define <2 x i64> @uabdl2_2d(<4 x i32>* %A, <4 x i32>* %B) nounwind { ret <2 x i64> %tmp4 } -declare i16 @llvm.experimental.vector.reduce.add.i16.v16i16(<16 x i16>) - -define i16 @uabdl8h_rdx(<16 x i8>* %a, <16 x i8>* %b) { -; CHECK-LABEL: uabdl8h_rdx +define i16 @uabdl8h_log2_shuffle(<16 x i8>* %a, <16 x i8>* %b) { +; CHECK-LABEL: uabdl8h_log2_shuffle ; CHECK: uabdl2.8h ; CHECK: uabdl.8h %aload = load <16 x i8>, <16 x i8>* %a, align 1 @@ -148,14 +146,20 @@ define i16 @uabdl8h_rdx(<16 x i8>* %a, <16 x i8>* %b) { %abcmp = icmp slt <16 x i16> %abdiff, zeroinitializer %ababs = sub nsw <16 x i16> zeroinitializer, %abdiff %absel = select <16 x i1> %abcmp, <16 x i16> %ababs, <16 x i16> %abdiff - %reduced_v = call i16 @llvm.experimental.vector.reduce.add.i16.v16i16(<16 x i16> %absel) + %rdx.shuf = shufflevector <16 x i16> %absel, <16 x i16> undef, <16 x i32> + %bin1.rdx = add <16 x i16> %absel, %rdx.shuf + %rdx.shufx = shufflevector <16 x i16> %bin1.rdx, <16 x i16> undef, <16 x i32> + %bin.rdx = add <16 x i16> %bin1.rdx, %rdx.shufx + %rdx.shuf136 = shufflevector <16 x i16> %bin.rdx, <16 x i16> undef, <16 x i32> + %bin.rdx137 = add <16 x i16> %bin.rdx, %rdx.shuf136 + %rdx.shuf138 = shufflevector <16 x i16> %bin.rdx137, <16 x i16> undef, <16 x i32> + %bin.rdx139 = add <16 x i16> %bin.rdx137, %rdx.shuf138 + %reduced_v = extractelement <16 x i16> %bin.rdx139, i16 0 ret i16 %reduced_v } -declare i32 @llvm.experimental.vector.reduce.add.i32.v8i32(<8 x i32>) - -define i32 @uabdl4s_rdx(<8 x i16>* %a, <8 x i16>* %b) { -; CHECK-LABEL: uabdl4s_rdx +define i32 @uabdl4s_log2_shuffle(<8 x i16>* %a, <8 x i16>* %b) { +; CHECK-LABEL: uabdl4s_log2_shuffle ; CHECK: uabdl2.4s ; CHECK: uabdl.4s %aload = load <8 x i16>, <8 x i16>* %a, align 1 @@ -166,14 +170,18 @@ define i32 @uabdl4s_rdx(<8 x i16>* %a, <8 x i16>* %b) { %abcmp = icmp slt <8 x i32> %abdiff, zeroinitializer %ababs = sub nsw <8 x i32> zeroinitializer, %abdiff %absel = select <8 x i1> %abcmp, <8 x i32> %ababs, <8 x i32> %abdiff - %reduced_v = call i32 @llvm.experimental.vector.reduce.add.i32.v8i32(<8 x i32> %absel) + %rdx.shuf = shufflevector <8 x i32> %absel, <8 x i32> undef, <8 x i32> + %bin.rdx = add <8 x i32> %absel, %rdx.shuf + %rdx.shuf136 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> + %bin.rdx137 = add <8 x i32> %bin.rdx, %rdx.shuf136 + %rdx.shuf138 = shufflevector <8 x i32> %bin.rdx137, <8 x i32> undef, <8 x i32> + %bin.rdx139 = add <8 x i32> %bin.rdx137, %rdx.shuf138 + %reduced_v = extractelement <8 x i32> %bin.rdx139, i32 0 ret i32 %reduced_v } -declare i64 @llvm.experimental.vector.reduce.add.i64.v4i64(<4 x i64>) - -define i64 @uabdl2d_rdx(<4 x i32>* %a, <4 x i32>* %b, i32 %h) { -; CHECK: uabdl2d_rdx +define i64 @uabdl2d_log2_shuffle(<4 x i32>* %a, <4 x i32>* %b, i32 %h) { +; CHECK: uabdl2d_log2_shuffle ; CHECK: uabdl2.2d ; CHECK: uabdl.2d %aload = load <4 x i32>, <4 x i32>* %a, align 1 @@ -184,7 +192,11 @@ define i64 @uabdl2d_rdx(<4 x i32>* %a, <4 x i32>* %b, i32 %h) { %abcmp = icmp slt <4 x i64> %abdiff, zeroinitializer %ababs = sub nsw <4 x i64> zeroinitializer, %abdiff %absel = select <4 x i1> %abcmp, <4 x i64> %ababs, <4 x i64> %abdiff - %reduced_v = call i64 @llvm.experimental.vector.reduce.add.i64.v4i64(<4 x i64> %absel) + %rdx.shuf136 = shufflevector <4 x i64> %absel, <4 x i64> undef, <4 x i32> + %bin.rdx137 = add <4 x i64> %absel, %rdx.shuf136 + %rdx.shuf138 = shufflevector <4 x i64> %bin.rdx137, <4 x i64> undef, <4 x i32> + %bin.rdx139 = add <4 x i64> %bin.rdx137, %rdx.shuf138 + %reduced_v = extractelement <4 x i64> %bin.rdx139, i16 0 ret i64 %reduced_v } diff --git a/test/Transforms/LoopVectorize/AArch64/reduction-small-size.ll b/test/Transforms/LoopVectorize/AArch64/reduction-small-size.ll index 9d9aea00e9a..be08a63b212 100644 --- a/test/Transforms/LoopVectorize/AArch64/reduction-small-size.ll +++ b/test/Transforms/LoopVectorize/AArch64/reduction-small-size.ll @@ -20,7 +20,15 @@ target triple = "aarch64--linux-gnu" ; CHECK: add <16 x i8> ; ; CHECK: middle.block: -; CHECK: [[Rdx:%[a-zA-Z0-9.]+]] = call i8 @llvm.experimental.vector.reduce.add.i8.v16i8(<16 x i8> +; CHECK: shufflevector <16 x i8> +; CHECK: add <16 x i8> +; CHECK: shufflevector <16 x i8> +; CHECK: add <16 x i8> +; CHECK: shufflevector <16 x i8> +; CHECK: add <16 x i8> +; CHECK: shufflevector <16 x i8> +; CHECK: add <16 x i8> +; CHECK: [[Rdx:%[a-zA-Z0-9.]+]] = extractelement <16 x i8> ; CHECK: zext i8 [[Rdx]] to i32 ; define i8 @reduction_i8(i8* nocapture readonly %a, i8* nocapture readonly %b, i32 %n) { @@ -75,7 +83,13 @@ for.body: ; CHECK: add <8 x i16> ; ; CHECK: middle.block: -; CHECK: [[Rdx:%[a-zA-Z0-9.]+]] = call i16 @llvm.experimental.vector.reduce.add.i16.v8i16(<8 x i16> +; CHECK: shufflevector <8 x i16> +; CHECK: add <8 x i16> +; CHECK: shufflevector <8 x i16> +; CHECK: add <8 x i16> +; CHECK: shufflevector <8 x i16> +; CHECK: add <8 x i16> +; CHECK: [[Rdx:%[a-zA-Z0-9.]+]] = extractelement <8 x i16> ; CHECK: zext i16 [[Rdx]] to i32 ; define i16 @reduction_i16_1(i16* nocapture readonly %a, i16* nocapture readonly %b, i32 %n) { @@ -132,7 +146,13 @@ for.body: ; CHECK: add <8 x i16> ; ; CHECK: middle.block: -; CHECK: [[Rdx:%[a-zA-Z0-9.]+]] = call i16 @llvm.experimental.vector.reduce.add.i16.v8i16(<8 x i16> +; CHECK: shufflevector <8 x i16> +; CHECK: add <8 x i16> +; CHECK: shufflevector <8 x i16> +; CHECK: add <8 x i16> +; CHECK: shufflevector <8 x i16> +; CHECK: add <8 x i16> +; CHECK: [[Rdx:%[a-zA-Z0-9.]+]] = extractelement <8 x i16> ; CHECK: zext i16 [[Rdx]] to i32 ; define i16 @reduction_i16_2(i8* nocapture readonly %a, i8* nocapture readonly %b, i32 %n) { diff --git a/test/Transforms/SLPVectorizer/AArch64/gather-root.ll b/test/Transforms/SLPVectorizer/AArch64/gather-root.ll index 68d6ebd27a5..b7fa5452f25 100644 --- a/test/Transforms/SLPVectorizer/AArch64/gather-root.ll +++ b/test/Transforms/SLPVectorizer/AArch64/gather-root.ll @@ -11,8 +11,14 @@ target triple = "aarch64--linux-gnu" ; DEFAULT-LABEL: @PR28330( ; DEFAULT: %tmp17 = phi i32 [ %bin.extra, %for.body ], [ 0, %entry ] ; DEFAULT: %[[S0:.+]] = select <8 x i1> %1, <8 x i32> , <8 x i32> -; DEFAULT: %[[Rdx:.+]] = call i32 @llvm.experimental.vector.reduce.add.i32.v8i32(<8 x i32> %[[S0]]) -; DEFAULT: %bin.extra = add i32 %[[Rdx]], %tmp17 +; DEFAULT: %[[R0:.+]] = shufflevector <8 x i32> %[[S0]], <8 x i32> undef, <8 x i32> +; DEFAULT: %[[R1:.+]] = add <8 x i32> %[[S0]], %[[R0]] +; DEFAULT: %[[R2:.+]] = shufflevector <8 x i32> %[[R1]], <8 x i32> undef, <8 x i32> +; DEFAULT: %[[R3:.+]] = add <8 x i32> %[[R1]], %[[R2]] +; DEFAULT: %[[R4:.+]] = shufflevector <8 x i32> %[[R3]], <8 x i32> undef, <8 x i32> +; DEFAULT: %[[R5:.+]] = add <8 x i32> %[[R3]], %[[R4]] +; DEFAULT: %[[R6:.+]] = extractelement <8 x i32> %[[R5]], i32 0 +; DEFAULT: %bin.extra = add i32 %[[R6]], %tmp17 ; ; GATHER-LABEL: @PR28330( ; GATHER: %tmp17 = phi i32 [ %bin.extra, %for.body ], [ 0, %entry ] @@ -32,8 +38,14 @@ target triple = "aarch64--linux-gnu" ; GATHER: %[[I5:.+]] = insertelement <8 x i32> %[[I4]], i32 %tmp29, i32 5 ; GATHER: %[[I6:.+]] = insertelement <8 x i32> %[[I5]], i32 %tmp31, i32 6 ; GATHER: %[[I7:.+]] = insertelement <8 x i32> %[[I6]], i32 %tmp33, i32 7 -; GATHER: %[[Rdx:.+]] = call i32 @llvm.experimental.vector.reduce.add.i32.v8i32(<8 x i32> %[[I7]]) -; GATHER: %bin.extra = add i32 %[[Rdx]], %tmp17 +; GATHER: %[[R0:.+]] = shufflevector <8 x i32> %[[I7]], <8 x i32> undef, <8 x i32> +; GATHER: %[[R1:.+]] = add <8 x i32> %[[I7]], %[[R0]] +; GATHER: %[[R2:.+]] = shufflevector <8 x i32> %[[R1]], <8 x i32> undef, <8 x i32> +; GATHER: %[[R3:.+]] = add <8 x i32> %[[R1]], %[[R2]] +; GATHER: %[[R4:.+]] = shufflevector <8 x i32> %[[R3]], <8 x i32> undef, <8 x i32> +; GATHER: %[[R5:.+]] = add <8 x i32> %[[R3]], %[[R4]] +; GATHER: %[[R6:.+]] = extractelement <8 x i32> %[[R5]], i32 0 +; GATHER: %bin.extra = add i32 %[[R6]], %tmp17 ; ; MAX-COST-LABEL: @PR28330( ; MAX-COST-NOT: shufflevector @@ -95,8 +107,14 @@ define void @PR32038(i32 %n) { ; DEFAULT-NEXT: [[TMP28:%.*]] = add i32 [[TMP26]], undef ; DEFAULT-NEXT: [[TMP30:%.*]] = add i32 [[TMP28]], undef ; DEFAULT-NEXT: [[TMP32:%.*]] = add i32 [[TMP30]], undef -; DEFAULT-NEXT: [[Rdx:%.*]] = call i32 @llvm.experimental.vector.reduce.add.i32.v8i32(<8 x i32> [[TMP2]]) -; DEFAULT-NEXT: [[BIN_EXTRA]] = add i32 [[Rdx]], -5 +; DEFAULT-NEXT: [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> undef, <8 x i32> +; DEFAULT-NEXT: [[BIN_RDX:%.*]] = add <8 x i32> [[TMP2]], [[RDX_SHUF]] +; DEFAULT-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <8 x i32> [[BIN_RDX]], <8 x i32> undef, <8 x i32> +; DEFAULT-NEXT: [[BIN_RDX2:%.*]] = add <8 x i32> [[BIN_RDX]], [[RDX_SHUF1]] +; DEFAULT-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <8 x i32> [[BIN_RDX2]], <8 x i32> undef, <8 x i32> +; DEFAULT-NEXT: [[BIN_RDX4:%.*]] = add <8 x i32> [[BIN_RDX2]], [[RDX_SHUF3]] +; DEFAULT-NEXT: [[TMP3:%.*]] = extractelement <8 x i32> [[BIN_RDX4]], i32 0 +; DEFAULT-NEXT: [[BIN_EXTRA]] = add i32 [[TMP3]], -5 ; DEFAULT-NEXT: [[TMP34:%.*]] = add i32 [[TMP32]], undef ; DEFAULT-NEXT: br label [[FOR_BODY]] ; @@ -144,8 +162,14 @@ define void @PR32038(i32 %n) { ; GATHER-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[TMP29]], i32 5 ; GATHER-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[TMP31]], i32 6 ; GATHER-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[TMP33]], i32 7 -; GATHER-NEXT: [[Rdx:%.*]] = call i32 @llvm.experimental.vector.reduce.add.i32.v8i32(<8 x i32> [[TMP7]]) -; GATHER-NEXT: [[BIN_EXTRA]] = add i32 [[Rdx]], -5 +; GATHER-NEXT: [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP7]], <8 x i32> undef, <8 x i32> +; GATHER-NEXT: [[BIN_RDX:%.*]] = add <8 x i32> [[TMP7]], [[RDX_SHUF]] +; GATHER-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <8 x i32> [[BIN_RDX]], <8 x i32> undef, <8 x i32> +; GATHER-NEXT: [[BIN_RDX2:%.*]] = add <8 x i32> [[BIN_RDX]], [[RDX_SHUF1]] +; GATHER-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <8 x i32> [[BIN_RDX2]], <8 x i32> undef, <8 x i32> +; GATHER-NEXT: [[BIN_RDX4:%.*]] = add <8 x i32> [[BIN_RDX2]], [[RDX_SHUF3]] +; GATHER-NEXT: [[TMP8:%.*]] = extractelement <8 x i32> [[BIN_RDX4]], i32 0 +; GATHER-NEXT: [[BIN_EXTRA]] = add i32 [[TMP8]], -5 ; GATHER-NEXT: [[TMP34:%.*]] = add i32 [[TMP32]], [[TMP33]] ; GATHER-NEXT: br label [[FOR_BODY]] ; -- 2.40.0