STATISTIC(NumTailCalls, "Number of tail calls");
+cl::opt<bool> ExperimentalVectorWideningLegalization(
+ "x86-experimental-vector-widening-legalization", cl::init(true),
+ cl::desc("Enable an experimental vector type legalization through widening "
+ "rather than promotion."),
+ cl::Hidden);
+
static cl::opt<int> ExperimentalPrefLoopAlignment(
"x86-experimental-pref-loop-alignment", cl::init(4),
cl::desc("Sets the preferable loop alignment for experiments "
setOperationAction(ISD::UREM, VT, Custom);
}
+ if (!ExperimentalVectorWideningLegalization) {
+ setOperationAction(ISD::MUL, MVT::v2i16, Custom);
+ setOperationAction(ISD::MUL, MVT::v2i32, Custom);
+ setOperationAction(ISD::MUL, MVT::v4i16, Custom);
+ }
+
setOperationAction(ISD::MUL, MVT::v2i8, Custom);
setOperationAction(ISD::MUL, MVT::v4i8, Custom);
setOperationAction(ISD::MUL, MVT::v8i8, Custom);
setOperationAction(ISD::UADDSAT, MVT::v2i64, Custom);
setOperationAction(ISD::USUBSAT, MVT::v2i64, Custom);
+ if (!ExperimentalVectorWideningLegalization) {
+ // Use widening instead of promotion.
+ for (auto VT : { MVT::v8i8, MVT::v4i8, MVT::v2i8,
+ MVT::v4i16, MVT::v2i16 }) {
+ setOperationAction(ISD::UADDSAT, VT, Custom);
+ setOperationAction(ISD::SADDSAT, VT, Custom);
+ setOperationAction(ISD::USUBSAT, VT, Custom);
+ setOperationAction(ISD::SSUBSAT, VT, Custom);
+ }
+ }
+
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom);
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
+ // Provide custom widening for v2f32 setcc. This is really for VLX when
+ // setcc result type returns v2i1/v4i1 vector for v2f32/v4f32 leading to
+ // type legalization changing the result type to v4i1 during widening.
+ // It works fine for SSE2 and is probably faster so no need to qualify with
+ // VLX support.
+ if (!ExperimentalVectorWideningLegalization)
+ setOperationAction(ISD::SETCC, MVT::v2i32, Custom);
+
for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
setOperationAction(ISD::SETCC, VT, Custom);
setOperationAction(ISD::CTPOP, VT, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
}
+ // We support custom legalizing of sext and anyext loads for specific
+ // memory vector types which we can load as a scalar (or sequence of
+ // scalars) and extend in-register to a legal 128-bit vector type. For sext
+ // loads these must work with a single scalar load.
+ if (!ExperimentalVectorWideningLegalization) {
+ for (MVT VT : MVT::integer_vector_valuetypes()) {
+ setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Custom);
+ setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Custom);
+ setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i32, Custom);
+ setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i8, Custom);
+ setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Custom);
+ setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8i8, Custom);
+ }
+ }
+
for (auto VT : { MVT::v2f64, MVT::v2i64 }) {
setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i32, Custom);
setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i16, Custom);
- setOperationAction(ISD::SIGN_EXTEND, MVT::v4i64, Custom);
+ if (ExperimentalVectorWideningLegalization) {
+ setOperationAction(ISD::SIGN_EXTEND, MVT::v4i64, Custom);
- setOperationAction(ISD::TRUNCATE, MVT::v2i8, Custom);
- setOperationAction(ISD::TRUNCATE, MVT::v2i16, Custom);
- setOperationAction(ISD::TRUNCATE, MVT::v2i32, Custom);
- setOperationAction(ISD::TRUNCATE, MVT::v4i8, Custom);
- setOperationAction(ISD::TRUNCATE, MVT::v4i16, Custom);
- setOperationAction(ISD::TRUNCATE, MVT::v8i8, Custom);
+ setOperationAction(ISD::TRUNCATE, MVT::v2i8, Custom);
+ setOperationAction(ISD::TRUNCATE, MVT::v2i16, Custom);
+ setOperationAction(ISD::TRUNCATE, MVT::v2i32, Custom);
+ setOperationAction(ISD::TRUNCATE, MVT::v4i8, Custom);
+ setOperationAction(ISD::TRUNCATE, MVT::v4i16, Custom);
+ setOperationAction(ISD::TRUNCATE, MVT::v8i8, Custom);
+ } else {
+ setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i64, Custom);
+ }
// In the customized shift lowering, the legal v4i32/v2i64 cases
// in AVX2 will be recognized.
setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Legal);
}
+ if (!ExperimentalVectorWideningLegalization) {
+ // Avoid narrow result types when widening. The legal types are listed
+ // in the next loop.
+ for (MVT VT : MVT::integer_vector_valuetypes()) {
+ setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Custom);
+ setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Custom);
+ setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i32, Custom);
+ }
+ }
+
// SSE41 also has vector sign/zero extending loads, PMOV[SZ]X
for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
setLoadExtAction(LoadExtOp, MVT::v8i16, MVT::v8i8, Legal);
setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i8, Legal);
+ if (!ExperimentalVectorWideningLegalization)
+ setLoadExtAction(LoadExtOp, MVT::v2i32, MVT::v2i8, Legal);
setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i8, Legal);
setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i16, Legal);
setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i16, Legal);
setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);
setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom);
- // Need to custom widen this if we don't have AVX512BW.
- setOperationAction(ISD::ANY_EXTEND, MVT::v8i8, Custom);
- setOperationAction(ISD::ZERO_EXTEND, MVT::v8i8, Custom);
- setOperationAction(ISD::SIGN_EXTEND, MVT::v8i8, Custom);
+ if (ExperimentalVectorWideningLegalization) {
+ // Need to custom widen this if we don't have AVX512BW.
+ setOperationAction(ISD::ANY_EXTEND, MVT::v8i8, Custom);
+ setOperationAction(ISD::ZERO_EXTEND, MVT::v8i8, Custom);
+ setOperationAction(ISD::SIGN_EXTEND, MVT::v8i8, Custom);
+ }
for (auto VT : { MVT::v16f32, MVT::v8f64 }) {
setOperationAction(ISD::FFLOOR, VT, Legal);
if (VT == MVT::v32i1 && Subtarget.hasAVX512() && !Subtarget.hasBWI())
return TypeSplitVector;
- if (VT.getVectorNumElements() != 1 &&
+ if (ExperimentalVectorWideningLegalization &&
+ VT.getVectorNumElements() != 1 &&
VT.getVectorElementType() != MVT::i1)
return TypeWidenVector;
// Custom legalize v8i8->v8i64 on CPUs without avx512bw.
if (InVT == MVT::v8i8) {
- if (VT != MVT::v8i64)
+ if (!ExperimentalVectorWideningLegalization || VT != MVT::v8i64)
return SDValue();
In = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op),
SDValue Lo, Hi;
std::tie(Lo, Hi) = DAG.SplitVector(In, DL);
+ if (!ExperimentalVectorWideningLegalization) {
+ // Without vector widening we need to manually construct X86 specific
+ // nodes and an unpcklqdq.
+ Lo = DAG.getNode(X86ISD::VTRUNC, DL, VT, Lo);
+ Hi = DAG.getNode(X86ISD::VTRUNC, DL, VT, Hi);
+
+ // Manually concat the truncates using a shuffle.
+ unsigned NumElts = VT.getVectorNumElements();
+ SmallVector<int, 16> ShufMask(NumElts);
+ for (unsigned i = 0; i != NumElts / 2; ++i)
+ ShufMask[i] = i;
+ for (unsigned i = NumElts / 2; i != NumElts; ++i)
+ ShufMask[i] = i + (NumElts / 2);
+ return DAG.getVectorShuffle(VT, DL, Lo, Hi, ShufMask);
+ }
+
EVT LoVT, HiVT;
std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
// Custom legalize v8i8->v8i64 on CPUs without avx512bw.
if (InVT == MVT::v8i8) {
- if (VT != MVT::v8i64)
+ if (!ExperimentalVectorWideningLegalization || VT != MVT::v8i64)
return SDValue();
In = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op),
return SDValue();
}
- const TargetLowering &TLI = DAG.getTargetLoweringInfo();
assert(StoreVT.isVector() && StoreVT.getSizeInBits() == 64 &&
"Unexpected VT");
- assert(TLI.getTypeAction(*DAG.getContext(), StoreVT) ==
- TargetLowering::TypeWidenVector && "Unexpected type action!");
+ if (DAG.getTargetLoweringInfo().getTypeAction(*DAG.getContext(), StoreVT) !=
+ TargetLowering::TypeWidenVector)
+ return SDValue();
- EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), StoreVT);
+ MVT WideVT = MVT::getVectorVT(StoreVT.getVectorElementType(),
+ StoreVT.getVectorNumElements() * 2);
StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, StoredVal,
DAG.getUNDEF(StoreVT));
LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
SDLoc dl(Ld);
+ EVT MemVT = Ld->getMemoryVT();
// Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 loads.
if (RegVT.getVectorElementType() == MVT::i1) {
- assert(EVT(RegVT) == Ld->getMemoryVT() && "Expected non-extending load");
+ assert(EVT(RegVT) == MemVT && "Expected non-extending load");
assert(RegVT.getVectorNumElements() <= 8 && "Unexpected VT");
assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&
"Expected AVX512F without AVX512DQI");
return DAG.getMergeValues({Val, NewLd.getValue(1)}, dl);
}
- return SDValue();
+ if (ExperimentalVectorWideningLegalization)
+ return SDValue();
+
+ // Nothing useful we can do without SSE2 shuffles.
+ assert(Subtarget.hasSSE2() && "We only custom lower sext loads with SSE2.");
+
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ unsigned RegSz = RegVT.getSizeInBits();
+
+ ISD::LoadExtType Ext = Ld->getExtensionType();
+
+ assert((Ext == ISD::EXTLOAD || Ext == ISD::SEXTLOAD)
+ && "Only anyext and sext are currently implemented.");
+ assert(MemVT != RegVT && "Cannot extend to the same type");
+ assert(MemVT.isVector() && "Must load a vector from memory");
+
+ unsigned NumElems = RegVT.getVectorNumElements();
+ unsigned MemSz = MemVT.getSizeInBits();
+ assert(RegSz > MemSz && "Register size must be greater than the mem size");
+
+ if (Ext == ISD::SEXTLOAD && RegSz == 256 && !Subtarget.hasInt256()) {
+ // The only way in which we have a legal 256-bit vector result but not the
+ // integer 256-bit operations needed to directly lower a sextload is if we
+ // have AVX1 but not AVX2. In that case, we can always emit a sextload to
+ // a 128-bit vector and a normal sign_extend to 256-bits that should get
+ // correctly legalized. We do this late to allow the canonical form of
+ // sextload to persist throughout the rest of the DAG combiner -- it wants
+ // to fold together any extensions it can, and so will fuse a sign_extend
+ // of an sextload into a sextload targeting a wider value.
+ SDValue Load;
+ if (MemSz == 128) {
+ // Just switch this to a normal load.
+ assert(TLI.isTypeLegal(MemVT) && "If the memory type is a 128-bit type, "
+ "it must be a legal 128-bit vector "
+ "type!");
+ Load = DAG.getLoad(MemVT, dl, Ld->getChain(), Ld->getBasePtr(),
+ Ld->getPointerInfo(), Ld->getAlignment(),
+ Ld->getMemOperand()->getFlags());
+ } else {
+ assert(MemSz < 128 &&
+ "Can't extend a type wider than 128 bits to a 256 bit vector!");
+ // Do an sext load to a 128-bit vector type. We want to use the same
+ // number of elements, but elements half as wide. This will end up being
+ // recursively lowered by this routine, but will succeed as we definitely
+ // have all the necessary features if we're using AVX1.
+ EVT HalfEltVT =
+ EVT::getIntegerVT(*DAG.getContext(), RegVT.getScalarSizeInBits() / 2);
+ EVT HalfVecVT = EVT::getVectorVT(*DAG.getContext(), HalfEltVT, NumElems);
+ Load =
+ DAG.getExtLoad(Ext, dl, HalfVecVT, Ld->getChain(), Ld->getBasePtr(),
+ Ld->getPointerInfo(), MemVT, Ld->getAlignment(),
+ Ld->getMemOperand()->getFlags());
+ }
+
+ // Replace chain users with the new chain.
+ assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
+
+ // Finally, do a normal sign-extend to the desired register.
+ SDValue SExt = DAG.getSExtOrTrunc(Load, dl, RegVT);
+ return DAG.getMergeValues({SExt, Load.getValue(1)}, dl);
+ }
+
+ // All sizes must be a power of two.
+ assert(isPowerOf2_32(RegSz * MemSz * NumElems) &&
+ "Non-power-of-two elements are not custom lowered!");
+
+ // Attempt to load the original value using scalar loads.
+ // Find the largest scalar type that divides the total loaded size.
+ MVT SclrLoadTy = MVT::i8;
+ for (MVT Tp : MVT::integer_valuetypes()) {
+ if (TLI.isTypeLegal(Tp) && ((MemSz % Tp.getSizeInBits()) == 0)) {
+ SclrLoadTy = Tp;
+ }
+ }
+
+ // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
+ if (TLI.isTypeLegal(MVT::f64) && SclrLoadTy.getSizeInBits() < 64 &&
+ (64 <= MemSz))
+ SclrLoadTy = MVT::f64;
+
+ // Calculate the number of scalar loads that we need to perform
+ // in order to load our vector from memory.
+ unsigned NumLoads = MemSz / SclrLoadTy.getSizeInBits();
+
+ assert((Ext != ISD::SEXTLOAD || NumLoads == 1) &&
+ "Can only lower sext loads with a single scalar load!");
+
+ unsigned loadRegSize = RegSz;
+ if (Ext == ISD::SEXTLOAD && RegSz >= 256)
+ loadRegSize = 128;
+
+ // If we don't have BWI we won't be able to create the shuffle needed for
+ // v8i8->v8i64.
+ if (Ext == ISD::EXTLOAD && !Subtarget.hasBWI() && RegVT == MVT::v8i64 &&
+ MemVT == MVT::v8i8)
+ loadRegSize = 128;
+
+ // Represent our vector as a sequence of elements which are the
+ // largest scalar that we can load.
+ EVT LoadUnitVecVT = EVT::getVectorVT(
+ *DAG.getContext(), SclrLoadTy, loadRegSize / SclrLoadTy.getSizeInBits());
+
+ // Represent the data using the same element type that is stored in
+ // memory. In practice, we ''widen'' MemVT.
+ EVT WideVecVT =
+ EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
+ loadRegSize / MemVT.getScalarSizeInBits());
+
+ assert(WideVecVT.getSizeInBits() == LoadUnitVecVT.getSizeInBits() &&
+ "Invalid vector type");
+
+ // We can't shuffle using an illegal type.
+ assert(TLI.isTypeLegal(WideVecVT) &&
+ "We only lower types that form legal widened vector types");
+
+ SmallVector<SDValue, 8> Chains;
+ SDValue Ptr = Ld->getBasePtr();
+ unsigned OffsetInc = SclrLoadTy.getSizeInBits() / 8;
+ SDValue Increment = DAG.getConstant(OffsetInc, dl,
+ TLI.getPointerTy(DAG.getDataLayout()));
+ SDValue Res = DAG.getUNDEF(LoadUnitVecVT);
+
+ unsigned Offset = 0;
+ for (unsigned i = 0; i < NumLoads; ++i) {
+ unsigned NewAlign = MinAlign(Ld->getAlignment(), Offset);
+
+ // Perform a single load.
+ SDValue ScalarLoad =
+ DAG.getLoad(SclrLoadTy, dl, Ld->getChain(), Ptr,
+ Ld->getPointerInfo().getWithOffset(Offset),
+ NewAlign, Ld->getMemOperand()->getFlags());
+ Chains.push_back(ScalarLoad.getValue(1));
+ // Create the first element type using SCALAR_TO_VECTOR in order to avoid
+ // another round of DAGCombining.
+ if (i == 0)
+ Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LoadUnitVecVT, ScalarLoad);
+ else
+ Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, LoadUnitVecVT, Res,
+ ScalarLoad, DAG.getIntPtrConstant(i, dl));
+
+ Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
+ Offset += OffsetInc;
+ }
+
+ SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
+
+ // Bitcast the loaded value to a vector of the original element type, in
+ // the size of the target vector type.
+ SDValue SlicedVec = DAG.getBitcast(WideVecVT, Res);
+ unsigned SizeRatio = RegSz / MemSz;
+
+ if (Ext == ISD::SEXTLOAD) {
+ SDValue Sext = getExtendInVec(ISD::SIGN_EXTEND, dl, RegVT, SlicedVec, DAG);
+ return DAG.getMergeValues({Sext, TF}, dl);
+ }
+
+ if (Ext == ISD::EXTLOAD && !Subtarget.hasBWI() && RegVT == MVT::v8i64 &&
+ MemVT == MVT::v8i8) {
+ SDValue Sext = getExtendInVec(ISD::ZERO_EXTEND, dl, RegVT, SlicedVec, DAG);
+ return DAG.getMergeValues({Sext, TF}, dl);
+ }
+
+ // Redistribute the loaded elements into the different locations.
+ SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
+ for (unsigned i = 0; i != NumElems; ++i)
+ ShuffleVec[i * SizeRatio] = i;
+
+ SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec,
+ DAG.getUNDEF(WideVecVT), ShuffleVec);
+
+ // Bitcast to the requested type.
+ Shuff = DAG.getBitcast(RegVT, Shuff);
+ return DAG.getMergeValues({Shuff, TF}, dl);
}
/// Return true if node is an ISD::AND or ISD::OR of two X86ISD::SETCC nodes
SDValue Chain = N->getChain();
SDValue BasePtr = N->getBasePtr();
- if (VT == MVT::v2f32 || VT == MVT::v2i32) {
+ if (VT == MVT::v2f32) {
assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
// If the index is v2i64 and we have VLX we can use xmm for data and index.
if (Index.getValueType() == MVT::v2i64 && Subtarget.hasVLX()) {
- const TargetLowering &TLI = DAG.getTargetLoweringInfo();
- EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
- Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, Src, DAG.getUNDEF(VT));
+ Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
+ DAG.getUNDEF(MVT::v2f32));
SDVTList VTs = DAG.getVTList(MVT::v2i1, MVT::Other);
SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
SDValue NewScatter = DAG.getTargetMemSDNode<X86MaskedScatterSDNode>(
return SDValue();
}
+ if (VT == MVT::v2i32) {
+ assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
+ Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
+ DAG.getUNDEF(MVT::v2i32));
+ // If the index is v2i64 and we have VLX we can use xmm for data and index.
+ if (Index.getValueType() == MVT::v2i64 && Subtarget.hasVLX()) {
+ SDVTList VTs = DAG.getVTList(MVT::v2i1, MVT::Other);
+ SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
+ SDValue NewScatter = DAG.getTargetMemSDNode<X86MaskedScatterSDNode>(
+ VTs, Ops, dl, N->getMemoryVT(), N->getMemOperand());
+ return SDValue(NewScatter.getNode(), 1);
+ }
+ // Custom widen all the operands to avoid promotion.
+ EVT NewIndexVT = EVT::getVectorVT(
+ *DAG.getContext(), Index.getValueType().getVectorElementType(), 4);
+ Index = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewIndexVT, Index,
+ DAG.getUNDEF(Index.getValueType()));
+ Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,
+ DAG.getConstant(0, dl, MVT::v2i1));
+ SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
+ return DAG.getMaskedScatter(DAG.getVTList(MVT::Other), N->getMemoryVT(), dl,
+ Ops, N->getMemOperand(), N->getIndexType());
+ }
+
MVT IndexVT = Index.getSimpleValueType();
MVT MaskVT = Mask.getSimpleValueType();
}
case ISD::MUL: {
EVT VT = N->getValueType(0);
- assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
- VT.getVectorElementType() == MVT::i8 && "Unexpected VT!");
- // Pre-promote these to vXi16 to avoid op legalization thinking all 16
- // elements are needed.
- MVT MulVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
- SDValue Op0 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(0));
- SDValue Op1 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(1));
- SDValue Res = DAG.getNode(ISD::MUL, dl, MulVT, Op0, Op1);
- Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
- unsigned NumConcats = 16 / VT.getVectorNumElements();
- SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(VT));
- ConcatOps[0] = Res;
- Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i8, ConcatOps);
- Results.push_back(Res);
+ assert(VT.isVector() && "Unexpected VT");
+ if (getTypeAction(*DAG.getContext(), VT) == TypePromoteInteger &&
+ VT.getVectorNumElements() == 2) {
+ // Promote to a pattern that will be turned into PMULUDQ.
+ SDValue N0 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::v2i64,
+ N->getOperand(0));
+ SDValue N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::v2i64,
+ N->getOperand(1));
+ SDValue Mul = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, N0, N1);
+ Results.push_back(DAG.getNode(ISD::TRUNCATE, dl, VT, Mul));
+ } else if (getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
+ VT.getVectorElementType() == MVT::i8) {
+ // Pre-promote these to vXi16 to avoid op legalization thinking all 16
+ // elements are needed.
+ MVT MulVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
+ SDValue Op0 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(0));
+ SDValue Op1 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(1));
+ SDValue Res = DAG.getNode(ISD::MUL, dl, MulVT, Op0, Op1);
+ Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
+ unsigned NumConcats = 16 / VT.getVectorNumElements();
+ SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(VT));
+ ConcatOps[0] = Res;
+ Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i8, ConcatOps);
+ Results.push_back(Res);
+ }
return;
}
+ case ISD::UADDSAT:
+ case ISD::SADDSAT:
+ case ISD::USUBSAT:
+ case ISD::SSUBSAT:
case X86ISD::VPMADDWD:
case X86ISD::AVG: {
// Legalize types for ISD::UADDSAT/SADDSAT/USUBSAT/SSUBSAT and
EVT InVT = N->getOperand(0).getValueType();
assert(VT.getSizeInBits() < 128 && 128 % VT.getSizeInBits() == 0 &&
"Expected a VT that divides into 128 bits.");
- assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
- "Unexpected type action!");
unsigned NumConcat = 128 / InVT.getSizeInBits();
EVT InWideVT = EVT::getVectorVT(*DAG.getContext(),
SDValue InVec1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, InWideVT, Ops);
SDValue Res = DAG.getNode(N->getOpcode(), dl, WideVT, InVec0, InVec1);
+ if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector)
+ Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
+ DAG.getIntPtrConstant(0, dl));
Results.push_back(Res);
return;
}
Results.push_back(Hi);
return;
}
+ case ISD::SETCC: {
+ // Widen v2i32 (setcc v2f32). This is really needed for AVX512VL when
+ // setCC result type is v2i1 because type legalzation will end up with
+ // a v4i1 setcc plus an extend.
+ assert(N->getValueType(0) == MVT::v2i32 && "Unexpected type");
+ if (N->getOperand(0).getValueType() != MVT::v2f32 ||
+ getTypeAction(*DAG.getContext(), MVT::v2i32) == TypeWidenVector)
+ return;
+ SDValue UNDEF = DAG.getUNDEF(MVT::v2f32);
+ SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
+ N->getOperand(0), UNDEF);
+ SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
+ N->getOperand(1), UNDEF);
+ SDValue Res = DAG.getNode(ISD::SETCC, dl, MVT::v4i32, LHS, RHS,
+ N->getOperand(2));
+ Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res,
+ DAG.getIntPtrConstant(0, dl));
+ Results.push_back(Res);
+ return;
+ }
// We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32.
case X86ISD::FMINC:
case X86ISD::FMIN:
case ISD::SREM:
case ISD::UREM: {
EVT VT = N->getValueType(0);
- if (VT.isVector()) {
- assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
- "Unexpected type action!");
+ if (getTypeAction(*DAG.getContext(), VT) == TypeWidenVector) {
// If this RHS is a constant splat vector we can widen this and let
// division/remainder by constant optimize it.
// TODO: Can we do something for non-splat?
return;
}
+ if (VT == MVT::v2i32) {
+ // Legalize v2i32 div/rem by unrolling. Otherwise we promote to the
+ // v2i64 and unroll later. But then we create i64 scalar ops which
+ // might be slow in 64-bit mode or require a libcall in 32-bit mode.
+ Results.push_back(DAG.UnrollVectorOp(N));
+ return;
+ }
+
+ if (VT.isVector())
+ return;
+
LLVM_FALLTHROUGH;
}
case ISD::SDIVREM:
}
case ISD::TRUNCATE: {
MVT VT = N->getSimpleValueType(0);
- assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
- "Unexpected type action!");
+ if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector)
+ return;
// The generic legalizer will try to widen the input type to the same
// number of elements as the widened result type. But this isn't always
}
return;
}
+ case ISD::SIGN_EXTEND_VECTOR_INREG: {
+ if (ExperimentalVectorWideningLegalization)
+ return;
+
+ EVT VT = N->getValueType(0);
+ SDValue In = N->getOperand(0);
+ EVT InVT = In.getValueType();
+ if (!Subtarget.hasSSE41() && VT == MVT::v4i64 &&
+ (InVT == MVT::v16i16 || InVT == MVT::v32i8)) {
+ // Custom split this so we can extend i8/i16->i32 invec. This is better
+ // since sign_extend_inreg i8/i16->i64 requires an extend to i32 using
+ // sra. Then extending from i32 to i64 using pcmpgt. By custom splitting
+ // we allow the sra from the extend to i32 to be shared by the split.
+ EVT ExtractVT = EVT::getVectorVT(*DAG.getContext(),
+ InVT.getVectorElementType(),
+ InVT.getVectorNumElements() / 2);
+ MVT ExtendVT = MVT::getVectorVT(MVT::i32,
+ VT.getVectorNumElements());
+ In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ExtractVT,
+ In, DAG.getIntPtrConstant(0, dl));
+ In = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, MVT::v4i32, In);
+
+ // Fill a vector with sign bits for each element.
+ SDValue Zero = DAG.getConstant(0, dl, ExtendVT);
+ SDValue SignBits = DAG.getSetCC(dl, ExtendVT, Zero, In, ISD::SETGT);
+
+ EVT LoVT, HiVT;
+ std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
+
+ // Create an unpackl and unpackh to interleave the sign bits then bitcast
+ // to vXi64.
+ SDValue Lo = getUnpackl(DAG, dl, ExtendVT, In, SignBits);
+ Lo = DAG.getNode(ISD::BITCAST, dl, LoVT, Lo);
+ SDValue Hi = getUnpackh(DAG, dl, ExtendVT, In, SignBits);
+ Hi = DAG.getNode(ISD::BITCAST, dl, HiVT, Hi);
+
+ SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
+ Results.push_back(Res);
+ return;
+ }
+ return;
+ }
case ISD::SIGN_EXTEND:
case ISD::ZERO_EXTEND: {
EVT VT = N->getValueType(0);
SDValue In = N->getOperand(0);
EVT InVT = In.getValueType();
if (!Subtarget.hasSSE41() && VT == MVT::v4i64 &&
- (InVT == MVT::v4i16 || InVT == MVT::v4i8)){
- assert(getTypeAction(*DAG.getContext(), InVT) == TypeWidenVector &&
- "Unexpected type action!");
+ (InVT == MVT::v4i16 || InVT == MVT::v4i8) &&
+ getTypeAction(*DAG.getContext(), InVT) == TypeWidenVector) {
assert(N->getOpcode() == ISD::SIGN_EXTEND && "Unexpected opcode");
// Custom split this so we can extend i8/i16->i32 invec. This is better
// since sign_extend_inreg i8/i16->i64 requires an extend to i32 using
SDValue Src = N->getOperand(0);
EVT SrcVT = Src.getValueType();
+ // Promote these manually to avoid over promotion to v2i64. Type
+ // legalization will revisit the v2i32 operation for more cleanup.
+ if ((VT == MVT::v2i8 || VT == MVT::v2i16) &&
+ getTypeAction(*DAG.getContext(), VT) == TypePromoteInteger) {
+ // AVX512DQ provides instructions that produce a v2i64 result.
+ if (Subtarget.hasDQI())
+ return;
+
+ SDValue Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v2i32, Src);
+ Res = DAG.getNode(N->getOpcode() == ISD::FP_TO_UINT ? ISD::AssertZext
+ : ISD::AssertSext,
+ dl, MVT::v2i32, Res,
+ DAG.getValueType(VT.getVectorElementType()));
+ Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
+ Results.push_back(Res);
+ return;
+ }
+
if (VT.isVector() && VT.getScalarSizeInBits() < 32) {
- assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
- "Unexpected type action!");
+ if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector)
+ return;
// Try to create a 128 bit vector, but don't exceed a 32 bit element.
unsigned NewEltWidth = std::min(128 / VT.getVectorNumElements(), 32U);
assert((IsSigned || Subtarget.hasAVX512()) &&
"Can only handle signed conversion without AVX512");
assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
- assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
- "Unexpected type action!");
+ bool Widenv2i32 =
+ getTypeAction(*DAG.getContext(), MVT::v2i32) == TypeWidenVector;
if (Src.getValueType() == MVT::v2f64) {
+ unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
if (!IsSigned && !Subtarget.hasVLX()) {
- // If we have VLX we can emit a target specific FP_TO_UINT node,
- // otherwise we can defer to the generic legalizer which will widen
- // the input as well. This will be further widened during op
- // legalization to v8i32<-v8f64.
- return;
+ // If v2i32 is widened, we can defer to the generic legalizer.
+ if (Widenv2i32)
+ return;
+ // Custom widen by doubling to a legal vector with. Isel will
+ // further widen to v8f64.
+ Opc = ISD::FP_TO_UINT;
+ Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f64,
+ Src, DAG.getUNDEF(MVT::v2f64));
}
- unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
SDValue Res = DAG.getNode(Opc, dl, MVT::v4i32, Src);
+ if (!Widenv2i32)
+ Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res,
+ DAG.getIntPtrConstant(0, dl));
+ Results.push_back(Res);
+ return;
+ }
+ if (SrcVT == MVT::v2f32 &&
+ getTypeAction(*DAG.getContext(), VT) != TypeWidenVector) {
+ SDValue Idx = DAG.getIntPtrConstant(0, dl);
+ SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
+ DAG.getUNDEF(MVT::v2f32));
+ Res = DAG.getNode(IsSigned ? ISD::FP_TO_SINT
+ : ISD::FP_TO_UINT, dl, MVT::v4i32, Res);
+ Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res, Idx);
Results.push_back(Res);
return;
}
return;
}
- assert(!VT.isVector() && "Vectors should have been handled above!");
-
if (Subtarget.hasDQI() && VT == MVT::i64 &&
(SrcVT == MVT::f32 || SrcVT == MVT::f64)) {
assert(!Subtarget.is64Bit() && "i64 should be legal");
return;
}
- if (DstVT.isVector() && SrcVT == MVT::x86mmx) {
- assert(getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector &&
- "Unexpected type action!");
+ if (DstVT.isVector() && SrcVT == MVT::x86mmx &&
+ getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector) {
EVT WideVT = getTypeToTransformTo(*DAG.getContext(), DstVT);
SDValue Res = DAG.getNode(X86ISD::MOVQ2DQ, dl, WideVT, N->getOperand(0));
Results.push_back(Res);
return;
}
+ if (SrcVT != MVT::f64 ||
+ (DstVT != MVT::v2i32 && DstVT != MVT::v4i16 && DstVT != MVT::v8i8) ||
+ getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector)
+ return;
+
+ unsigned NumElts = DstVT.getVectorNumElements();
+ EVT SVT = DstVT.getVectorElementType();
+ EVT WiderVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2);
+ SDValue Res;
+ Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, N->getOperand(0));
+ Res = DAG.getBitcast(WiderVT, Res);
+ Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DstVT, Res,
+ DAG.getIntPtrConstant(0, dl));
+ Results.push_back(Res);
return;
}
case ISD::MGATHER: {
EVT VT = N->getValueType(0);
- if ((VT == MVT::v2f32 || VT == MVT::v2i32) &&
- (Subtarget.hasVLX() || !Subtarget.hasAVX512())) {
+ if (VT == MVT::v2f32 && (Subtarget.hasVLX() || !Subtarget.hasAVX512())) {
auto *Gather = cast<MaskedGatherSDNode>(N);
SDValue Index = Gather->getIndex();
if (Index.getValueType() != MVT::v2i64)
return;
- assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
- "Unexpected type action!");
- EVT WideVT = getTypeToTransformTo(*DAG.getContext(), VT);
SDValue Mask = Gather->getMask();
assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
- SDValue PassThru = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT,
+ SDValue PassThru = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
Gather->getPassThru(),
- DAG.getUNDEF(VT));
+ DAG.getUNDEF(MVT::v2f32));
if (!Subtarget.hasVLX()) {
// We need to widen the mask, but the instruction will only use 2
// of its elements. So we can use undef.
SDValue Ops[] = { Gather->getChain(), PassThru, Mask,
Gather->getBasePtr(), Index, Gather->getScale() };
SDValue Res = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
- DAG.getVTList(WideVT, Mask.getValueType(), MVT::Other), Ops, dl,
+ DAG.getVTList(MVT::v4f32, Mask.getValueType(), MVT::Other), Ops, dl,
Gather->getMemoryVT(), Gather->getMemOperand());
Results.push_back(Res);
Results.push_back(Res.getValue(2));
return;
}
+ if (VT == MVT::v2i32) {
+ auto *Gather = cast<MaskedGatherSDNode>(N);
+ SDValue Index = Gather->getIndex();
+ SDValue Mask = Gather->getMask();
+ assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
+ SDValue PassThru = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32,
+ Gather->getPassThru(),
+ DAG.getUNDEF(MVT::v2i32));
+ // If the index is v2i64 we can use it directly.
+ if (Index.getValueType() == MVT::v2i64 &&
+ (Subtarget.hasVLX() || !Subtarget.hasAVX512())) {
+ if (!Subtarget.hasVLX()) {
+ // We need to widen the mask, but the instruction will only use 2
+ // of its elements. So we can use undef.
+ Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,
+ DAG.getUNDEF(MVT::v2i1));
+ Mask = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Mask);
+ }
+ SDValue Ops[] = { Gather->getChain(), PassThru, Mask,
+ Gather->getBasePtr(), Index, Gather->getScale() };
+ SDValue Res = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
+ DAG.getVTList(MVT::v4i32, Mask.getValueType(), MVT::Other), Ops, dl,
+ Gather->getMemoryVT(), Gather->getMemOperand());
+ SDValue Chain = Res.getValue(2);
+ if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector)
+ Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res,
+ DAG.getIntPtrConstant(0, dl));
+ Results.push_back(Res);
+ Results.push_back(Chain);
+ return;
+ }
+ if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector) {
+ EVT IndexVT = Index.getValueType();
+ EVT NewIndexVT = EVT::getVectorVT(*DAG.getContext(),
+ IndexVT.getScalarType(), 4);
+ // Otherwise we need to custom widen everything to avoid promotion.
+ Index = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewIndexVT, Index,
+ DAG.getUNDEF(IndexVT));
+ Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,
+ DAG.getConstant(0, dl, MVT::v2i1));
+ SDValue Ops[] = { Gather->getChain(), PassThru, Mask,
+ Gather->getBasePtr(), Index, Gather->getScale() };
+ SDValue Res = DAG.getMaskedGather(DAG.getVTList(MVT::v4i32, MVT::Other),
+ Gather->getMemoryVT(), dl, Ops,
+ Gather->getMemOperand(),
+ Gather->getIndexType());
+ SDValue Chain = Res.getValue(1);
+ if (getTypeAction(*DAG.getContext(), MVT::v2i32) != TypeWidenVector)
+ Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res,
+ DAG.getIntPtrConstant(0, dl));
+ Results.push_back(Res);
+ Results.push_back(Chain);
+ return;
+ }
+ }
return;
}
case ISD::LOAD: {
// cast since type legalization will try to use an i64 load.
MVT VT = N->getSimpleValueType(0);
assert(VT.isVector() && VT.getSizeInBits() == 64 && "Unexpected VT");
- assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
- "Unexpected type action!");
+ if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector)
+ return;
if (!ISD::isNON_EXTLoad(N))
return;
auto *Ld = cast<LoadSDNode>(N);
Ld->getPointerInfo(), Ld->getAlignment(),
Ld->getMemOperand()->getFlags());
SDValue Chain = Res.getValue(1);
- MVT VecVT = MVT::getVectorVT(LdVT, 2);
- Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Res);
- EVT WideVT = getTypeToTransformTo(*DAG.getContext(), VT);
- Res = DAG.getBitcast(WideVT, Res);
+ MVT WideVT = MVT::getVectorVT(LdVT, 2);
+ Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, WideVT, Res);
+ MVT CastVT = MVT::getVectorVT(VT.getVectorElementType(),
+ VT.getVectorNumElements() * 2);
+ Res = DAG.getBitcast(CastVT, Res);
Results.push_back(Res);
Results.push_back(Chain);
return;
return HAddSub;
}
+ // During Type Legalization, when promoting illegal vector types,
+ // the backend might introduce new shuffle dag nodes and bitcasts.
+ //
+ // This code performs the following transformation:
+ // fold: (shuffle (bitcast (BINOP A, B)), Undef, <Mask>) ->
+ // (shuffle (BINOP (bitcast A), (bitcast B)), Undef, <Mask>)
+ //
+ // We do this only if both the bitcast and the BINOP dag nodes have
+ // one use. Also, perform this transformation only if the new binary
+ // operation is legal. This is to avoid introducing dag nodes that
+ // potentially need to be further expanded (or custom lowered) into a
+ // less optimal sequence of dag nodes.
+ if (!ExperimentalVectorWideningLegalization &&
+ !DCI.isBeforeLegalize() && DCI.isBeforeLegalizeOps() &&
+ N->getOpcode() == ISD::VECTOR_SHUFFLE &&
+ N->getOperand(0).getOpcode() == ISD::BITCAST &&
+ N->getOperand(1).isUndef() && N->getOperand(0).hasOneUse()) {
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+
+ SDValue BC0 = N0.getOperand(0);
+ EVT SVT = BC0.getValueType();
+ unsigned Opcode = BC0.getOpcode();
+ unsigned NumElts = VT.getVectorNumElements();
+
+ if (BC0.hasOneUse() && SVT.isVector() &&
+ SVT.getVectorNumElements() * 2 == NumElts &&
+ TLI.isOperationLegal(Opcode, VT)) {
+ bool CanFold = false;
+ switch (Opcode) {
+ default : break;
+ case ISD::ADD:
+ case ISD::SUB:
+ case ISD::MUL:
+ // isOperationLegal lies for integer ops on floating point types.
+ CanFold = VT.isInteger();
+ break;
+ case ISD::FADD:
+ case ISD::FSUB:
+ case ISD::FMUL:
+ // isOperationLegal lies for floating point ops on integer types.
+ CanFold = VT.isFloatingPoint();
+ break;
+ }
+
+ unsigned SVTNumElts = SVT.getVectorNumElements();
+ ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
+ for (unsigned i = 0, e = SVTNumElts; i != e && CanFold; ++i)
+ CanFold = SVOp->getMaskElt(i) == (int)(i * 2);
+ for (unsigned i = SVTNumElts, e = NumElts; i != e && CanFold; ++i)
+ CanFold = SVOp->getMaskElt(i) < 0;
+
+ if (CanFold) {
+ SDValue BC00 = DAG.getBitcast(VT, BC0.getOperand(0));
+ SDValue BC01 = DAG.getBitcast(VT, BC0.getOperand(1));
+ SDValue NewBinOp = DAG.getNode(BC0.getOpcode(), dl, VT, BC00, BC01);
+ return DAG.getVectorShuffle(VT, dl, NewBinOp, N1, SVOp->getMask());
+ }
+ }
+ }
+
// Attempt to combine into a vector load/broadcast.
if (SDValue LD = combineToConsecutiveLoads(VT, N, dl, DAG, Subtarget, true))
return LD;
}
}
+
+ // Look for a truncating shuffle to v2i32 of a PMULUDQ where one of the
+ // operands is an extend from v2i32 to v2i64. Turn it into a pmulld.
+ // FIXME: This can probably go away once we default to widening legalization.
+ if (!ExperimentalVectorWideningLegalization &&
+ Subtarget.hasSSE41() && VT == MVT::v4i32 &&
+ N->getOpcode() == ISD::VECTOR_SHUFFLE &&
+ N->getOperand(0).getOpcode() == ISD::BITCAST &&
+ N->getOperand(0).getOperand(0).getOpcode() == X86ISD::PMULUDQ) {
+ SDValue BC = N->getOperand(0);
+ SDValue MULUDQ = BC.getOperand(0);
+ ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
+ ArrayRef<int> Mask = SVOp->getMask();
+ if (BC.hasOneUse() && MULUDQ.hasOneUse() &&
+ Mask[0] == 0 && Mask[1] == 2 && Mask[2] == -1 && Mask[3] == -1) {
+ SDValue Op0 = MULUDQ.getOperand(0);
+ SDValue Op1 = MULUDQ.getOperand(1);
+ if (Op0.getOpcode() == ISD::BITCAST &&
+ Op0.getOperand(0).getOpcode() == ISD::VECTOR_SHUFFLE &&
+ Op0.getOperand(0).getValueType() == MVT::v4i32) {
+ ShuffleVectorSDNode *SVOp0 =
+ cast<ShuffleVectorSDNode>(Op0.getOperand(0));
+ ArrayRef<int> Mask2 = SVOp0->getMask();
+ if (Mask2[0] == 0 && Mask2[1] == -1 &&
+ Mask2[2] == 1 && Mask2[3] == -1) {
+ Op0 = SVOp0->getOperand(0);
+ Op1 = DAG.getBitcast(MVT::v4i32, Op1);
+ Op1 = DAG.getVectorShuffle(MVT::v4i32, dl, Op1, Op1, Mask);
+ return DAG.getNode(ISD::MUL, dl, MVT::v4i32, Op0, Op1);
+ }
+ }
+ if (Op1.getOpcode() == ISD::BITCAST &&
+ Op1.getOperand(0).getOpcode() == ISD::VECTOR_SHUFFLE &&
+ Op1.getOperand(0).getValueType() == MVT::v4i32) {
+ ShuffleVectorSDNode *SVOp1 =
+ cast<ShuffleVectorSDNode>(Op1.getOperand(0));
+ ArrayRef<int> Mask2 = SVOp1->getMask();
+ if (Mask2[0] == 0 && Mask2[1] == -1 &&
+ Mask2[2] == 1 && Mask2[3] == -1) {
+ Op0 = DAG.getBitcast(MVT::v4i32, Op0);
+ Op0 = DAG.getVectorShuffle(MVT::v4i32, dl, Op0, Op0, Mask);
+ Op1 = SVOp1->getOperand(0);
+ return DAG.getNode(ISD::MUL, dl, MVT::v4i32, Op0, Op1);
+ }
+ }
+ }
+ }
+
return SDValue();
}
SDLoc DL(ExtElt);
- if (VecVT == MVT::v8i8) {
+ if (ExperimentalVectorWideningLegalization && VecVT == MVT::v8i8) {
// Pad with undef.
Rdx = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, Rdx,
DAG.getUNDEF(VecVT));
// Since SKX these selects have a proper lowering.
if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && CondVT.isVector() &&
CondVT.getVectorElementType() == MVT::i1 &&
+ (ExperimentalVectorWideningLegalization ||
+ VT.getVectorNumElements() > 4) &&
(VT.getVectorElementType() == MVT::i8 ||
VT.getVectorElementType() == MVT::i16)) {
Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond);
if ((NumElts % 2) != 0)
return SDValue();
+ unsigned RegSize = 128;
+ MVT OpsVT = MVT::getVectorVT(MVT::i16, RegSize / 16);
EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts);
// Shrink the operands of mul.
SDValue NewN0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N0);
SDValue NewN1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N1);
- // Generate the lower part of mul: pmullw. For MULU8/MULS8, only the
- // lower part is needed.
- SDValue MulLo = DAG.getNode(ISD::MUL, DL, ReducedVT, NewN0, NewN1);
- if (Mode == MULU8 || Mode == MULS8)
- return DAG.getNode((Mode == MULU8) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND,
- DL, VT, MulLo);
+ if (ExperimentalVectorWideningLegalization ||
+ NumElts >= OpsVT.getVectorNumElements()) {
+ // Generate the lower part of mul: pmullw. For MULU8/MULS8, only the
+ // lower part is needed.
+ SDValue MulLo = DAG.getNode(ISD::MUL, DL, ReducedVT, NewN0, NewN1);
+ if (Mode == MULU8 || Mode == MULS8)
+ return DAG.getNode((Mode == MULU8) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND,
+ DL, VT, MulLo);
+
+ MVT ResVT = MVT::getVectorVT(MVT::i32, NumElts / 2);
+ // Generate the higher part of mul: pmulhw/pmulhuw. For MULU16/MULS16,
+ // the higher part is also needed.
+ SDValue MulHi = DAG.getNode(Mode == MULS16 ? ISD::MULHS : ISD::MULHU, DL,
+ ReducedVT, NewN0, NewN1);
+
+ // Repack the lower part and higher part result of mul into a wider
+ // result.
+ // Generate shuffle functioning as punpcklwd.
+ SmallVector<int, 16> ShuffleMask(NumElts);
+ for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
+ ShuffleMask[2 * i] = i;
+ ShuffleMask[2 * i + 1] = i + NumElts;
+ }
+ SDValue ResLo =
+ DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
+ ResLo = DAG.getBitcast(ResVT, ResLo);
+ // Generate shuffle functioning as punpckhwd.
+ for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
+ ShuffleMask[2 * i] = i + NumElts / 2;
+ ShuffleMask[2 * i + 1] = i + NumElts * 3 / 2;
+ }
+ SDValue ResHi =
+ DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
+ ResHi = DAG.getBitcast(ResVT, ResHi);
+ return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ResLo, ResHi);
+ }
+
+ // When VT.getVectorNumElements() < OpsVT.getVectorNumElements(), we want
+ // to legalize the mul explicitly because implicit legalization for type
+ // <4 x i16> to <4 x i32> sometimes involves unnecessary unpack
+ // instructions which will not exist when we explicitly legalize it by
+ // extending <4 x i16> to <8 x i16> (concatenating the <4 x i16> val with
+ // <4 x i16> undef).
+ //
+ // Legalize the operands of mul.
+ // FIXME: We may be able to handle non-concatenated vectors by insertion.
+ unsigned ReducedSizeInBits = ReducedVT.getSizeInBits();
+ if ((RegSize % ReducedSizeInBits) != 0)
+ return SDValue();
+
+ SmallVector<SDValue, 16> Ops(RegSize / ReducedSizeInBits,
+ DAG.getUNDEF(ReducedVT));
+ Ops[0] = NewN0;
+ NewN0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, OpsVT, Ops);
+ Ops[0] = NewN1;
+ NewN1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, OpsVT, Ops);
+
+ if (Mode == MULU8 || Mode == MULS8) {
+ // Generate lower part of mul: pmullw. For MULU8/MULS8, only the lower
+ // part is needed.
+ SDValue Mul = DAG.getNode(ISD::MUL, DL, OpsVT, NewN0, NewN1);
+
+ // convert the type of mul result to VT.
+ MVT ResVT = MVT::getVectorVT(MVT::i32, RegSize / 32);
+ SDValue Res = DAG.getNode(Mode == MULU8 ? ISD::ZERO_EXTEND_VECTOR_INREG
+ : ISD::SIGN_EXTEND_VECTOR_INREG,
+ DL, ResVT, Mul);
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
+ DAG.getIntPtrConstant(0, DL));
+ }
- MVT ResVT = MVT::getVectorVT(MVT::i32, NumElts / 2);
- // Generate the higher part of mul: pmulhw/pmulhuw. For MULU16/MULS16,
- // the higher part is also needed.
+ // Generate the lower and higher part of mul: pmulhw/pmulhuw. For
+ // MULU16/MULS16, both parts are needed.
+ SDValue MulLo = DAG.getNode(ISD::MUL, DL, OpsVT, NewN0, NewN1);
SDValue MulHi = DAG.getNode(Mode == MULS16 ? ISD::MULHS : ISD::MULHU, DL,
- ReducedVT, NewN0, NewN1);
+ OpsVT, NewN0, NewN1);
// Repack the lower part and higher part result of mul into a wider
- // result.
- // Generate shuffle functioning as punpcklwd.
- SmallVector<int, 16> ShuffleMask(NumElts);
- for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
- ShuffleMask[2 * i] = i;
- ShuffleMask[2 * i + 1] = i + NumElts;
- }
- SDValue ResLo =
- DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
- ResLo = DAG.getBitcast(ResVT, ResLo);
- // Generate shuffle functioning as punpckhwd.
- for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
- ShuffleMask[2 * i] = i + NumElts / 2;
- ShuffleMask[2 * i + 1] = i + NumElts * 3 / 2;
- }
- SDValue ResHi =
- DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
- ResHi = DAG.getBitcast(ResVT, ResHi);
- return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ResLo, ResHi);
+ // result. Make sure the type of mul result is VT.
+ MVT ResVT = MVT::getVectorVT(MVT::i32, RegSize / 32);
+ SDValue Res = getUnpackl(DAG, DL, OpsVT, MulLo, MulHi);
+ Res = DAG.getBitcast(ResVT, Res);
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
+ DAG.getIntPtrConstant(0, DL));
}
static SDValue combineMulSpecial(uint64_t MulAmt, SDNode *N, SelectionDAG &DAG,
// Make sure the vXi16 type is legal. This covers the AVX512 without BWI case.
// Also allow v2i32 if it will be widened.
MVT WVT = MVT::getVectorVT(MVT::i16, 2 * VT.getVectorNumElements());
- if (VT != MVT::v2i32 && !DAG.getTargetLoweringInfo().isTypeLegal(WVT))
+ if (!((ExperimentalVectorWideningLegalization && VT == MVT::v2i32) ||
+ DAG.getTargetLoweringInfo().isTypeLegal(WVT)))
return SDValue();
SDValue N0 = N->getOperand(0);
return Blend;
}
- return SDValue();
+ if (ExperimentalVectorWideningLegalization)
+ return SDValue();
+
+ if (Mld->getExtensionType() != ISD::EXTLOAD)
+ return SDValue();
+
+ // Resolve extending loads.
+ EVT VT = Mld->getValueType(0);
+ unsigned NumElems = VT.getVectorNumElements();
+ EVT LdVT = Mld->getMemoryVT();
+ SDLoc dl(Mld);
+
+ assert(LdVT != VT && "Cannot extend to the same type");
+ unsigned ToSz = VT.getScalarSizeInBits();
+ unsigned FromSz = LdVT.getScalarSizeInBits();
+ // From/To sizes and ElemCount must be pow of two.
+ assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&
+ "Unexpected size for extending masked load");
+
+ unsigned SizeRatio = ToSz / FromSz;
+ assert(SizeRatio * NumElems * FromSz == VT.getSizeInBits());
+
+ // Create a type on which we perform the shuffle.
+ EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
+ LdVT.getScalarType(), NumElems*SizeRatio);
+ assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
+
+ // Convert PassThru value.
+ SDValue WidePassThru = DAG.getBitcast(WideVecVT, Mld->getPassThru());
+ if (!Mld->getPassThru().isUndef()) {
+ SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
+ for (unsigned i = 0; i != NumElems; ++i)
+ ShuffleVec[i] = i * SizeRatio;
+
+ // Can't shuffle using an illegal type.
+ assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) &&
+ "WideVecVT should be legal");
+ WidePassThru = DAG.getVectorShuffle(WideVecVT, dl, WidePassThru,
+ DAG.getUNDEF(WideVecVT), ShuffleVec);
+ }
+
+ // Prepare the new mask.
+ SDValue NewMask;
+ SDValue Mask = Mld->getMask();
+ if (Mask.getValueType() == VT) {
+ // Mask and original value have the same type.
+ NewMask = DAG.getBitcast(WideVecVT, Mask);
+ SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
+ for (unsigned i = 0; i != NumElems; ++i)
+ ShuffleVec[i] = i * SizeRatio;
+ for (unsigned i = NumElems; i != NumElems * SizeRatio; ++i)
+ ShuffleVec[i] = NumElems * SizeRatio;
+ NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
+ DAG.getConstant(0, dl, WideVecVT),
+ ShuffleVec);
+ } else {
+ assert(Mask.getValueType().getVectorElementType() == MVT::i1);
+ unsigned WidenNumElts = NumElems*SizeRatio;
+ unsigned MaskNumElts = VT.getVectorNumElements();
+ EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
+ WidenNumElts);
+
+ unsigned NumConcat = WidenNumElts / MaskNumElts;
+ SDValue ZeroVal = DAG.getConstant(0, dl, Mask.getValueType());
+ SmallVector<SDValue, 16> Ops(NumConcat, ZeroVal);
+ Ops[0] = Mask;
+ NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
+ }
+
+ SDValue WideLd = DAG.getMaskedLoad(WideVecVT, dl, Mld->getChain(),
+ Mld->getBasePtr(), NewMask, WidePassThru,
+ Mld->getMemoryVT(), Mld->getMemOperand(),
+ ISD::NON_EXTLOAD);
+
+ SDValue SlicedVec = DAG.getBitcast(WideVecVT, WideLd);
+ SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
+ for (unsigned i = 0; i != NumElems; ++i)
+ ShuffleVec[i * SizeRatio] = i;
+
+ // Can't shuffle using an illegal type.
+ assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) &&
+ "WideVecVT should be legal");
+ SlicedVec = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec,
+ DAG.getUNDEF(WideVecVT), ShuffleVec);
+ SlicedVec = DAG.getBitcast(VT, SlicedVec);
+
+ return DCI.CombineTo(N, SlicedVec, WideLd.getValue(1), true);
}
/// If exactly one element of the mask is set for a non-truncating masked store,
return SDValue();
EVT VT = Mst->getValue().getValueType();
+ EVT StVT = Mst->getMemoryVT();
SDLoc dl(Mst);
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
- if (Mst->isTruncatingStore())
+ if (!Mst->isTruncatingStore()) {
+ if (SDValue ScalarStore = reduceMaskedStoreToScalarStore(Mst, DAG))
+ return ScalarStore;
+
+ // If the mask value has been legalized to a non-boolean vector, try to
+ // simplify ops leading up to it. We only demand the MSB of each lane.
+ SDValue Mask = Mst->getMask();
+ if (Mask.getScalarValueSizeInBits() != 1) {
+ APInt DemandedMask(APInt::getSignMask(VT.getScalarSizeInBits()));
+ if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI))
+ return SDValue(N, 0);
+ }
+
+ SDValue Value = Mst->getValue();
+ if (Value.getOpcode() == ISD::TRUNCATE && Value.getNode()->hasOneUse() &&
+ TLI.isTruncStoreLegal(Value.getOperand(0).getValueType(),
+ Mst->getMemoryVT())) {
+ return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Value.getOperand(0),
+ Mst->getBasePtr(), Mask,
+ Mst->getMemoryVT(), Mst->getMemOperand(), true);
+ }
+
+ return SDValue();
+ }
+
+ if (ExperimentalVectorWideningLegalization)
return SDValue();
- if (SDValue ScalarStore = reduceMaskedStoreToScalarStore(Mst, DAG))
- return ScalarStore;
+ // Resolve truncating stores.
+ unsigned NumElems = VT.getVectorNumElements();
- // If the mask value has been legalized to a non-boolean vector, try to
- // simplify ops leading up to it. We only demand the MSB of each lane.
+ assert(StVT != VT && "Cannot truncate to the same type");
+ unsigned FromSz = VT.getScalarSizeInBits();
+ unsigned ToSz = StVT.getScalarSizeInBits();
+
+ // The truncating store is legal in some cases. For example
+ // vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw
+ // are designated for truncate store.
+ // In this case we don't need any further transformations.
+ if (TLI.isTruncStoreLegal(VT, StVT))
+ return SDValue();
+
+ // From/To sizes and ElemCount must be pow of two.
+ assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&
+ "Unexpected size for truncating masked store");
+ // We are going to use the original vector elt for storing.
+ // Accumulated smaller vector elements must be a multiple of the store size.
+ assert (((NumElems * FromSz) % ToSz) == 0 &&
+ "Unexpected ratio for truncating masked store");
+
+ unsigned SizeRatio = FromSz / ToSz;
+ assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());
+
+ // Create a type on which we perform the shuffle.
+ EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
+ StVT.getScalarType(), NumElems*SizeRatio);
+
+ assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
+
+ SDValue WideVec = DAG.getBitcast(WideVecVT, Mst->getValue());
+ SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
+ for (unsigned i = 0; i != NumElems; ++i)
+ ShuffleVec[i] = i * SizeRatio;
+
+ // Can't shuffle using an illegal type.
+ assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) &&
+ "WideVecVT should be legal");
+
+ SDValue TruncatedVal = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
+ DAG.getUNDEF(WideVecVT),
+ ShuffleVec);
+
+ SDValue NewMask;
SDValue Mask = Mst->getMask();
- if (Mask.getScalarValueSizeInBits() != 1) {
- APInt DemandedMask(APInt::getSignMask(VT.getScalarSizeInBits()));
- if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI))
- return SDValue(N, 0);
- }
+ if (Mask.getValueType() == VT) {
+ // Mask and original value have the same type.
+ NewMask = DAG.getBitcast(WideVecVT, Mask);
+ for (unsigned i = 0; i != NumElems; ++i)
+ ShuffleVec[i] = i * SizeRatio;
+ for (unsigned i = NumElems; i != NumElems*SizeRatio; ++i)
+ ShuffleVec[i] = NumElems*SizeRatio;
+ NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
+ DAG.getConstant(0, dl, WideVecVT),
+ ShuffleVec);
+ } else {
+ assert(Mask.getValueType().getVectorElementType() == MVT::i1);
+ unsigned WidenNumElts = NumElems*SizeRatio;
+ unsigned MaskNumElts = VT.getVectorNumElements();
+ EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
+ WidenNumElts);
- SDValue Value = Mst->getValue();
- if (Value.getOpcode() == ISD::TRUNCATE && Value.getNode()->hasOneUse() &&
- TLI.isTruncStoreLegal(Value.getOperand(0).getValueType(),
- Mst->getMemoryVT())) {
- return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Value.getOperand(0),
- Mst->getBasePtr(), Mask,
- Mst->getMemoryVT(), Mst->getMemOperand(), true);
+ unsigned NumConcat = WidenNumElts / MaskNumElts;
+ SDValue ZeroVal = DAG.getConstant(0, dl, Mask.getValueType());
+ SmallVector<SDValue, 16> Ops(NumConcat, ZeroVal);
+ Ops[0] = Mask;
+ NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
}
- return SDValue();
+ return DAG.getMaskedStore(Mst->getChain(), dl, TruncatedVal,
+ Mst->getBasePtr(), NewMask, StVT,
+ Mst->getMemOperand(), false);
}
static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
MVT::v16i8, St->getMemOperand());
}
+ // Look for a truncating store to a less than 128 bit vector that has been
+ // truncated from an any_extend_inreg from a 128 bit vector with the same
+ // element size. We can use a 64/32/16-bit extractelement and store that.
+ // Disabling this when widening legalization is in effect since the trunc
+ // store would have been unlikely to be created in that case. Only doing this
+ // when truncstore is legal since it would otherwise be decomposed below and
+ // then combined away.
+ if (St->isTruncatingStore() && TLI.isTruncStoreLegal(VT, StVT) &&
+ StoredVal.getOpcode() == ISD::ANY_EXTEND_VECTOR_INREG &&
+ StoredVal.getValueType().is128BitVector() &&
+ !ExperimentalVectorWideningLegalization) {
+ EVT OrigVT = StoredVal.getOperand(0).getValueType();
+ if (OrigVT.is128BitVector() &&
+ OrigVT.getVectorElementType() == StVT.getVectorElementType()) {
+ unsigned StoreSize = StVT.getSizeInBits();
+ assert((128 % StoreSize == 0) && "Unexpected store size!");
+ MVT IntVT = MVT::getIntegerVT(StoreSize);
+ MVT CastVT = MVT::getVectorVT(IntVT, 128 / StoreSize);
+ StoredVal = DAG.getBitcast(CastVT, StoredVal.getOperand(0));
+ // Use extract_store for the 64-bit case to support 32-bit targets.
+ if (IntVT == MVT::i64) {
+ SDVTList Tys = DAG.getVTList(MVT::Other);
+ SDValue Ops[] = {St->getChain(), StoredVal, St->getBasePtr()};
+ return DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys, Ops,
+ IntVT, St->getMemOperand());
+ }
+
+ // Otherwise just use an extract and store.
+ StoredVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, IntVT, StoredVal,
+ DAG.getIntPtrConstant(0, dl));
+ return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
+ St->getMemOperand());
+ }
+ }
+
// Optimize trunc store (of multiple scalars) to shuffle and store.
// First, pack all of the elements in one place. Next, store to memory
// in fewer chunks.
St->getMemoryVT(), St->getMemOperand(), DAG);
}
- return SDValue();
+ if (ExperimentalVectorWideningLegalization)
+ return SDValue();
+
+ unsigned NumElems = VT.getVectorNumElements();
+ assert(StVT != VT && "Cannot truncate to the same type");
+ unsigned FromSz = VT.getScalarSizeInBits();
+ unsigned ToSz = StVT.getScalarSizeInBits();
+
+ // The truncating store is legal in some cases. For example
+ // vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw
+ // are designated for truncate store.
+ // In this case we don't need any further transformations.
+ if (TLI.isTruncStoreLegalOrCustom(VT, StVT))
+ return SDValue();
+
+ // From, To sizes and ElemCount must be pow of two
+ if (!isPowerOf2_32(NumElems * FromSz * ToSz)) return SDValue();
+ // We are going to use the original vector elt for storing.
+ // Accumulated smaller vector elements must be a multiple of the store size.
+ if (0 != (NumElems * FromSz) % ToSz) return SDValue();
+
+ unsigned SizeRatio = FromSz / ToSz;
+
+ assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());
+
+ // Create a type on which we perform the shuffle
+ EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
+ StVT.getScalarType(), NumElems*SizeRatio);
+
+ assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
+
+ SDValue WideVec = DAG.getBitcast(WideVecVT, St->getValue());
+ SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
+ for (unsigned i = 0; i != NumElems; ++i)
+ ShuffleVec[i] = i * SizeRatio;
+
+ // Can't shuffle using an illegal type.
+ if (!TLI.isTypeLegal(WideVecVT))
+ return SDValue();
+
+ SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
+ DAG.getUNDEF(WideVecVT),
+ ShuffleVec);
+ // At this point all of the data is stored at the bottom of the
+ // register. We now need to save it to mem.
+
+ // Find the largest store unit
+ MVT StoreType = MVT::i8;
+ for (MVT Tp : MVT::integer_valuetypes()) {
+ if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToSz)
+ StoreType = Tp;
+ }
+
+ // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
+ if (TLI.isTypeLegal(MVT::f64) && StoreType.getSizeInBits() < 64 &&
+ (64 <= NumElems * ToSz))
+ StoreType = MVT::f64;
+
+ // Bitcast the original vector into a vector of store-size units
+ EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(),
+ StoreType, VT.getSizeInBits()/StoreType.getSizeInBits());
+ assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
+ SDValue ShuffWide = DAG.getBitcast(StoreVecVT, Shuff);
+ SmallVector<SDValue, 8> Chains;
+ SDValue Ptr = St->getBasePtr();
+
+ // Perform one or more big stores into memory.
+ for (unsigned i=0, e=(ToSz*NumElems)/StoreType.getSizeInBits(); i!=e; ++i) {
+ SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
+ StoreType, ShuffWide,
+ DAG.getIntPtrConstant(i, dl));
+ SDValue Ch =
+ DAG.getStore(St->getChain(), dl, SubVec, Ptr, St->getPointerInfo(),
+ St->getAlignment(), St->getMemOperand()->getFlags());
+ Ptr = DAG.getMemBasePlusOffset(Ptr, StoreType.getStoreSize(), dl);
+ Chains.push_back(Ch);
+ }
+
+ return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
}
// Turn load->store of MMX types into GPR load/stores. This avoids clobbering
bool NoImplicitFloatOps = F.hasFnAttribute(Attribute::NoImplicitFloat);
bool F64IsLegal =
!Subtarget.useSoftFloat() && !NoImplicitFloatOps && Subtarget.hasSSE2();
- if ((VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit()) &&
+ if (((VT.isVector() && !VT.isFloatingPoint() &&
+ !ExperimentalVectorWideningLegalization) ||
+ (VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit())) &&
isa<LoadSDNode>(St->getValue()) &&
!cast<LoadSDNode>(St->getValue())->isVolatile() &&
St->getChain().hasOneUse() && !St->isVolatile()) {
// Only handle vXi16 types that are at least 128-bits unless they will be
// widened.
- if (!VT.isVector() || VT.getVectorElementType() != MVT::i16)
+ if (!VT.isVector() || VT.getVectorElementType() != MVT::i16 ||
+ (!ExperimentalVectorWideningLegalization &&
+ VT.getVectorNumElements() < 8))
return SDValue();
// Input type should be vXi32.
DAG.getConstant(EltSizeInBits - 1, DL, VT));
}
+/// Convert a SEXT or ZEXT of a vector to a SIGN_EXTEND_VECTOR_INREG or
+/// ZERO_EXTEND_VECTOR_INREG, this requires the splitting (or concatenating
+/// with UNDEFs) of the input to vectors of the same size as the target type
+/// which then extends the lowest elements.
+static SDValue combineToExtendVectorInReg(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget &Subtarget) {
+ if (ExperimentalVectorWideningLegalization)
+ return SDValue();
+
+ unsigned Opcode = N->getOpcode();
+ // TODO - add ANY_EXTEND support.
+ if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND)
+ return SDValue();
+ if (!DCI.isBeforeLegalizeOps())
+ return SDValue();
+ if (!Subtarget.hasSSE2())
+ return SDValue();
+
+ SDValue N0 = N->getOperand(0);
+ EVT VT = N->getValueType(0);
+ EVT SVT = VT.getScalarType();
+ EVT InVT = N0.getValueType();
+ EVT InSVT = InVT.getScalarType();
+
+ // FIXME: Generic DAGCombiner previously had a bug that would cause a
+ // sign_extend of setcc to sometimes return the original node and tricked it
+ // into thinking CombineTo was used which prevented the target combines from
+ // running.
+ // Earlying out here to avoid regressions like this
+ // (v4i32 (sext (v4i1 (setcc (v4i16)))))
+ // Becomes
+ // (v4i32 (sext_invec (v8i16 (concat (v4i16 (setcc (v4i16))), undef))))
+ // Type legalized to
+ // (v4i32 (sext_invec (v8i16 (trunc_invec (v4i32 (setcc (v4i32)))))))
+ // Leading to a packssdw+pmovsxwd
+ // We could write a DAG combine to fix this, but really we shouldn't be
+ // creating sext_invec that's forcing v8i16 into the DAG.
+ if (N0.getOpcode() == ISD::SETCC)
+ return SDValue();
+
+ // Input type must be a vector and we must be extending legal integer types.
+ if (!VT.isVector() || VT.getVectorNumElements() < 2)
+ return SDValue();
+ if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
+ return SDValue();
+ if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
+ return SDValue();
+
+ // If the input/output types are both legal then we have at least AVX1 and
+ // we will be able to use SIGN_EXTEND/ZERO_EXTEND directly.
+ if (DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
+ DAG.getTargetLoweringInfo().isTypeLegal(InVT))
+ return SDValue();
+
+ SDLoc DL(N);
+
+ auto ExtendVecSize = [&DAG](const SDLoc &DL, SDValue N, unsigned Size) {
+ EVT SrcVT = N.getValueType();
+ EVT DstVT = EVT::getVectorVT(*DAG.getContext(), SrcVT.getScalarType(),
+ Size / SrcVT.getScalarSizeInBits());
+ SmallVector<SDValue, 8> Opnds(Size / SrcVT.getSizeInBits(),
+ DAG.getUNDEF(SrcVT));
+ Opnds[0] = N;
+ return DAG.getNode(ISD::CONCAT_VECTORS, DL, DstVT, Opnds);
+ };
+
+ // If target-size is less than 128-bits, extend to a type that would extend
+ // to 128 bits, extend that and extract the original target vector.
+ if (VT.getSizeInBits() < 128 && !(128 % VT.getSizeInBits())) {
+ unsigned Scale = 128 / VT.getSizeInBits();
+ EVT ExVT =
+ EVT::getVectorVT(*DAG.getContext(), SVT, 128 / SVT.getSizeInBits());
+ SDValue Ex = ExtendVecSize(DL, N0, Scale * InVT.getSizeInBits());
+ SDValue SExt = DAG.getNode(Opcode, DL, ExVT, Ex);
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SExt,
+ DAG.getIntPtrConstant(0, DL));
+ }
+
+ // If target-size is 128-bits (or 256-bits on AVX target), then convert to
+ // ISD::*_EXTEND_VECTOR_INREG which ensures lowering to X86ISD::V*EXT.
+ // Also use this if we don't have SSE41 to allow the legalizer do its job.
+ if (!Subtarget.hasSSE41() || VT.is128BitVector() ||
+ (VT.is256BitVector() && Subtarget.hasAVX()) ||
+ (VT.is512BitVector() && Subtarget.useAVX512Regs())) {
+ SDValue ExOp = ExtendVecSize(DL, N0, VT.getSizeInBits());
+ Opcode = getOpcode_EXTEND_VECTOR_INREG(Opcode);
+ return DAG.getNode(Opcode, DL, VT, ExOp);
+ }
+
+ auto SplitAndExtendInReg = [&](unsigned SplitSize) {
+ unsigned NumVecs = VT.getSizeInBits() / SplitSize;
+ unsigned NumSubElts = SplitSize / SVT.getSizeInBits();
+ EVT SubVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumSubElts);
+ EVT InSubVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubElts);
+
+ unsigned IROpc = getOpcode_EXTEND_VECTOR_INREG(Opcode);
+ SmallVector<SDValue, 8> Opnds;
+ for (unsigned i = 0, Offset = 0; i != NumVecs; ++i, Offset += NumSubElts) {
+ SDValue SrcVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InSubVT, N0,
+ DAG.getIntPtrConstant(Offset, DL));
+ SrcVec = ExtendVecSize(DL, SrcVec, SplitSize);
+ SrcVec = DAG.getNode(IROpc, DL, SubVT, SrcVec);
+ Opnds.push_back(SrcVec);
+ }
+ return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Opnds);
+ };
+
+ // On pre-AVX targets, split into 128-bit nodes of
+ // ISD::*_EXTEND_VECTOR_INREG.
+ if (!Subtarget.hasAVX() && !(VT.getSizeInBits() % 128))
+ return SplitAndExtendInReg(128);
+
+ // On pre-AVX512 targets, split into 256-bit nodes of
+ // ISD::*_EXTEND_VECTOR_INREG.
+ if (!Subtarget.useAVX512Regs() && !(VT.getSizeInBits() % 256))
+ return SplitAndExtendInReg(256);
+
+ return SDValue();
+}
+
// Attempt to combine a (sext/zext (setcc)) to a setcc with a xmm/ymm/zmm
// result type.
static SDValue combineExtSetcc(SDNode *N, SelectionDAG &DAG,
return DAG.getNode(ISD::SUB, DL, VT, Zext, DAG.getConstant(1, DL, VT));
}
+ if (SDValue V = combineToExtendVectorInReg(N, DAG, DCI, Subtarget))
+ return V;
+
if (SDValue V = combineToExtendBoolVectorInReg(N, DAG, DCI, Subtarget))
return V;
if (SDValue V = combineExtSetcc(N, DAG, Subtarget))
return V;
+ if (SDValue V = combineToExtendVectorInReg(N, DAG, DCI, Subtarget))
+ return V;
+
if (SDValue V = combineToExtendBoolVectorInReg(N, DAG, DCI, Subtarget))
return V;
// go through type promotion to a 128-bit vector.
if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && VT.isVector() &&
VT.getVectorElementType() == MVT::i1 &&
+ (ExperimentalVectorWideningLegalization ||
+ VT.getVectorNumElements() > 4) &&
(OpVT.getVectorElementType() == MVT::i8 ||
OpVT.getVectorElementType() == MVT::i16)) {
SDValue Setcc = DAG.getNode(ISD::SETCC, DL, OpVT, LHS, RHS,
}
}
+ // Combine (ext_invec (ext_invec X)) -> (ext_invec X)
+ // Disabling for widening legalization for now. We can enable if we find a
+ // case that needs it. Otherwise it can be deleted when we switch to
+ // widening legalization.
+ if (!ExperimentalVectorWideningLegalization &&
+ In.getOpcode() == N->getOpcode() &&
+ TLI.isTypeLegal(VT) && TLI.isTypeLegal(In.getOperand(0).getValueType()))
+ return DAG.getNode(N->getOpcode(), SDLoc(N), VT, In.getOperand(0));
+
// Attempt to combine as a shuffle.
// TODO: SSE41 support
if (Subtarget.hasAVX() && N->getOpcode() != ISD::SIGN_EXTEND_VECTOR_INREG) {