return ConstsNode;
}
-static SDValue getConstVector(ArrayRef<APInt> Values, SmallBitVector &Undefs,
+static SDValue getConstVector(ArrayRef<APInt> Bits, SmallBitVector &Undefs,
MVT VT, SelectionDAG &DAG, const SDLoc &dl) {
- assert(Values.size() == Undefs.size() && "Unequal constant and undef arrays");
+ assert(Bits.size() == Undefs.size() && "Unequal constant and undef arrays");
SmallVector<SDValue, 32> Ops;
bool Split = false;
}
MVT EltVT = ConstVecVT.getVectorElementType();
- for (unsigned i = 0, e = Values.size(); i != e; ++i) {
+ for (unsigned i = 0, e = Bits.size(); i != e; ++i) {
if (Undefs[i]) {
Ops.append(Split ? 2 : 1, DAG.getUNDEF(EltVT));
continue;
}
- const APInt &V = Values[i];
+ const APInt &V = Bits[i];
assert(V.getBitWidth() == VT.getScalarSizeInBits() && "Unexpected sizes");
if (Split) {
Ops.push_back(DAG.getConstant(V.trunc(32), dl, EltVT));
Ops.push_back(DAG.getConstant(V.lshr(32).trunc(32), dl, EltVT));
+ } else if (EltVT == MVT::f32) {
+ APFloat FV(APFloat::IEEEsingle, V);
+ Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
+ } else if (EltVT == MVT::f64) {
+ APFloat FV(APFloat::IEEEdouble, V);
+ Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
} else {
Ops.push_back(DAG.getConstant(V, dl, EltVT));
}
return dyn_cast<Constant>(CNode->getConstVal());
}
+// Extract constant bits from constant pool vector.
+static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
+ SmallBitVector &UndefElts,
+ SmallVectorImpl<APInt> &EltBits) {
+ assert(UndefElts.empty() && "Expected an empty UndefElts vector");
+ assert(EltBits.empty() && "Expected an empty EltBits vector");
+
+ EVT VT = Op.getValueType();
+ unsigned SizeInBits = VT.getSizeInBits();
+ assert((SizeInBits % EltSizeInBits) == 0 && "Can't split constant!");
+ unsigned NumElts = SizeInBits / EltSizeInBits;
+
+ auto *Cst = getTargetConstantFromNode(Op);
+ if (!Cst)
+ return false;
+
+ Type *CstTy = Cst->getType();
+ if (!CstTy->isVectorTy() || (SizeInBits != CstTy->getPrimitiveSizeInBits()))
+ return false;
+
+ // Extract all the undef/constant element data and pack into single bitsets.
+ APInt UndefBits(SizeInBits, 0);
+ APInt MaskBits(SizeInBits, 0);
+
+ unsigned CstEltSizeInBits = CstTy->getScalarSizeInBits();
+ for (unsigned i = 0, e = CstTy->getVectorNumElements(); i != e; ++i) {
+ auto *COp = Cst->getAggregateElement(i);
+ if (!COp ||
+ !(isa<UndefValue>(COp) || isa<ConstantInt>(COp) ||
+ isa<ConstantFP>(COp)))
+ return false;
+
+ if (isa<UndefValue>(COp)) {
+ APInt EltUndef = APInt::getLowBitsSet(SizeInBits, CstEltSizeInBits);
+ UndefBits |= EltUndef.shl(i * CstEltSizeInBits);
+ continue;
+ }
+
+ APInt Bits;
+ if (auto *CInt = dyn_cast<ConstantInt>(COp))
+ Bits = CInt->getValue();
+ else if (auto *CFP = dyn_cast<ConstantFP>(COp))
+ Bits = CFP->getValueAPF().bitcastToAPInt();
+
+ Bits = Bits.zextOrTrunc(SizeInBits);
+ MaskBits |= Bits.shl(i * CstEltSizeInBits);
+ }
+
+ UndefElts = SmallBitVector(NumElts, false);
+ EltBits.resize(NumElts, APInt(EltSizeInBits, 0));
+
+ // Now extract the undef/constant bit data into the target elts.
+ for (unsigned i = 0; i != NumElts; ++i) {
+ APInt UndefEltBits = UndefBits.lshr(i * EltSizeInBits);
+ UndefEltBits = UndefEltBits.zextOrTrunc(EltSizeInBits);
+
+ // Only treat the element as UNDEF if all bits are UNDEF, otherwise
+ // treat it as zero.
+ if (UndefEltBits.isAllOnesValue()) {
+ UndefElts[i] = true;
+ continue;
+ }
+
+ APInt Bits = MaskBits.lshr(i * EltSizeInBits);
+ Bits = Bits.zextOrTrunc(EltSizeInBits);
+ EltBits[i] = Bits.getZExtValue();
+ }
+
+ return true;
+}
+
static bool getTargetShuffleMaskIndices(SDValue MaskNode,
unsigned MaskEltSizeInBits,
SmallVectorImpl<uint64_t> &RawMask) {
return false;
}
+// Attempt to constant fold all of the constant source ops.
+// Returns true if the entire shuffle is folded to a constant.
+// TODO: Extend this to merge multiple constant Ops and update the mask.
+static bool combineX86ShufflesConstants(const SmallVectorImpl<SDValue> &Ops,
+ ArrayRef<int> Mask, SDValue Root,
+ bool HasVariableMask, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget &Subtarget) {
+ MVT VT = Root.getSimpleValueType();
+
+ unsigned SizeInBits = VT.getSizeInBits();
+ unsigned NumMaskElts = Mask.size();
+ unsigned MaskSizeInBits = SizeInBits / NumMaskElts;
+ unsigned NumOps = Ops.size();
+
+ // Extract constant bits from each source op.
+ bool OneUseConstantOp = false;
+ SmallVector<SmallBitVector, 4> UndefEltsOps(NumOps);
+ SmallVector<SmallVector<APInt, 8>, 4> RawBitsOps(NumOps);
+ for (unsigned i = 0; i != NumOps; ++i) {
+ SDValue SrcOp = Ops[i];
+ OneUseConstantOp |= SrcOp.hasOneUse();
+ if (!getTargetConstantBitsFromNode(SrcOp, MaskSizeInBits, UndefEltsOps[i],
+ RawBitsOps[i]))
+ return false;
+ }
+
+ // Only fold if at least one of the constants is only used once or
+ // the combined shuffle has included a variable mask shuffle, this
+ // is to avoid constant pool bloat.
+ if (!OneUseConstantOp && !HasVariableMask)
+ return false;
+
+ // Shuffle the constant bits according to the mask.
+ SmallBitVector UndefElts(NumMaskElts, false);
+ SmallBitVector ZeroElts(NumMaskElts, false);
+ SmallBitVector ConstantElts(NumMaskElts, false);
+ SmallVector<APInt, 8> ConstantBitData(NumMaskElts,
+ APInt::getNullValue(MaskSizeInBits));
+ for (unsigned i = 0; i != NumMaskElts; ++i) {
+ int M = Mask[i];
+ if (M == SM_SentinelUndef) {
+ UndefElts[i] = true;
+ continue;
+ } else if (M == SM_SentinelZero) {
+ ZeroElts[i] = true;
+ continue;
+ }
+ assert(0 <= M && M < (int)(NumMaskElts * NumOps));
+
+ unsigned SrcOpIdx = (unsigned)M / NumMaskElts;
+ unsigned SrcMaskIdx = (unsigned)M % NumMaskElts;
+
+ auto &SrcUndefElts = UndefEltsOps[SrcOpIdx];
+ if (SrcUndefElts[SrcMaskIdx]) {
+ UndefElts[i] = true;
+ continue;
+ }
+
+ auto &SrcEltBits = RawBitsOps[SrcOpIdx];
+ APInt &Bits = SrcEltBits[SrcMaskIdx];
+ if (!Bits) {
+ ZeroElts[i] = true;
+ continue;
+ }
+
+ ConstantElts[i] = true;
+ ConstantBitData[i] = Bits;
+ }
+ assert((UndefElts | ZeroElts | ConstantElts).count() == NumMaskElts);
+
+ // Create the constant data.
+ MVT MaskSVT;
+ if (VT.isFloatingPoint() && (MaskSizeInBits == 32 || MaskSizeInBits == 64))
+ MaskSVT = MVT::getFloatingPointVT(MaskSizeInBits);
+ else
+ MaskSVT = MVT::getIntegerVT(MaskSizeInBits);
+
+ MVT MaskVT = MVT::getVectorVT(MaskSVT, NumMaskElts);
+
+ SDLoc DL(Root);
+ SDValue CstOp = getConstVector(ConstantBitData, UndefElts, MaskVT, DAG, DL);
+ DCI.AddToWorklist(CstOp.getNode());
+ DCI.CombineTo(Root.getNode(), DAG.getBitcast(VT, CstOp));
+ return true;
+}
+
/// \brief Fully generic combining of x86 shuffle instructions.
///
/// This should be the last combine run over the x86 shuffle instructions. Once
HasVariableMask, DAG, DCI, Subtarget))
return true;
+ // Attempt to constant fold all of the constant source ops.
+ if (combineX86ShufflesConstants(Ops, Mask, Root, HasVariableMask, DAG, DCI,
+ Subtarget))
+ return true;
+
// We can only combine unary and binary shuffle mask cases.
if (Ops.size() > 2)
return false;
define <2 x double> @constant_fold_vpermilvar_pd() {
; X32-LABEL: constant_fold_vpermilvar_pd:
; X32: # BB#0:
-; X32-NEXT: vpermilpd {{.*#+}} xmm0 = mem[1,0]
+; X32-NEXT: vmovaps {{.*#+}} xmm0 = [2.000000e+00,1.000000e+00]
; X32-NEXT: retl
;
; X64-LABEL: constant_fold_vpermilvar_pd:
; X64: # BB#0:
-; X64-NEXT: vpermilpd {{.*#+}} xmm0 = mem[1,0]
+; X64-NEXT: vmovaps {{.*#+}} xmm0 = [2.000000e+00,1.000000e+00]
; X64-NEXT: retq
%1 = call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> <double 1.0, double 2.0>, <2 x i64> <i64 2, i64 0>)
ret <2 x double> %1
define <4 x double> @constant_fold_vpermilvar_pd_256() {
; X32-LABEL: constant_fold_vpermilvar_pd_256:
; X32: # BB#0:
-; X32-NEXT: vpermilpd {{.*#+}} ymm0 = mem[1,0,2,3]
+; X32-NEXT: vmovaps {{.*#+}} ymm0 = [2.000000e+00,1.000000e+00,3.000000e+00,4.000000e+00]
; X32-NEXT: retl
;
; X64-LABEL: constant_fold_vpermilvar_pd_256:
; X64: # BB#0:
-; X64-NEXT: vpermilpd {{.*#+}} ymm0 = mem[1,0,2,3]
+; X64-NEXT: vmovaps {{.*#+}} ymm0 = [2.000000e+00,1.000000e+00,3.000000e+00,4.000000e+00]
; X64-NEXT: retq
%1 = call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> <double 1.0, double 2.0, double 3.0, double 4.0>, <4 x i64> <i64 2, i64 0, i64 0, i64 2>)
ret <4 x double> %1
define <4 x float> @constant_fold_vpermilvar_ps() {
; X32-LABEL: constant_fold_vpermilvar_ps:
; X32: # BB#0:
-; X32-NEXT: vpermilps {{.*#+}} xmm0 = mem[3,0,2,1]
+; X32-NEXT: vmovaps {{.*#+}} xmm0 = [4.000000e+00,1.000000e+00,3.000000e+00,2.000000e+00]
; X32-NEXT: retl
;
; X64-LABEL: constant_fold_vpermilvar_ps:
; X64: # BB#0:
-; X64-NEXT: vpermilps {{.*#+}} xmm0 = mem[3,0,2,1]
+; X64-NEXT: vmovaps {{.*#+}} xmm0 = [4.000000e+00,1.000000e+00,3.000000e+00,2.000000e+00]
; X64-NEXT: retq
%1 = call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, <4 x i32> <i32 3, i32 0, i32 2, i32 1>)
ret <4 x float> %1
define <8 x float> @constant_fold_vpermilvar_ps_256() {
; X32-LABEL: constant_fold_vpermilvar_ps_256:
; X32: # BB#0:
-; X32-NEXT: vmovaps {{.*#+}} ymm0 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,5.000000e+00,6.000000e+00,7.000000e+00,8.000000e+00]
-; X32-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,1,4,5,5,5]
+; X32-NEXT: vmovaps {{.*#+}} ymm0 = [1.000000e+00,1.000000e+00,3.000000e+00,2.000000e+00,5.000000e+00,6.000000e+00,6.000000e+00,6.000000e+00]
; X32-NEXT: retl
;
; X64-LABEL: constant_fold_vpermilvar_ps_256:
; X64: # BB#0:
-; X64-NEXT: vmovaps {{.*#+}} ymm0 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,5.000000e+00,6.000000e+00,7.000000e+00,8.000000e+00]
-; X64-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,1,4,5,5,5]
+; X64-NEXT: vmovaps {{.*#+}} ymm0 = [1.000000e+00,1.000000e+00,3.000000e+00,2.000000e+00,5.000000e+00,6.000000e+00,6.000000e+00,6.000000e+00]
; X64-NEXT: retq
%1 = call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0>, <8 x i32> <i32 4, i32 0, i32 2, i32 1, i32 0, i32 1, i32 1, i32 1>)
ret <8 x float> %1
define <8 x i32> @constant_fold_permd() {
; X32-LABEL: constant_fold_permd:
; X32: # BB#0:
-; X32-NEXT: vmovdqa {{.*#+}} ymm0 = [4,6,2,1,7,1,5,0]
-; X32-NEXT: vpermd {{\.LCPI.*}}, %ymm0, %ymm0
+; X32-NEXT: vmovaps {{.*#+}} ymm0 = [5,7,3,2,8,2,6,1]
; X32-NEXT: retl
;
; X64-LABEL: constant_fold_permd:
; X64: # BB#0:
-; X64-NEXT: vmovdqa {{.*#+}} ymm0 = [4,6,2,1,7,1,5,0]
-; X64-NEXT: vpermd {{.*}}(%rip), %ymm0, %ymm0
+; X64-NEXT: vmovaps {{.*#+}} ymm0 = [5,7,3,2,8,2,6,1]
; X64-NEXT: retq
%1 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>, <8 x i32> <i32 4, i32 6, i32 2, i32 1, i32 7, i32 1, i32 5, i32 0>)
ret <8 x i32> %1
define <8 x float> @constant_fold_permps() {
; X32-LABEL: constant_fold_permps:
; X32: # BB#0:
-; X32-NEXT: vmovaps {{.*#+}} ymm0 = [4,6,2,1,7,1,5,0]
-; X32-NEXT: vpermps {{\.LCPI.*}}, %ymm0, %ymm0
+; X32-NEXT: vmovaps {{.*#+}} ymm0 = [5.000000e+00,7.000000e+00,3.000000e+00,2.000000e+00,8.000000e+00,2.000000e+00,6.000000e+00,1.000000e+00]
; X32-NEXT: retl
;
; X64-LABEL: constant_fold_permps:
; X64: # BB#0:
-; X64-NEXT: vmovaps {{.*#+}} ymm0 = [4,6,2,1,7,1,5,0]
-; X64-NEXT: vpermps {{.*}}(%rip), %ymm0, %ymm0
+; X64-NEXT: vmovaps {{.*#+}} ymm0 = [5.000000e+00,7.000000e+00,3.000000e+00,2.000000e+00,8.000000e+00,2.000000e+00,6.000000e+00,1.000000e+00]
; X64-NEXT: retq
%1 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0>, <8 x i32> <i32 4, i32 6, i32 2, i32 1, i32 7, i32 1, i32 5, i32 0>)
ret <8 x float> %1
define <32 x i8> @constant_fold_pshufb_256() {
; X32-LABEL: constant_fold_pshufb_256:
; X32: # BB#0:
-; X32-NEXT: vmovdqa {{.*#+}} ymm0 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,0,255,254,253,252,251,250,249,248,247,246,245,244,243,242,241]
-; X32-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1],zero,zero,zero,ymm0[u,u],zero,zero,ymm0[15],zero,zero,zero,zero,zero,ymm0[7,6,17],zero,zero,zero,ymm0[u,u],zero,zero,ymm0[31],zero,zero,zero,zero,zero,ymm0[23,22]
+; X32-NEXT: vmovaps {{.*#+}} ymm0 = <14,0,0,0,u,u,0,0,0,0,0,0,0,0,8,9,255,0,0,0,u,u,0,0,241,0,0,0,0,0,249,250>
; X32-NEXT: retl
;
; X64-LABEL: constant_fold_pshufb_256:
; X64: # BB#0:
-; X64-NEXT: vmovdqa {{.*#+}} ymm0 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,0,255,254,253,252,251,250,249,248,247,246,245,244,243,242,241]
-; X64-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1],zero,zero,zero,ymm0[u,u],zero,zero,ymm0[15],zero,zero,zero,zero,zero,ymm0[7,6,17],zero,zero,zero,ymm0[u,u],zero,zero,ymm0[31],zero,zero,zero,zero,zero,ymm0[23,22]
+; X64-NEXT: vmovaps {{.*#+}} ymm0 = <14,0,0,0,u,u,0,0,0,0,0,0,0,0,8,9,255,0,0,0,u,u,0,0,241,0,0,0,0,0,249,250>
; X64-NEXT: retq
%1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> <i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 -1, i8 -2, i8 -3, i8 -4, i8 -5, i8 -6, i8 -7, i8 -8, i8 -9, i8 -10, i8 -11, i8 -12, i8 -13, i8 -14, i8 -15>, <32 x i8> <i8 1, i8 -1, i8 -1, i8 -1, i8 undef, i8 undef, i8 -1, i8 -1, i8 15, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 7, i8 6, i8 1, i8 -1, i8 -1, i8 -1, i8 undef, i8 undef, i8 -1, i8 -1, i8 15, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 7, i8 6>)
ret <32 x i8> %1
define <16 x i8> @constant_fold_pshufb() {
; SSE-LABEL: constant_fold_pshufb:
; SSE: # BB#0:
-; SSE-NEXT: movdqa {{.*#+}} xmm0 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
-; SSE-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1],zero,zero,zero,xmm0[u,u],zero,zero,xmm0[15],zero,zero,zero,zero,zero,xmm0[7,6]
+; SSE-NEXT: movaps {{.*#+}} xmm0 = <14,0,0,0,u,u,0,0,0,0,0,0,0,0,8,9>
; SSE-NEXT: retq
;
; AVX-LABEL: constant_fold_pshufb:
; AVX: # BB#0:
-; AVX-NEXT: vmovdqa {{.*#+}} xmm0 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
-; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1],zero,zero,zero,xmm0[u,u],zero,zero,xmm0[15],zero,zero,zero,zero,zero,xmm0[7,6]
+; AVX-NEXT: vmovaps {{.*#+}} xmm0 = <14,0,0,0,u,u,0,0,0,0,0,0,0,0,8,9>
; AVX-NEXT: retq
%1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> <i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>, <16 x i8> <i8 1, i8 -1, i8 -1, i8 -1, i8 undef, i8 undef, i8 -1, i8 -1, i8 15, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 7, i8 6>)
ret <16 x i8> %1
define <2 x double> @constant_fold_vpermil2pd() {
; X32-LABEL: constant_fold_vpermil2pd:
; X32: # BB#0:
-; X32-NEXT: vmovapd {{.*#+}} xmm0 = [-2.000000e+00,-1.000000e+00]
-; X32-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],mem[1]
+; X32-NEXT: vmovaps {{.*#+}} xmm0 = [-2.000000e+00,2.000000e+00]
; X32-NEXT: retl
;
; X64-LABEL: constant_fold_vpermil2pd:
; X64: # BB#0:
-; X64-NEXT: vmovapd {{.*#+}} xmm0 = [-2.000000e+00,-1.000000e+00]
-; X64-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],mem[1]
+; X64-NEXT: vmovaps {{.*#+}} xmm0 = [-2.000000e+00,2.000000e+00]
; X64-NEXT: retq
%1 = call <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double> <double 1.0, double 2.0>, <2 x double> <double -2.0, double -1.0>, <2 x i64> <i64 4, i64 2>, i8 2)
ret <2 x double> %1
define <4 x double> @constant_fold_vpermil2pd_256() {
; X32-LABEL: constant_fold_vpermil2pd_256:
; X32: # BB#0:
-; X32-NEXT: vmovapd {{.*#+}} ymm0 = [-4.000000e+00,-3.000000e+00,-2.000000e+00,-1.000000e+00]
-; X32-NEXT: vmovapd {{.*#+}} ymm1 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00]
-; X32-NEXT: vpermil2pd {{.*#+}} ymm0 = ymm0[0],zero,ymm1[3,2]
+; X32-NEXT: vmovaps {{.*#+}} ymm0 = [-4.000000e+00,0.000000e+00,4.000000e+00,3.000000e+00]
; X32-NEXT: retl
;
; X64-LABEL: constant_fold_vpermil2pd_256:
; X64: # BB#0:
-; X64-NEXT: vmovapd {{.*#+}} ymm0 = [-4.000000e+00,-3.000000e+00,-2.000000e+00,-1.000000e+00]
-; X64-NEXT: vmovapd {{.*#+}} ymm1 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00]
-; X64-NEXT: vpermil2pd {{.*#+}} ymm0 = ymm0[0],zero,ymm1[3,2]
+; X64-NEXT: vmovaps {{.*#+}} ymm0 = [-4.000000e+00,0.000000e+00,4.000000e+00,3.000000e+00]
; X64-NEXT: retq
%1 = call <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double> <double 1.0, double 2.0, double 3.0, double 4.0>, <4 x double> <double -4.0, double -3.0, double -2.0, double -1.0>, <4 x i64> <i64 4, i64 8, i64 2, i64 0>, i8 2)
ret <4 x double> %1
define <4 x float> @constant_fold_vpermil2ps() {
; X32-LABEL: constant_fold_vpermil2ps:
; X32: # BB#0:
-; X32-NEXT: vmovaps {{.*#+}} xmm0 = [-4.000000e+00,-3.000000e+00,-2.000000e+00,-1.000000e+00]
-; X32-NEXT: vmovaps {{.*#+}} xmm1 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00]
-; X32-NEXT: vpermil2ps {{.*#+}} xmm0 = xmm0[0],xmm1[0,2],zero
+; X32-NEXT: vmovaps {{.*#+}} xmm0 = [-4.000000e+00,1.000000e+00,3.000000e+00,0.000000e+00]
; X32-NEXT: retl
;
; X64-LABEL: constant_fold_vpermil2ps:
; X64: # BB#0:
-; X64-NEXT: vmovaps {{.*#+}} xmm0 = [-4.000000e+00,-3.000000e+00,-2.000000e+00,-1.000000e+00]
-; X64-NEXT: vmovaps {{.*#+}} xmm1 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00]
-; X64-NEXT: vpermil2ps {{.*#+}} xmm0 = xmm0[0],xmm1[0,2],zero
+; X64-NEXT: vmovaps {{.*#+}} xmm0 = [-4.000000e+00,1.000000e+00,3.000000e+00,0.000000e+00]
; X64-NEXT: retq
%1 = call <4 x float> @llvm.x86.xop.vpermil2ps(<4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, <4 x float> <float -4.0, float -3.0, float -2.0, float -1.0>, <4 x i32> <i32 4, i32 0, i32 2, i32 8>, i8 2)
ret <4 x float> %1
define <8 x float> @constant_fold_vpermil2ps_256() {
; X32-LABEL: constant_fold_vpermil2ps_256:
; X32: # BB#0:
-; X32-NEXT: vmovaps {{.*#+}} ymm0 = [-8.000000e+00,-7.000000e+00,-6.000000e+00,-5.000000e+00,-4.000000e+00,-3.000000e+00,-2.000000e+00,-1.000000e+00]
-; X32-NEXT: vmovaps {{.*#+}} ymm1 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,5.000000e+00,6.000000e+00,7.000000e+00,8.000000e+00]
-; X32-NEXT: vpermil2ps {{.*#+}} ymm0 = ymm0[0],ymm1[0,2],zero,ymm1[4],zero,ymm1[4,6]
+; X32-NEXT: vmovaps {{.*#+}} ymm0 = [-8.000000e+00,1.000000e+00,3.000000e+00,0.000000e+00,5.000000e+00,0.000000e+00,5.000000e+00,7.000000e+00]
; X32-NEXT: retl
;
; X64-LABEL: constant_fold_vpermil2ps_256:
; X64: # BB#0:
-; X64-NEXT: vmovaps {{.*#+}} ymm0 = [-8.000000e+00,-7.000000e+00,-6.000000e+00,-5.000000e+00,-4.000000e+00,-3.000000e+00,-2.000000e+00,-1.000000e+00]
-; X64-NEXT: vmovaps {{.*#+}} ymm1 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,5.000000e+00,6.000000e+00,7.000000e+00,8.000000e+00]
-; X64-NEXT: vpermil2ps {{.*#+}} ymm0 = ymm0[0],ymm1[0,2],zero,ymm1[4],zero,ymm1[4,6]
+; X64-NEXT: vmovaps {{.*#+}} ymm0 = [-8.000000e+00,1.000000e+00,3.000000e+00,0.000000e+00,5.000000e+00,0.000000e+00,5.000000e+00,7.000000e+00]
; X64-NEXT: retq
%1 = call <8 x float> @llvm.x86.xop.vpermil2ps.256(<8 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0>, <8 x float> <float -8.0, float -7.0, float -6.0, float -5.0, float -4.0, float -3.0, float -2.0, float -1.0>, <8 x i32> <i32 4, i32 0, i32 2, i32 8, i32 0, i32 8, i32 0, i32 2>, i8 2)
ret <8 x float> %1
define <16 x i8> @constant_fold_vpperm() {
; X32-LABEL: constant_fold_vpperm:
; X32: # BB#0:
-; X32-NEXT: vmovdqa {{.*#+}} xmm0 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
-; X32-NEXT: vmovdqa {{.*#+}} xmm1 = [0,255,254,253,252,251,250,249,248,247,246,245,244,243,242,241]
-; X32-NEXT: vpperm {{.*#+}} xmm0 = xmm0[15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
+; X32-NEXT: vmovaps {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
; X32-NEXT: retl
;
; X64-LABEL: constant_fold_vpperm:
; X64: # BB#0:
-; X64-NEXT: vmovdqa {{.*#+}} xmm0 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
-; X64-NEXT: vmovdqa {{.*#+}} xmm1 = [0,255,254,253,252,251,250,249,248,247,246,245,244,243,242,241]
-; X64-NEXT: vpperm {{.*#+}} xmm0 = xmm0[15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
+; X64-NEXT: vmovaps {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
; X64-NEXT: retq
%1 = call <16 x i8> @llvm.x86.xop.vpperm(<16 x i8> <i8 0, i8 -1, i8 -2, i8 -3, i8 -4, i8 -5, i8 -6, i8 -7, i8 -8, i8 -9, i8 -10, i8 -11, i8 -12, i8 -13, i8 -14, i8 -15>, <16 x i8> <i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>, <16 x i8> <i8 31, i8 30, i8 29, i8 28, i8 27, i8 26, i8 25, i8 24, i8 23, i8 22, i8 21, i8 20, i8 19, i8 18, i8 17, i8 16>)
ret <16 x i8> %1
; X32-NEXT: xorps %xmm0, %xmm0
; X32-NEXT: movlps %xmm0, (%esp)
; X32-NEXT: movq (%esp), %mm0
-; X32-NEXT: pshuflw {{.*#+}} xmm0 = mem[0,2,2,3,4,5,6,7]
-; X32-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
-; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; X32-NEXT: movq %xmm0, {{[0-9]+}}(%esp)
+; X32-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; X32-NEXT: movsd %xmm0, {{[0-9]+}}(%esp)
; X32-NEXT: movq {{[0-9]+}}(%esp), %mm1
; X32-NEXT: xorl %edi, %edi
; X32-NEXT: maskmovq %mm1, %mm0
; X64-NEXT: xorps %xmm0, %xmm0
; X64-NEXT: movlps %xmm0, -{{[0-9]+}}(%rsp)
; X64-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
-; X64-NEXT: pshuflw {{.*#+}} xmm0 = mem[0,2,2,3,4,5,6,7]
-; X64-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
-; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; X64-NEXT: movq %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NEXT: movq {{.*}}(%rip), %rax
+; X64-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
; X64-NEXT: movq -{{[0-9]+}}(%rsp), %mm1
; X64-NEXT: xorl %edi, %edi
; X64-NEXT: maskmovq %mm1, %mm0
define void @test(<4 x i16>* %a, <4 x i16>* %b) {
; AVX-LABEL: test:
; AVX: ## BB#0: ## %body
-; AVX-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [65533,124,125,14807]
-; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm1
-; AVX-NEXT: vmovq %xmm1, (%rdi)
-; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,0,0,65535]
-; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm0
-; AVX-NEXT: vmovq %xmm0, (%rsi)
+; AVX-NEXT: movq {{.*}}(%rip), %rax
+; AVX-NEXT: movq %rax, (%rdi)
+; AVX-NEXT: movq {{.*}}(%rip), %rax
+; AVX-NEXT: movq %rax, (%rsi)
; AVX-NEXT: retq
body:
%predphi = select <4 x i1> <i1 true, i1 false, i1 false, i1 true>, <4 x i16> <i16 -3, i16 545, i16 4385, i16 14807>, <4 x i16> <i16 123, i16 124, i16 125, i16 127>
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movdqa {{.*#+}} xmm0 = <0,4,8,128,u,u,u,u,u,u,u,u,u,u,u,u>
-; X86-NEXT: movdqa {{.*#+}} xmm1 = <158,158,158,u>
-; X86-NEXT: pshufb %xmm0, %xmm1
-; X86-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
-; X86-NEXT: pextrw $0, %xmm1, (%edx)
+; X86-NEXT: pmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero
+; X86-NEXT: pextrw $0, %xmm0, (%edx)
; X86-NEXT: movb $-98, 2(%edx)
-; X86-NEXT: movdqa {{.*#+}} xmm1 = <1,1,1,u>
-; X86-NEXT: pshufb %xmm0, %xmm1
-; X86-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; X86-NEXT: pmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero
; X86-NEXT: pextrw $0, %xmm0, (%ecx)
; X86-NEXT: movb $1, 2(%ecx)
; X86-NEXT: pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
;
; X64-LABEL: rot:
; X64: # BB#0: # %entry
-; X64-NEXT: movdqa {{.*#+}} xmm0 = <0,4,8,128,u,u,u,u,u,u,u,u,u,u,u,u>
-; X64-NEXT: movdqa {{.*#+}} xmm1 = <158,158,158,u>
-; X64-NEXT: pshufb %xmm0, %xmm1
-; X64-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
-; X64-NEXT: pextrw $0, %xmm1, (%rsi)
+; X64-NEXT: pmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero
+; X64-NEXT: pextrw $0, %xmm0, (%rsi)
; X64-NEXT: movb $-98, 2(%rsi)
-; X64-NEXT: movdqa {{.*#+}} xmm1 = <1,1,1,u>
-; X64-NEXT: pshufb %xmm0, %xmm1
-; X64-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; X64-NEXT: pmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero
; X64-NEXT: pextrw $0, %xmm0, (%rdx)
; X64-NEXT: movb $1, 2(%rdx)
; X64-NEXT: pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; X86-LABEL: shuf5:
; X86: # BB#0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movdqa {{.*#+}} xmm0 = [33,33,33,33,33,33,33,33]
-; X86-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
-; X86-NEXT: movq %xmm0, (%eax)
+; X86-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; X86-NEXT: movsd %xmm0, (%eax)
; X86-NEXT: retl
;
; X64-LABEL: shuf5:
; X64: # BB#0:
-; X64-NEXT: movdqa {{.*#+}} xmm0 = [33,33,33,33,33,33,33,33]
-; X64-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
-; X64-NEXT: movq %xmm0, (%rdi)
+; X64-NEXT: movq {{.*}}(%rip), %rax
+; X64-NEXT: movq %rax, (%rdi)
; X64-NEXT: retq
%v = shufflevector <2 x i8> <i8 4, i8 33>, <2 x i8> undef, <8 x i32> <i32 1, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
store <8 x i8> %v, <8 x i8>* %p, align 8