/// ((v0+v2), (v1+v3), undef, undef)
int getReductionCost(unsigned Opcode, Type *Ty, bool IsPairwiseForm) const;
- /// \returns The cost of Intrinsic instructions. Types analysis only.
+ /// \returns The cost of Intrinsic instructions. Analyses the real arguments.
+ /// Three cases are handled: 1. scalar instruction 2. vector instruction
+ /// 3. scalar instruction which is to be vectorized with VF.
int getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
- ArrayRef<Type *> Tys, FastMathFlags FMF) const;
+ ArrayRef<Value *> Args, FastMathFlags FMF,
+ unsigned VF = 1) const;
- /// \returns The cost of Intrinsic instructions. Analyses the real arguments.
+ /// \returns The cost of Intrinsic instructions. Types analysis only.
+ /// If ScalarizationCostPassed is UINT_MAX, the cost of scalarizing the
+ /// arguments and the return value will be computed based on types.
int getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
- ArrayRef<Value *> Args, FastMathFlags FMF) const;
+ ArrayRef<Type *> Tys, FastMathFlags FMF,
+ unsigned ScalarizationCostPassed = UINT_MAX) const;
/// \returns The cost of Call instructions.
int getCallInstrCost(Function *F, Type *RetTy, ArrayRef<Type *> Tys) const;
virtual int getReductionCost(unsigned Opcode, Type *Ty,
bool IsPairwiseForm) = 0;
virtual int getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
- ArrayRef<Type *> Tys,
- FastMathFlags FMF) = 0;
+ ArrayRef<Type *> Tys, FastMathFlags FMF,
+ unsigned ScalarizationCostPassed) = 0;
virtual int getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
- ArrayRef<Value *> Args,
- FastMathFlags FMF) = 0;
+ ArrayRef<Value *> Args, FastMathFlags FMF, unsigned VF) = 0;
virtual int getCallInstrCost(Function *F, Type *RetTy,
ArrayRef<Type *> Tys) = 0;
virtual unsigned getNumberOfParts(Type *Tp) = 0;
return Impl.getReductionCost(Opcode, Ty, IsPairwiseForm);
}
int getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, ArrayRef<Type *> Tys,
- FastMathFlags FMF) override {
- return Impl.getIntrinsicInstrCost(ID, RetTy, Tys, FMF);
+ FastMathFlags FMF, unsigned ScalarizationCostPassed) override {
+ return Impl.getIntrinsicInstrCost(ID, RetTy, Tys, FMF,
+ ScalarizationCostPassed);
}
int getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
- ArrayRef<Value *> Args,
- FastMathFlags FMF) override {
- return Impl.getIntrinsicInstrCost(ID, RetTy, Args, FMF);
+ ArrayRef<Value *> Args, FastMathFlags FMF, unsigned VF) override {
+ return Impl.getIntrinsicInstrCost(ID, RetTy, Args, FMF, VF);
}
int getCallInstrCost(Function *F, Type *RetTy,
ArrayRef<Type *> Tys) override {
}
unsigned getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
- ArrayRef<Type *> Tys, FastMathFlags FMF) {
+ ArrayRef<Type *> Tys, FastMathFlags FMF,
+ unsigned ScalarizationCostPassed) {
return 1;
}
unsigned getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
- ArrayRef<Value *> Args, FastMathFlags FMF) {
+ ArrayRef<Value *> Args, FastMathFlags FMF, unsigned VF) {
return 1;
}
return Cost;
}
- /// Estimate the overhead of scalarizing an instructions unique operands.
+ /// Estimate the overhead of scalarizing an instructions unique
+ /// non-constant operands. The types of the arguments are ordinarily
+ /// scalar, in which case the costs are multiplied with VF. Vector
+ /// arguments are allowed if 1 is passed for VF.
unsigned getOperandsScalarizationOverhead(ArrayRef<const Value *> Args,
unsigned VF) {
unsigned Cost = 0;
SmallPtrSet<const Value*, 4> UniqueOperands;
for (const Value *A : Args) {
- if (UniqueOperands.insert(A).second)
- Cost += getScalarizationOverhead(VectorType::get(A->getType(), VF),
- false, true);
+ if (!isa<Constant>(A) && UniqueOperands.insert(A).second) {
+ Type *VecTy = nullptr;
+ if (A->getType()->isVectorTy()) {
+ assert (VF == 1 && "Vector argument passed with VF > 1");
+ VecTy = A->getType();
+ }
+ else
+ VecTy = VectorType::get(A->getType(), VF);
+
+ Cost += getScalarizationOverhead(VecTy, false, true);
+ }
}
+
return Cost;
}
return Cost;
}
- /// Get intrinsic cost based on arguments
+ /// Get intrinsic cost based on arguments.
unsigned getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
- ArrayRef<Value *> Args, FastMathFlags FMF) {
+ ArrayRef<Value *> Args, FastMathFlags FMF,
+ unsigned VF = 1) {
+ unsigned RetVF = (RetTy->isVectorTy() ? RetTy->getVectorNumElements() : 1);
+ assert ((RetVF == 1 || VF == 1) && "VF > 1 and RetVF is a vector type");
+
switch (IID) {
default: {
+ // Assume that we need to scalarize this intrinsic.
SmallVector<Type *, 4> Types;
- for (Value *Op : Args)
- Types.push_back(Op->getType());
- return static_cast<T *>(this)->getIntrinsicInstrCost(IID, RetTy, Types,
- FMF);
+ for (Value *Op : Args) {
+ Type *OpTy = Op->getType();
+ assert (VF == 1 || !OpTy->isVectorTy());
+ Types.push_back(VF == 1 ? OpTy : VectorType::get(OpTy, VF));
+ }
+
+ if (VF > 1 && !RetTy->isVoidTy())
+ RetTy = VectorType::get(RetTy, VF);
+
+ // Compute the scalarization overhead based on Args for a vector
+ // intrinsic. A vectorizer will pass a scalar RetTy and VF > 1, while
+ // CostModel will pass a vector RetTy and VF is 1.
+ unsigned ScalarizationCost = UINT_MAX;
+ if (RetVF > 1 || VF > 1) {
+ ScalarizationCost = getScalarizationOverhead(RetTy, true, false);
+ ScalarizationCost += getOperandsScalarizationOverhead(Args, VF);
+ }
+
+ return static_cast<T *>(this)->
+ getIntrinsicInstrCost(IID, RetTy, Types, FMF, ScalarizationCost);
}
case Intrinsic::masked_scatter: {
+ assert (VF == 1 && "Can't vectorize types here.");
Value *Mask = Args[3];
bool VarMask = !isa<Constant>(Mask);
unsigned Alignment = cast<ConstantInt>(Args[2])->getZExtValue();
Alignment);
}
case Intrinsic::masked_gather: {
+ assert (VF == 1 && "Can't vectorize types here.");
Value *Mask = Args[2];
bool VarMask = !isa<Constant>(Mask);
unsigned Alignment = cast<ConstantInt>(Args[1])->getZExtValue();
}
}
- /// Get intrinsic cost based on argument types
+ /// Get intrinsic cost based on argument types.
+ /// If ScalarizationCostPassed is UINT_MAX, the cost of scalarizing the
+ /// arguments and the return value will be computed based on types.
unsigned getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
- ArrayRef<Type *> Tys, FastMathFlags FMF) {
+ ArrayRef<Type *> Tys, FastMathFlags FMF,
+ unsigned ScalarizationCostPassed = UINT_MAX) {
SmallVector<unsigned, 2> ISDs;
unsigned SingleCallCost = 10; // Library call cost. Make it expensive.
switch (IID) {
default: {
// Assume that we need to scalarize this intrinsic.
- unsigned ScalarizationCost = 0;
+ unsigned ScalarizationCost = ScalarizationCostPassed;
unsigned ScalarCalls = 1;
Type *ScalarRetTy = RetTy;
if (RetTy->isVectorTy()) {
- ScalarizationCost = getScalarizationOverhead(RetTy, true, false);
+ if (ScalarizationCostPassed == UINT_MAX)
+ ScalarizationCost = getScalarizationOverhead(RetTy, true, false);
ScalarCalls = std::max(ScalarCalls, RetTy->getVectorNumElements());
ScalarRetTy = RetTy->getScalarType();
}
for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) {
Type *Ty = Tys[i];
if (Ty->isVectorTy()) {
- ScalarizationCost += getScalarizationOverhead(Ty, false, true);
+ if (ScalarizationCostPassed == UINT_MAX)
+ ScalarizationCost += getScalarizationOverhead(Ty, false, true);
ScalarCalls = std::max(ScalarCalls, Ty->getVectorNumElements());
Ty = Ty->getScalarType();
}
// this will emit a costly libcall, adding call overhead and spills. Make it
// very expensive.
if (RetTy->isVectorTy()) {
- unsigned ScalarizationCost = getScalarizationOverhead(RetTy, true, false);
+ unsigned ScalarizationCost = ((ScalarizationCostPassed != UINT_MAX) ?
+ ScalarizationCostPassed : getScalarizationOverhead(RetTy, true, false));
unsigned ScalarCalls = RetTy->getVectorNumElements();
SmallVector<Type *, 4> ScalarTys;
for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) {
IID, RetTy->getScalarType(), ScalarTys, FMF);
for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) {
if (Tys[i]->isVectorTy()) {
- ScalarizationCost += getScalarizationOverhead(Tys[i], false, true);
+ if (ScalarizationCostPassed == UINT_MAX)
+ ScalarizationCost += getScalarizationOverhead(Tys[i], false, true);
ScalarCalls = std::max(ScalarCalls, Tys[i]->getVectorNumElements());
}
}
}
case Instruction::Call:
if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
- SmallVector<Value *, 4> Args;
- for (unsigned J = 0, JE = II->getNumArgOperands(); J != JE; ++J)
- Args.push_back(II->getArgOperand(J));
+ SmallVector<Value *, 4> Args(II->arg_operands());
FastMathFlags FMF;
if (auto *FPMO = dyn_cast<FPMathOperator>(II))
}
int TargetTransformInfo::getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
- ArrayRef<Type *> Tys,
- FastMathFlags FMF) const {
- int Cost = TTIImpl->getIntrinsicInstrCost(ID, RetTy, Tys, FMF);
+ ArrayRef<Type *> Tys, FastMathFlags FMF,
+ unsigned ScalarizationCostPassed) const {
+ int Cost = TTIImpl->getIntrinsicInstrCost(ID, RetTy, Tys, FMF,
+ ScalarizationCostPassed);
assert(Cost >= 0 && "TTI should not produce negative costs!");
return Cost;
}
int TargetTransformInfo::getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
- ArrayRef<Value *> Args,
- FastMathFlags FMF) const {
- int Cost = TTIImpl->getIntrinsicInstrCost(ID, RetTy, Args, FMF);
+ ArrayRef<Value *> Args, FastMathFlags FMF, unsigned VF) const {
+ int Cost = TTIImpl->getIntrinsicInstrCost(ID, RetTy, Args, FMF, VF);
assert(Cost >= 0 && "TTI should not produce negative costs!");
return Cost;
}
}
int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
- ArrayRef<Type *> Tys, FastMathFlags FMF) {
+ ArrayRef<Type *> Tys, FastMathFlags FMF,
+ unsigned ScalarizationCostPassed) {
// Costs should match the codegen from:
// BITREVERSE: llvm\test\CodeGen\X86\vector-bitreverse.ll
// BSWAP: llvm\test\CodeGen\X86\bswap-vector.ll
if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))
return LT.first * Entry->Cost;
- return BaseT::getIntrinsicInstrCost(IID, RetTy, Tys, FMF);
+ return BaseT::getIntrinsicInstrCost(IID, RetTy, Tys, FMF, ScalarizationCostPassed);
}
int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
- ArrayRef<Value *> Args, FastMathFlags FMF) {
- return BaseT::getIntrinsicInstrCost(IID, RetTy, Args, FMF);
+ ArrayRef<Value *> Args, FastMathFlags FMF, unsigned VF) {
+ return BaseT::getIntrinsicInstrCost(IID, RetTy, Args, FMF, VF);
}
int X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) {
const SCEV *Ptr);
int getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
- ArrayRef<Type *> Tys, FastMathFlags FMF);
+ ArrayRef<Type *> Tys, FastMathFlags FMF,
+ unsigned ScalarizationCostPassed = UINT_MAX);
int getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
- ArrayRef<Value *> Args, FastMathFlags FMF);
+ ArrayRef<Value *> Args, FastMathFlags FMF,
+ unsigned VF = 1);
int getReductionCost(unsigned Opcode, Type *Ty, bool IsPairwiseForm);
FastMathFlags FMFCI;
if (auto *FPMOCI = dyn_cast<FPMathOperator>(CI))
FMFCI = FPMOCI->getFastMathFlags();
+ SmallVector<Value *, 4> IArgs(CI->arg_operands());
+ unsigned ICost = TTI->getIntrinsicInstrCost(IID, IT1, IArgs, FMFCI);
- SmallVector<Type*, 4> Tys;
- for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i)
- Tys.push_back(CI->getArgOperand(i)->getType());
- unsigned ICost = TTI->getIntrinsicInstrCost(IID, IT1, Tys, FMFCI);
-
- Tys.clear();
CallInst *CJ = cast<CallInst>(J);
FastMathFlags FMFCJ;
if (auto *FPMOCJ = dyn_cast<FPMathOperator>(CJ))
FMFCJ = FPMOCJ->getFastMathFlags();
- for (unsigned i = 0, ie = CJ->getNumArgOperands(); i != ie; ++i)
- Tys.push_back(CJ->getArgOperand(i)->getType());
- unsigned JCost = TTI->getIntrinsicInstrCost(IID, JT1, Tys, FMFCJ);
+ SmallVector<Value *, 4> JArgs(CJ->arg_operands());
+ unsigned JCost = TTI->getIntrinsicInstrCost(IID, JT1, JArgs, FMFCJ);
- Tys.clear();
assert(CI->getNumArgOperands() == CJ->getNumArgOperands() &&
"Intrinsic argument counts differ");
+ SmallVector<Type*, 4> Tys;
+ SmallVector<Value *, 4> VecArgs;
for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) {
if ((IID == Intrinsic::powi || IID == Intrinsic::ctlz ||
- IID == Intrinsic::cttz) && i == 1)
+ IID == Intrinsic::cttz) && i == 1) {
Tys.push_back(CI->getArgOperand(i)->getType());
- else
+ VecArgs.push_back(CI->getArgOperand(i));
+ }
+ else {
Tys.push_back(getVecTypeForPair(CI->getArgOperand(i)->getType(),
CJ->getArgOperand(i)->getType()));
+ // Add both operands, and then count their scalarization overhead
+ // with VF 1.
+ VecArgs.push_back(CI->getArgOperand(i));
+ VecArgs.push_back(CJ->getArgOperand(i));
+ }
}
+ // Compute the scalarization cost here with the original operands (to
+ // check for uniqueness etc), and then call getIntrinsicInstrCost()
+ // with the constructed vector types.
+ Type *RetTy = getVecTypeForPair(IT1, JT1);
+ unsigned ScalarizationCost = 0;
+ if (!RetTy->isVoidTy())
+ ScalarizationCost += TTI->getScalarizationOverhead(RetTy, true, false);
+ ScalarizationCost += TTI->getOperandsScalarizationOverhead(VecArgs, 1);
+
FastMathFlags FMFV = FMFCI;
FMFV &= FMFCJ;
- Type *RetTy = getVecTypeForPair(IT1, JT1);
- unsigned VCost = TTI->getIntrinsicInstrCost(IID, RetTy, Tys, FMFV);
+ unsigned VCost = TTI->getIntrinsicInstrCost(IID, RetTy, Tys, FMFV,
+ ScalarizationCost);
if (VCost > ICost + JCost)
return false;
Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
assert(ID && "Expected intrinsic call!");
- Type *RetTy = ToVectorTy(CI->getType(), VF);
- SmallVector<Type *, 4> Tys;
- for (Value *ArgOperand : CI->arg_operands())
- Tys.push_back(ToVectorTy(ArgOperand->getType(), VF));
-
FastMathFlags FMF;
if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
FMF = FPMO->getFastMathFlags();
- return TTI.getIntrinsicInstrCost(ID, RetTy, Tys, FMF);
+ SmallVector<Value *, 4> Operands(CI->arg_operands());
+ return TTI.getIntrinsicInstrCost(ID, CI->getType(), Operands, FMF, VF);
}
static Type *smallestIntegerVectorType(Type *T1, Type *T2) {
Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
// Calculate the cost of the scalar and vector calls.
- SmallVector<Type*, 4> ScalarTys, VecTys;
- for (unsigned op = 0, opc = CI->getNumArgOperands(); op!= opc; ++op) {
+ SmallVector<Type*, 4> ScalarTys;
+ for (unsigned op = 0, opc = CI->getNumArgOperands(); op!= opc; ++op)
ScalarTys.push_back(CI->getArgOperand(op)->getType());
- VecTys.push_back(VectorType::get(CI->getArgOperand(op)->getType(),
- VecTy->getNumElements()));
- }
FastMathFlags FMF;
if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
int ScalarCallCost = VecTy->getNumElements() *
TTI->getIntrinsicInstrCost(ID, ScalarTy, ScalarTys, FMF);
- int VecCallCost = TTI->getIntrinsicInstrCost(ID, VecTy, VecTys, FMF);
+ SmallVector<Value *, 4> Args(CI->arg_operands());
+ int VecCallCost = TTI->getIntrinsicInstrCost(ID, CI->getType(), Args, FMF,
+ VecTy->getNumElements());
DEBUG(dbgs() << "SLP: Call cost "<< VecCallCost - ScalarCallCost
<< " (" << VecCallCost << "-" << ScalarCallCost << ")"
; AVX2: cost of 1 {{.*}} %F32 = call float @llvm.fma.f32
; AVX512: cost of 1 {{.*}} %F32 = call float @llvm.fma.f32
%F32 = call float @llvm.fma.f32(float undef, float undef, float undef)
- ; SSE2: cost of 52 {{.*}} %V4F32 = call <4 x float> @llvm.fma.v4f32
- ; SSE42: cost of 52 {{.*}} %V4F32 = call <4 x float> @llvm.fma.v4f32
+ ; SSE2: cost of 43 {{.*}} %V4F32 = call <4 x float> @llvm.fma.v4f32
+ ; SSE42: cost of 43 {{.*}} %V4F32 = call <4 x float> @llvm.fma.v4f32
; AVX: cost of 1 {{.*}} %V4F32 = call <4 x float> @llvm.fma.v4f32
; AVX2: cost of 1 {{.*}} %V4F32 = call <4 x float> @llvm.fma.v4f32
; AVX512: cost of 1 {{.*}} %V4F32 = call <4 x float> @llvm.fma.v4f32
%V4F32 = call <4 x float> @llvm.fma.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef)
- ; SSE2: cost of 104 {{.*}} %V8F32 = call <8 x float> @llvm.fma.v8f32
- ; SSE42: cost of 104 {{.*}} %V8F32 = call <8 x float> @llvm.fma.v8f32
+ ; SSE2: cost of 86 {{.*}} %V8F32 = call <8 x float> @llvm.fma.v8f32
+ ; SSE42: cost of 86 {{.*}} %V8F32 = call <8 x float> @llvm.fma.v8f32
; AVX: cost of 1 {{.*}} %V8F32 = call <8 x float> @llvm.fma.v8f32
; AVX2: cost of 1 {{.*}} %V8F32 = call <8 x float> @llvm.fma.v8f32
; AVX512: cost of 1 {{.*}} %V8F32 = call <8 x float> @llvm.fma.v8f32
%V8F32 = call <8 x float> @llvm.fma.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef)
- ; SSE2: cost of 208 {{.*}} %V16F32 = call <16 x float> @llvm.fma.v16f32
- ; SSE42: cost of 208 {{.*}} %V16F32 = call <16 x float> @llvm.fma.v16f32
+ ; SSE2: cost of 172 {{.*}} %V16F32 = call <16 x float> @llvm.fma.v16f32
+ ; SSE42: cost of 172 {{.*}} %V16F32 = call <16 x float> @llvm.fma.v16f32
; AVX: cost of 4 {{.*}} %V16F32 = call <16 x float> @llvm.fma.v16f32
; AVX2: cost of 4 {{.*}} %V16F32 = call <16 x float> @llvm.fma.v16f32
; AVX512: cost of 1 {{.*}} %V16F32 = call <16 x float> @llvm.fma.v16f32
; AVX2: cost of 1 {{.*}} %F64 = call double @llvm.fma.f64
; AVX512: cost of 1 {{.*}} %F64 = call double @llvm.fma.f64
%F64 = call double @llvm.fma.f64(double undef, double undef, double undef)
- ; SSE2: cost of 24 {{.*}} %V2F64 = call <2 x double> @llvm.fma.v2f64
- ; SSE42: cost of 24 {{.*}} %V2F64 = call <2 x double> @llvm.fma.v2f64
+ ; SSE2: cost of 21 {{.*}} %V2F64 = call <2 x double> @llvm.fma.v2f64
+ ; SSE42: cost of 21 {{.*}} %V2F64 = call <2 x double> @llvm.fma.v2f64
; AVX: cost of 1 {{.*}} %V2F64 = call <2 x double> @llvm.fma.v2f64
; AVX2: cost of 1 {{.*}} %V2F64 = call <2 x double> @llvm.fma.v2f64
; AVX512: cost of 1 {{.*}} %V2F64 = call <2 x double> @llvm.fma.v2f64
%V2F64 = call <2 x double> @llvm.fma.v2f64(<2 x double> undef, <2 x double> undef, <2 x double> undef)
- ; SSE2: cost of 48 {{.*}} %V4F64 = call <4 x double> @llvm.fma.v4f64
- ; SSE42: cost of 48 {{.*}} %V4F64 = call <4 x double> @llvm.fma.v4f64
+ ; SSE2: cost of 42 {{.*}} %V4F64 = call <4 x double> @llvm.fma.v4f64
+ ; SSE42: cost of 42 {{.*}} %V4F64 = call <4 x double> @llvm.fma.v4f64
; AVX: cost of 1 {{.*}} %V4F64 = call <4 x double> @llvm.fma.v4f64
; AVX2: cost of 1 {{.*}} %V4F64 = call <4 x double> @llvm.fma.v4f64
; AVX512: cost of 1 {{.*}} %V4F64 = call <4 x double> @llvm.fma.v4f64
%V4F64 = call <4 x double> @llvm.fma.v4f64(<4 x double> undef, <4 x double> undef, <4 x double> undef)
- ; SSE2: cost of 96 {{.*}} %V8F64 = call <8 x double> @llvm.fma.v8f64
- ; SSE42: cost of 96 {{.*}} %V8F64 = call <8 x double> @llvm.fma.v8f64
+ ; SSE2: cost of 84 {{.*}} %V8F64 = call <8 x double> @llvm.fma.v8f64
+ ; SSE42: cost of 84 {{.*}} %V8F64 = call <8 x double> @llvm.fma.v8f64
; AVX: cost of 4 {{.*}} %V8F64 = call <8 x double> @llvm.fma.v8f64
; AVX2: cost of 4 {{.*}} %V8F64 = call <8 x double> @llvm.fma.v8f64
; AVX512: cost of 1 {{.*}} %V8F64 = call <8 x double> @llvm.fma.v8f64
; VF_2-LABEL: Checking a loop in "i64_factor_8"
; VF_2: Found an estimated cost of 6 for VF 2 For instruction: %tmp2 = load i64, i64* %tmp0, align 8
; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp3 = load i64, i64* %tmp1, align 8
-; VF_2-NEXT: Found an estimated cost of 10 for VF 2 For instruction: store i64 0, i64* %tmp0, align 8
-; VF_2-NEXT: Found an estimated cost of 10 for VF 2 For instruction: store i64 0, i64* %tmp1, align 8
+; VF_2-NEXT: Found an estimated cost of 7 for VF 2 For instruction: store i64 0, i64* %tmp0, align 8
+; VF_2-NEXT: Found an estimated cost of 7 for VF 2 For instruction: store i64 0, i64* %tmp1, align 8
for.body:
%i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
%tmp0 = getelementptr inbounds %i64.8, %i64.8* %data, i64 %i, i32 2
; VF_4: Found an estimated cost of 40 for VF 4 For instruction: %tmp2 = load half, half* %tmp0, align 2
; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load half, half* %tmp1, align 2
; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store half 0xH0000, half* %tmp0, align 2
-; VF_4-NEXT: Found an estimated cost of 40 for VF 4 For instruction: store half 0xH0000, half* %tmp1, align 2
+; VF_4-NEXT: Found an estimated cost of 32 for VF 4 For instruction: store half 0xH0000, half* %tmp1, align 2
; VF_8-LABEL: Checking a loop in "half_factor_2"
; VF_8: Found an estimated cost of 80 for VF 8 For instruction: %tmp2 = load half, half* %tmp0, align 2
; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load half, half* %tmp1, align 2
; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store half 0xH0000, half* %tmp0, align 2
-; VF_8-NEXT: Found an estimated cost of 80 for VF 8 For instruction: store half 0xH0000, half* %tmp1, align 2
+; VF_8-NEXT: Found an estimated cost of 64 for VF 8 For instruction: store half 0xH0000, half* %tmp1, align 2
for.body:
%i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
%tmp0 = getelementptr inbounds %half.2, %half.2* %data, i64 %i, i32 0