From 85dd82a95b6acf091b827e66de24a523bfc96c02 Mon Sep 17 00:00:00 2001 From: Jonas Paulsson Date: Tue, 14 Mar 2017 06:35:36 +0000 Subject: [PATCH] [TargetTransformInfo] getIntrinsicInstrCost() scalarization estimation improved getIntrinsicInstrCost() used to only compute scalarization cost based on types. This patch improves this so that the actual arguments are checked when they are available, in order to handle only unique non-constant operands. Tests updates: Analysis/CostModel/X86/arith-fp.ll Transforms/LoopVectorize/AArch64/interleaved_cost.ll Transforms/LoopVectorize/ARM/interleaved_cost.ll The improvement in getOperandsScalarizationOverhead() to differentiate on constants made it necessary to update the interleaved_cost.ll tests even though they do not relate to intrinsics. Review: Hal Finkel https://reviews.llvm.org/D29540 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@297705 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/Analysis/TargetTransformInfo.h | 31 ++++---- .../llvm/Analysis/TargetTransformInfoImpl.h | 5 +- include/llvm/CodeGen/BasicTTIImpl.h | 76 ++++++++++++++----- lib/Analysis/CostModel.cpp | 4 +- lib/Analysis/TargetTransformInfo.cpp | 12 +-- lib/Target/X86/X86TargetTransformInfo.cpp | 9 ++- lib/Target/X86/X86TargetTransformInfo.h | 6 +- lib/Transforms/Vectorize/BBVectorize.cpp | 40 ++++++---- lib/Transforms/Vectorize/LoopVectorize.cpp | 8 +- lib/Transforms/Vectorize/SLPVectorizer.cpp | 11 ++- test/Analysis/CostModel/X86/arith-fp.ll | 24 +++--- .../LoopVectorize/AArch64/interleaved_cost.ll | 4 +- .../LoopVectorize/ARM/interleaved_cost.ll | 4 +- 13 files changed, 145 insertions(+), 89 deletions(-) diff --git a/include/llvm/Analysis/TargetTransformInfo.h b/include/llvm/Analysis/TargetTransformInfo.h index f947c46268c..2f4f3362c26 100644 --- a/include/llvm/Analysis/TargetTransformInfo.h +++ b/include/llvm/Analysis/TargetTransformInfo.h @@ -628,13 +628,19 @@ public: /// ((v0+v2), (v1+v3), undef, undef) int getReductionCost(unsigned Opcode, Type *Ty, bool IsPairwiseForm) const; - /// \returns The cost of Intrinsic instructions. Types analysis only. + /// \returns The cost of Intrinsic instructions. Analyses the real arguments. + /// Three cases are handled: 1. scalar instruction 2. vector instruction + /// 3. scalar instruction which is to be vectorized with VF. int getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, - ArrayRef Tys, FastMathFlags FMF) const; + ArrayRef Args, FastMathFlags FMF, + unsigned VF = 1) const; - /// \returns The cost of Intrinsic instructions. Analyses the real arguments. + /// \returns The cost of Intrinsic instructions. Types analysis only. + /// If ScalarizationCostPassed is UINT_MAX, the cost of scalarizing the + /// arguments and the return value will be computed based on types. int getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, - ArrayRef Args, FastMathFlags FMF) const; + ArrayRef Tys, FastMathFlags FMF, + unsigned ScalarizationCostPassed = UINT_MAX) const; /// \returns The cost of Call instructions. int getCallInstrCost(Function *F, Type *RetTy, ArrayRef Tys) const; @@ -828,11 +834,10 @@ public: virtual int getReductionCost(unsigned Opcode, Type *Ty, bool IsPairwiseForm) = 0; virtual int getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, - ArrayRef Tys, - FastMathFlags FMF) = 0; + ArrayRef Tys, FastMathFlags FMF, + unsigned ScalarizationCostPassed) = 0; virtual int getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, - ArrayRef Args, - FastMathFlags FMF) = 0; + ArrayRef Args, FastMathFlags FMF, unsigned VF) = 0; virtual int getCallInstrCost(Function *F, Type *RetTy, ArrayRef Tys) = 0; virtual unsigned getNumberOfParts(Type *Tp) = 0; @@ -1086,13 +1091,13 @@ public: return Impl.getReductionCost(Opcode, Ty, IsPairwiseForm); } int getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, ArrayRef Tys, - FastMathFlags FMF) override { - return Impl.getIntrinsicInstrCost(ID, RetTy, Tys, FMF); + FastMathFlags FMF, unsigned ScalarizationCostPassed) override { + return Impl.getIntrinsicInstrCost(ID, RetTy, Tys, FMF, + ScalarizationCostPassed); } int getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, - ArrayRef Args, - FastMathFlags FMF) override { - return Impl.getIntrinsicInstrCost(ID, RetTy, Args, FMF); + ArrayRef Args, FastMathFlags FMF, unsigned VF) override { + return Impl.getIntrinsicInstrCost(ID, RetTy, Args, FMF, VF); } int getCallInstrCost(Function *F, Type *RetTy, ArrayRef Tys) override { diff --git a/include/llvm/Analysis/TargetTransformInfoImpl.h b/include/llvm/Analysis/TargetTransformInfoImpl.h index df503449cf9..2455b582496 100644 --- a/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -369,11 +369,12 @@ public: } unsigned getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, - ArrayRef Tys, FastMathFlags FMF) { + ArrayRef Tys, FastMathFlags FMF, + unsigned ScalarizationCostPassed) { return 1; } unsigned getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, - ArrayRef Args, FastMathFlags FMF) { + ArrayRef Args, FastMathFlags FMF, unsigned VF) { return 1; } diff --git a/include/llvm/CodeGen/BasicTTIImpl.h b/include/llvm/CodeGen/BasicTTIImpl.h index ba774b5d160..039b4115c92 100644 --- a/include/llvm/CodeGen/BasicTTIImpl.h +++ b/include/llvm/CodeGen/BasicTTIImpl.h @@ -306,16 +306,28 @@ public: return Cost; } - /// Estimate the overhead of scalarizing an instructions unique operands. + /// Estimate the overhead of scalarizing an instructions unique + /// non-constant operands. The types of the arguments are ordinarily + /// scalar, in which case the costs are multiplied with VF. Vector + /// arguments are allowed if 1 is passed for VF. unsigned getOperandsScalarizationOverhead(ArrayRef Args, unsigned VF) { unsigned Cost = 0; SmallPtrSet UniqueOperands; for (const Value *A : Args) { - if (UniqueOperands.insert(A).second) - Cost += getScalarizationOverhead(VectorType::get(A->getType(), VF), - false, true); + if (!isa(A) && UniqueOperands.insert(A).second) { + Type *VecTy = nullptr; + if (A->getType()->isVectorTy()) { + assert (VF == 1 && "Vector argument passed with VF > 1"); + VecTy = A->getType(); + } + else + VecTy = VectorType::get(A->getType(), VF); + + Cost += getScalarizationOverhead(VecTy, false, true); + } } + return Cost; } @@ -705,18 +717,40 @@ public: return Cost; } - /// Get intrinsic cost based on arguments + /// Get intrinsic cost based on arguments. unsigned getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, - ArrayRef Args, FastMathFlags FMF) { + ArrayRef Args, FastMathFlags FMF, + unsigned VF = 1) { + unsigned RetVF = (RetTy->isVectorTy() ? RetTy->getVectorNumElements() : 1); + assert ((RetVF == 1 || VF == 1) && "VF > 1 and RetVF is a vector type"); + switch (IID) { default: { + // Assume that we need to scalarize this intrinsic. SmallVector Types; - for (Value *Op : Args) - Types.push_back(Op->getType()); - return static_cast(this)->getIntrinsicInstrCost(IID, RetTy, Types, - FMF); + for (Value *Op : Args) { + Type *OpTy = Op->getType(); + assert (VF == 1 || !OpTy->isVectorTy()); + Types.push_back(VF == 1 ? OpTy : VectorType::get(OpTy, VF)); + } + + if (VF > 1 && !RetTy->isVoidTy()) + RetTy = VectorType::get(RetTy, VF); + + // Compute the scalarization overhead based on Args for a vector + // intrinsic. A vectorizer will pass a scalar RetTy and VF > 1, while + // CostModel will pass a vector RetTy and VF is 1. + unsigned ScalarizationCost = UINT_MAX; + if (RetVF > 1 || VF > 1) { + ScalarizationCost = getScalarizationOverhead(RetTy, true, false); + ScalarizationCost += getOperandsScalarizationOverhead(Args, VF); + } + + return static_cast(this)-> + getIntrinsicInstrCost(IID, RetTy, Types, FMF, ScalarizationCost); } case Intrinsic::masked_scatter: { + assert (VF == 1 && "Can't vectorize types here."); Value *Mask = Args[3]; bool VarMask = !isa(Mask); unsigned Alignment = cast(Args[2])->getZExtValue(); @@ -727,6 +761,7 @@ public: Alignment); } case Intrinsic::masked_gather: { + assert (VF == 1 && "Can't vectorize types here."); Value *Mask = Args[2]; bool VarMask = !isa(Mask); unsigned Alignment = cast(Args[1])->getZExtValue(); @@ -738,19 +773,23 @@ public: } } - /// Get intrinsic cost based on argument types + /// Get intrinsic cost based on argument types. + /// If ScalarizationCostPassed is UINT_MAX, the cost of scalarizing the + /// arguments and the return value will be computed based on types. unsigned getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, - ArrayRef Tys, FastMathFlags FMF) { + ArrayRef Tys, FastMathFlags FMF, + unsigned ScalarizationCostPassed = UINT_MAX) { SmallVector ISDs; unsigned SingleCallCost = 10; // Library call cost. Make it expensive. switch (IID) { default: { // Assume that we need to scalarize this intrinsic. - unsigned ScalarizationCost = 0; + unsigned ScalarizationCost = ScalarizationCostPassed; unsigned ScalarCalls = 1; Type *ScalarRetTy = RetTy; if (RetTy->isVectorTy()) { - ScalarizationCost = getScalarizationOverhead(RetTy, true, false); + if (ScalarizationCostPassed == UINT_MAX) + ScalarizationCost = getScalarizationOverhead(RetTy, true, false); ScalarCalls = std::max(ScalarCalls, RetTy->getVectorNumElements()); ScalarRetTy = RetTy->getScalarType(); } @@ -758,7 +797,8 @@ public: for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) { Type *Ty = Tys[i]; if (Ty->isVectorTy()) { - ScalarizationCost += getScalarizationOverhead(Ty, false, true); + if (ScalarizationCostPassed == UINT_MAX) + ScalarizationCost += getScalarizationOverhead(Ty, false, true); ScalarCalls = std::max(ScalarCalls, Ty->getVectorNumElements()); Ty = Ty->getScalarType(); } @@ -906,7 +946,8 @@ public: // this will emit a costly libcall, adding call overhead and spills. Make it // very expensive. if (RetTy->isVectorTy()) { - unsigned ScalarizationCost = getScalarizationOverhead(RetTy, true, false); + unsigned ScalarizationCost = ((ScalarizationCostPassed != UINT_MAX) ? + ScalarizationCostPassed : getScalarizationOverhead(RetTy, true, false)); unsigned ScalarCalls = RetTy->getVectorNumElements(); SmallVector ScalarTys; for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) { @@ -919,7 +960,8 @@ public: IID, RetTy->getScalarType(), ScalarTys, FMF); for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) { if (Tys[i]->isVectorTy()) { - ScalarizationCost += getScalarizationOverhead(Tys[i], false, true); + if (ScalarizationCostPassed == UINT_MAX) + ScalarizationCost += getScalarizationOverhead(Tys[i], false, true); ScalarCalls = std::max(ScalarCalls, Tys[i]->getVectorNumElements()); } } diff --git a/lib/Analysis/CostModel.cpp b/lib/Analysis/CostModel.cpp index 6b77397956c..757a1e50284 100644 --- a/lib/Analysis/CostModel.cpp +++ b/lib/Analysis/CostModel.cpp @@ -542,9 +542,7 @@ unsigned CostModelAnalysis::getInstructionCost(const Instruction *I) const { } case Instruction::Call: if (const IntrinsicInst *II = dyn_cast(I)) { - SmallVector Args; - for (unsigned J = 0, JE = II->getNumArgOperands(); J != JE; ++J) - Args.push_back(II->getArgOperand(J)); + SmallVector Args(II->arg_operands()); FastMathFlags FMF; if (auto *FPMO = dyn_cast(II)) diff --git a/lib/Analysis/TargetTransformInfo.cpp b/lib/Analysis/TargetTransformInfo.cpp index 3e1c1457b6d..4249b0cbe85 100644 --- a/lib/Analysis/TargetTransformInfo.cpp +++ b/lib/Analysis/TargetTransformInfo.cpp @@ -378,17 +378,17 @@ int TargetTransformInfo::getInterleavedMemoryOpCost( } int TargetTransformInfo::getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, - ArrayRef Tys, - FastMathFlags FMF) const { - int Cost = TTIImpl->getIntrinsicInstrCost(ID, RetTy, Tys, FMF); + ArrayRef Tys, FastMathFlags FMF, + unsigned ScalarizationCostPassed) const { + int Cost = TTIImpl->getIntrinsicInstrCost(ID, RetTy, Tys, FMF, + ScalarizationCostPassed); assert(Cost >= 0 && "TTI should not produce negative costs!"); return Cost; } int TargetTransformInfo::getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, - ArrayRef Args, - FastMathFlags FMF) const { - int Cost = TTIImpl->getIntrinsicInstrCost(ID, RetTy, Args, FMF); + ArrayRef Args, FastMathFlags FMF, unsigned VF) const { + int Cost = TTIImpl->getIntrinsicInstrCost(ID, RetTy, Args, FMF, VF); assert(Cost >= 0 && "TTI should not produce negative costs!"); return Cost; } diff --git a/lib/Target/X86/X86TargetTransformInfo.cpp b/lib/Target/X86/X86TargetTransformInfo.cpp index 7629e0c95c6..7a92ddff253 100644 --- a/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/lib/Target/X86/X86TargetTransformInfo.cpp @@ -1370,7 +1370,8 @@ int X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy) { } int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, - ArrayRef Tys, FastMathFlags FMF) { + ArrayRef Tys, FastMathFlags FMF, + unsigned ScalarizationCostPassed) { // Costs should match the codegen from: // BITREVERSE: llvm\test\CodeGen\X86\vector-bitreverse.ll // BSWAP: llvm\test\CodeGen\X86\bswap-vector.ll @@ -1551,12 +1552,12 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy)) return LT.first * Entry->Cost; - return BaseT::getIntrinsicInstrCost(IID, RetTy, Tys, FMF); + return BaseT::getIntrinsicInstrCost(IID, RetTy, Tys, FMF, ScalarizationCostPassed); } int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, - ArrayRef Args, FastMathFlags FMF) { - return BaseT::getIntrinsicInstrCost(IID, RetTy, Args, FMF); + ArrayRef Args, FastMathFlags FMF, unsigned VF) { + return BaseT::getIntrinsicInstrCost(IID, RetTy, Args, FMF, VF); } int X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) { diff --git a/lib/Target/X86/X86TargetTransformInfo.h b/lib/Target/X86/X86TargetTransformInfo.h index 63a1493002f..0622fcf2815 100644 --- a/lib/Target/X86/X86TargetTransformInfo.h +++ b/lib/Target/X86/X86TargetTransformInfo.h @@ -74,9 +74,11 @@ public: const SCEV *Ptr); int getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, - ArrayRef Tys, FastMathFlags FMF); + ArrayRef Tys, FastMathFlags FMF, + unsigned ScalarizationCostPassed = UINT_MAX); int getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, - ArrayRef Args, FastMathFlags FMF); + ArrayRef Args, FastMathFlags FMF, + unsigned VF = 1); int getReductionCost(unsigned Opcode, Type *Ty, bool IsPairwiseForm); diff --git a/lib/Transforms/Vectorize/BBVectorize.cpp b/lib/Transforms/Vectorize/BBVectorize.cpp index c01740b27d5..705e1533275 100644 --- a/lib/Transforms/Vectorize/BBVectorize.cpp +++ b/lib/Transforms/Vectorize/BBVectorize.cpp @@ -1127,39 +1127,51 @@ namespace { FastMathFlags FMFCI; if (auto *FPMOCI = dyn_cast(CI)) FMFCI = FPMOCI->getFastMathFlags(); + SmallVector IArgs(CI->arg_operands()); + unsigned ICost = TTI->getIntrinsicInstrCost(IID, IT1, IArgs, FMFCI); - SmallVector Tys; - for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) - Tys.push_back(CI->getArgOperand(i)->getType()); - unsigned ICost = TTI->getIntrinsicInstrCost(IID, IT1, Tys, FMFCI); - - Tys.clear(); CallInst *CJ = cast(J); FastMathFlags FMFCJ; if (auto *FPMOCJ = dyn_cast(CJ)) FMFCJ = FPMOCJ->getFastMathFlags(); - for (unsigned i = 0, ie = CJ->getNumArgOperands(); i != ie; ++i) - Tys.push_back(CJ->getArgOperand(i)->getType()); - unsigned JCost = TTI->getIntrinsicInstrCost(IID, JT1, Tys, FMFCJ); + SmallVector JArgs(CJ->arg_operands()); + unsigned JCost = TTI->getIntrinsicInstrCost(IID, JT1, JArgs, FMFCJ); - Tys.clear(); assert(CI->getNumArgOperands() == CJ->getNumArgOperands() && "Intrinsic argument counts differ"); + SmallVector Tys; + SmallVector VecArgs; for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) { if ((IID == Intrinsic::powi || IID == Intrinsic::ctlz || - IID == Intrinsic::cttz) && i == 1) + IID == Intrinsic::cttz) && i == 1) { Tys.push_back(CI->getArgOperand(i)->getType()); - else + VecArgs.push_back(CI->getArgOperand(i)); + } + else { Tys.push_back(getVecTypeForPair(CI->getArgOperand(i)->getType(), CJ->getArgOperand(i)->getType())); + // Add both operands, and then count their scalarization overhead + // with VF 1. + VecArgs.push_back(CI->getArgOperand(i)); + VecArgs.push_back(CJ->getArgOperand(i)); + } } + // Compute the scalarization cost here with the original operands (to + // check for uniqueness etc), and then call getIntrinsicInstrCost() + // with the constructed vector types. + Type *RetTy = getVecTypeForPair(IT1, JT1); + unsigned ScalarizationCost = 0; + if (!RetTy->isVoidTy()) + ScalarizationCost += TTI->getScalarizationOverhead(RetTy, true, false); + ScalarizationCost += TTI->getOperandsScalarizationOverhead(VecArgs, 1); + FastMathFlags FMFV = FMFCI; FMFV &= FMFCJ; - Type *RetTy = getVecTypeForPair(IT1, JT1); - unsigned VCost = TTI->getIntrinsicInstrCost(IID, RetTy, Tys, FMFV); + unsigned VCost = TTI->getIntrinsicInstrCost(IID, RetTy, Tys, FMFV, + ScalarizationCost); if (VCost > ICost + JCost) return false; diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index 43dca02f65f..c13bcea737e 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -3784,16 +3784,12 @@ static unsigned getVectorIntrinsicCost(CallInst *CI, unsigned VF, Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); assert(ID && "Expected intrinsic call!"); - Type *RetTy = ToVectorTy(CI->getType(), VF); - SmallVector Tys; - for (Value *ArgOperand : CI->arg_operands()) - Tys.push_back(ToVectorTy(ArgOperand->getType(), VF)); - FastMathFlags FMF; if (auto *FPMO = dyn_cast(CI)) FMF = FPMO->getFastMathFlags(); - return TTI.getIntrinsicInstrCost(ID, RetTy, Tys, FMF); + SmallVector Operands(CI->arg_operands()); + return TTI.getIntrinsicInstrCost(ID, CI->getType(), Operands, FMF, VF); } static Type *smallestIntegerVectorType(Type *T1, Type *T2) { diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp index 40adf2e79be..b9df89e3eec 100644 --- a/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -1877,12 +1877,9 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); // Calculate the cost of the scalar and vector calls. - SmallVector ScalarTys, VecTys; - for (unsigned op = 0, opc = CI->getNumArgOperands(); op!= opc; ++op) { + SmallVector ScalarTys; + for (unsigned op = 0, opc = CI->getNumArgOperands(); op!= opc; ++op) ScalarTys.push_back(CI->getArgOperand(op)->getType()); - VecTys.push_back(VectorType::get(CI->getArgOperand(op)->getType(), - VecTy->getNumElements())); - } FastMathFlags FMF; if (auto *FPMO = dyn_cast(CI)) @@ -1891,7 +1888,9 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { int ScalarCallCost = VecTy->getNumElements() * TTI->getIntrinsicInstrCost(ID, ScalarTy, ScalarTys, FMF); - int VecCallCost = TTI->getIntrinsicInstrCost(ID, VecTy, VecTys, FMF); + SmallVector Args(CI->arg_operands()); + int VecCallCost = TTI->getIntrinsicInstrCost(ID, CI->getType(), Args, FMF, + VecTy->getNumElements()); DEBUG(dbgs() << "SLP: Call cost "<< VecCallCost - ScalarCallCost << " (" << VecCallCost << "-" << ScalarCallCost << ")" diff --git a/test/Analysis/CostModel/X86/arith-fp.ll b/test/Analysis/CostModel/X86/arith-fp.ll index 689442f67a1..e5043010c11 100644 --- a/test/Analysis/CostModel/X86/arith-fp.ll +++ b/test/Analysis/CostModel/X86/arith-fp.ll @@ -456,20 +456,20 @@ define i32 @fma(i32 %arg) { ; AVX2: cost of 1 {{.*}} %F32 = call float @llvm.fma.f32 ; AVX512: cost of 1 {{.*}} %F32 = call float @llvm.fma.f32 %F32 = call float @llvm.fma.f32(float undef, float undef, float undef) - ; SSE2: cost of 52 {{.*}} %V4F32 = call <4 x float> @llvm.fma.v4f32 - ; SSE42: cost of 52 {{.*}} %V4F32 = call <4 x float> @llvm.fma.v4f32 + ; SSE2: cost of 43 {{.*}} %V4F32 = call <4 x float> @llvm.fma.v4f32 + ; SSE42: cost of 43 {{.*}} %V4F32 = call <4 x float> @llvm.fma.v4f32 ; AVX: cost of 1 {{.*}} %V4F32 = call <4 x float> @llvm.fma.v4f32 ; AVX2: cost of 1 {{.*}} %V4F32 = call <4 x float> @llvm.fma.v4f32 ; AVX512: cost of 1 {{.*}} %V4F32 = call <4 x float> @llvm.fma.v4f32 %V4F32 = call <4 x float> @llvm.fma.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef) - ; SSE2: cost of 104 {{.*}} %V8F32 = call <8 x float> @llvm.fma.v8f32 - ; SSE42: cost of 104 {{.*}} %V8F32 = call <8 x float> @llvm.fma.v8f32 + ; SSE2: cost of 86 {{.*}} %V8F32 = call <8 x float> @llvm.fma.v8f32 + ; SSE42: cost of 86 {{.*}} %V8F32 = call <8 x float> @llvm.fma.v8f32 ; AVX: cost of 1 {{.*}} %V8F32 = call <8 x float> @llvm.fma.v8f32 ; AVX2: cost of 1 {{.*}} %V8F32 = call <8 x float> @llvm.fma.v8f32 ; AVX512: cost of 1 {{.*}} %V8F32 = call <8 x float> @llvm.fma.v8f32 %V8F32 = call <8 x float> @llvm.fma.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef) - ; SSE2: cost of 208 {{.*}} %V16F32 = call <16 x float> @llvm.fma.v16f32 - ; SSE42: cost of 208 {{.*}} %V16F32 = call <16 x float> @llvm.fma.v16f32 + ; SSE2: cost of 172 {{.*}} %V16F32 = call <16 x float> @llvm.fma.v16f32 + ; SSE42: cost of 172 {{.*}} %V16F32 = call <16 x float> @llvm.fma.v16f32 ; AVX: cost of 4 {{.*}} %V16F32 = call <16 x float> @llvm.fma.v16f32 ; AVX2: cost of 4 {{.*}} %V16F32 = call <16 x float> @llvm.fma.v16f32 ; AVX512: cost of 1 {{.*}} %V16F32 = call <16 x float> @llvm.fma.v16f32 @@ -481,20 +481,20 @@ define i32 @fma(i32 %arg) { ; AVX2: cost of 1 {{.*}} %F64 = call double @llvm.fma.f64 ; AVX512: cost of 1 {{.*}} %F64 = call double @llvm.fma.f64 %F64 = call double @llvm.fma.f64(double undef, double undef, double undef) - ; SSE2: cost of 24 {{.*}} %V2F64 = call <2 x double> @llvm.fma.v2f64 - ; SSE42: cost of 24 {{.*}} %V2F64 = call <2 x double> @llvm.fma.v2f64 + ; SSE2: cost of 21 {{.*}} %V2F64 = call <2 x double> @llvm.fma.v2f64 + ; SSE42: cost of 21 {{.*}} %V2F64 = call <2 x double> @llvm.fma.v2f64 ; AVX: cost of 1 {{.*}} %V2F64 = call <2 x double> @llvm.fma.v2f64 ; AVX2: cost of 1 {{.*}} %V2F64 = call <2 x double> @llvm.fma.v2f64 ; AVX512: cost of 1 {{.*}} %V2F64 = call <2 x double> @llvm.fma.v2f64 %V2F64 = call <2 x double> @llvm.fma.v2f64(<2 x double> undef, <2 x double> undef, <2 x double> undef) - ; SSE2: cost of 48 {{.*}} %V4F64 = call <4 x double> @llvm.fma.v4f64 - ; SSE42: cost of 48 {{.*}} %V4F64 = call <4 x double> @llvm.fma.v4f64 + ; SSE2: cost of 42 {{.*}} %V4F64 = call <4 x double> @llvm.fma.v4f64 + ; SSE42: cost of 42 {{.*}} %V4F64 = call <4 x double> @llvm.fma.v4f64 ; AVX: cost of 1 {{.*}} %V4F64 = call <4 x double> @llvm.fma.v4f64 ; AVX2: cost of 1 {{.*}} %V4F64 = call <4 x double> @llvm.fma.v4f64 ; AVX512: cost of 1 {{.*}} %V4F64 = call <4 x double> @llvm.fma.v4f64 %V4F64 = call <4 x double> @llvm.fma.v4f64(<4 x double> undef, <4 x double> undef, <4 x double> undef) - ; SSE2: cost of 96 {{.*}} %V8F64 = call <8 x double> @llvm.fma.v8f64 - ; SSE42: cost of 96 {{.*}} %V8F64 = call <8 x double> @llvm.fma.v8f64 + ; SSE2: cost of 84 {{.*}} %V8F64 = call <8 x double> @llvm.fma.v8f64 + ; SSE42: cost of 84 {{.*}} %V8F64 = call <8 x double> @llvm.fma.v8f64 ; AVX: cost of 4 {{.*}} %V8F64 = call <8 x double> @llvm.fma.v8f64 ; AVX2: cost of 4 {{.*}} %V8F64 = call <8 x double> @llvm.fma.v8f64 ; AVX512: cost of 1 {{.*}} %V8F64 = call <8 x double> @llvm.fma.v8f64 diff --git a/test/Transforms/LoopVectorize/AArch64/interleaved_cost.ll b/test/Transforms/LoopVectorize/AArch64/interleaved_cost.ll index 5add5a24d76..54ee8fc6e73 100644 --- a/test/Transforms/LoopVectorize/AArch64/interleaved_cost.ll +++ b/test/Transforms/LoopVectorize/AArch64/interleaved_cost.ll @@ -170,8 +170,8 @@ entry: ; VF_2-LABEL: Checking a loop in "i64_factor_8" ; VF_2: Found an estimated cost of 6 for VF 2 For instruction: %tmp2 = load i64, i64* %tmp0, align 8 ; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp3 = load i64, i64* %tmp1, align 8 -; VF_2-NEXT: Found an estimated cost of 10 for VF 2 For instruction: store i64 0, i64* %tmp0, align 8 -; VF_2-NEXT: Found an estimated cost of 10 for VF 2 For instruction: store i64 0, i64* %tmp1, align 8 +; VF_2-NEXT: Found an estimated cost of 7 for VF 2 For instruction: store i64 0, i64* %tmp0, align 8 +; VF_2-NEXT: Found an estimated cost of 7 for VF 2 For instruction: store i64 0, i64* %tmp1, align 8 for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %tmp0 = getelementptr inbounds %i64.8, %i64.8* %data, i64 %i, i32 2 diff --git a/test/Transforms/LoopVectorize/ARM/interleaved_cost.ll b/test/Transforms/LoopVectorize/ARM/interleaved_cost.ll index c85dc63c31a..29adec049f6 100644 --- a/test/Transforms/LoopVectorize/ARM/interleaved_cost.ll +++ b/test/Transforms/LoopVectorize/ARM/interleaved_cost.ll @@ -124,12 +124,12 @@ entry: ; VF_4: Found an estimated cost of 40 for VF 4 For instruction: %tmp2 = load half, half* %tmp0, align 2 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load half, half* %tmp1, align 2 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store half 0xH0000, half* %tmp0, align 2 -; VF_4-NEXT: Found an estimated cost of 40 for VF 4 For instruction: store half 0xH0000, half* %tmp1, align 2 +; VF_4-NEXT: Found an estimated cost of 32 for VF 4 For instruction: store half 0xH0000, half* %tmp1, align 2 ; VF_8-LABEL: Checking a loop in "half_factor_2" ; VF_8: Found an estimated cost of 80 for VF 8 For instruction: %tmp2 = load half, half* %tmp0, align 2 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load half, half* %tmp1, align 2 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store half 0xH0000, half* %tmp0, align 2 -; VF_8-NEXT: Found an estimated cost of 80 for VF 8 For instruction: store half 0xH0000, half* %tmp1, align 2 +; VF_8-NEXT: Found an estimated cost of 64 for VF 8 For instruction: store half 0xH0000, half* %tmp1, align 2 for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %tmp0 = getelementptr inbounds %half.2, %half.2* %data, i64 %i, i32 0 -- 2.50.1