From: Jonas Paulsson Date: Wed, 12 Apr 2017 12:41:37 +0000 (+0000) Subject: [LoopVectorizer, TTI] New method supportsEfficientVectorElementLoadStore() X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=43d439da886fb7df1b90c1d8136c58de47e24b92;p=llvm [LoopVectorizer, TTI] New method supportsEfficientVectorElementLoadStore() Since SystemZ supports vector element load/store instructions, there is no need for extracts/inserts if a vector load/store gets scalarized. This patch lets Target specify that it supports such instructions by means of a new TTI hook that defaults to false. The use for this is in the LoopVectorizer getScalarizationOverhead() method, which will with this patch produce a smaller sum for a vector load/store on SystemZ. New test: test/Transforms/LoopVectorize/SystemZ/load-store-scalarization-cost.ll Review: Adam Nemet https://reviews.llvm.org/D30680 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@300056 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/include/llvm/Analysis/TargetTransformInfo.h b/include/llvm/Analysis/TargetTransformInfo.h index 3d92208f08a..67196687d55 100644 --- a/include/llvm/Analysis/TargetTransformInfo.h +++ b/include/llvm/Analysis/TargetTransformInfo.h @@ -437,6 +437,11 @@ public: unsigned getOperandsScalarizationOverhead(ArrayRef Args, unsigned VF) const; + /// If target has efficient vector element load/store instructions, it can + /// return true here so that insertion/extraction costs are not added to + /// the scalarization cost of a load/store. + bool supportsEfficientVectorElementLoadStore() const; + /// \brief Don't restrict interleaved unrolling to small loops. bool enableAggressiveInterleaving(bool LoopHasReductions) const; @@ -790,6 +795,7 @@ public: getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) = 0; virtual unsigned getOperandsScalarizationOverhead(ArrayRef Args, unsigned VF) = 0; + virtual bool supportsEfficientVectorElementLoadStore() = 0; virtual bool enableAggressiveInterleaving(bool LoopHasReductions) = 0; virtual bool enableInterleavedAccessVectorization() = 0; virtual bool isFPVectorizationPotentiallyUnsafe() = 0; @@ -996,6 +1002,10 @@ public: return Impl.getOperandsScalarizationOverhead(Args, VF); } + bool supportsEfficientVectorElementLoadStore() override { + return Impl.supportsEfficientVectorElementLoadStore(); + } + bool enableAggressiveInterleaving(bool LoopHasReductions) override { return Impl.enableAggressiveInterleaving(LoopHasReductions); } diff --git a/include/llvm/Analysis/TargetTransformInfoImpl.h b/include/llvm/Analysis/TargetTransformInfoImpl.h index 790acbc5145..9ab6b7445ab 100644 --- a/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -262,6 +262,8 @@ public: unsigned getOperandsScalarizationOverhead(ArrayRef Args, unsigned VF) { return 0; } + bool supportsEfficientVectorElementLoadStore() { return false; } + bool enableAggressiveInterleaving(bool LoopHasReductions) { return false; } bool enableInterleavedAccessVectorization() { return false; } diff --git a/lib/Analysis/TargetTransformInfo.cpp b/lib/Analysis/TargetTransformInfo.cpp index c8b8740bd85..d73b1a12803 100644 --- a/lib/Analysis/TargetTransformInfo.cpp +++ b/lib/Analysis/TargetTransformInfo.cpp @@ -197,6 +197,10 @@ getOperandsScalarizationOverhead(ArrayRef Args, return TTIImpl->getOperandsScalarizationOverhead(Args, VF); } +bool TargetTransformInfo::supportsEfficientVectorElementLoadStore() const { + return TTIImpl->supportsEfficientVectorElementLoadStore(); +} + bool TargetTransformInfo::enableAggressiveInterleaving(bool LoopHasReductions) const { return TTIImpl->enableAggressiveInterleaving(LoopHasReductions); } diff --git a/lib/Target/SystemZ/SystemZTargetTransformInfo.h b/lib/Target/SystemZ/SystemZTargetTransformInfo.h index d2639cb271d..3766ed45b8c 100644 --- a/lib/Target/SystemZ/SystemZTargetTransformInfo.h +++ b/lib/Target/SystemZ/SystemZTargetTransformInfo.h @@ -55,6 +55,7 @@ public: unsigned getNumberOfRegisters(bool Vector); unsigned getRegisterBitWidth(bool Vector); + bool supportsEfficientVectorElementLoadStore() { return true; } bool enableInterleavedAccessVectorization() { return true; } int getArithmeticInstrCost( diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index f891cd9d62c..26bcbcb5f18 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -3663,13 +3663,17 @@ static unsigned getScalarizationOverhead(Instruction *I, unsigned VF, unsigned Cost = 0; Type *RetTy = ToVectorTy(I->getType(), VF); - if (!RetTy->isVoidTy()) + if (!RetTy->isVoidTy() && + (!isa(I) || + !TTI.supportsEfficientVectorElementLoadStore())) Cost += TTI.getScalarizationOverhead(RetTy, true, false); if (CallInst *CI = dyn_cast(I)) { SmallVector Operands(CI->arg_operands()); Cost += TTI.getOperandsScalarizationOverhead(Operands, VF); - } else { + } + else if (!isa(I) || + !TTI.supportsEfficientVectorElementLoadStore()) { SmallVector Operands(I->operand_values()); Cost += TTI.getOperandsScalarizationOverhead(Operands, VF); } diff --git a/test/Transforms/LoopVectorize/SystemZ/load-store-scalarization-cost.ll b/test/Transforms/LoopVectorize/SystemZ/load-store-scalarization-cost.ll new file mode 100644 index 00000000000..e7096c29b99 --- /dev/null +++ b/test/Transforms/LoopVectorize/SystemZ/load-store-scalarization-cost.ll @@ -0,0 +1,33 @@ +; REQUIRES: asserts +; RUN: opt -mtriple=s390x-unknown-linux -mcpu=z13 -loop-vectorize \ +; RUN: -force-vector-width=4 -debug-only=loop-vectorize \ +; RUN: -disable-output -enable-interleaved-mem-accesses=false < %s 2>&1 | \ +; RUN: FileCheck %s +; +; Check that a scalarized load/store does not get a cost for insterts/ +; extracts, since z13 supports element load/store. + +define void @fun(i32* %data, i64 %n) { +entry: + br label %for.body + +for.body: + %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] + %tmp0 = getelementptr inbounds i32, i32* %data, i64 %i + %tmp1 = load i32, i32* %tmp0, align 4 + %tmp2 = add i32 %tmp1, 1 + store i32 %tmp2, i32* %tmp0, align 4 + %i.next = add nuw nsw i64 %i, 2 + %cond = icmp slt i64 %i.next, %n + br i1 %cond, label %for.body, label %for.end + +for.end: + ret void + +; CHECK: LV: Found an estimated cost of 4 for VF 4 For instruction: %tmp1 = load i32, i32* %tmp0, align 4 +; CHECK: LV: Found an estimated cost of 4 for VF 4 For instruction: store i32 %tmp2, i32* %tmp0, align 4 + +; CHECK: LV: Scalarizing: %tmp1 = load i32, i32* %tmp0, align 4 +; CHECK: LV: Scalarizing: store i32 %tmp2, i32* %tmp0, align 4 +} +