From 0316f025dab46d776dee70d3b695e1a07db5371c Mon Sep 17 00:00:00 2001 From: Hans Wennborg Date: Thu, 18 Aug 2016 22:38:06 +0000 Subject: [PATCH] Merging r279125 and r278343: ------------------------------------------------------------------------ r279125 | mssimpso | 2016-08-18 12:50:32 -0700 (Thu, 18 Aug 2016) | 14 lines [SLP] Initialize VectorizedValue when gathering We abort building vectorizable trees in some cases (e.g., if the maximum recursion depth is reached, if the region size is too large, etc.). If this happens for a reduction, we can be left with a root entry that needs to be gathered. For these cases, we need make sure we actually set VectorizedValue to the resulting vector. This patch ensures we properly set VectorizedValue, and it also ensures the insertelement sequence generated for the gathers is inserted at the correct location. Reference: https://llvm.org/bugs/show_bug.cgi?id=28330 Differential Revison: https://reviews.llvm.org/D23410 ------------------------------------------------------------------------ ------------------------------------------------------------------------ r278343 | mssimpso | 2016-08-11 08:28:45 -0700 (Thu, 11 Aug 2016) | 1 line [SLP] Make RecursionMaxDepth a command line option (NFC) ------------------------------------------------------------------------ git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_39@279174 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Vectorize/SLPVectorizer.cpp | 79 +++++++++++++-- .../SLPVectorizer/AArch64/gather-root.ll | 95 +++++++++++++++++++ 2 files changed, 164 insertions(+), 10 deletions(-) create mode 100644 test/Transforms/SLPVectorizer/AArch64/gather-root.ll diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp index 8a3c4d14fec..cd921069515 100644 --- a/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -82,8 +82,9 @@ static cl::opt MinVectorRegSizeOption( "slp-min-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits")); -// FIXME: Set this via cl::opt to allow overriding. -static const unsigned RecursionMaxDepth = 12; +static cl::opt RecursionMaxDepth( + "slp-recursion-max-depth", cl::init(12), cl::Hidden, + cl::desc("Limit the recursion depth when building a vectorizable tree")); // Limit the number of alias checks. The limit is chosen so that // it has no negative effect on the llvm benchmarks. @@ -2124,11 +2125,61 @@ void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef VL, } void BoUpSLP::setInsertPointAfterBundle(ArrayRef VL) { - Instruction *VL0 = cast(VL[0]); - BasicBlock::iterator NextInst(VL0); - ++NextInst; - Builder.SetInsertPoint(VL0->getParent(), NextInst); - Builder.SetCurrentDebugLocation(VL0->getDebugLoc()); + + // Get the basic block this bundle is in. All instructions in the bundle + // should be in this block. + auto *Front = cast(VL.front()); + auto *BB = Front->getParent(); + assert(all_of(make_range(VL.begin(), VL.end()), [&](Value *V) -> bool { + return cast(V)->getParent() == BB; + })); + + // The last instruction in the bundle in program order. + Instruction *LastInst = nullptr; + + // Find the last instruction. The common case should be that BB has been + // scheduled, and the last instruction is VL.back(). So we start with + // VL.back() and iterate over schedule data until we reach the end of the + // bundle. The end of the bundle is marked by null ScheduleData. + if (BlocksSchedules.count(BB)) { + auto *Bundle = BlocksSchedules[BB]->getScheduleData(VL.back()); + if (Bundle && Bundle->isPartOfBundle()) + for (; Bundle; Bundle = Bundle->NextInBundle) + LastInst = Bundle->Inst; + } + + // LastInst can still be null at this point if there's either not an entry + // for BB in BlocksSchedules or there's no ScheduleData available for + // VL.back(). This can be the case if buildTree_rec aborts for various + // reasons (e.g., the maximum recursion depth is reached, the maximum region + // size is reached, etc.). ScheduleData is initialized in the scheduling + // "dry-run". + // + // If this happens, we can still find the last instruction by brute force. We + // iterate forwards from Front (inclusive) until we either see all + // instructions in the bundle or reach the end of the block. If Front is the + // last instruction in program order, LastInst will be set to Front, and we + // will visit all the remaining instructions in the block. + // + // One of the reasons we exit early from buildTree_rec is to place an upper + // bound on compile-time. Thus, taking an additional compile-time hit here is + // not ideal. However, this should be exceedingly rare since it requires that + // we both exit early from buildTree_rec and that the bundle be out-of-order + // (causing us to iterate all the way to the end of the block). + if (!LastInst) { + SmallPtrSet Bundle(VL.begin(), VL.end()); + for (auto &I : make_range(BasicBlock::iterator(Front), BB->end())) { + if (Bundle.erase(&I)) + LastInst = &I; + if (Bundle.empty()) + break; + } + } + + // Set the insertion point after the last instruction in the bundle. Set the + // debug location to Front. + Builder.SetInsertPoint(BB, next(BasicBlock::iterator(LastInst))); + Builder.SetCurrentDebugLocation(Front->getDebugLoc()); } Value *BoUpSLP::Gather(ArrayRef VL, VectorType *Ty) { @@ -2206,7 +2257,9 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { if (E->NeedToGather) { setInsertPointAfterBundle(E->Scalars); - return Gather(E->Scalars, VecTy); + auto *V = Gather(E->Scalars, VecTy); + E->VectorizedValue = V; + return V; } unsigned Opcode = getSameOpcode(E->Scalars); @@ -2253,7 +2306,10 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { E->VectorizedValue = V; return V; } - return Gather(E->Scalars, VecTy); + setInsertPointAfterBundle(E->Scalars); + auto *V = Gather(E->Scalars, VecTy); + E->VectorizedValue = V; + return V; } case Instruction::ExtractValue: { if (canReuseExtract(E->Scalars, Instruction::ExtractValue)) { @@ -2265,7 +2321,10 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { E->VectorizedValue = V; return propagateMetadata(V, E->Scalars); } - return Gather(E->Scalars, VecTy); + setInsertPointAfterBundle(E->Scalars); + auto *V = Gather(E->Scalars, VecTy); + E->VectorizedValue = V; + return V; } case Instruction::ZExt: case Instruction::SExt: diff --git a/test/Transforms/SLPVectorizer/AArch64/gather-root.ll b/test/Transforms/SLPVectorizer/AArch64/gather-root.ll new file mode 100644 index 00000000000..f9ef7382d31 --- /dev/null +++ b/test/Transforms/SLPVectorizer/AArch64/gather-root.ll @@ -0,0 +1,95 @@ +; RUN: opt < %s -slp-vectorizer -S | FileCheck %s --check-prefix=DEFAULT +; RUN: opt < %s -slp-recursion-max-depth=0 -slp-vectorizer -S | FileCheck %s --check-prefix=GATHER + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64--linux-gnu" + +@a = common global [80 x i8] zeroinitializer, align 16 + +; DEFAULT-LABEL: @PR28330( +; DEFAULT: %tmp17 = phi i32 [ %tmp34, %for.body ], [ 0, %entry ] +; DEFAULT: %tmp18 = phi i32 [ %tmp35, %for.body ], [ %n, %entry ] +; DEFAULT: %[[S0:.+]] = select <8 x i1> %1, <8 x i32> , <8 x i32> +; DEFAULT: %[[R0:.+]] = shufflevector <8 x i32> %[[S0]], <8 x i32> undef, <8 x i32> +; DEFAULT: %[[R1:.+]] = add <8 x i32> %[[S0]], %[[R0]] +; DEFAULT: %[[R2:.+]] = shufflevector <8 x i32> %[[R1]], <8 x i32> undef, <8 x i32> +; DEFAULT: %[[R3:.+]] = add <8 x i32> %[[R1]], %[[R2]] +; DEFAULT: %[[R4:.+]] = shufflevector <8 x i32> %[[R3]], <8 x i32> undef, <8 x i32> +; DEFAULT: %[[R5:.+]] = add <8 x i32> %[[R3]], %[[R4]] +; DEFAULT: %[[R6:.+]] = extractelement <8 x i32> %[[R5]], i32 0 +; DEFAULT: %tmp34 = add i32 %[[R6]], %tmp17 +; +; GATHER-LABEL: @PR28330( +; GATHER: %tmp17 = phi i32 [ %tmp34, %for.body ], [ 0, %entry ] +; GATHER: %tmp18 = phi i32 [ %tmp35, %for.body ], [ %n, %entry ] +; GATHER: %tmp19 = select i1 %tmp1, i32 -720, i32 -80 +; GATHER: %tmp21 = select i1 %tmp3, i32 -720, i32 -80 +; GATHER: %tmp23 = select i1 %tmp5, i32 -720, i32 -80 +; GATHER: %tmp25 = select i1 %tmp7, i32 -720, i32 -80 +; GATHER: %tmp27 = select i1 %tmp9, i32 -720, i32 -80 +; GATHER: %tmp29 = select i1 %tmp11, i32 -720, i32 -80 +; GATHER: %tmp31 = select i1 %tmp13, i32 -720, i32 -80 +; GATHER: %tmp33 = select i1 %tmp15, i32 -720, i32 -80 +; GATHER: %[[I0:.+]] = insertelement <8 x i32> undef, i32 %tmp19, i32 0 +; GATHER: %[[I1:.+]] = insertelement <8 x i32> %[[I0]], i32 %tmp21, i32 1 +; GATHER: %[[I2:.+]] = insertelement <8 x i32> %[[I1]], i32 %tmp23, i32 2 +; GATHER: %[[I3:.+]] = insertelement <8 x i32> %[[I2]], i32 %tmp25, i32 3 +; GATHER: %[[I4:.+]] = insertelement <8 x i32> %[[I3]], i32 %tmp27, i32 4 +; GATHER: %[[I5:.+]] = insertelement <8 x i32> %[[I4]], i32 %tmp29, i32 5 +; GATHER: %[[I6:.+]] = insertelement <8 x i32> %[[I5]], i32 %tmp31, i32 6 +; GATHER: %[[I7:.+]] = insertelement <8 x i32> %[[I6]], i32 %tmp33, i32 7 +; GATHER: %[[R0:.+]] = shufflevector <8 x i32> %[[I7]], <8 x i32> undef, <8 x i32> +; GATHER: %[[R1:.+]] = add <8 x i32> %[[I7]], %[[R0]] +; GATHER: %[[R2:.+]] = shufflevector <8 x i32> %[[R1]], <8 x i32> undef, <8 x i32> +; GATHER: %[[R3:.+]] = add <8 x i32> %[[R1]], %[[R2]] +; GATHER: %[[R4:.+]] = shufflevector <8 x i32> %[[R3]], <8 x i32> undef, <8 x i32> +; GATHER: %[[R5:.+]] = add <8 x i32> %[[R3]], %[[R4]] +; GATHER: %[[R6:.+]] = extractelement <8 x i32> %[[R5]], i32 0 +; GATHER: %tmp34 = add i32 %[[R6]], %tmp17 + +define void @PR28330(i32 %n) { +entry: + %tmp0 = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 1), align 1 + %tmp1 = icmp eq i8 %tmp0, 0 + %tmp2 = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 2), align 2 + %tmp3 = icmp eq i8 %tmp2, 0 + %tmp4 = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 3), align 1 + %tmp5 = icmp eq i8 %tmp4, 0 + %tmp6 = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 4), align 4 + %tmp7 = icmp eq i8 %tmp6, 0 + %tmp8 = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 5), align 1 + %tmp9 = icmp eq i8 %tmp8, 0 + %tmp10 = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 6), align 2 + %tmp11 = icmp eq i8 %tmp10, 0 + %tmp12 = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 7), align 1 + %tmp13 = icmp eq i8 %tmp12, 0 + %tmp14 = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 8), align 8 + %tmp15 = icmp eq i8 %tmp14, 0 + br label %for.body + +for.body: + %tmp17 = phi i32 [ %tmp34, %for.body ], [ 0, %entry ] + %tmp18 = phi i32 [ %tmp35, %for.body ], [ %n, %entry ] + %tmp19 = select i1 %tmp1, i32 -720, i32 -80 + %tmp20 = add i32 %tmp17, %tmp19 + %tmp21 = select i1 %tmp3, i32 -720, i32 -80 + %tmp22 = add i32 %tmp20, %tmp21 + %tmp23 = select i1 %tmp5, i32 -720, i32 -80 + %tmp24 = add i32 %tmp22, %tmp23 + %tmp25 = select i1 %tmp7, i32 -720, i32 -80 + %tmp26 = add i32 %tmp24, %tmp25 + %tmp27 = select i1 %tmp9, i32 -720, i32 -80 + %tmp28 = add i32 %tmp26, %tmp27 + %tmp29 = select i1 %tmp11, i32 -720, i32 -80 + %tmp30 = add i32 %tmp28, %tmp29 + %tmp31 = select i1 %tmp13, i32 -720, i32 -80 + %tmp32 = add i32 %tmp30, %tmp31 + %tmp33 = select i1 %tmp15, i32 -720, i32 -80 + %tmp34 = add i32 %tmp32, %tmp33 + %tmp35 = add nsw i32 %tmp18, -1 + %tmp36 = icmp eq i32 %tmp35, 0 + br i1 %tmp36, label %for.end, label %for.body + +for.end: + ret void +} -- 2.40.0