From: Tyler Nowicki Date: Thu, 27 Aug 2015 18:56:49 +0000 (+0000) Subject: Improve vectorization diagnostic messages and extend vectorize(enable) pragma. X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=5e59aab216c2ce41abbf2ba850fb5f19d2384559;p=llvm Improve vectorization diagnostic messages and extend vectorize(enable) pragma. This patch changes the analysis diagnostics produced when loops with floating-point recurrences or memory operations are identified. The new messages say "cannot prove it is safe to reorder * operations; allow reordering by specifying #pragma clang loop vectorize(enable)". Depending on the type of diagnostic the message will include additional options such as ffast-math or __restrict__. This patch also allows the vectorize(enable) pragma to override the low pointer memory check threshold. When the hint is given a higher threshold is used. See the clang patch for the options produced for each diagnostic. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@246187 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index 6fc1aaab6e1..eaa3ecd48e4 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -214,6 +214,11 @@ static cl::opt MaxNestedScalarReductionIC( cl::desc("The maximum interleave count to use when interleaving a scalar " "reduction in a nested loop.")); +static cl::opt PragmaVectorizeMemoryCheckThreshold( + "pragma-vectorize-memory-check-threshold", cl::init(128), cl::Hidden, + cl::desc("The maximum allowed number of runtime memory checks with a " + "vectorize(enable) pragma.")); + namespace { // Forward declarations. @@ -929,6 +934,15 @@ public: return DiagnosticInfo::AlwaysPrint; } + bool allowReordering() const { + // When enabling loop hints are provided we allow the vectorizer to change + // the order of operations that is given by the scalar loop. This is not + // enabled by default because can be unsafe or inefficient. For example, + // reordering floating-point operations will change the way round-off + // error accumulates in the loop. + return getForce() == LoopVectorizeHints::FK_Enabled || getWidth() > 1; + } + private: /// Find hints specified in the loop metadata and update local values. void getHintsFromMetadata() { @@ -1427,29 +1441,25 @@ public: bool doesNotMeet(Function *F, Loop *L, const LoopVectorizeHints &Hints) { const char *Name = Hints.vectorizeAnalysisPassName(); bool Failed = false; - if (UnsafeAlgebraInst && - Hints.getForce() == LoopVectorizeHints::FK_Undefined && - Hints.getWidth() == 0) { + if (UnsafeAlgebraInst && !Hints.allowReordering()) { emitOptimizationRemarkAnalysisFPCommute( F->getContext(), Name, *F, UnsafeAlgebraInst->getDebugLoc(), - VectorizationReport() << "vectorization requires changes in the " - "order of operations, however IEEE 754 " - "floating-point operations are not " - "commutative"); + VectorizationReport() << "cannot prove it is safe to reorder " + "floating-point operations"); Failed = true; } - if (NumRuntimePointerChecks > - VectorizerParams::RuntimeMemoryCheckThreshold) { + // Test if runtime memcheck thresholds are exceeded. + bool PragmaThresholdReached = + NumRuntimePointerChecks > PragmaVectorizeMemoryCheckThreshold; + bool ThresholdReached = + NumRuntimePointerChecks > VectorizerParams::RuntimeMemoryCheckThreshold; + if ((ThresholdReached && !Hints.allowReordering()) || + PragmaThresholdReached) { emitOptimizationRemarkAnalysisAliasing( F->getContext(), Name, *F, L->getStartLoc(), VectorizationReport() - << "cannot prove pointers refer to independent arrays in memory. " - "The loop requires " - << NumRuntimePointerChecks - << " runtime independence checks to vectorize the loop, but that " - "would exceed the limit of " - << VectorizerParams::RuntimeMemoryCheckThreshold << " checks"); + << "cannot prove it is safe to reorder memory operations"); DEBUG(dbgs() << "LV: Too many memory checks needed.\n"); Failed = true; } diff --git a/test/Transforms/LoopVectorize/X86/no_fpmath.ll b/test/Transforms/LoopVectorize/X86/no_fpmath.ll index bc1173ae9f7..b1e20e52478 100644 --- a/test/Transforms/LoopVectorize/X86/no_fpmath.ll +++ b/test/Transforms/LoopVectorize/X86/no_fpmath.ll @@ -1,6 +1,6 @@ ; RUN: opt < %s -loop-vectorize -mtriple=x86_64-unknown-linux -S -pass-remarks='loop-vectorize' -pass-remarks-missed='loop-vectorize' -pass-remarks-analysis='loop-vectorize' 2>&1 | FileCheck %s -; CHECK: remark: no_fpmath.c:6:11: loop not vectorized: vectorization requires changes in the order of operations, however IEEE 754 floating-point operations are not commutative +; CHECK: remark: no_fpmath.c:6:11: loop not vectorized: cannot prove it is safe to reorder floating-point operations ; CHECK: remark: no_fpmath.c:6:14: loop not vectorized: ; CHECK: remark: no_fpmath.c:17:14: vectorized loop (vectorization width: 2, interleaved count: 2) diff --git a/test/Transforms/LoopVectorize/runtime-limit.ll b/test/Transforms/LoopVectorize/runtime-limit.ll index ff8d15576c0..e583e6b7137 100644 --- a/test/Transforms/LoopVectorize/runtime-limit.ll +++ b/test/Transforms/LoopVectorize/runtime-limit.ll @@ -1,17 +1,27 @@ -; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -dce -instcombine -pass-remarks=loop-vectorize -pass-remarks-missed=loop-vectorize -S 2>&1 | FileCheck %s +; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -dce -instcombine -pass-remarks=loop-vectorize -pass-remarks-missed=loop-vectorize -S 2>&1 | FileCheck %s +; RUN: opt < %s -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -dce -instcombine -pass-remarks=loop-vectorize -pass-remarks-missed=loop-vectorize -S 2>&1 | FileCheck %s -check-prefix=OVERRIDE +; RUN: opt < %s -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -pragma-vectorize-memory-check-threshold=6 -dce -instcombine -pass-remarks=loop-vectorize -pass-remarks-missed=loop-vectorize -S 2>&1 | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" target triple = "x86_64-apple-macosx10.8.0" ; First loop produced diagnostic pass remark. -;CHECK: remark: {{.*}}:0:0: vectorized loop (vectorization width: 4, interleaved count: 1) +;CHECK: remark: {{.*}}:0:0: vectorized loop (vectorization width: {{[0-9]}}, interleaved count: 1) ; Second loop produces diagnostic analysis remark. -;CHECK: remark: {{.*}}:0:0: loop not vectorized: cannot prove pointers refer to independent arrays in memory. The loop requires 11 runtime independence checks to vectorize the loop, but that would exceed the limit of 8 checks +;CHECK: remark: {{.*}}:0:0: loop not vectorized: cannot prove it is safe to reorder memory operations + +; First loop produced diagnostic pass remark. +;OVERRIDE: remark: {{.*}}:0:0: vectorized loop (vectorization width: {{[0-9]}}, interleaved count: 1) +; Second loop produces diagnostic pass remark. +;OVERRIDE: remark: {{.*}}:0:0: vectorized loop (vectorization width: {{[0-9]}}, interleaved count: 1) ; We are vectorizing with 6 runtime checks. ;CHECK-LABEL: func1x6( -;CHECK: <4 x i32> +;CHECK: <{{[0-9]}} x i32> ;CHECK: ret +;OVERRIDE-LABEL: func1x6( +;OVERRIDE: <4 x i32> +;OVERRIDE: ret define i32 @func1x6(i32* nocapture %out, i32* nocapture %A, i32* nocapture %B, i32* nocapture %C, i32* nocapture %D, i32* nocapture %E, i32* nocapture %F) { entry: br label %for.body @@ -44,8 +54,12 @@ for.end: ; preds = %for.body ; We are not vectorizing with 12 runtime checks. ;CHECK-LABEL: func2x6( -;CHECK-NOT: <4 x i32> +;CHECK-NOT: <{{[0-9]}} x i32> ;CHECK: ret +; We vectorize with 12 checks if a vectorization hint is provided. +;OVERRIDE-LABEL: func2x6( +;OVERRIDE: <4 x i32> +;OVERRIDE: ret define i32 @func2x6(i32* nocapture %out, i32* nocapture %out2, i32* nocapture %A, i32* nocapture %B, i32* nocapture %C, i32* nocapture %D, i32* nocapture %E, i32* nocapture %F) { entry: br label %for.body