From: Teresa Johnson Date: Sat, 1 Jul 2017 03:24:06 +0000 (+0000) Subject: Enable vectorizer-maximize-bandwidth by default. X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=005cfad2e884ca81dcc35b49fb6e151d5827f8d9;p=llvm Enable vectorizer-maximize-bandwidth by default. Summary: vectorizer-maximize-bandwidth is generally useful in terms of performance. I've tested the impact of changing this to default on speccpu benchmarks on sandybridge machines. The result shows non-negative impact: spec/2006/fp/C++/444.namd 26.84 -0.31% spec/2006/fp/C++/447.dealII 46.19 +0.89% spec/2006/fp/C++/450.soplex 42.92 -0.44% spec/2006/fp/C++/453.povray 38.57 -2.25% spec/2006/fp/C/433.milc 24.54 -0.76% spec/2006/fp/C/470.lbm 41.08 +0.26% spec/2006/fp/C/482.sphinx3 47.58 -0.99% spec/2006/int/C++/471.omnetpp 22.06 +1.87% spec/2006/int/C++/473.astar 22.65 -0.12% spec/2006/int/C++/483.xalancbmk 33.69 +4.97% spec/2006/int/C/400.perlbench 33.43 +1.70% spec/2006/int/C/401.bzip2 23.02 -0.19% spec/2006/int/C/403.gcc 32.57 -0.43% spec/2006/int/C/429.mcf 40.35 +0.27% spec/2006/int/C/445.gobmk 26.96 +0.06% spec/2006/int/C/456.hmmer 24.4 +0.19% spec/2006/int/C/458.sjeng 27.91 -0.08% spec/2006/int/C/462.libquantum 57.47 -0.20% spec/2006/int/C/464.h264ref 46.52 +1.35% geometric mean +0.29% The regression on 453.povray seems real, but is due to secondary effects as all hot functions are bit-identical with and without the flag. I started this patch to consult upstream opinions on this. It will be greatly appreciated if the community can help test the performance impact of this change on other architectures so that we can decided if this should be target-dependent. Reviewers: hfinkel, mkuper, davidxl, chandlerc Reviewed By: chandlerc Subscribers: rengolin, sanjoy, javed.absar, bjope, dorit, magabari, RKSimon, llvm-commits, mzolotukhin Differential Revision: https://reviews.llvm.org/D33341 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@306933 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index 193cc4d1378..11484e8d96b 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -123,7 +123,7 @@ static cl::opt TinyTripCountVectorThreshold( "are incurred.")); static cl::opt MaximizeBandwidth( - "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, + "vectorizer-maximize-bandwidth", cl::init(true), cl::Hidden, cl::desc("Maximize bandwidth when selecting vectorization factor which " "will be determined by the smallest type in loop.")); diff --git a/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll b/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll index 1149afe7b9f..4cdf76b9b44 100644 --- a/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll +++ b/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll @@ -88,9 +88,9 @@ for.body: ; preds = %entry, %for.body } ; CHECK-LABEL: @add_c( -; CHECK: load <8 x i8>, <8 x i8>* -; CHECK: add <8 x i16> -; CHECK: store <8 x i16> +; CHECK: load <16 x i8>, <16 x i8>* +; CHECK: add <16 x i16> +; CHECK: store <16 x i16> ; Function Attrs: nounwind define void @add_c(i8* noalias nocapture readonly %p, i16* noalias nocapture %q, i32 %len) #0 { entry: @@ -116,9 +116,9 @@ for.body: ; preds = %entry, %for.body } ; CHECK-LABEL: @add_d( -; CHECK: load <4 x i16> -; CHECK: add nsw <4 x i32> -; CHECK: store <4 x i32> +; CHECK: load <8 x i16> +; CHECK: add nsw <8 x i32> +; CHECK: store <8 x i32> define void @add_d(i16* noalias nocapture readonly %p, i32* noalias nocapture %q, i32 %len) #0 { entry: %cmp7 = icmp sgt i32 %len, 0 @@ -187,16 +187,16 @@ for.body: ; preds = %for.body, %for.body } ; CHECK-LABEL: @add_f -; CHECK: load <8 x i16> -; CHECK: trunc <8 x i16> -; CHECK: shl <8 x i8> -; CHECK: add <8 x i8> -; CHECK: or <8 x i8> -; CHECK: mul <8 x i8> -; CHECK: and <8 x i8> -; CHECK: xor <8 x i8> -; CHECK: mul <8 x i8> -; CHECK: store <8 x i8> +; CHECK: load <16 x i16> +; CHECK: trunc <16 x i16> +; CHECK: shl <16 x i8> +; CHECK: add <16 x i8> +; CHECK: or <16 x i8> +; CHECK: mul <16 x i8> +; CHECK: and <16 x i8> +; CHECK: xor <16 x i8> +; CHECK: mul <16 x i8> +; CHECK: store <16 x i8> define void @add_f(i16* noalias nocapture readonly %p, i8* noalias nocapture %q, i8 %arg1, i8 %arg2, i32 %len) #0 { entry: %cmp.32 = icmp sgt i32 %len, 0 diff --git a/test/Transforms/LoopVectorize/AArch64/reduction-small-size.ll b/test/Transforms/LoopVectorize/AArch64/reduction-small-size.ll index 9d9aea00e9a..2c3d0591954 100644 --- a/test/Transforms/LoopVectorize/AArch64/reduction-small-size.ll +++ b/test/Transforms/LoopVectorize/AArch64/reduction-small-size.ll @@ -123,16 +123,16 @@ for.body: ; } ; ; CHECK: vector.body: -; CHECK: phi <8 x i16> -; CHECK: [[Ld1:%[a-zA-Z0-9.]+]] = load <8 x i8> -; CHECK: zext <8 x i8> [[Ld1]] to <8 x i16> -; CHECK: [[Ld2:%[a-zA-Z0-9.]+]] = load <8 x i8> -; CHECK: zext <8 x i8> [[Ld2]] to <8 x i16> -; CHECK: add <8 x i16> -; CHECK: add <8 x i16> +; CHECK: phi <16 x i16> +; CHECK: [[Ld1:%[a-zA-Z0-9.]+]] = load <16 x i8> +; CHECK: zext <16 x i8> [[Ld1]] to <16 x i16> +; CHECK: [[Ld2:%[a-zA-Z0-9.]+]] = load <16 x i8> +; CHECK: zext <16 x i8> [[Ld2]] to <16 x i16> +; CHECK: add <16 x i16> +; CHECK: add <16 x i16> ; ; CHECK: middle.block: -; CHECK: [[Rdx:%[a-zA-Z0-9.]+]] = call i16 @llvm.experimental.vector.reduce.add.i16.v8i16(<8 x i16> +; CHECK: [[Rdx:%[a-zA-Z0-9.]+]] = call i16 @llvm.experimental.vector.reduce.add.i16.v16i16(<16 x i16> ; CHECK: zext i16 [[Rdx]] to i32 ; define i16 @reduction_i16_2(i8* nocapture readonly %a, i8* nocapture readonly %b, i32 %n) { diff --git a/test/Transforms/LoopVectorize/ARM/gcc-examples.ll b/test/Transforms/LoopVectorize/ARM/gcc-examples.ll index 783156d7399..609b304f93f 100644 --- a/test/Transforms/LoopVectorize/ARM/gcc-examples.ll +++ b/test/Transforms/LoopVectorize/ARM/gcc-examples.ll @@ -35,9 +35,9 @@ define void @example1() nounwind uwtable ssp { } ;CHECK-LABEL: @example10b( -;CHECK: load <4 x i16> -;CHECK: sext <4 x i16> -;CHECK: store <4 x i32> +;CHECK: load <8 x i16> +;CHECK: sext <8 x i16> +;CHECK: store <8 x i32> ;CHECK: ret void define void @example10b(i16* noalias nocapture %sa, i16* noalias nocapture %sb, i16* noalias nocapture %sc, i32* noalias nocapture %ia, i32* noalias nocapture %ib, i32* noalias nocapture %ic) nounwind uwtable ssp { br label %1 diff --git a/test/Transforms/LoopVectorize/X86/fp64_to_uint32-cost-model.ll b/test/Transforms/LoopVectorize/X86/fp64_to_uint32-cost-model.ll index c066afcfa63..f002dd8adec 100644 --- a/test/Transforms/LoopVectorize/X86/fp64_to_uint32-cost-model.ll +++ b/test/Transforms/LoopVectorize/X86/fp64_to_uint32-cost-model.ll @@ -9,7 +9,9 @@ target triple = "x86_64-apple-macosx" ; If we need to scalarize the fptoui and then use inserts to build up the ; vector again, then there is certainly no value in going 256-bit wide. -; CHECK-NOT: vpinsrd +; But as we default to maximize bandwidth, we should convert it to 256-bit +; anyway. +; CHECK: vpinsrd define void @convert() { entry: diff --git a/test/Transforms/LoopVectorize/X86/gcc-examples.ll b/test/Transforms/LoopVectorize/X86/gcc-examples.ll index c581f4bf2a6..2e3dfa0a15f 100644 --- a/test/Transforms/LoopVectorize/X86/gcc-examples.ll +++ b/test/Transforms/LoopVectorize/X86/gcc-examples.ll @@ -44,17 +44,16 @@ define void @example1() nounwind uwtable ssp { ret void } -; Select VF=4 because sext <8 x i1> to <8 x i32> is expensive. ;CHECK-LABEL: @example10b( -;CHECK: load <4 x i16> -;CHECK: sext <4 x i16> -;CHECK: store <4 x i32> +;CHECK: load <8 x i16> +;CHECK: sext <8 x i16> +;CHECK: store <8 x i32> ;CHECK: ret void ;UNROLL-LABEL: @example10b( -;UNROLL: load <4 x i16> -;UNROLL: load <4 x i16> -;UNROLL: store <4 x i32> -;UNROLL: store <4 x i32> +;UNROLL: load <8 x i16> +;UNROLL: load <8 x i16> +;UNROLL: store <8 x i32> +;UNROLL: store <8 x i32> ;UNROLL: ret void define void @example10b(i16* noalias nocapture %sa, i16* noalias nocapture %sb, i16* noalias nocapture %sc, i32* noalias nocapture %ia, i32* noalias nocapture %ib, i32* noalias nocapture %ic) nounwind uwtable ssp { br label %1 diff --git a/test/Transforms/LoopVectorize/X86/masked_load_store.ll b/test/Transforms/LoopVectorize/X86/masked_load_store.ll index 0377ae1c24d..e15c707587f 100644 --- a/test/Transforms/LoopVectorize/X86/masked_load_store.ll +++ b/test/Transforms/LoopVectorize/X86/masked_load_store.ll @@ -260,20 +260,28 @@ for.end: ; preds = %for.cond ; } ;} -;AVX-LABEL: @foo3 -;AVX: icmp slt <4 x i32> %wide.load, @llvm.masked.load.v4f64.p0v4f64 -;AVX: sitofp <4 x i32> %wide.load to <4 x double> -;AVX: fadd <4 x double> -;AVX: call void @llvm.masked.store.v4f64.p0v4f64 -;AVX: ret void +;AVX1-LABEL: @foo3 +;AVX1: icmp slt <4 x i32> %wide.load, @llvm.masked.load.v4f64.p0v4f64 +;AVX1: sitofp <4 x i32> %wide.load to <4 x double> +;AVX1: fadd <4 x double> +;AVX1: call void @llvm.masked.store.v4f64.p0v4f64 +;AVX1: ret void + +;AVX2-LABEL: @foo3 +;AVX2: icmp slt <8 x i32> %wide.load, @llvm.masked.load.v8f64.p0v8f64 +;AVX2: sitofp <8 x i32> %wide.load to <8 x double> +;AVX2: fadd <8 x double> +;AVX2: call void @llvm.masked.store.v8f64.p0v8f64 +;AVX2: ret void ;AVX512-LABEL: @foo3 -;AVX512: icmp slt <8 x i32> %wide.load, @llvm.masked.load.v8f64.p0v8f64 -;AVX512: sitofp <8 x i32> %wide.load to <8 x double> -;AVX512: fadd <8 x double> -;AVX512: call void @llvm.masked.store.v8f64.p0v8f64 +;AVX512: icmp slt <16 x i32> %wide.load, @llvm.masked.load.v16f64.p0v16f64 +;AVX512: sitofp <16 x i32> %wide.load to <16 x double> +;AVX512: fadd <16 x double> +;AVX512: call void @llvm.masked.store.v16f64.p0v16f64 ;AVX512: ret void @@ -502,19 +510,19 @@ for.end: ; preds = %for.cond ; } ;} ;AVX2-LABEL: @foo6 -;AVX2: icmp sgt <4 x i32> %reverse, zeroinitializer -;AVX2: shufflevector <4 x i1>{{.*}}<4 x i32> -;AVX2: call <4 x double> @llvm.masked.load.v4f64.p0v4f64 -;AVX2: fadd <4 x double> -;AVX2: call void @llvm.masked.store.v4f64.p0v4f64 +;AVX2: icmp sgt <8 x i32> %reverse, zeroinitializer +;AVX2: shufflevector <8 x i1>{{.*}}<8 x i32> @llvm.masked.load.v8f64.p0v8f64 +;AVX2: fadd <8 x double> +;AVX2: call void @llvm.masked.store.v8f64.p0v8f64 ;AVX2: ret void ;AVX512-LABEL: @foo6 -;AVX512: icmp sgt <8 x i32> %reverse, zeroinitializer -;AVX512: shufflevector <8 x i1>{{.*}}<8 x i32> @llvm.masked.load.v8f64.p0v8f64 -;AVX512: fadd <8 x double> -;AVX512: call void @llvm.masked.store.v8f64.p0v8f64 +;AVX512: icmp sgt <16 x i32> %reverse, zeroinitializer +;AVX512: shufflevector <16 x i1>{{.*}}<16 x i32> @llvm.masked.load.v16f64.p0v16f64 +;AVX512: fadd <16 x double> +;AVX512: call void @llvm.masked.store.v16f64.p0v16f64 ;AVX512: ret void @@ -582,8 +590,8 @@ for.end: ; preds = %for.cond ; } ;AVX512-LABEL: @foo7 -;AVX512: call <8 x double*> @llvm.masked.load.v8p0f64.p0v8p0f64(<8 x double*>* -;AVX512: call void @llvm.masked.store.v8f64.p0v8f64 +;AVX512: call <64 x double*> @llvm.masked.load.v64p0f64.p0v64p0f64(<64 x double*>* +;AVX512: call void @llvm.masked.store.v64f64.p0v64f64 ;AVX512: ret void define void @foo7(double* noalias %out, double** noalias %in, i8* noalias %trigger, i32 %size) #0 { @@ -654,8 +662,8 @@ for.end: ; preds = %for.cond ;} ;AVX512-LABEL: @foo8 -;AVX512: call <8 x i32 ()*> @llvm.masked.load.v8p0f_i32f.p0v8p0f_i32f(<8 x i32 ()*>* % -;AVX512: call void @llvm.masked.store.v8f64.p0v8f64 +;AVX512: call <64 x i32 ()*> @llvm.masked.load.v64p0f_i32f.p0v64p0f_i32f(<64 x i32 ()*>* % +;AVX512: call void @llvm.masked.store.v64f64.p0v64f64 ;AVX512: ret void define void @foo8(double* noalias %out, i32 ()** noalias %in, i8* noalias %trigger, i32 %size) #0 { diff --git a/test/Transforms/LoopVectorize/X86/no_fpmath.ll b/test/Transforms/LoopVectorize/X86/no_fpmath.ll index 2efe928f0f4..04ecaeb4e4e 100644 --- a/test/Transforms/LoopVectorize/X86/no_fpmath.ll +++ b/test/Transforms/LoopVectorize/X86/no_fpmath.ll @@ -2,7 +2,7 @@ ; CHECK: remark: no_fpmath.c:6:11: loop not vectorized: cannot prove it is safe to reorder floating-point operations ; CHECK: remark: no_fpmath.c:6:14: loop not vectorized -; CHECK: remark: no_fpmath.c:17:14: vectorized loop (vectorization width: 2, interleaved count: 2) +; CHECK: remark: no_fpmath.c:17:14: vectorized loop (vectorization width: 4, interleaved count: 2) target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-apple-macosx10.10.0" diff --git a/test/Transforms/LoopVectorize/X86/no_fpmath_with_hotness.ll b/test/Transforms/LoopVectorize/X86/no_fpmath_with_hotness.ll index 86b40dc613b..569c50d4fd6 100644 --- a/test/Transforms/LoopVectorize/X86/no_fpmath_with_hotness.ll +++ b/test/Transforms/LoopVectorize/X86/no_fpmath_with_hotness.ll @@ -3,7 +3,7 @@ ; CHECK: remark: no_fpmath.c:6:11: loop not vectorized: cannot prove it is safe to reorder floating-point operations (hotness: 300) ; CHECK: remark: no_fpmath.c:6:14: loop not vectorized -; CHECK: remark: no_fpmath.c:17:14: vectorized loop (vectorization width: 2, interleaved count: 2) (hotness: 300) +; CHECK: remark: no_fpmath.c:17:14: vectorized loop (vectorization width: 4, interleaved count: 2) (hotness: 300) target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-apple-macosx10.10.0" diff --git a/test/Transforms/LoopVectorize/X86/reduction-crash.ll b/test/Transforms/LoopVectorize/X86/reduction-crash.ll index 6393002d507..c9761c3b31d 100644 --- a/test/Transforms/LoopVectorize/X86/reduction-crash.ll +++ b/test/Transforms/LoopVectorize/X86/reduction-crash.ll @@ -7,7 +7,7 @@ target triple = "i386-apple-darwin" define void @test1(float* nocapture %arg, i32 %arg1) nounwind { ; CHECK-LABEL: @test1( ; CHECK: preheader -; CHECK: insertelement <2 x double> zeroinitializer, double %tmp, i32 0 +; CHECK: insertelement <4 x double> zeroinitializer, double %tmp, i32 0 ; CHECK: vector.memcheck bb: diff --git a/test/Transforms/LoopVectorize/X86/vectorization-remarks-loopid-dbg.ll b/test/Transforms/LoopVectorize/X86/vectorization-remarks-loopid-dbg.ll index 1d51b9c4bea..65ff9b72e2d 100644 --- a/test/Transforms/LoopVectorize/X86/vectorization-remarks-loopid-dbg.ll +++ b/test/Transforms/LoopVectorize/X86/vectorization-remarks-loopid-dbg.ll @@ -6,7 +6,7 @@ ; DEBUG-OUTPUT-NOT: .loc ; DEBUG-OUTPUT-NOT: {{.*}}.debug_info -; VECTORIZED: remark: vectorization-remarks.c:17:8: vectorized loop (vectorization width: 4, interleaved count: 1) +; VECTORIZED: remark: vectorization-remarks.c:17:8: vectorized loop (vectorization width: 16, interleaved count: 1) ; UNROLLED: remark: vectorization-remarks.c:17:8: interleaved loop (interleaved count: 4) ; NONE: remark: vectorization-remarks.c:17:8: loop not vectorized: vectorization and interleaving are explicitly disabled, or vectorize width and interleave count are both set to 1 diff --git a/test/Transforms/LoopVectorize/X86/vectorization-remarks.ll b/test/Transforms/LoopVectorize/X86/vectorization-remarks.ll index c14a2cb91b6..22cbc4a931e 100644 --- a/test/Transforms/LoopVectorize/X86/vectorization-remarks.ll +++ b/test/Transforms/LoopVectorize/X86/vectorization-remarks.ll @@ -6,7 +6,7 @@ ; DEBUG-OUTPUT-NOT: .loc ; DEBUG-OUTPUT-NOT: {{.*}}.debug_info -; VECTORIZED: remark: vectorization-remarks.c:17:8: vectorized loop (vectorization width: 4, interleaved count: 1) +; VECTORIZED: remark: vectorization-remarks.c:17:8: vectorized loop (vectorization width: 16, interleaved count: 1) ; UNROLLED: remark: vectorization-remarks.c:17:8: interleaved loop (interleaved count: 4) ; NONE: remark: vectorization-remarks.c:17:8: loop not vectorized: vectorization and interleaving are explicitly disabled, or vectorize width and interleave count are both set to 1