re-commit r306336: Enable vectorizer-maximize-bandwidth by default.

author Teresa Johnson <tejohnson@google.com>

Sat, 1 Jul 2017 03:24:08 +0000 (03:24 +0000)

committer Teresa Johnson <tejohnson@google.com>

Sat, 1 Jul 2017 03:24:08 +0000 (03:24 +0000)
author Teresa Johnson <tejohnson@google.com>
Sat, 1 Jul 2017 03:24:08 +0000 (03:24 +0000)
committer Teresa Johnson <tejohnson@google.com>
Sat, 1 Jul 2017 03:24:08 +0000 (03:24 +0000)
diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp

index 193cc4d137870a5cd4ad84f1f24b60da554b9ebf..11484e8d96bb309adc5b87cc52f08a23a7709774 100644 (file)
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -123,7 +123,7 @@ static cl::opt<unsigned> TinyTripCountVectorThreshold(
               "are incurred."));
  
  static cl::opt<bool> MaximizeBandwidth(
-    "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
+    "vectorizer-maximize-bandwidth", cl::init(true), cl::Hidden,
      cl::desc("Maximize bandwidth when selecting vectorization factor which "
               "will be determined by the smallest type in loop."));
  
diff --git a/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll b/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll

index 1149afe7b9f4a04731c8524e62c91698b006584c..4cdf76b9b4488065bb3e9039a36c4d0ddd85876a 100644 (file)
--- a/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll
+++ b/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll
@@ -88,9 +88,9 @@ for.body:                                         ; preds = %entry, %for.body
  }
  
  ; CHECK-LABEL: @add_c(
-; CHECK: load <8 x i8>, <8 x i8>*
-; CHECK: add <8 x i16>
-; CHECK: store <8 x i16>
+; CHECK: load <16 x i8>, <16 x i8>*
+; CHECK: add <16 x i16>
+; CHECK: store <16 x i16>
  ; Function Attrs: nounwind
  define void @add_c(i8* noalias nocapture readonly %p, i16* noalias nocapture %q, i32 %len) #0 {
  entry:
@@ -116,9 +116,9 @@ for.body:                                         ; preds = %entry, %for.body
  }
  
  ; CHECK-LABEL: @add_d(
-; CHECK: load <4 x i16>
-; CHECK: add nsw <4 x i32>
-; CHECK: store <4 x i32>
+; CHECK: load <8 x i16>
+; CHECK: add nsw <8 x i32>
+; CHECK: store <8 x i32>
  define void @add_d(i16* noalias nocapture readonly %p, i32* noalias nocapture %q, i32 %len) #0 {
  entry:
    %cmp7 = icmp sgt i32 %len, 0
@@ -187,16 +187,16 @@ for.body:                                         ; preds = %for.body, %for.body
  }
  
  ; CHECK-LABEL: @add_f
-; CHECK: load <8 x i16>
-; CHECK: trunc <8 x i16>
-; CHECK: shl <8 x i8>
-; CHECK: add <8 x i8>
-; CHECK: or <8 x i8>
-; CHECK: mul <8 x i8>
-; CHECK: and <8 x i8>
-; CHECK: xor <8 x i8>
-; CHECK: mul <8 x i8>
-; CHECK: store <8 x i8>
+; CHECK: load <16 x i16>
+; CHECK: trunc <16 x i16>
+; CHECK: shl <16 x i8>
+; CHECK: add <16 x i8>
+; CHECK: or <16 x i8>
+; CHECK: mul <16 x i8>
+; CHECK: and <16 x i8>
+; CHECK: xor <16 x i8>
+; CHECK: mul <16 x i8>
+; CHECK: store <16 x i8>
  define void @add_f(i16* noalias nocapture readonly %p, i8* noalias nocapture %q, i8 %arg1, i8 %arg2, i32 %len) #0 {
  entry:
    %cmp.32 = icmp sgt i32 %len, 0
diff --git a/test/Transforms/LoopVectorize/AArch64/reduction-small-size.ll b/test/Transforms/LoopVectorize/AArch64/reduction-small-size.ll

index 9d9aea00e9a9124396c30781db639f2a995c30b9..2c3d059195474e21943fc41770d1841746aa7f2e 100644 (file)
--- a/test/Transforms/LoopVectorize/AArch64/reduction-small-size.ll
+++ b/test/Transforms/LoopVectorize/AArch64/reduction-small-size.ll
@@ -123,16 +123,16 @@ for.body:
  ; }
  ;
  ; CHECK: vector.body:
-; CHECK:   phi <8 x i16>
-; CHECK:   [[Ld1:%[a-zA-Z0-9.]+]] = load <8 x i8>
-; CHECK:   zext <8 x i8> [[Ld1]] to <8 x i16>
-; CHECK:   [[Ld2:%[a-zA-Z0-9.]+]] = load <8 x i8>
-; CHECK:   zext <8 x i8> [[Ld2]] to <8 x i16>
-; CHECK:   add <8 x i16>
-; CHECK:   add <8 x i16>
+; CHECK:   phi <16 x i16>
+; CHECK:   [[Ld1:%[a-zA-Z0-9.]+]] = load <16 x i8>
+; CHECK:   zext <16 x i8> [[Ld1]] to <16 x i16>
+; CHECK:   [[Ld2:%[a-zA-Z0-9.]+]] = load <16 x i8>
+; CHECK:   zext <16 x i8> [[Ld2]] to <16 x i16>
+; CHECK:   add <16 x i16>
+; CHECK:   add <16 x i16>
  ;
  ; CHECK: middle.block:
-; CHECK:   [[Rdx:%[a-zA-Z0-9.]+]] = call i16 @llvm.experimental.vector.reduce.add.i16.v8i16(<8 x i16>
+; CHECK:   [[Rdx:%[a-zA-Z0-9.]+]] = call i16 @llvm.experimental.vector.reduce.add.i16.v16i16(<16 x i16>
  ; CHECK:   zext i16 [[Rdx]] to i32
  ;
  define i16 @reduction_i16_2(i8* nocapture readonly %a, i8* nocapture readonly %b, i32 %n) {
diff --git a/test/Transforms/LoopVectorize/ARM/gcc-examples.ll b/test/Transforms/LoopVectorize/ARM/gcc-examples.ll

index 783156d7399dc070f69cd49edaab1cf461790809..609b304f93fffade8cb7b23876a118003b92144b 100644 (file)
--- a/test/Transforms/LoopVectorize/ARM/gcc-examples.ll
+++ b/test/Transforms/LoopVectorize/ARM/gcc-examples.ll
@@ -35,9 +35,9 @@ define void @example1() nounwind uwtable ssp {
  }
  
  ;CHECK-LABEL: @example10b(
-;CHECK: load <4 x i16>
-;CHECK: sext <4 x i16>
-;CHECK: store <4 x i32>
+;CHECK: load <8 x i16>
+;CHECK: sext <8 x i16>
+;CHECK: store <8 x i32>
  ;CHECK: ret void
  define void @example10b(i16* noalias nocapture %sa, i16* noalias nocapture %sb, i16* noalias nocapture %sc, i32* noalias nocapture %ia, i32* noalias nocapture %ib, i32* noalias nocapture %ic) nounwind uwtable ssp {
    br label %1
diff --git a/test/Transforms/LoopVectorize/X86/fp64_to_uint32-cost-model.ll b/test/Transforms/LoopVectorize/X86/fp64_to_uint32-cost-model.ll

index c066afcfa63e5334a1f7dd82696760818a5d615a..f002dd8adec2e68879b273c45b24aa502ab27188 100644 (file)
--- a/test/Transforms/LoopVectorize/X86/fp64_to_uint32-cost-model.ll
+++ b/test/Transforms/LoopVectorize/X86/fp64_to_uint32-cost-model.ll
@@ -9,7 +9,9 @@ target triple = "x86_64-apple-macosx"
  
  ; If we need to scalarize the fptoui and then use inserts to build up the
  ; vector again, then there is certainly no value in going 256-bit wide.
-; CHECK-NOT: vpinsrd
+; But as we default to maximize bandwidth, we should convert it to 256-bit
+; anyway.
+; CHECK: vpinsrd
  
  define void @convert() {
  entry:
diff --git a/test/Transforms/LoopVectorize/X86/gcc-examples.ll b/test/Transforms/LoopVectorize/X86/gcc-examples.ll

index c581f4bf2a630e41bb028139693bdef42800d366..2e3dfa0a15f31f7557a1d89b037555dcfef8bf7b 100644 (file)
--- a/test/Transforms/LoopVectorize/X86/gcc-examples.ll
+++ b/test/Transforms/LoopVectorize/X86/gcc-examples.ll
@@ -44,17 +44,16 @@ define void @example1() nounwind uwtable ssp {
    ret void
  }
  
-; Select VF=4 because sext <8 x i1> to <8 x i32> is expensive.
  ;CHECK-LABEL: @example10b(
-;CHECK: load <4 x i16>
-;CHECK: sext <4 x i16>
-;CHECK: store <4 x i32>
+;CHECK: load <8 x i16>
+;CHECK: sext <8 x i16>
+;CHECK: store <8 x i32>
  ;CHECK: ret void
  ;UNROLL-LABEL: @example10b(
-;UNROLL: load <4 x i16>
-;UNROLL: load <4 x i16>
-;UNROLL: store <4 x i32>
-;UNROLL: store <4 x i32>
+;UNROLL: load <8 x i16>
+;UNROLL: load <8 x i16>
+;UNROLL: store <8 x i32>
+;UNROLL: store <8 x i32>
  ;UNROLL: ret void
  define void @example10b(i16* noalias nocapture %sa, i16* noalias nocapture %sb, i16* noalias nocapture %sc, i32* noalias nocapture %ia, i32* noalias nocapture %ib, i32* noalias nocapture %ic) nounwind uwtable ssp {
    br label %1
diff --git a/test/Transforms/LoopVectorize/X86/masked_load_store.ll b/test/Transforms/LoopVectorize/X86/masked_load_store.ll

index 0377ae1c24dbb9e66ef5a51f20ff07d5e8d60d5a..e15c707587fba94a7035f155ba244e283f26d3ba 100644 (file)
--- a/test/Transforms/LoopVectorize/X86/masked_load_store.ll
+++ b/test/Transforms/LoopVectorize/X86/masked_load_store.ll
@@ -260,20 +260,28 @@ for.end:                                          ; preds = %for.cond
  ;  }
  ;}
  
-;AVX-LABEL: @foo3
-;AVX: icmp slt <4 x i32> %wide.load, <i32 100, i32 100,
-;AVX: call <4 x double> @llvm.masked.load.v4f64.p0v4f64
-;AVX: sitofp <4 x i32> %wide.load to <4 x double>
-;AVX: fadd <4 x double>
-;AVX: call void @llvm.masked.store.v4f64.p0v4f64
-;AVX: ret void
+;AVX1-LABEL: @foo3
+;AVX1: icmp slt <4 x i32> %wide.load, <i32 100, i32 100,
+;AVX1: call <4 x double> @llvm.masked.load.v4f64.p0v4f64
+;AVX1: sitofp <4 x i32> %wide.load to <4 x double>
+;AVX1: fadd <4 x double>
+;AVX1: call void @llvm.masked.store.v4f64.p0v4f64
+;AVX1: ret void
+
+;AVX2-LABEL: @foo3
+;AVX2: icmp slt <8 x i32> %wide.load, <i32 100, i32 100,
+;AVX2: call <8 x double> @llvm.masked.load.v8f64.p0v8f64
+;AVX2: sitofp <8 x i32> %wide.load to <8 x double>
+;AVX2: fadd <8 x double>
+;AVX2: call void @llvm.masked.store.v8f64.p0v8f64
+;AVX2: ret void
  
  ;AVX512-LABEL: @foo3
-;AVX512: icmp slt <8 x i32> %wide.load, <i32 100, i32 100,
-;AVX512: call <8 x double> @llvm.masked.load.v8f64.p0v8f64
-;AVX512: sitofp <8 x i32> %wide.load to <8 x double>
-;AVX512: fadd <8 x double>
-;AVX512: call void @llvm.masked.store.v8f64.p0v8f64
+;AVX512: icmp slt <16 x i32> %wide.load, <i32 100, i32 100,
+;AVX512: call <16 x double> @llvm.masked.load.v16f64.p0v16f64
+;AVX512: sitofp <16 x i32> %wide.load to <16 x double>
+;AVX512: fadd <16 x double>
+;AVX512: call void @llvm.masked.store.v16f64.p0v16f64
  ;AVX512: ret void
  
  
@@ -502,19 +510,19 @@ for.end:                                          ; preds = %for.cond
  ;  }
  ;}
  ;AVX2-LABEL: @foo6
-;AVX2: icmp sgt <4 x i32> %reverse, zeroinitializer
-;AVX2: shufflevector <4 x i1>{{.*}}<4 x i32> <i32 3, i32 2, i32 1, i32 0>
-;AVX2: call <4 x double> @llvm.masked.load.v4f64.p0v4f64
-;AVX2: fadd <4 x double>
-;AVX2: call void @llvm.masked.store.v4f64.p0v4f64
+;AVX2: icmp sgt <8 x i32> %reverse, zeroinitializer
+;AVX2: shufflevector <8 x i1>{{.*}}<8 x i32> <i32 7, i32 6, i32 5, i32 4
+;AVX2: call <8 x double> @llvm.masked.load.v8f64.p0v8f64
+;AVX2: fadd <8 x double>
+;AVX2: call void @llvm.masked.store.v8f64.p0v8f64
  ;AVX2: ret void
  
  ;AVX512-LABEL: @foo6
-;AVX512: icmp sgt <8 x i32> %reverse, zeroinitializer
-;AVX512: shufflevector <8 x i1>{{.*}}<8 x i32> <i32 7, i32 6, i32 5, i32 4
-;AVX512: call <8 x double> @llvm.masked.load.v8f64.p0v8f64
-;AVX512: fadd <8 x double>
-;AVX512: call void @llvm.masked.store.v8f64.p0v8f64
+;AVX512: icmp sgt <16 x i32> %reverse, zeroinitializer
+;AVX512: shufflevector <16 x i1>{{.*}}<16 x i32> <i32 15, i32 14, i32 13, i32 12
+;AVX512: call <16 x double> @llvm.masked.load.v16f64.p0v16f64
+;AVX512: fadd <16 x double>
+;AVX512: call void @llvm.masked.store.v16f64.p0v16f64
  ;AVX512: ret void
  
  
@@ -582,8 +590,8 @@ for.end:                                          ; preds = %for.cond
  ; }
  
  ;AVX512-LABEL: @foo7
-;AVX512: call <8 x double*> @llvm.masked.load.v8p0f64.p0v8p0f64(<8 x double*>*
-;AVX512: call void @llvm.masked.store.v8f64.p0v8f64
+;AVX512: call <64 x double*> @llvm.masked.load.v64p0f64.p0v64p0f64(<64 x double*>*
+;AVX512: call void @llvm.masked.store.v64f64.p0v64f64
  ;AVX512: ret void
  
  define void @foo7(double* noalias %out, double** noalias %in, i8* noalias %trigger, i32 %size) #0 {
@@ -654,8 +662,8 @@ for.end:                                          ; preds = %for.cond
  ;}
  
  ;AVX512-LABEL: @foo8
-;AVX512: call <8 x i32 ()*> @llvm.masked.load.v8p0f_i32f.p0v8p0f_i32f(<8 x i32 ()*>* %
-;AVX512: call void @llvm.masked.store.v8f64.p0v8f64
+;AVX512: call <64 x i32 ()*> @llvm.masked.load.v64p0f_i32f.p0v64p0f_i32f(<64 x i32 ()*>* %
+;AVX512: call void @llvm.masked.store.v64f64.p0v64f64
  ;AVX512: ret void
  
  define void @foo8(double* noalias %out, i32 ()** noalias %in, i8* noalias %trigger, i32 %size) #0 {
diff --git a/test/Transforms/LoopVectorize/X86/no_fpmath.ll b/test/Transforms/LoopVectorize/X86/no_fpmath.ll

index 2efe928f0f464c4beed92110174435a0133dc7d7..04ecaeb4e4ee7d044296e107a5abacbe8e6db8bb 100644 (file)
--- a/test/Transforms/LoopVectorize/X86/no_fpmath.ll
+++ b/test/Transforms/LoopVectorize/X86/no_fpmath.ll
@@ -2,7 +2,7 @@
  
  ; CHECK: remark: no_fpmath.c:6:11: loop not vectorized: cannot prove it is safe to reorder floating-point operations
  ; CHECK: remark: no_fpmath.c:6:14: loop not vectorized
-; CHECK: remark: no_fpmath.c:17:14: vectorized loop (vectorization width: 2, interleaved count: 2)
+; CHECK: remark: no_fpmath.c:17:14: vectorized loop (vectorization width: 4, interleaved count: 2)
  
  target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
  target triple = "x86_64-apple-macosx10.10.0"
diff --git a/test/Transforms/LoopVectorize/X86/no_fpmath_with_hotness.ll b/test/Transforms/LoopVectorize/X86/no_fpmath_with_hotness.ll

index 86b40dc613bb34fe0c967f9848ce69568242b98c..569c50d4fd6931d8ce7969b91f220fac92405397 100644 (file)
--- a/test/Transforms/LoopVectorize/X86/no_fpmath_with_hotness.ll
+++ b/test/Transforms/LoopVectorize/X86/no_fpmath_with_hotness.ll
@@ -3,7 +3,7 @@
  
  ; CHECK: remark: no_fpmath.c:6:11: loop not vectorized: cannot prove it is safe to reorder floating-point operations (hotness: 300)
  ; CHECK: remark: no_fpmath.c:6:14: loop not vectorized
-; CHECK: remark: no_fpmath.c:17:14: vectorized loop (vectorization width: 2, interleaved count: 2) (hotness: 300)
+; CHECK: remark: no_fpmath.c:17:14: vectorized loop (vectorization width: 4, interleaved count: 2) (hotness: 300)
  
  target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
  target triple = "x86_64-apple-macosx10.10.0"
diff --git a/test/Transforms/LoopVectorize/X86/reduction-crash.ll b/test/Transforms/LoopVectorize/X86/reduction-crash.ll

index 6393002d507169256a52bd3eb49b2b7402ca7637..c9761c3b31d5b6a3a0321e76f3df788f63a7f808 100644 (file)
--- a/test/Transforms/LoopVectorize/X86/reduction-crash.ll
+++ b/test/Transforms/LoopVectorize/X86/reduction-crash.ll
@@ -7,7 +7,7 @@ target triple = "i386-apple-darwin"
  define void @test1(float* nocapture %arg, i32 %arg1) nounwind {
  ; CHECK-LABEL: @test1(
  ; CHECK: preheader
-; CHECK: insertelement <2 x double> zeroinitializer, double %tmp, i32 0
+; CHECK: insertelement <4 x double> zeroinitializer, double %tmp, i32 0
  ; CHECK: vector.memcheck
  
  bb:
diff --git a/test/Transforms/LoopVectorize/X86/vectorization-remarks-loopid-dbg.ll b/test/Transforms/LoopVectorize/X86/vectorization-remarks-loopid-dbg.ll

index 1d51b9c4beaa3d6a2c01bbb695b544650624d65d..65ff9b72e2d96946308f2c3aab46cc7b1887ce3c 100644 (file)
--- a/test/Transforms/LoopVectorize/X86/vectorization-remarks-loopid-dbg.ll
+++ b/test/Transforms/LoopVectorize/X86/vectorization-remarks-loopid-dbg.ll
@@ -6,7 +6,7 @@
  ; DEBUG-OUTPUT-NOT: .loc
  ; DEBUG-OUTPUT-NOT: {{.*}}.debug_info
  
-; VECTORIZED: remark: vectorization-remarks.c:17:8: vectorized loop (vectorization width: 4, interleaved count: 1)
+; VECTORIZED: remark: vectorization-remarks.c:17:8: vectorized loop (vectorization width: 16, interleaved count: 1)
  ; UNROLLED: remark: vectorization-remarks.c:17:8: interleaved loop (interleaved count: 4)
  ; NONE: remark: vectorization-remarks.c:17:8: loop not vectorized: vectorization and interleaving are explicitly disabled, or vectorize width and interleave count are both set to 1
  
diff --git a/test/Transforms/LoopVectorize/X86/vectorization-remarks.ll b/test/Transforms/LoopVectorize/X86/vectorization-remarks.ll

index c14a2cb91b60e9563a61745f947bae3e9557be10..22cbc4a931efe28d3eac69ffdba45ccffcfbcc9f 100644 (file)
--- a/test/Transforms/LoopVectorize/X86/vectorization-remarks.ll
+++ b/test/Transforms/LoopVectorize/X86/vectorization-remarks.ll
@@ -6,7 +6,7 @@
  ; DEBUG-OUTPUT-NOT: .loc
  ; DEBUG-OUTPUT-NOT: {{.*}}.debug_info
  
-; VECTORIZED: remark: vectorization-remarks.c:17:8: vectorized loop (vectorization width: 4, interleaved count: 1)
+; VECTORIZED: remark: vectorization-remarks.c:17:8: vectorized loop (vectorization width: 16, interleaved count: 1)
  ; UNROLLED: remark: vectorization-remarks.c:17:8: interleaved loop (interleaved count: 4)
  ; NONE: remark: vectorization-remarks.c:17:8: loop not vectorized: vectorization and interleaving are explicitly disabled, or vectorize width and interleave count are both set to 1
author	Teresa Johnson <tejohnson@google.com>
	Sat, 1 Jul 2017 03:24:08 +0000 (03:24 +0000)
committer	Teresa Johnson <tejohnson@google.com>
	Sat, 1 Jul 2017 03:24:08 +0000 (03:24 +0000)
lib/Transforms/Vectorize/LoopVectorize.cpp		patch \| blob \| history
test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll		patch \| blob \| history
test/Transforms/LoopVectorize/AArch64/reduction-small-size.ll		patch \| blob \| history
test/Transforms/LoopVectorize/ARM/gcc-examples.ll		patch \| blob \| history
test/Transforms/LoopVectorize/X86/fp64_to_uint32-cost-model.ll		patch \| blob \| history
test/Transforms/LoopVectorize/X86/gcc-examples.ll		patch \| blob \| history
test/Transforms/LoopVectorize/X86/masked_load_store.ll		patch \| blob \| history
test/Transforms/LoopVectorize/X86/no_fpmath.ll		patch \| blob \| history
test/Transforms/LoopVectorize/X86/no_fpmath_with_hotness.ll		patch \| blob \| history
test/Transforms/LoopVectorize/X86/reduction-crash.ll		patch \| blob \| history
test/Transforms/LoopVectorize/X86/vectorization-remarks-loopid-dbg.ll		patch \| blob \| history
test/Transforms/LoopVectorize/X86/vectorization-remarks.ll		patch \| blob \| history