From 15bfd0f9a039cdda59dc67f2c6b8c069235f80ea Mon Sep 17 00:00:00 2001 From: Keno Fischer Date: Wed, 5 Apr 2017 20:51:38 +0000 Subject: [PATCH] [X86 TTI] Implement LSV hook Summary: LSV wants to know the maximum size that can be loaded to a vector register. On X86, this always matches the maximum register width. Implement this accordingly and add a test to make sure that LSV can vectorize up to the maximum permissible width on X86. Reviewers: delena, arsenm Reviewed By: arsenm Subscribers: wdng, llvm-commits Differential Revision: https://reviews.llvm.org/D31504 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@299589 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86TargetTransformInfo.cpp | 6 ++- lib/Target/X86/X86TargetTransformInfo.h | 3 +- .../LoadStoreVectorizer/X86/load-width.ll | 38 +++++++++++++++++++ 3 files changed, 45 insertions(+), 2 deletions(-) create mode 100644 test/Transforms/LoadStoreVectorizer/X86/load-width.ll diff --git a/lib/Target/X86/X86TargetTransformInfo.cpp b/lib/Target/X86/X86TargetTransformInfo.cpp index da653a6a9f9..ea8aa5cb61e 100644 --- a/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/lib/Target/X86/X86TargetTransformInfo.cpp @@ -78,7 +78,7 @@ unsigned X86TTIImpl::getNumberOfRegisters(bool Vector) { return 8; } -unsigned X86TTIImpl::getRegisterBitWidth(bool Vector) { +unsigned X86TTIImpl::getRegisterBitWidth(bool Vector) const { if (Vector) { if (ST->hasAVX512()) return 512; @@ -95,6 +95,10 @@ unsigned X86TTIImpl::getRegisterBitWidth(bool Vector) { return 32; } +unsigned X86TTIImpl::getLoadStoreVecRegBitWidth(unsigned) const { + return getRegisterBitWidth(true); +} + unsigned X86TTIImpl::getMaxInterleaveFactor(unsigned VF) { // If the loop will not be vectorized, don't interleave the loop. // Let regular unroll to unroll the loop, which saves the overflow diff --git a/lib/Target/X86/X86TargetTransformInfo.h b/lib/Target/X86/X86TargetTransformInfo.h index 0622fcf2815..2aa94fdc3c2 100644 --- a/lib/Target/X86/X86TargetTransformInfo.h +++ b/lib/Target/X86/X86TargetTransformInfo.h @@ -51,7 +51,8 @@ public: /// @{ unsigned getNumberOfRegisters(bool Vector); - unsigned getRegisterBitWidth(bool Vector); + unsigned getRegisterBitWidth(bool Vector) const; + unsigned getLoadStoreVecRegBitWidth(unsigned AS) const; unsigned getMaxInterleaveFactor(unsigned VF); int getArithmeticInstrCost( unsigned Opcode, Type *Ty, diff --git a/test/Transforms/LoadStoreVectorizer/X86/load-width.ll b/test/Transforms/LoadStoreVectorizer/X86/load-width.ll new file mode 100644 index 00000000000..a61b25119a1 --- /dev/null +++ b/test/Transforms/LoadStoreVectorizer/X86/load-width.ll @@ -0,0 +1,38 @@ +; RUN: opt -mtriple=x86_64-unknown-linux-gnu -load-store-vectorizer -mcpu haswell -S -o - %s | FileCheck --check-prefix=CHECK-HSW %s +; RUN: opt -mtriple=x86_64-unknown-linux-gnu -load-store-vectorizer -mcpu knl -S -o - %s | FileCheck --check-prefix=CHECK-KNL %s + +define <8 x double> @loadwidth_insert_extract(double* %ptr) { + %a = bitcast double* %ptr to <2 x double> * + %b = getelementptr <2 x double>, <2 x double>* %a, i32 1 + %c = getelementptr <2 x double>, <2 x double>* %a, i32 2 + %d = getelementptr <2 x double>, <2 x double>* %a, i32 3 +; CHECK-HSW: load <4 x double> +; CHECK-HSW: load <4 x double> +; CHECK-HSW-NOT: load +; CHECK-KNL: load <8 x double> +; CHECK-KNL-NOT: load + %la = load <2 x double>, <2 x double> *%a + %lb = load <2 x double>, <2 x double> *%b + %lc = load <2 x double>, <2 x double> *%c + %ld = load <2 x double>, <2 x double> *%d + ; Scalarize everything - Explicitly not a shufflevector to test this code + ; path in the LSV + %v1 = extractelement <2 x double> %la, i32 0 + %v2 = extractelement <2 x double> %la, i32 1 + %v3 = extractelement <2 x double> %lb, i32 0 + %v4 = extractelement <2 x double> %lb, i32 1 + %v5 = extractelement <2 x double> %lc, i32 0 + %v6 = extractelement <2 x double> %lc, i32 1 + %v7 = extractelement <2 x double> %ld, i32 0 + %v8 = extractelement <2 x double> %ld, i32 1 + ; Make a vector again + %i1 = insertelement <8 x double> undef, double %v1, i32 0 + %i2 = insertelement <8 x double> %i1, double %v2, i32 1 + %i3 = insertelement <8 x double> %i2, double %v3, i32 2 + %i4 = insertelement <8 x double> %i3, double %v4, i32 3 + %i5 = insertelement <8 x double> %i4, double %v5, i32 4 + %i6 = insertelement <8 x double> %i5, double %v6, i32 5 + %i7 = insertelement <8 x double> %i6, double %v7, i32 6 + %i8 = insertelement <8 x double> %i7, double %v8, i32 7 + ret <8 x double> %i8 +} -- 2.40.0