From 255071b56faf2e6477747a9945f8c018937e6052 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Thu, 15 Dec 2016 14:24:07 +0000 Subject: [PATCH] [CostModel][X86] Updated reverse shuffle costs git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@289819 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86TargetTransformInfo.cpp | 100 +++++++++++++++++- .../Analysis/CostModel/X86/shuffle-reverse.ll | 88 +++++++++------ 2 files changed, 151 insertions(+), 37 deletions(-) diff --git a/lib/Target/X86/X86TargetTransformInfo.cpp b/lib/Target/X86/X86TargetTransformInfo.cpp index f9d1217dbbe..6b5b5a1528e 100644 --- a/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/lib/Target/X86/X86TargetTransformInfo.cpp @@ -604,12 +604,102 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, if (Kind == TTI::SK_Reverse) { std::pair LT = TLI->getTypeLegalizationCost(DL, Tp); - int Cost = 1; - if (LT.second.getSizeInBits() > 128) - Cost = 3; // Extract + insert + copy. - // Multiple by the number of parts. - return Cost * LT.first; + static const CostTblEntry AVX512VBMIShuffleTbl[] = { + { ISD::VECTOR_SHUFFLE, MVT::v64i8, 1 }, // vpermb + { ISD::VECTOR_SHUFFLE, MVT::v32i8, 1 } // vpermb + }; + + if (ST->hasVBMI()) + if (const auto *Entry = CostTableLookup(AVX512VBMIShuffleTbl, + ISD::VECTOR_SHUFFLE, LT.second)) + return LT.first * Entry->Cost; + + static const CostTblEntry AVX512BWShuffleTbl[] = { + { ISD::VECTOR_SHUFFLE, MVT::v32i16, 1 }, // vpermw + { ISD::VECTOR_SHUFFLE, MVT::v64i8, 6 } // vextracti64x4 + 2*vperm2i128 + // + 2*pshufb + vinserti64x4 + }; + + if (ST->hasBWI()) + if (const auto *Entry = CostTableLookup(AVX512BWShuffleTbl, + ISD::VECTOR_SHUFFLE, LT.second)) + return LT.first * Entry->Cost; + + static const CostTblEntry AVX512ShuffleTbl[] = { + { ISD::VECTOR_SHUFFLE, MVT::v8f64, 1 }, // vpermpd + { ISD::VECTOR_SHUFFLE, MVT::v16f32, 1 }, // vpermps + { ISD::VECTOR_SHUFFLE, MVT::v8i64, 1 }, // vpermq + { ISD::VECTOR_SHUFFLE, MVT::v16i32, 1 }, // vpermd + }; + + if (ST->hasAVX512()) + if (const auto *Entry = + CostTableLookup(AVX512ShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second)) + return LT.first * Entry->Cost; + + static const CostTblEntry AVX2ShuffleTbl[] = { + { ISD::VECTOR_SHUFFLE, MVT::v4f64, 1 }, // vpermpd + { ISD::VECTOR_SHUFFLE, MVT::v8f32, 1 }, // vpermps + { ISD::VECTOR_SHUFFLE, MVT::v4i64, 1 }, // vpermq + { ISD::VECTOR_SHUFFLE, MVT::v8i32, 1 }, // vpermd + { ISD::VECTOR_SHUFFLE, MVT::v16i16, 2 }, // vperm2i128 + pshufb + { ISD::VECTOR_SHUFFLE, MVT::v32i8, 2 } // vperm2i128 + pshufb + }; + + if (ST->hasAVX2()) + if (const auto *Entry = + CostTableLookup(AVX2ShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second)) + return LT.first * Entry->Cost; + + static const CostTblEntry AVX1ShuffleTbl[] = { + { ISD::VECTOR_SHUFFLE, MVT::v4f64, 2 }, // vperm2f128 + vpermilpd + { ISD::VECTOR_SHUFFLE, MVT::v8f32, 2 }, // vperm2f128 + vpermilps + { ISD::VECTOR_SHUFFLE, MVT::v4i64, 2 }, // vperm2f128 + vpermilpd + { ISD::VECTOR_SHUFFLE, MVT::v8i32, 2 }, // vperm2f128 + vpermilps + { ISD::VECTOR_SHUFFLE, MVT::v16i16, 4 }, // vextractf128 + 2*pshufb + // + vinsertf128 + { ISD::VECTOR_SHUFFLE, MVT::v32i8, 4 } // vextractf128 + 2*pshufb + // + vinsertf128 + }; + + if (ST->hasAVX()) + if (const auto *Entry = + CostTableLookup(AVX1ShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second)) + return LT.first * Entry->Cost; + + static const CostTblEntry SSSE3ShuffleTbl[] = { + { ISD::VECTOR_SHUFFLE, MVT::v8i16, 1 }, // pshufb + { ISD::VECTOR_SHUFFLE, MVT::v16i8, 1 } // pshufb + }; + + if (ST->hasSSSE3()) + if (const auto *Entry = + CostTableLookup(SSSE3ShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second)) + return LT.first * Entry->Cost; + + static const CostTblEntry SSE2ShuffleTbl[] = { + { ISD::VECTOR_SHUFFLE, MVT::v2f64, 1 }, // shufpd + { ISD::VECTOR_SHUFFLE, MVT::v2i64, 1 }, // pshufd + { ISD::VECTOR_SHUFFLE, MVT::v4i32, 1 }, // pshufd + { ISD::VECTOR_SHUFFLE, MVT::v8i16, 3 }, // pshuflw + pshufhw + pshufd + { ISD::VECTOR_SHUFFLE, MVT::v16i8, 9 } // 2*pshuflw + 2*pshufhw + // + 2*pshufd + 2*unpck + packus + }; + + if (ST->hasSSE2()) + if (const auto *Entry = + CostTableLookup(SSE2ShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second)) + return LT.first * Entry->Cost; + + static const CostTblEntry SSE1ShuffleTbl[] = { + { ISD::VECTOR_SHUFFLE, MVT::v4f32, 1 }, // shufps + }; + + if (ST->hasSSE1()) + if (const auto *Entry = + CostTableLookup(SSE1ShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second)) + return LT.first * Entry->Cost; } if (Kind == TTI::SK_Alternate) { diff --git a/test/Analysis/CostModel/X86/shuffle-reverse.ll b/test/Analysis/CostModel/X86/shuffle-reverse.ll index 64ccdcedd5f..1b2ecb6bf0d 100644 --- a/test/Analysis/CostModel/X86/shuffle-reverse.ll +++ b/test/Analysis/CostModel/X86/shuffle-reverse.ll @@ -18,13 +18,15 @@ define void @test_vXf64(<2 x double> %src128, <4 x double> %src256, <8 x double> %V128 = shufflevector <2 x double> %src128, <2 x double> undef, <2 x i32> ; SSE: cost of 2 {{.*}} %V256 = shufflevector - ; AVX: cost of 3 {{.*}} %V256 = shufflevector - ; AVX512: cost of 3 {{.*}} %V256 = shufflevector + ; AVX1: cost of 2 {{.*}} %V256 = shufflevector + ; AVX2: cost of 1 {{.*}} %V256 = shufflevector + ; AVX512: cost of 1 {{.*}} %V256 = shufflevector %V256 = shufflevector <4 x double> %src256, <4 x double> undef, <4 x i32> ; SSE: cost of 4 {{.*}} %V512 = shufflevector - ; AVX: cost of 6 {{.*}} %V512 = shufflevector - ; AVX512: cost of 3 {{.*}} %V512 = shufflevector + ; AVX1: cost of 4 {{.*}} %V512 = shufflevector + ; AVX2: cost of 2 {{.*}} %V512 = shufflevector + ; AVX512: cost of 1 {{.*}} %V512 = shufflevector %V512 = shufflevector <8 x double> %src512, <8 x double> undef, <8 x i32> ret void @@ -38,13 +40,15 @@ define void @test_vXi64(<2 x i64> %src128, <4 x i64> %src256, <8 x i64> %src512) %V128 = shufflevector <2 x i64> %src128, <2 x i64> undef, <2 x i32> ; SSE: cost of 2 {{.*}} %V256 = shufflevector - ; AVX: cost of 3 {{.*}} %V256 = shufflevector - ; AVX512: cost of 3 {{.*}} %V256 = shufflevector + ; AVX1: cost of 2 {{.*}} %V256 = shufflevector + ; AVX2: cost of 1 {{.*}} %V256 = shufflevector + ; AVX512: cost of 1 {{.*}} %V256 = shufflevector %V256 = shufflevector <4 x i64> %src256, <4 x i64> undef, <4 x i32> ; SSE: cost of 4 {{.*}} %V512 = shufflevector - ; AVX: cost of 6 {{.*}} %V512 = shufflevector - ; AVX512: cost of 3 {{.*}} %V512 = shufflevector + ; AVX1: cost of 4 {{.*}} %V512 = shufflevector + ; AVX2: cost of 2 {{.*}} %V512 = shufflevector + ; AVX512: cost of 1 {{.*}} %V512 = shufflevector %V512 = shufflevector <8 x i64> %src512, <8 x i64> undef, <8 x i32> ret void @@ -63,13 +67,15 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr %V128 = shufflevector <4 x float> %src128, <4 x float> undef, <4 x i32> ; SSE: cost of 2 {{.*}} %V256 = shufflevector - ; AVX: cost of 3 {{.*}} %V256 = shufflevector - ; AVX512: cost of 3 {{.*}} %V256 = shufflevector + ; AVX1: cost of 2 {{.*}} %V256 = shufflevector + ; AVX2: cost of 1 {{.*}} %V256 = shufflevector + ; AVX512: cost of 1 {{.*}} %V256 = shufflevector %V256 = shufflevector <8 x float> %src256, <8 x float> undef, <8 x i32> ; SSE: cost of 4 {{.*}} %V512 = shufflevector - ; AVX: cost of 6 {{.*}} %V512 = shufflevector - ; AVX512: cost of 3 {{.*}} %V512 = shufflevector + ; AVX1: cost of 4 {{.*}} %V512 = shufflevector + ; AVX2: cost of 2 {{.*}} %V512 = shufflevector + ; AVX512: cost of 1 {{.*}} %V512 = shufflevector %V512 = shufflevector <16 x float> %src512, <16 x float> undef, <16 x i32> ret void @@ -88,13 +94,15 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256, %V128 = shufflevector <4 x i32> %src128, <4 x i32> undef, <4 x i32> ; SSE: cost of 2 {{.*}} %V256 = shufflevector - ; AVX: cost of 3 {{.*}} %V256 = shufflevector - ; AVX512: cost of 3 {{.*}} %V256 = shufflevector + ; AVX1: cost of 2 {{.*}} %V256 = shufflevector + ; AVX2: cost of 1 {{.*}} %V256 = shufflevector + ; AVX512: cost of 1 {{.*}} %V256 = shufflevector %V256 = shufflevector <8 x i32> %src256, <8 x i32> undef, <8 x i32> ; SSE: cost of 4 {{.*}} %V512 = shufflevector - ; AVX: cost of 6 {{.*}} %V512 = shufflevector - ; AVX512: cost of 3 {{.*}} %V512 = shufflevector + ; AVX1: cost of 4 {{.*}} %V512 = shufflevector + ; AVX2: cost of 2 {{.*}} %V512 = shufflevector + ; AVX512: cost of 1 {{.*}} %V512 = shufflevector %V512 = shufflevector <16 x i32> %src512, <16 x i32> undef, <16 x i32> ret void @@ -102,20 +110,28 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256, ; CHECK-LABEL: 'test_vXi16' define void @test_vXi16(<8 x i16> %src128, <16 x i16> %src256, <32 x i16> %src512) { - ; SSE: cost of 1 {{.*}} %V128 = shufflevector + ; SSE2: cost of 3 {{.*}} %V128 = shufflevector + ; SSSE3: cost of 1 {{.*}} %V128 = shufflevector + ; SSE42: cost of 1 {{.*}} %V128 = shufflevector ; AVX: cost of 1 {{.*}} %V128 = shufflevector ; AVX512: cost of 1 {{.*}} %V128 = shufflevector %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> - ; SSE: cost of 2 {{.*}} %V256 = shufflevector - ; AVX: cost of 3 {{.*}} %V256 = shufflevector - ; AVX512: cost of 3 {{.*}} %V256 = shufflevector + ; SSE2: cost of 6 {{.*}} %V256 = shufflevector + ; SSSE3: cost of 2 {{.*}} %V256 = shufflevector + ; SSE42: cost of 2 {{.*}} %V256 = shufflevector + ; AVX1: cost of 4 {{.*}} %V256 = shufflevector + ; AVX2: cost of 2 {{.*}} %V256 = shufflevector + ; AVX512: cost of 2 {{.*}} %V256 = shufflevector %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> - ; SSE: cost of 4 {{.*}} %V512 = shufflevector - ; AVX: cost of 6 {{.*}} %V512 = shufflevector - ; AVX512F: cost of 6 {{.*}} %V512 = shufflevector - ; AVX512BW: cost of 3 {{.*}} %V512 = shufflevector + ; SSE2: cost of 12 {{.*}} %V512 = shufflevector + ; SSSE3: cost of 4 {{.*}} %V512 = shufflevector + ; SSE42: cost of 4 {{.*}} %V512 = shufflevector + ; AVX1: cost of 8 {{.*}} %V512 = shufflevector + ; AVX2: cost of 4 {{.*}} %V512 = shufflevector + ; AVX512F: cost of 4 {{.*}} %V512 = shufflevector + ; AVX512BW: cost of 1 {{.*}} %V512 = shufflevector %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> ret void @@ -123,20 +139,28 @@ define void @test_vXi16(<8 x i16> %src128, <16 x i16> %src256, <32 x i16> %src51 ; CHECK-LABEL: 'test_vXi8' define void @test_vXi8(<16 x i8> %src128, <32 x i8> %src256, <64 x i8> %src512) { - ; SSE: cost of 1 {{.*}} %V128 = shufflevector + ; SSE2: cost of 9 {{.*}} %V128 = shufflevector + ; SSSE3: cost of 1 {{.*}} %V128 = shufflevector + ; SSE42: cost of 1 {{.*}} %V128 = shufflevector ; AVX: cost of 1 {{.*}} %V128 = shufflevector ; AVX512: cost of 1 {{.*}} %V128 = shufflevector %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> - ; SSE: cost of 2 {{.*}} %V256 = shufflevector - ; AVX: cost of 3 {{.*}} %V256 = shufflevector - ; AVX512: cost of 3 {{.*}} %V256 = shufflevector + ; SSE2: cost of 18 {{.*}} %V256 = shufflevector + ; SSSE3: cost of 2 {{.*}} %V256 = shufflevector + ; SSE42: cost of 2 {{.*}} %V256 = shufflevector + ; AVX1: cost of 4 {{.*}} %V256 = shufflevector + ; AVX2: cost of 2 {{.*}} %V256 = shufflevector + ; AVX512: cost of 2 {{.*}} %V256 = shufflevector %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> - ; SSE: cost of 4 {{.*}} %V512 = shufflevector - ; AVX: cost of 6 {{.*}} %V512 = shufflevector - ; AVX512F: cost of 6 {{.*}} %V512 = shufflevector - ; AVX512BW: cost of 3 {{.*}} %V512 = shufflevector + ; SSE2: cost of 36 {{.*}} %V512 = shufflevector + ; SSSE3: cost of 4 {{.*}} %V512 = shufflevector + ; SSE42: cost of 4 {{.*}} %V512 = shufflevector + ; AVX1: cost of 8 {{.*}} %V512 = shufflevector + ; AVX2: cost of 4 {{.*}} %V512 = shufflevector + ; AVX512F: cost of 4 {{.*}} %V512 = shufflevector + ; AVX512BW: cost of 6 {{.*}} %V512 = shufflevector %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> ret void -- 2.50.0