From: Simon Pilgrim Date: Thu, 10 Aug 2017 17:27:20 +0000 (+0000) Subject: [CostModel][X86] Improve single src shuffle costs X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=b0b94e11177540edaebe15faa682a3188fa46a01;p=llvm [CostModel][X86] Improve single src shuffle costs Add missing SK_PermuteSingleSrc costs for AVX2 targets and earlier, also added some of the simpler SK_PermuteTwoSrc costs to support splitting of SK_PermuteSingleSrc shuffles git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@310632 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/Target/X86/X86TargetTransformInfo.cpp b/lib/Target/X86/X86TargetTransformInfo.cpp index 82425189a0a..fcacef84a66 100644 --- a/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/lib/Target/X86/X86TargetTransformInfo.cpp @@ -838,6 +838,8 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, { TTI::SK_Alternate, MVT::v16i16, 1 }, // vpblendw { TTI::SK_Alternate, MVT::v32i8, 1 }, // vpblendvb + { TTI::SK_PermuteSingleSrc, MVT::v4f64, 1 }, // vpermpd + { TTI::SK_PermuteSingleSrc, MVT::v8f32, 1 }, // vpermps { TTI::SK_PermuteSingleSrc, MVT::v4i64, 1 }, // vpermq { TTI::SK_PermuteSingleSrc, MVT::v8i32, 1 }, // vpermd { TTI::SK_PermuteSingleSrc, MVT::v16i16, 4 }, // vperm2i128 + 2 * vpshufb @@ -872,7 +874,16 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, { TTI::SK_Alternate, MVT::v8i32, 1 }, // vblendps { TTI::SK_Alternate, MVT::v8f32, 1 }, // vblendps { TTI::SK_Alternate, MVT::v16i16, 3 }, // vpand + vpandn + vpor - { TTI::SK_Alternate, MVT::v32i8, 3 } // vpand + vpandn + vpor + { TTI::SK_Alternate, MVT::v32i8, 3 }, // vpand + vpandn + vpor + + { TTI::SK_PermuteSingleSrc, MVT::v4f64, 3 }, // 2*vperm2f128 + vshufpd + { TTI::SK_PermuteSingleSrc, MVT::v4i64, 3 }, // 2*vperm2f128 + vshufpd + { TTI::SK_PermuteSingleSrc, MVT::v8f32, 4 }, // 2*vperm2f128 + 2*vshufps + { TTI::SK_PermuteSingleSrc, MVT::v8i32, 4 }, // 2*vperm2f128 + 2*vshufps + { TTI::SK_PermuteSingleSrc, MVT::v16i16, 8 }, // vextractf128 + 4*pshufb + // + 2*por + vinsertf128 + { TTI::SK_PermuteSingleSrc, MVT::v32i8, 8 }, // vextractf128 + 4*pshufb + // + 2*por + vinsertf128 }; if (ST->hasAVX()) @@ -899,11 +910,14 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, { TTI::SK_Reverse, MVT::v8i16, 1 }, // pshufb { TTI::SK_Reverse, MVT::v16i8, 1 }, // pshufb - { TTI::SK_Alternate, MVT::v8i16, 3 }, // pshufb + pshufb + por - { TTI::SK_Alternate, MVT::v16i8, 3 }, // pshufb + pshufb + por + { TTI::SK_Alternate, MVT::v8i16, 3 }, // 2*pshufb + por + { TTI::SK_Alternate, MVT::v16i8, 3 }, // 2*pshufb + por { TTI::SK_PermuteSingleSrc, MVT::v8i16, 1 }, // pshufb - { TTI::SK_PermuteSingleSrc, MVT::v16i8, 1 } // pshufb + { TTI::SK_PermuteSingleSrc, MVT::v16i8, 1 }, // pshufb + + { TTI::SK_PermuteTwoSrc, MVT::v8i16, 3 }, // 2*pshufb + por + { TTI::SK_PermuteTwoSrc, MVT::v16i8, 3 }, // 2*pshufb + por }; if (ST->hasSSSE3()) @@ -914,13 +928,13 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, { TTI::SK_Broadcast, MVT::v2f64, 1 }, // shufpd { TTI::SK_Broadcast, MVT::v2i64, 1 }, // pshufd { TTI::SK_Broadcast, MVT::v4i32, 1 }, // pshufd - { TTI::SK_Broadcast, MVT::v8i16, 2 }, // pshuflw + pshufd + { TTI::SK_Broadcast, MVT::v8i16, 2 }, // pshuflw + pshufd { TTI::SK_Broadcast, MVT::v16i8, 3 }, // unpck + pshuflw + pshufd { TTI::SK_Reverse, MVT::v2f64, 1 }, // shufpd { TTI::SK_Reverse, MVT::v2i64, 1 }, // pshufd { TTI::SK_Reverse, MVT::v4i32, 1 }, // pshufd - { TTI::SK_Reverse, MVT::v8i16, 3 }, // pshuflw + pshufhw + pshufd + { TTI::SK_Reverse, MVT::v8i16, 3 }, // pshuflw + pshufhw + pshufd { TTI::SK_Reverse, MVT::v16i8, 9 }, // 2*pshuflw + 2*pshufhw // + 2*pshufd + 2*unpck + packus @@ -930,8 +944,17 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, { TTI::SK_Alternate, MVT::v8i16, 3 }, // pand + pandn + por { TTI::SK_Alternate, MVT::v16i8, 3 }, // pand + pandn + por - { TTI::SK_PermuteSingleSrc, MVT::v2i64, 1 }, // pshufd - { TTI::SK_PermuteSingleSrc, MVT::v4i32, 1 } // pshufd + { TTI::SK_PermuteSingleSrc, MVT::v2f64, 1 }, // shufpd + { TTI::SK_PermuteSingleSrc, MVT::v2i64, 1 }, // pshufd + { TTI::SK_PermuteSingleSrc, MVT::v4i32, 1 }, // pshufd + { TTI::SK_PermuteSingleSrc, MVT::v8i16, 5 }, // 2*pshuflw + 2*pshufhw + // + pshufd/unpck + { TTI::SK_PermuteSingleSrc, MVT::v16i8, 10 }, // 2*pshuflw + 2*pshufhw + // + 2*pshufd + 2*unpck + 2*packus + + { TTI::SK_PermuteTwoSrc, MVT::v2f64, 1 }, // shufpd + { TTI::SK_PermuteTwoSrc, MVT::v2i64, 1 }, // shufpd + { TTI::SK_PermuteTwoSrc, MVT::v4i32, 2 }, // 2*{unpck,movsd,pshufd} }; if (ST->hasSSE2()) @@ -939,9 +962,11 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, return LT.first * Entry->Cost; static const CostTblEntry SSE1ShuffleTbl[] = { - { TTI::SK_Broadcast, MVT::v4f32, 1 }, // shufps - { TTI::SK_Reverse, MVT::v4f32, 1 }, // shufps - { TTI::SK_Alternate, MVT::v4f32, 2 } // 2*shufps + { TTI::SK_Broadcast, MVT::v4f32, 1 }, // shufps + { TTI::SK_Reverse, MVT::v4f32, 1 }, // shufps + { TTI::SK_Alternate, MVT::v4f32, 2 }, // 2*shufps + { TTI::SK_PermuteSingleSrc, MVT::v4f32, 1 }, // shufps + { TTI::SK_PermuteTwoSrc, MVT::v4f32, 2 }, // 2*shufps }; if (ST->hasSSE1()) diff --git a/test/Analysis/CostModel/X86/shuffle-single-src.ll b/test/Analysis/CostModel/X86/shuffle-single-src.ll index a4a0673bc3c..d63632d085e 100644 --- a/test/Analysis/CostModel/X86/shuffle-single-src.ll +++ b/test/Analysis/CostModel/X86/shuffle-single-src.ll @@ -13,33 +13,33 @@ ; CHECK-LABEL: 'test_vXf64' define void @test_vXf64(<2 x double> %src128, <4 x double> %src256, <8 x double> %src512, <16 x double> %src1024) { - ; SSE2: cost of 2 {{.*}} %V128 = shufflevector - ; SSSE3: cost of 2 {{.*}} %V128 = shufflevector - ; SSE42: cost of 2 {{.*}} %V128 = shufflevector - ; AVX1: cost of 2 {{.*}} %V128 = shufflevector - ; AVX2: cost of 2 {{.*}} %V128 = shufflevector + ; SSE2: cost of 1 {{.*}} %V128 = shufflevector + ; SSSE3: cost of 1 {{.*}} %V128 = shufflevector + ; SSE42: cost of 1 {{.*}} %V128 = shufflevector + ; AVX1: cost of 1 {{.*}} %V128 = shufflevector + ; AVX2: cost of 1 {{.*}} %V128 = shufflevector ; AVX512: cost of 1 {{.*}} %V128 = shufflevector %V128 = shufflevector <2 x double> %src128, <2 x double> undef, <2 x i32> - ; SSE2: cost of 4 {{.*}} %V256 = shufflevector - ; SSSE3: cost of 4 {{.*}} %V256 = shufflevector - ; SSE42: cost of 4 {{.*}} %V256 = shufflevector - ; AVX1: cost of 6 {{.*}} %V256 = shufflevector - ; AVX2: cost of 6 {{.*}} %V256 = shufflevector + ; SSE2: cost of 2 {{.*}} %V256 = shufflevector + ; SSSE3: cost of 2 {{.*}} %V256 = shufflevector + ; SSE42: cost of 2 {{.*}} %V256 = shufflevector + ; AVX1: cost of 3 {{.*}} %V256 = shufflevector + ; AVX2: cost of 1 {{.*}} %V256 = shufflevector ; AVX512: cost of 1 {{.*}} %V256 = shufflevector %V256 = shufflevector <4 x double> %src256, <4 x double> undef, <4 x i32> - ; SSE2: cost of 24 {{.*}} %V512 = shufflevector - ; SSSE3: cost of 24 {{.*}} %V512 = shufflevector - ; SSE42: cost of 24 {{.*}} %V512 = shufflevector + ; SSE2: cost of 12 {{.*}} %V512 = shufflevector + ; SSSE3: cost of 12 {{.*}} %V512 = shufflevector + ; SSE42: cost of 12 {{.*}} %V512 = shufflevector ; AVX1: cost of 12 {{.*}} %V512 = shufflevector ; AVX2: cost of 12 {{.*}} %V512 = shufflevector ; AVX512: cost of 1 {{.*}} %V512 = shufflevector %V512 = shufflevector <8 x double> %src512, <8 x double> undef, <8 x i32> - ; SSE2: cost of 112 {{.*}} %V1024 = shufflevector - ; SSSE3: cost of 112 {{.*}} %V1024 = shufflevector - ; SSE42: cost of 112 {{.*}} %V1024 = shufflevector + ; SSE2: cost of 56 {{.*}} %V1024 = shufflevector + ; SSSE3: cost of 56 {{.*}} %V1024 = shufflevector + ; SSE42: cost of 56 {{.*}} %V1024 = shufflevector ; AVX1: cost of 72 {{.*}} %V1024 = shufflevector ; AVX2: cost of 72 {{.*}} %V1024 = shufflevector ; AVX512: cost of 2 {{.*}} %V1024 = shufflevector @@ -59,17 +59,17 @@ define void @test_vXi64(<2 x i64> %src128, <4 x i64> %src256, <8 x i64> %src512) ; AVX512: cost of 1 {{.*}} %V128 = shufflevector %V128 = shufflevector <2 x i64> %src128, <2 x i64> undef, <2 x i32> - ; SSE2: cost of 8 {{.*}} %V256 = shufflevector - ; SSSE3: cost of 8 {{.*}} %V256 = shufflevector - ; SSE42: cost of 8 {{.*}} %V256 = shufflevector - ; AVX1: cost of 8 {{.*}} %V256 = shufflevector + ; SSE2: cost of 2 {{.*}} %V256 = shufflevector + ; SSSE3: cost of 2 {{.*}} %V256 = shufflevector + ; SSE42: cost of 2 {{.*}} %V256 = shufflevector + ; AVX1: cost of 3 {{.*}} %V256 = shufflevector ; AVX2: cost of 1 {{.*}} %V256 = shufflevector ; AVX512: cost of 1 {{.*}} %V256 = shufflevector %V256 = shufflevector <4 x i64> %src256, <4 x i64> undef, <4 x i32> - ; SSE2: cost of 48 {{.*}} %V512 = shufflevector - ; SSSE3: cost of 48 {{.*}} %V512 = shufflevector - ; SSE42: cost of 48 {{.*}} %V512 = shufflevector + ; SSE2: cost of 12 {{.*}} %V512 = shufflevector + ; SSSE3: cost of 12 {{.*}} %V512 = shufflevector + ; SSE42: cost of 12 {{.*}} %V512 = shufflevector ; AVX1: cost of 16 {{.*}} %V512 = shufflevector ; AVX2: cost of 16 {{.*}} %V512 = shufflevector ; AVX512: cost of 1 {{.*}} %V512 = shufflevector @@ -81,25 +81,25 @@ define void @test_vXi64(<2 x i64> %src128, <4 x i64> %src256, <8 x i64> %src512) ; CHECK-LABEL: 'test_vXf32' define void @test_vXf32(<4 x float> %src128, <8 x float> %src256, <16 x float> %src512) { - ; SSE2: cost of 6 {{.*}} %V128 = shufflevector - ; SSSE3: cost of 6 {{.*}} %V128 = shufflevector - ; SSE42: cost of 6 {{.*}} %V128 = shufflevector - ; AVX1: cost of 6 {{.*}} %V128 = shufflevector - ; AVX2: cost of 6 {{.*}} %V128 = shufflevector + ; SSE2: cost of 1 {{.*}} %V128 = shufflevector + ; SSSE3: cost of 1 {{.*}} %V128 = shufflevector + ; SSE42: cost of 1 {{.*}} %V128 = shufflevector + ; AVX1: cost of 1 {{.*}} %V128 = shufflevector + ; AVX2: cost of 1 {{.*}} %V128 = shufflevector ; AVX512: cost of 1 {{.*}} %V128 = shufflevector %V128 = shufflevector <4 x float> %src128, <4 x float> undef, <4 x i32> - ; SSE2: cost of 12 {{.*}} %V256 = shufflevector - ; SSSE3: cost of 12 {{.*}} %V256 = shufflevector - ; SSE42: cost of 12 {{.*}} %V256 = shufflevector - ; AVX1: cost of 14 {{.*}} %V256 = shufflevector - ; AVX2: cost of 14 {{.*}} %V256 = shufflevector + ; SSE2: cost of 4 {{.*}} %V256 = shufflevector + ; SSSE3: cost of 4 {{.*}} %V256 = shufflevector + ; SSE42: cost of 4 {{.*}} %V256 = shufflevector + ; AVX1: cost of 4 {{.*}} %V256 = shufflevector + ; AVX2: cost of 1 {{.*}} %V256 = shufflevector ; AVX512: cost of 1 {{.*}} %V256 = shufflevector %V256 = shufflevector <8 x float> %src256, <8 x float> undef, <8 x i32> - ; SSE2: cost of 72 {{.*}} %V512 = shufflevector - ; SSSE3: cost of 72 {{.*}} %V512 = shufflevector - ; SSE42: cost of 72 {{.*}} %V512 = shufflevector + ; SSE2: cost of 24 {{.*}} %V512 = shufflevector + ; SSSE3: cost of 24 {{.*}} %V512 = shufflevector + ; SSE42: cost of 24 {{.*}} %V512 = shufflevector ; AVX1: cost of 28 {{.*}} %V512 = shufflevector ; AVX2: cost of 28 {{.*}} %V512 = shufflevector ; AVX512: cost of 1 {{.*}} %V512 = shufflevector @@ -119,25 +119,25 @@ define void @test_vXi32(<4 x i32> %src128, <8 x i32> %src256, <16 x i32> %src512 ; AVX512: cost of 1 {{.*}} %V128 = shufflevector %V128 = shufflevector <4 x i32> %src128, <4 x i32> undef, <4 x i32> - ; SSE2: cost of 16 {{.*}} %V256 = shufflevector - ; SSSE3: cost of 16 {{.*}} %V256 = shufflevector - ; SSE42: cost of 16 {{.*}} %V256 = shufflevector - ; AVX1: cost of 16 {{.*}} %V256 = shufflevector + ; SSE2: cost of 4 {{.*}} %V256 = shufflevector + ; SSSE3: cost of 4 {{.*}} %V256 = shufflevector + ; SSE42: cost of 4 {{.*}} %V256 = shufflevector + ; AVX1: cost of 4 {{.*}} %V256 = shufflevector ; AVX2: cost of 1 {{.*}} %V256 = shufflevector ; AVX512: cost of 1 {{.*}} %V256 = shufflevector %V256 = shufflevector <8 x i32> %src256, <8 x i32> undef, <8 x i32> - ; SSE2: cost of 96 {{.*}} %V512 = shufflevector - ; SSSE3: cost of 96 {{.*}} %V512 = shufflevector - ; SSE42: cost of 96 {{.*}} %V512 = shufflevector + ; SSE2: cost of 24 {{.*}} %V512 = shufflevector + ; SSSE3: cost of 24 {{.*}} %V512 = shufflevector + ; SSE42: cost of 24 {{.*}} %V512 = shufflevector ; AVX1: cost of 32 {{.*}} %V512 = shufflevector ; AVX2: cost of 32 {{.*}} %V512 = shufflevector ; AVX512: cost of 1 {{.*}} %V512 = shufflevector %V512 = shufflevector <16 x i32> %src512, <16 x i32> undef, <16 x i32> - ; SSE2: cost of 448 {{.*}} %V1024 = shufflevector - ; SSSE3: cost of 448 {{.*}} %V1024 = shufflevector - ; SSE42: cost of 448 {{.*}} %V1024 = shufflevector + ; SSE2: cost of 112 {{.*}} %V1024 = shufflevector + ; SSSE3: cost of 112 {{.*}} %V1024 = shufflevector + ; SSE42: cost of 112 {{.*}} %V1024 = shufflevector ; AVX1: cost of 192 {{.*}} %V1024 = shufflevector ; AVX2: cost of 192 {{.*}} %V1024 = shufflevector ; AVX512: cost of 2 {{.*}} %V1024 = shufflevector @@ -148,7 +148,7 @@ define void @test_vXi32(<4 x i32> %src128, <8 x i32> %src256, <16 x i32> %src512 ; CHECK-LABEL: 'test_vXi16' define void @test_vXi16(<8 x i16> %src128, <16 x i16> %src256, <32 x i16> %src512, <64 x i16> %src1024) { - ; SSE2: cost of 16 {{.*}} %V128 = shufflevector + ; SSE2: cost of 5 {{.*}} %V128 = shufflevector ; SSSE3: cost of 1 {{.*}} %V128 = shufflevector ; SSE42: cost of 1 {{.*}} %V128 = shufflevector ; AVX1: cost of 1 {{.*}} %V128 = shufflevector @@ -158,17 +158,17 @@ define void @test_vXi16(<8 x i16> %src128, <16 x i16> %src256, <32 x i16> %src51 %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> ; SSE2: cost of 32 {{.*}} %V256 = shufflevector - ; SSSE3: cost of 32 {{.*}} %V256 = shufflevector - ; SSE42: cost of 32 {{.*}} %V256 = shufflevector - ; AVX1: cost of 32 {{.*}} %V256 = shufflevector + ; SSSE3: cost of 6 {{.*}} %V256 = shufflevector + ; SSE42: cost of 6 {{.*}} %V256 = shufflevector + ; AVX1: cost of 8 {{.*}} %V256 = shufflevector ; AVX2: cost of 4 {{.*}} %V256 = shufflevector ; AVX512F: cost of 4 {{.*}} %V256 = shufflevector ; AVX512BW cost of 1 {{.*}} %V256 = shufflevector %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> ; SSE2: cost of 192 {{.*}} %V512 = shufflevector - ; SSSE3: cost of 192 {{.*}} %V512 = shufflevector - ; SSE42: cost of 192 {{.*}} %V512 = shufflevector + ; SSSE3: cost of 36 {{.*}} %V512 = shufflevector + ; SSE42: cost of 36 {{.*}} %V512 = shufflevector ; AVX1: cost of 64 {{.*}} %V512 = shufflevector ; AVX2: cost of 64 {{.*}} %V512 = shufflevector ; AVX512F: cost of 64 {{.*}} %V512 = shufflevector @@ -176,8 +176,8 @@ define void @test_vXi16(<8 x i16> %src128, <16 x i16> %src256, <32 x i16> %src51 %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> ; SSE2: cost of 896 {{.*}} %V1024 = shufflevector - ; SSSE3: cost of 896 {{.*}} %V1024 = shufflevector - ; SSE42: cost of 896 {{.*}} %V1024 = shufflevector + ; SSSE3: cost of 168 {{.*}} %V1024 = shufflevector + ; SSE42: cost of 168 {{.*}} %V1024 = shufflevector ; AVX1: cost of 384 {{.*}} %V1024 = shufflevector ; AVX2: cost of 384 {{.*}} %V1024 = shufflevector ; AVX512F: cost of 384 {{.*}} %V1024 = shufflevector @@ -188,7 +188,7 @@ define void @test_vXi16(<8 x i16> %src128, <16 x i16> %src256, <32 x i16> %src51 ; CHECK-LABEL: 'test_vXi8' define void @test_vXi8(<16 x i8> %src128, <32 x i8> %src256, <64 x i8> %src512) { - ; SSE2: cost of 32 {{.*}} %V128 = shufflevector + ; SSE2: cost of 10 {{.*}} %V128 = shufflevector ; SSSE3: cost of 1 {{.*}} %V128 = shufflevector ; SSE42: cost of 1 {{.*}} %V128 = shufflevector ; AVX1: cost of 1 {{.*}} %V128 = shufflevector @@ -197,17 +197,17 @@ define void @test_vXi8(<16 x i8> %src128, <32 x i8> %src256, <64 x i8> %src512) %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> ; SSE2: cost of 64 {{.*}} %V256 = shufflevector - ; SSSE3: cost of 64 {{.*}} %V256 = shufflevector - ; SSE42: cost of 64 {{.*}} %V256 = shufflevector - ; AVX1: cost of 64 {{.*}} %V256 = shufflevector + ; SSSE3: cost of 6 {{.*}} %V256 = shufflevector + ; SSE42: cost of 6 {{.*}} %V256 = shufflevector + ; AVX1: cost of 8 {{.*}} %V256 = shufflevector ; AVX2: cost of 4 {{.*}} %V256 = shufflevector ; AVX512F: cost of 4 {{.*}} %V256 = shufflevector ; AVX512BW: cost of 3 {{.*}} %V256 = shufflevector %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> ; SSE2: cost of 384 {{.*}} %V512 = shufflevector - ; SSSE3: cost of 384 {{.*}} %V512 = shufflevector - ; SSE42: cost of 384 {{.*}} %V512 = shufflevector + ; SSSE3: cost of 36 {{.*}} %V512 = shufflevector + ; SSE42: cost of 36 {{.*}} %V512 = shufflevector ; AVX1: cost of 128 {{.*}} %V512 = shufflevector ; AVX2: cost of 128 {{.*}} %V512 = shufflevector ; AVX512F: cost of 128 {{.*}} %V512 = shufflevector