[CostModel][X86] Match 256-bit vector shift 'splat' costs for AVX2 and above

author Simon Pilgrim <llvm-dev@redking.me.uk>

Sat, 7 Jan 2017 21:47:10 +0000 (21:47 +0000)

committer Simon Pilgrim <llvm-dev@redking.me.uk>

Sat, 7 Jan 2017 21:47:10 +0000 (21:47 +0000)
author Simon Pilgrim <llvm-dev@redking.me.uk>
Sat, 7 Jan 2017 21:47:10 +0000 (21:47 +0000)
committer Simon Pilgrim <llvm-dev@redking.me.uk>
Sat, 7 Jan 2017 21:47:10 +0000 (21:47 +0000)
diff --git a/lib/Target/X86/X86TargetTransformInfo.cpp b/lib/Target/X86/X86TargetTransformInfo.cpp

index 29cd8edc46f726a4b32362db20cdf23bce89d3f3..829b47b7aa274fed3e2ec7c4fe8a3d8d35915ccd 100644 (file)
--- a/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -263,7 +263,7 @@ int X86TTIImpl::getArithmeticInstrCost(
      if (const auto *Entry = CostTableLookup(AVX512CostTable, ISD, LT.second))
        return LT.first * Entry->Cost;
  
-  static const CostTblEntry AVX2CostTable[] = {
+  static const CostTblEntry AVX2ShiftCostTable[] = {
      // Shifts on v4i64/v8i32 on AVX2 is legal even though we declare to
      // customize them to detect the cases where shift amount is a scalar one.
      { ISD::SHL,     MVT::v4i32,    1 },
@@ -287,11 +287,11 @@ int X86TTIImpl::getArithmeticInstrCost(
        // is lowered into a vector multiply (vpmullw).
        return LT.first;
  
-    if (const auto *Entry = CostTableLookup(AVX2CostTable, ISD, LT.second))
+    if (const auto *Entry = CostTableLookup(AVX2ShiftCostTable, ISD, LT.second))
        return LT.first * Entry->Cost;
    }
  
-  static const CostTblEntry XOPCostTable[] = {
+  static const CostTblEntry XOPShiftCostTable[] = {
      // 128bit shifts take 1cy, but right shifts require negation beforehand.
      { ISD::SHL,     MVT::v16i8,    1 },
      { ISD::SRL,     MVT::v16i8,    2 },
@@ -322,48 +322,7 @@ int X86TTIImpl::getArithmeticInstrCost(
  
    // Look for XOP lowering tricks.
    if (ST->hasXOP())
-    if (const auto *Entry = CostTableLookup(XOPCostTable, ISD, LT.second))
-      return LT.first * Entry->Cost;
-
-  static const CostTblEntry AVX2CustomCostTable[] = {
-    { ISD::SHL,  MVT::v32i8,     11 }, // vpblendvb sequence.
-    { ISD::SHL,  MVT::v16i16,    10 }, // extend/vpsrlvd/pack sequence.
-
-    { ISD::SRL,  MVT::v32i8,     11 }, // vpblendvb sequence.
-    { ISD::SRL,  MVT::v16i16,    10 }, // extend/vpsrlvd/pack sequence.
-
-    { ISD::SRA,  MVT::v32i8,     24 }, // vpblendvb sequence.
-    { ISD::SRA,  MVT::v16i16,    10 }, // extend/vpsravd/pack sequence.
-    { ISD::SRA,  MVT::v2i64,      4 }, // srl/xor/sub sequence.
-    { ISD::SRA,  MVT::v4i64,      4 }, // srl/xor/sub sequence.
-
-    { ISD::SUB,  MVT::v32i8,      1 }, // psubb
-    { ISD::ADD,  MVT::v32i8,      1 }, // paddb
-    { ISD::SUB,  MVT::v16i16,     1 }, // psubw
-    { ISD::ADD,  MVT::v16i16,     1 }, // paddw
-    { ISD::SUB,  MVT::v8i32,      1 }, // psubd
-    { ISD::ADD,  MVT::v8i32,      1 }, // paddd
-    { ISD::SUB,  MVT::v4i64,      1 }, // psubq
-    { ISD::ADD,  MVT::v4i64,      1 }, // paddq
-
-    { ISD::MUL,  MVT::v32i8,     17 }, // extend/pmullw/trunc sequence.
-    { ISD::MUL,  MVT::v16i8,      7 }, // extend/pmullw/trunc sequence.
-    { ISD::MUL,  MVT::v16i16,     1 }, // pmullw
-    { ISD::MUL,  MVT::v8i32,      1 }, // pmulld
-    { ISD::MUL,  MVT::v4i64,      8 }, // 3*pmuludq/3*shift/2*add
-
-    { ISD::FDIV, MVT::f32,        7 }, // Haswell from http://www.agner.org/
-    { ISD::FDIV, MVT::v4f32,      7 }, // Haswell from http://www.agner.org/
-    { ISD::FDIV, MVT::v8f32,     14 }, // Haswell from http://www.agner.org/
-    { ISD::FDIV, MVT::f64,       14 }, // Haswell from http://www.agner.org/
-    { ISD::FDIV, MVT::v2f64,     14 }, // Haswell from http://www.agner.org/
-    { ISD::FDIV, MVT::v4f64,     28 }, // Haswell from http://www.agner.org/
-  };
-
-  // Look for AVX2 lowering tricks for custom cases.
-  if (ST->hasAVX2())
-    if (const auto *Entry = CostTableLookup(AVX2CustomCostTable, ISD,
-                                            LT.second))
+    if (const auto *Entry = CostTableLookup(XOPShiftCostTable, ISD, LT.second))
        return LT.first * Entry->Cost;
  
    static const CostTblEntry
@@ -415,6 +374,46 @@ int X86TTIImpl::getArithmeticInstrCost(
        ISD = ISD::MUL;
    }
  
+  static const CostTblEntry AVX2CostTable[] = {
+    { ISD::SHL,  MVT::v32i8,     11 }, // vpblendvb sequence.
+    { ISD::SHL,  MVT::v16i16,    10 }, // extend/vpsrlvd/pack sequence.
+
+    { ISD::SRL,  MVT::v32i8,     11 }, // vpblendvb sequence.
+    { ISD::SRL,  MVT::v16i16,    10 }, // extend/vpsrlvd/pack sequence.
+
+    { ISD::SRA,  MVT::v32i8,     24 }, // vpblendvb sequence.
+    { ISD::SRA,  MVT::v16i16,    10 }, // extend/vpsravd/pack sequence.
+    { ISD::SRA,  MVT::v2i64,      4 }, // srl/xor/sub sequence.
+    { ISD::SRA,  MVT::v4i64,      4 }, // srl/xor/sub sequence.
+
+    { ISD::SUB,  MVT::v32i8,      1 }, // psubb
+    { ISD::ADD,  MVT::v32i8,      1 }, // paddb
+    { ISD::SUB,  MVT::v16i16,     1 }, // psubw
+    { ISD::ADD,  MVT::v16i16,     1 }, // paddw
+    { ISD::SUB,  MVT::v8i32,      1 }, // psubd
+    { ISD::ADD,  MVT::v8i32,      1 }, // paddd
+    { ISD::SUB,  MVT::v4i64,      1 }, // psubq
+    { ISD::ADD,  MVT::v4i64,      1 }, // paddq
+
+    { ISD::MUL,  MVT::v32i8,     17 }, // extend/pmullw/trunc sequence.
+    { ISD::MUL,  MVT::v16i8,      7 }, // extend/pmullw/trunc sequence.
+    { ISD::MUL,  MVT::v16i16,     1 }, // pmullw
+    { ISD::MUL,  MVT::v8i32,      1 }, // pmulld
+    { ISD::MUL,  MVT::v4i64,      8 }, // 3*pmuludq/3*shift/2*add
+
+    { ISD::FDIV, MVT::f32,        7 }, // Haswell from http://www.agner.org/
+    { ISD::FDIV, MVT::v4f32,      7 }, // Haswell from http://www.agner.org/
+    { ISD::FDIV, MVT::v8f32,     14 }, // Haswell from http://www.agner.org/
+    { ISD::FDIV, MVT::f64,       14 }, // Haswell from http://www.agner.org/
+    { ISD::FDIV, MVT::v2f64,     14 }, // Haswell from http://www.agner.org/
+    { ISD::FDIV, MVT::v4f64,     28 }, // Haswell from http://www.agner.org/
+  };
+
+  // Look for AVX2 lowering tricks for custom cases.
+  if (ST->hasAVX2())
+    if (const auto *Entry = CostTableLookup(AVX2CostTable, ISD, LT.second))
+      return LT.first * Entry->Cost;
+
    static const CostTblEntry AVX1CostTable[] = {
      // We don't have to scalarize unsupported ops. We can issue two half-sized
      // operations and we only need to extract the upper YMM half.
diff --git a/test/Analysis/CostModel/X86/vshift-ashr-cost.ll b/test/Analysis/CostModel/X86/vshift-ashr-cost.ll

index a2a3d040cb702edd909cb8630e714aa4fe535f48..ab1eb730109ba047db1f55b945a94d691813ba8b 100644 (file)
--- a/test/Analysis/CostModel/X86/vshift-ashr-cost.ll
+++ b/test/Analysis/CostModel/X86/vshift-ashr-cost.ll
@@ -578,8 +578,8 @@ define <16 x i16> @splatconstant_shift_v16i16(<16 x i16> %a) {
  ; SSE2: Found an estimated cost of 2 for instruction:   %shift
  ; SSE41: Found an estimated cost of 2 for instruction:   %shift
  ; AVX: Found an estimated cost of 2 for instruction:   %shift
-; AVX2: Found an estimated cost of 10 for instruction:   %shift
-; AVX512: Found an estimated cost of 10 for instruction:   %shift
+; AVX2: Found an estimated cost of 2 for instruction:   %shift
+; AVX512: Found an estimated cost of 2 for instruction:   %shift
  ; XOP: Found an estimated cost of 4 for instruction:   %shift
    %shift = ashr <16 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
    ret <16 x i16> %shift
@@ -590,8 +590,8 @@ define <32 x i16> @splatconstant_shift_v32i16(<32 x i16> %a) {
  ; SSE2: Found an estimated cost of 4 for instruction:   %shift
  ; SSE41: Found an estimated cost of 4 for instruction:   %shift
  ; AVX: Found an estimated cost of 4 for instruction:   %shift
-; AVX2: Found an estimated cost of 20 for instruction:   %shift
-; AVX512F: Found an estimated cost of 20 for instruction:   %shift
+; AVX2: Found an estimated cost of 4 for instruction:   %shift
+; AVX512F: Found an estimated cost of 4 for instruction:   %shift
  ; AVX512BW: Found an estimated cost of 1 for instruction:   %shift
  ; XOP: Found an estimated cost of 8 for instruction:   %shift
    %shift = ashr <32 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
@@ -615,8 +615,8 @@ define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) {
  ; SSE2: Found an estimated cost of 8 for instruction:   %shift
  ; SSE41: Found an estimated cost of 8 for instruction:   %shift
  ; AVX: Found an estimated cost of 8 for instruction:   %shift
-; AVX2: Found an estimated cost of 24 for instruction:   %shift
-; AVX512: Found an estimated cost of 24 for instruction:   %shift
+; AVX2: Found an estimated cost of 8 for instruction:   %shift
+; AVX512: Found an estimated cost of 8 for instruction:   %shift
  ; XOP: Found an estimated cost of 4 for instruction:   %shift
    %shift = ashr <32 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
    ret <32 x i8> %shift
@@ -627,8 +627,8 @@ define <64 x i8> @splatconstant_shift_v64i8(<64 x i8> %a) {
  ; SSE2: Found an estimated cost of 16 for instruction:   %shift
  ; SSE41: Found an estimated cost of 16 for instruction:   %shift
  ; AVX: Found an estimated cost of 16 for instruction:   %shift
-; AVX2: Found an estimated cost of 48 for instruction:   %shift
-; AVX512F: Found an estimated cost of 48 for instruction:   %shift
+; AVX2: Found an estimated cost of 16 for instruction:   %shift
+; AVX512F: Found an estimated cost of 16 for instruction:   %shift
  ; AVX512BW: Found an estimated cost of 2 for instruction:   %shift
  ; XOP: Found an estimated cost of 8 for instruction:   %shift
    %shift = ashr <64 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
diff --git a/test/Analysis/CostModel/X86/vshift-lshr-cost.ll b/test/Analysis/CostModel/X86/vshift-lshr-cost.ll

index c4eaef0adc575b2c972ddffe2f70c45ba5b59c15..fea727147ff600879945e376848677835b300dfe 100644 (file)
--- a/test/Analysis/CostModel/X86/vshift-lshr-cost.ll
+++ b/test/Analysis/CostModel/X86/vshift-lshr-cost.ll
@@ -589,8 +589,8 @@ define <16 x i16> @splatconstant_shift_v16i16(<16 x i16> %a) {
  ; SSE2: Found an estimated cost of 2 for instruction:   %shift
  ; SSE41: Found an estimated cost of 2 for instruction:   %shift
  ; AVX: Found an estimated cost of 2 for instruction:   %shift
-; AVX2: Found an estimated cost of 10 for instruction:   %shift
-; AVX512: Found an estimated cost of 10 for instruction:   %shift
+; AVX2: Found an estimated cost of 2 for instruction:   %shift
+; AVX512: Found an estimated cost of 2 for instruction:   %shift
  ; XOP: Found an estimated cost of 4 for instruction:   %shift
    %shift = lshr <16 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
    ret <16 x i16> %shift
@@ -601,8 +601,8 @@ define <32 x i16> @splatconstant_shift_v32i16(<32 x i16> %a) {
  ; SSE2: Found an estimated cost of 4 for instruction:   %shift
  ; SSE41: Found an estimated cost of 4 for instruction:   %shift
  ; AVX: Found an estimated cost of 4 for instruction:   %shift
-; AVX2: Found an estimated cost of 20 for instruction:   %shift
-; AVX512F: Found an estimated cost of 20 for instruction:   %shift
+; AVX2: Found an estimated cost of 4 for instruction:   %shift
+; AVX512F: Found an estimated cost of 4 for instruction:   %shift
  ; AVX512BW: Found an estimated cost of 1 for instruction:   %shift
  ; XOP: Found an estimated cost of 8 for instruction:   %shift
    %shift = lshr <32 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
@@ -626,8 +626,8 @@ define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) {
  ; SSE2: Found an estimated cost of 2 for instruction:   %shift
  ; SSE41: Found an estimated cost of 2 for instruction:   %shift
  ; AVX: Found an estimated cost of 2 for instruction:   %shift
-; AVX2: Found an estimated cost of 11 for instruction:   %shift
-; AVX512: Found an estimated cost of 11 for instruction:   %shift
+; AVX2: Found an estimated cost of 2 for instruction:   %shift
+; AVX512: Found an estimated cost of 2 for instruction:   %shift
  ; XOP: Found an estimated cost of 4 for instruction:   %shift
    %shift = lshr <32 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
    ret <32 x i8> %shift
@@ -638,8 +638,8 @@ define <64 x i8> @splatconstant_shift_v64i8(<64 x i8> %a) {
  ; SSE2: Found an estimated cost of 4 for instruction:   %shift
  ; SSE41: Found an estimated cost of 4 for instruction:   %shift
  ; AVX: Found an estimated cost of 4 for instruction:   %shift
-; AVX2: Found an estimated cost of 22 for instruction:   %shift
-; AVX512F: Found an estimated cost of 22 for instruction:   %shift
+; AVX2: Found an estimated cost of 4 for instruction:   %shift
+; AVX512F: Found an estimated cost of 4 for instruction:   %shift
  ; AVX512BW: Found an estimated cost of 2 for instruction:   %shift
  ; XOP: Found an estimated cost of 8 for instruction:   %shift
    %shift = lshr <64 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
diff --git a/test/Analysis/CostModel/X86/vshift-shl-cost.ll b/test/Analysis/CostModel/X86/vshift-shl-cost.ll

index 5bf43219f8e060ebdf54d547712334bf5b3cf24a..7090ae4a35e1a9ccc80fa365e0ed46904d7ab23a 100644 (file)
--- a/test/Analysis/CostModel/X86/vshift-shl-cost.ll
+++ b/test/Analysis/CostModel/X86/vshift-shl-cost.ll
@@ -631,8 +631,8 @@ define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) {
  ; SSE2: Found an estimated cost of 2 for instruction:   %shift
  ; SSE41: Found an estimated cost of 2 for instruction:   %shift
  ; AVX: Found an estimated cost of 2 for instruction:   %shift
-; AVX2: Found an estimated cost of 11 for instruction:   %shift
-; AVX512: Found an estimated cost of 11 for instruction:   %shift
+; AVX2: Found an estimated cost of 2 for instruction:   %shift
+; AVX512: Found an estimated cost of 2 for instruction:   %shift
  ; XOP: Found an estimated cost of 2 for instruction:   %shift
    %shift = shl <32 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
    ret <32 x i8> %shift
@@ -643,8 +643,8 @@ define <64 x i8> @splatconstant_shift_v64i8(<64 x i8> %a) {
  ; SSE2: Found an estimated cost of 4 for instruction:   %shift
  ; SSE41: Found an estimated cost of 4 for instruction:   %shift
  ; AVX: Found an estimated cost of 4 for instruction:   %shift
-; AVX2: Found an estimated cost of 22 for instruction:   %shift
-; AVX512F: Found an estimated cost of 22 for instruction:   %shift
+; AVX2: Found an estimated cost of 4 for instruction:   %shift
+; AVX512F: Found an estimated cost of 4 for instruction:   %shift
  ; AVX512BW: Found an estimated cost of 2 for instruction:   %shift
  ; XOP: Found an estimated cost of 4 for instruction:   %shift
    %shift = shl <64 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
author	Simon Pilgrim <llvm-dev@redking.me.uk>
	Sat, 7 Jan 2017 21:47:10 +0000 (21:47 +0000)
committer	Simon Pilgrim <llvm-dev@redking.me.uk>
	Sat, 7 Jan 2017 21:47:10 +0000 (21:47 +0000)
lib/Target/X86/X86TargetTransformInfo.cpp		patch \| blob \| history
test/Analysis/CostModel/X86/vshift-ashr-cost.ll		patch \| blob \| history
test/Analysis/CostModel/X86/vshift-lshr-cost.ll		patch \| blob \| history
test/Analysis/CostModel/X86/vshift-shl-cost.ll		patch \| blob \| history