[AVX2] [TTI CostModel] Add cost of interleaved loads/stores for AVX2

author Dorit Nuzman <dorit.nuzman@intel.com>

Sun, 25 Jun 2017 08:26:25 +0000 (08:26 +0000)

committer Dorit Nuzman <dorit.nuzman@intel.com>

Sun, 25 Jun 2017 08:26:25 +0000 (08:26 +0000)
author Dorit Nuzman <dorit.nuzman@intel.com>
Sun, 25 Jun 2017 08:26:25 +0000 (08:26 +0000)
committer Dorit Nuzman <dorit.nuzman@intel.com>
Sun, 25 Jun 2017 08:26:25 +0000 (08:26 +0000)
diff --git a/lib/Target/X86/X86TargetTransformInfo.cpp b/lib/Target/X86/X86TargetTransformInfo.cpp

index f13933e92883f24a40e7dfac148b61ec2fb476c7..5ba8534d32d3309f2145b87a0903588a28aa8411 100644 (file)
--- a/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -2245,6 +2245,114 @@ bool X86TTIImpl::enableInterleavedAccessVectorization() {
    return !(ST->isAtom());
  }
  
+// Get estimation for interleaved load/store operations for AVX2.
+// \p Factor is the interleaved-access factor (stride) - number of
+// (interleaved) elements in the group.
+// \p Indices contains the indices for a strided load: when the
+// interleaved load has gaps they indicate which elements are used.
+// If Indices is empty (or if the number of indices is equal to the size
+// of the interleaved-access as given in \p Factor) the access has no gaps.
+//
+// As opposed to AVX-512, AVX2 does not have generic shuffles that allow
+// computing the cost using a generic formula as a function of generic
+// shuffles. We therefore use a lookup table instead, filled according to
+// the instruction sequences that codegen currently generates.
+int X86TTIImpl::getInterleavedMemoryOpCostAVX2(unsigned Opcode, Type *VecTy,
+                                               unsigned Factor,
+                                               ArrayRef<unsigned> Indices,
+                                               unsigned Alignment,
+                                               unsigned AddressSpace) {
+
+  // We currently Support only fully-interleaved groups, with no gaps.
+  // TODO: Support also strided loads (interleaved-groups with gaps).
+  if (Indices.size() && Indices.size() != Factor)
+    return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
+                                             Alignment, AddressSpace);
+
+  // VecTy for interleave memop is <VF*Factor x Elt>.
+  // So, for VF=4, Interleave Factor = 3, Element type = i32 we have
+  // VecTy = <12 x i32>.
+  MVT LegalVT = getTLI()->getTypeLegalizationCost(DL, VecTy).second;
+
+  // This function can be called with VecTy=<6xi128>, Factor=3, in which case
+  // the VF=2, while v2i128 is an unsupported MVT vector type
+  // (see MachineValueType.h::getVectorVT()).
+  if (!LegalVT.isVector())
+    return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
+                                             Alignment, AddressSpace);
+
+  unsigned VF = VecTy->getVectorNumElements() / Factor;
+  Type *ScalarTy = VecTy->getVectorElementType();
+  
+  // Calculate the number of memory operations (NumOfMemOps), required
+  // for load/store the VecTy.
+  unsigned VecTySize = DL.getTypeStoreSize(VecTy);
+  unsigned LegalVTSize = LegalVT.getStoreSize();
+  unsigned NumOfMemOps = (VecTySize + LegalVTSize - 1) / LegalVTSize;
+
+  // Get the cost of one memory operation.
+  Type *SingleMemOpTy = VectorType::get(VecTy->getVectorElementType(),
+                                        LegalVT.getVectorNumElements());
+  unsigned MemOpCost =
+      getMemoryOpCost(Opcode, SingleMemOpTy, Alignment, AddressSpace);
+  
+  VectorType *VT = VectorType::get(ScalarTy, VF);
+  EVT ETy = TLI->getValueType(DL, VT);
+  if (!ETy.isSimple())
+    return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
+                                             Alignment, AddressSpace);
+
+  // TODO: Complete for other data-types and strides.
+  // Each combination of Stride, ElementTy and VF results in a different
+  // sequence; The cost tables are therefore accessed with:
+  // Factor (stride) and VectorType=VFxElemType.
+  // The Cost accounts only for the shuffle sequence;
+  // The cost of the loads/stores is accounted for separately.
+  //
+  static const CostTblEntry AVX2InterleavedLoadTbl[] = {
+    { 3, MVT::v2i8,  10 }, //(load 6i8 and)  deinterleave into 3 x 2i8
+    { 3, MVT::v4i8,  4 },  //(load 12i8 and) deinterleave into 3 x 4i8
+    { 3, MVT::v8i8,  9 },  //(load 24i8 and) deinterleave into 3 x 8i8
+    { 3, MVT::v16i8, 18},  //(load 48i8 and) deinterleave into 3 x 16i8
+    { 3, MVT::v32i8, 42 }, //(load 96i8 and) deinterleave into 3 x 32i8
+    
+    { 4, MVT::v2i8,  12 }, //(load 8i8 and)   deinterleave into 4 x 2i8
+    { 4, MVT::v4i8,  4 },  //(load 16i8 and)  deinterleave into 4 x 4i8
+    { 4, MVT::v8i8,  20 }, //(load 32i8 and)  deinterleave into 4 x 8i8
+    { 4, MVT::v16i8, 39 }, //(load 64i8 and)  deinterleave into 4 x 16i8
+    { 4, MVT::v32i8, 80 }  //(load 128i8 and) deinterleave into 4 x 32i8
+  };
+
+  static const CostTblEntry AVX2InterleavedStoreTbl[] = {
+    { 3, MVT::v2i8,  7 },  //interleave 3 x 2i8  into 6i8 (and store)
+    { 3, MVT::v4i8,  8 },  //interleave 3 x 4i8  into 12i8 (and store)
+    { 3, MVT::v8i8,  11 }, //interleave 3 x 8i8  into 24i8 (and store)
+    { 3, MVT::v16i8, 17 }, //interleave 3 x 16i8 into 48i8 (and store)
+    { 3, MVT::v32i8, 32 }, //interleave 3 x 32i8 into 96i8 (and store)
+
+    { 4, MVT::v2i8,  12 }, //interleave 4 x 2i8  into 8i8 (and store)
+    { 4, MVT::v4i8,  9 },  //interleave 4 x 4i8  into 16i8 (and store)
+    { 4, MVT::v8i8,  16 }, //interleave 4 x 8i8  into 32i8 (and store)
+    { 4, MVT::v16i8, 20 }, //interleave 4 x 16i8 into 64i8 (and store)
+    { 4, MVT::v32i8, 40 }  //interleave 4 x 32i8 into 128i8 (and store)
+  };
+
+  if (Opcode == Instruction::Load) {
+    if (const auto *Entry =
+            CostTableLookup(AVX2InterleavedLoadTbl, Factor, ETy.getSimpleVT()))
+      return NumOfMemOps * MemOpCost + Entry->Cost;
+  } else {
+    assert(Opcode == Instruction::Store &&
+           "Expected Store Instruction at this  point");
+    if (const auto *Entry = 
+            CostTableLookup(AVX2InterleavedStoreTbl, Factor, ETy.getSimpleVT()))
+      return NumOfMemOps * MemOpCost + Entry->Cost;
+  }
+
+  return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
+                                           Alignment, AddressSpace);
+}
+
  // Get estimation for interleaved load/store operations and strided load.
  // \p Indices contains indices for strided load.
  // \p Factor - the factor of interleaving.
@@ -2353,6 +2461,10 @@ int X86TTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
    if (ST->hasAVX512() && HasAVX512Solution && (!RequiresBW || ST->hasBWI()))
      return getInterleavedMemoryOpCostAVX512(Opcode, VecTy, Factor, Indices,
                                              Alignment, AddressSpace);
+  if (ST->hasAVX2())
+    return getInterleavedMemoryOpCostAVX2(Opcode, VecTy, Factor, Indices,
+                                          Alignment, AddressSpace);
+  
    return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
                                             Alignment, AddressSpace);
  }
diff --git a/lib/Target/X86/X86TargetTransformInfo.h b/lib/Target/X86/X86TargetTransformInfo.h

index 375fb924c2c4cf2379131a0f5333b7189f86bb72..ad0a0a21130128288e4f7b85ff024ad09b07eba8 100644 (file)
--- a/lib/Target/X86/X86TargetTransformInfo.h
+++ b/lib/Target/X86/X86TargetTransformInfo.h
@@ -93,6 +93,9 @@ public:
    int getInterleavedMemoryOpCostAVX512(unsigned Opcode, Type *VecTy,
                                   unsigned Factor, ArrayRef<unsigned> Indices,
                                   unsigned Alignment, unsigned AddressSpace);
+  int getInterleavedMemoryOpCostAVX2(unsigned Opcode, Type *VecTy,
+                                 unsigned Factor, ArrayRef<unsigned> Indices,
+                                 unsigned Alignment, unsigned AddressSpace);
  
    int getIntImmCost(int64_t);
  
diff --git a/test/Analysis/CostModel/X86/interleaved-load-i8.ll b/test/Analysis/CostModel/X86/interleaved-load-i8.ll

new file mode 100644 (file)

index 0000000..382e5e5
--- /dev/null
+++ b/test/Analysis/CostModel/X86/interleaved-load-i8.ll
@@ -0,0 +1,98 @@
+; REQUIRES: asserts
+; RUN: opt -loop-vectorize -S -mcpu=core-avx2 --debug-only=loop-vectorize -vectorizer-maximize-bandwidth < %s 2>&1 | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Function Attrs: norecurse nounwind readonly uwtable
+define i32 @doit_stride3(i8* nocapture readonly %Ptr, i32 %Nels)  {
+;CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction:   %0 = load i8
+;CHECK: LV: Found an estimated cost of 11 for VF 2 For instruction:   %0 = load i8
+;CHECK: LV: Found an estimated cost of 5 for VF 4 For instruction:   %0 = load i8
+;CHECK: LV: Found an estimated cost of 10 for VF 8 For instruction:   %0 = load i8
+;CHECK: LV: Found an estimated cost of 20 for VF 16 For instruction:   %0 = load i8
+;CHECK: LV: Found an estimated cost of 45 for VF 32 For instruction:   %0 = load i8
+entry:
+  %cmp13 = icmp sgt i32 %Nels, 0
+  br i1 %cmp13, label %for.body.preheader, label %for.end
+
+for.body.preheader:
+  br label %for.body
+
+for.body:
+  %Ptr.addr.016 = phi i8* [ %incdec.ptr2, %for.body ], [ %Ptr, %for.body.preheader ]
+  %i.015 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+  %s.014 = phi i32 [ %add6, %for.body ], [ 0, %for.body.preheader ]
+  %incdec.ptr = getelementptr inbounds i8, i8* %Ptr.addr.016, i64 1
+  %0 = load i8, i8* %Ptr.addr.016, align 1
+  %incdec.ptr1 = getelementptr inbounds i8, i8* %Ptr.addr.016, i64 2
+  %1 = load i8, i8* %incdec.ptr, align 1
+  %incdec.ptr2 = getelementptr inbounds i8, i8* %Ptr.addr.016, i64 3
+  %2 = load i8, i8* %incdec.ptr1, align 1
+  %conv = zext i8 %0 to i32
+  %conv3 = zext i8 %1 to i32
+  %conv4 = zext i8 %2 to i32
+  %add = add i32 %s.014, %conv
+  %add5 = add i32 %add, %conv3
+  %add6 = add i32 %add5, %conv4
+  %inc = add nuw nsw i32 %i.015, 1
+  %exitcond = icmp eq i32 %inc, %Nels
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:
+  %add6.lcssa = phi i32 [ %add6, %for.body ]
+  br label %for.end
+
+for.end:
+  %s.0.lcssa = phi i32 [ 0, %entry ], [ %add6.lcssa, %for.end.loopexit ]
+  ret i32 %s.0.lcssa
+}
+
+; Function Attrs: norecurse nounwind readonly uwtable
+define i32 @doit_stride4(i8* nocapture readonly %Ptr, i32 %Nels) local_unnamed_addr {
+;CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction:   %0 = load i8
+;CHECK: LV: Found an estimated cost of 13 for VF 2 For instruction:   %0 = load i8
+;CHECK: LV: Found an estimated cost of 5 for VF 4 For instruction:   %0 = load i8
+;CHECK: LV: Found an estimated cost of 21 for VF 8 For instruction:   %0 = load i8
+;CHECK: LV: Found an estimated cost of 41 for VF 16 For instruction:   %0 = load i8
+;CHECK: LV: Found an estimated cost of 84 for VF 32 For instruction:   %0 = load i8
+entry:
+  %cmp59 = icmp sgt i32 %Nels, 0
+  br i1 %cmp59, label %for.body.preheader, label %for.end
+
+for.body.preheader:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %Ptr.addr.062 = phi i8* [ %incdec.ptr3, %for.body ], [ %Ptr, %for.body.preheader ]
+  %i.061 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+  %s.060 = phi i32 [ %cond39, %for.body ], [ 0, %for.body.preheader ]
+  %incdec.ptr = getelementptr inbounds i8, i8* %Ptr.addr.062, i64 1
+  %0 = load i8, i8* %Ptr.addr.062, align 1
+  %incdec.ptr1 = getelementptr inbounds i8, i8* %Ptr.addr.062, i64 2
+  %1 = load i8, i8* %incdec.ptr, align 1
+  %incdec.ptr2 = getelementptr inbounds i8, i8* %Ptr.addr.062, i64 3
+  %2 = load i8, i8* %incdec.ptr1, align 1
+  %incdec.ptr3 = getelementptr inbounds i8, i8* %Ptr.addr.062, i64 4
+  %3 = load i8, i8* %incdec.ptr2, align 1
+  %cmp5 = icmp ult i8 %0, %1
+  %.sink = select i1 %cmp5, i8 %0, i8 %1
+  %cmp12 = icmp ult i8 %.sink, %2
+  %.sink40 = select i1 %cmp12, i8 %.sink, i8 %2
+  %cmp23 = icmp ult i8 %.sink40, %3
+  %.sink41 = select i1 %cmp23, i8 %.sink40, i8 %3
+  %conv28 = zext i8 %.sink41 to i32
+  %cmp33 = icmp slt i32 %s.060, %conv28
+  %cond39 = select i1 %cmp33, i32 %s.060, i32 %conv28
+  %inc = add nuw nsw i32 %i.061, 1
+  %exitcond = icmp eq i32 %inc, %Nels
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit: 
+  %cond39.lcssa = phi i32 [ %cond39, %for.body ]
+  br label %for.end
+
+for.end:
+  %s.0.lcssa = phi i32 [ 0, %entry ], [ %cond39.lcssa, %for.end.loopexit ]
+  ret i32 %s.0.lcssa
+}
diff --git a/test/Analysis/CostModel/X86/interleaved-store-i8.ll b/test/Analysis/CostModel/X86/interleaved-store-i8.ll

new file mode 100644 (file)

index 0000000..d8408c1
--- /dev/null
+++ b/test/Analysis/CostModel/X86/interleaved-store-i8.ll
@@ -0,0 +1,85 @@
+; REQUIRES: asserts
+; RUN: opt -loop-vectorize -S -mcpu=core-avx2 --debug-only=loop-vectorize -vectorizer-maximize-bandwidth < %s 2>&1 | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Function Attrs: norecurse nounwind uwtable
+define void @doit_stride3(i8* nocapture %Ptr, i32 %Nels) local_unnamed_addr {
+;CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i8 %conv4
+;CHECK: LV: Found an estimated cost of 8 for VF 2 For instruction:   store i8 %conv4
+;CHECK: LV: Found an estimated cost of 9 for VF 4 For instruction:   store i8 %conv4
+;CHECK: LV: Found an estimated cost of 12 for VF 8 For instruction:   store i8 %conv4
+;CHECK: LV: Found an estimated cost of 19 for VF 16 For instruction:   store i8 %conv4
+;CHECK: LV: Found an estimated cost of 35 for VF 32 For instruction:   store i8 %conv4
+entry:
+  %cmp14 = icmp sgt i32 %Nels, 0
+  br i1 %cmp14, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:
+  %conv = trunc i32 %Nels to i8
+  %conv1 = shl i8 %conv, 1
+  %conv4 = shl i8 %conv, 2
+  br label %for.body
+
+for.body:
+  %i.016 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
+  %Ptr.addr.015 = phi i8* [ %Ptr, %for.body.lr.ph ], [ %incdec.ptr5, %for.body ]
+  %incdec.ptr = getelementptr inbounds i8, i8* %Ptr.addr.015, i64 1
+  store i8 %conv, i8* %Ptr.addr.015, align 1
+  %incdec.ptr2 = getelementptr inbounds i8, i8* %Ptr.addr.015, i64 2
+  store i8 %conv1, i8* %incdec.ptr, align 1
+  %incdec.ptr5 = getelementptr inbounds i8, i8* %Ptr.addr.015, i64 3
+  store i8 %conv4, i8* %incdec.ptr2, align 1
+  %inc = add nuw nsw i32 %i.016, 1
+  %exitcond = icmp eq i32 %inc, %Nels
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:
+  br label %for.end
+
+for.end:
+  ret void
+}
+
+; Function Attrs: norecurse nounwind uwtable
+define void @doit_stride4(i8* nocapture %Ptr, i32 %Nels) local_unnamed_addr {
+;CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i8 %conv7
+;CHECK: LV: Found an estimated cost of 13 for VF 2 For instruction:   store i8 %conv7
+;CHECK: LV: Found an estimated cost of 10 for VF 4 For instruction:   store i8 %conv7
+;CHECK: LV: Found an estimated cost of 17 for VF 8 For instruction:   store i8 %conv7
+;CHECK: LV: Found an estimated cost of 22 for VF 16 For instruction:   store i8 %conv7
+;CHECK: LV: Found an estimated cost of 44 for VF 32 For instruction:   store i8 %conv7
+entry:
+  %cmp19 = icmp sgt i32 %Nels, 0
+  br i1 %cmp19, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:
+  %conv = trunc i32 %Nels to i8
+  %conv1 = shl i8 %conv, 1
+  %conv4 = shl i8 %conv, 2
+  %mul6 = mul nsw i32 %Nels, 5
+  %conv7 = trunc i32 %mul6 to i8
+  br label %for.body
+
+for.body:
+  %i.021 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
+  %Ptr.addr.020 = phi i8* [ %Ptr, %for.body.lr.ph ], [ %incdec.ptr8, %for.body ]
+  %incdec.ptr = getelementptr inbounds i8, i8* %Ptr.addr.020, i64 1
+  store i8 %conv, i8* %Ptr.addr.020, align 1
+  %incdec.ptr2 = getelementptr inbounds i8, i8* %Ptr.addr.020, i64 2
+  store i8 %conv1, i8* %incdec.ptr, align 1
+  %incdec.ptr5 = getelementptr inbounds i8, i8* %Ptr.addr.020, i64 3
+  store i8 %conv4, i8* %incdec.ptr2, align 1
+  %incdec.ptr8 = getelementptr inbounds i8, i8* %Ptr.addr.020, i64 4
+  store i8 %conv7, i8* %incdec.ptr5, align 1
+  %inc = add nuw nsw i32 %i.021, 1
+  %exitcond = icmp eq i32 %inc, %Nels
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:
+  br label %for.end
+
+for.end:
+  ret void
+}
author	Dorit Nuzman <dorit.nuzman@intel.com>
	Sun, 25 Jun 2017 08:26:25 +0000 (08:26 +0000)
committer	Dorit Nuzman <dorit.nuzman@intel.com>
	Sun, 25 Jun 2017 08:26:25 +0000 (08:26 +0000)
lib/Target/X86/X86TargetTransformInfo.cpp		patch \| blob \| history
lib/Target/X86/X86TargetTransformInfo.h		patch \| blob \| history
test/Analysis/CostModel/X86/interleaved-load-i8.ll	[new file with mode: 0644]	patch \| blob
test/Analysis/CostModel/X86/interleaved-store-i8.ll	[new file with mode: 0644]	patch \| blob