return !(ST->isAtom());
}
+// Get estimation for interleaved load/store operations for AVX2.
+// \p Factor is the interleaved-access factor (stride) - number of
+// (interleaved) elements in the group.
+// \p Indices contains the indices for a strided load: when the
+// interleaved load has gaps they indicate which elements are used.
+// If Indices is empty (or if the number of indices is equal to the size
+// of the interleaved-access as given in \p Factor) the access has no gaps.
+//
+// As opposed to AVX-512, AVX2 does not have generic shuffles that allow
+// computing the cost using a generic formula as a function of generic
+// shuffles. We therefore use a lookup table instead, filled according to
+// the instruction sequences that codegen currently generates.
+int X86TTIImpl::getInterleavedMemoryOpCostAVX2(unsigned Opcode, Type *VecTy,
+ unsigned Factor,
+ ArrayRef<unsigned> Indices,
+ unsigned Alignment,
+ unsigned AddressSpace) {
+
+ // We currently Support only fully-interleaved groups, with no gaps.
+ // TODO: Support also strided loads (interleaved-groups with gaps).
+ if (Indices.size() && Indices.size() != Factor)
+ return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
+ Alignment, AddressSpace);
+
+ // VecTy for interleave memop is <VF*Factor x Elt>.
+ // So, for VF=4, Interleave Factor = 3, Element type = i32 we have
+ // VecTy = <12 x i32>.
+ MVT LegalVT = getTLI()->getTypeLegalizationCost(DL, VecTy).second;
+
+ // This function can be called with VecTy=<6xi128>, Factor=3, in which case
+ // the VF=2, while v2i128 is an unsupported MVT vector type
+ // (see MachineValueType.h::getVectorVT()).
+ if (!LegalVT.isVector())
+ return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
+ Alignment, AddressSpace);
+
+ unsigned VF = VecTy->getVectorNumElements() / Factor;
+ Type *ScalarTy = VecTy->getVectorElementType();
+
+ // Calculate the number of memory operations (NumOfMemOps), required
+ // for load/store the VecTy.
+ unsigned VecTySize = DL.getTypeStoreSize(VecTy);
+ unsigned LegalVTSize = LegalVT.getStoreSize();
+ unsigned NumOfMemOps = (VecTySize + LegalVTSize - 1) / LegalVTSize;
+
+ // Get the cost of one memory operation.
+ Type *SingleMemOpTy = VectorType::get(VecTy->getVectorElementType(),
+ LegalVT.getVectorNumElements());
+ unsigned MemOpCost =
+ getMemoryOpCost(Opcode, SingleMemOpTy, Alignment, AddressSpace);
+
+ VectorType *VT = VectorType::get(ScalarTy, VF);
+ EVT ETy = TLI->getValueType(DL, VT);
+ if (!ETy.isSimple())
+ return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
+ Alignment, AddressSpace);
+
+ // TODO: Complete for other data-types and strides.
+ // Each combination of Stride, ElementTy and VF results in a different
+ // sequence; The cost tables are therefore accessed with:
+ // Factor (stride) and VectorType=VFxElemType.
+ // The Cost accounts only for the shuffle sequence;
+ // The cost of the loads/stores is accounted for separately.
+ //
+ static const CostTblEntry AVX2InterleavedLoadTbl[] = {
+ { 3, MVT::v2i8, 10 }, //(load 6i8 and) deinterleave into 3 x 2i8
+ { 3, MVT::v4i8, 4 }, //(load 12i8 and) deinterleave into 3 x 4i8
+ { 3, MVT::v8i8, 9 }, //(load 24i8 and) deinterleave into 3 x 8i8
+ { 3, MVT::v16i8, 18}, //(load 48i8 and) deinterleave into 3 x 16i8
+ { 3, MVT::v32i8, 42 }, //(load 96i8 and) deinterleave into 3 x 32i8
+
+ { 4, MVT::v2i8, 12 }, //(load 8i8 and) deinterleave into 4 x 2i8
+ { 4, MVT::v4i8, 4 }, //(load 16i8 and) deinterleave into 4 x 4i8
+ { 4, MVT::v8i8, 20 }, //(load 32i8 and) deinterleave into 4 x 8i8
+ { 4, MVT::v16i8, 39 }, //(load 64i8 and) deinterleave into 4 x 16i8
+ { 4, MVT::v32i8, 80 } //(load 128i8 and) deinterleave into 4 x 32i8
+ };
+
+ static const CostTblEntry AVX2InterleavedStoreTbl[] = {
+ { 3, MVT::v2i8, 7 }, //interleave 3 x 2i8 into 6i8 (and store)
+ { 3, MVT::v4i8, 8 }, //interleave 3 x 4i8 into 12i8 (and store)
+ { 3, MVT::v8i8, 11 }, //interleave 3 x 8i8 into 24i8 (and store)
+ { 3, MVT::v16i8, 17 }, //interleave 3 x 16i8 into 48i8 (and store)
+ { 3, MVT::v32i8, 32 }, //interleave 3 x 32i8 into 96i8 (and store)
+
+ { 4, MVT::v2i8, 12 }, //interleave 4 x 2i8 into 8i8 (and store)
+ { 4, MVT::v4i8, 9 }, //interleave 4 x 4i8 into 16i8 (and store)
+ { 4, MVT::v8i8, 16 }, //interleave 4 x 8i8 into 32i8 (and store)
+ { 4, MVT::v16i8, 20 }, //interleave 4 x 16i8 into 64i8 (and store)
+ { 4, MVT::v32i8, 40 } //interleave 4 x 32i8 into 128i8 (and store)
+ };
+
+ if (Opcode == Instruction::Load) {
+ if (const auto *Entry =
+ CostTableLookup(AVX2InterleavedLoadTbl, Factor, ETy.getSimpleVT()))
+ return NumOfMemOps * MemOpCost + Entry->Cost;
+ } else {
+ assert(Opcode == Instruction::Store &&
+ "Expected Store Instruction at this point");
+ if (const auto *Entry =
+ CostTableLookup(AVX2InterleavedStoreTbl, Factor, ETy.getSimpleVT()))
+ return NumOfMemOps * MemOpCost + Entry->Cost;
+ }
+
+ return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
+ Alignment, AddressSpace);
+}
+
// Get estimation for interleaved load/store operations and strided load.
// \p Indices contains indices for strided load.
// \p Factor - the factor of interleaving.
if (ST->hasAVX512() && HasAVX512Solution && (!RequiresBW || ST->hasBWI()))
return getInterleavedMemoryOpCostAVX512(Opcode, VecTy, Factor, Indices,
Alignment, AddressSpace);
+ if (ST->hasAVX2())
+ return getInterleavedMemoryOpCostAVX2(Opcode, VecTy, Factor, Indices,
+ Alignment, AddressSpace);
+
return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
Alignment, AddressSpace);
}
int getInterleavedMemoryOpCostAVX512(unsigned Opcode, Type *VecTy,
unsigned Factor, ArrayRef<unsigned> Indices,
unsigned Alignment, unsigned AddressSpace);
+ int getInterleavedMemoryOpCostAVX2(unsigned Opcode, Type *VecTy,
+ unsigned Factor, ArrayRef<unsigned> Indices,
+ unsigned Alignment, unsigned AddressSpace);
int getIntImmCost(int64_t);
--- /dev/null
+; REQUIRES: asserts
+; RUN: opt -loop-vectorize -S -mcpu=core-avx2 --debug-only=loop-vectorize -vectorizer-maximize-bandwidth < %s 2>&1 | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Function Attrs: norecurse nounwind readonly uwtable
+define i32 @doit_stride3(i8* nocapture readonly %Ptr, i32 %Nels) {
+;CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction: %0 = load i8
+;CHECK: LV: Found an estimated cost of 11 for VF 2 For instruction: %0 = load i8
+;CHECK: LV: Found an estimated cost of 5 for VF 4 For instruction: %0 = load i8
+;CHECK: LV: Found an estimated cost of 10 for VF 8 For instruction: %0 = load i8
+;CHECK: LV: Found an estimated cost of 20 for VF 16 For instruction: %0 = load i8
+;CHECK: LV: Found an estimated cost of 45 for VF 32 For instruction: %0 = load i8
+entry:
+ %cmp13 = icmp sgt i32 %Nels, 0
+ br i1 %cmp13, label %for.body.preheader, label %for.end
+
+for.body.preheader:
+ br label %for.body
+
+for.body:
+ %Ptr.addr.016 = phi i8* [ %incdec.ptr2, %for.body ], [ %Ptr, %for.body.preheader ]
+ %i.015 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+ %s.014 = phi i32 [ %add6, %for.body ], [ 0, %for.body.preheader ]
+ %incdec.ptr = getelementptr inbounds i8, i8* %Ptr.addr.016, i64 1
+ %0 = load i8, i8* %Ptr.addr.016, align 1
+ %incdec.ptr1 = getelementptr inbounds i8, i8* %Ptr.addr.016, i64 2
+ %1 = load i8, i8* %incdec.ptr, align 1
+ %incdec.ptr2 = getelementptr inbounds i8, i8* %Ptr.addr.016, i64 3
+ %2 = load i8, i8* %incdec.ptr1, align 1
+ %conv = zext i8 %0 to i32
+ %conv3 = zext i8 %1 to i32
+ %conv4 = zext i8 %2 to i32
+ %add = add i32 %s.014, %conv
+ %add5 = add i32 %add, %conv3
+ %add6 = add i32 %add5, %conv4
+ %inc = add nuw nsw i32 %i.015, 1
+ %exitcond = icmp eq i32 %inc, %Nels
+ br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:
+ %add6.lcssa = phi i32 [ %add6, %for.body ]
+ br label %for.end
+
+for.end:
+ %s.0.lcssa = phi i32 [ 0, %entry ], [ %add6.lcssa, %for.end.loopexit ]
+ ret i32 %s.0.lcssa
+}
+
+; Function Attrs: norecurse nounwind readonly uwtable
+define i32 @doit_stride4(i8* nocapture readonly %Ptr, i32 %Nels) local_unnamed_addr {
+;CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction: %0 = load i8
+;CHECK: LV: Found an estimated cost of 13 for VF 2 For instruction: %0 = load i8
+;CHECK: LV: Found an estimated cost of 5 for VF 4 For instruction: %0 = load i8
+;CHECK: LV: Found an estimated cost of 21 for VF 8 For instruction: %0 = load i8
+;CHECK: LV: Found an estimated cost of 41 for VF 16 For instruction: %0 = load i8
+;CHECK: LV: Found an estimated cost of 84 for VF 32 For instruction: %0 = load i8
+entry:
+ %cmp59 = icmp sgt i32 %Nels, 0
+ br i1 %cmp59, label %for.body.preheader, label %for.end
+
+for.body.preheader:
+ br label %for.body
+
+for.body: ; preds = %for.body.preheader, %for.body
+ %Ptr.addr.062 = phi i8* [ %incdec.ptr3, %for.body ], [ %Ptr, %for.body.preheader ]
+ %i.061 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+ %s.060 = phi i32 [ %cond39, %for.body ], [ 0, %for.body.preheader ]
+ %incdec.ptr = getelementptr inbounds i8, i8* %Ptr.addr.062, i64 1
+ %0 = load i8, i8* %Ptr.addr.062, align 1
+ %incdec.ptr1 = getelementptr inbounds i8, i8* %Ptr.addr.062, i64 2
+ %1 = load i8, i8* %incdec.ptr, align 1
+ %incdec.ptr2 = getelementptr inbounds i8, i8* %Ptr.addr.062, i64 3
+ %2 = load i8, i8* %incdec.ptr1, align 1
+ %incdec.ptr3 = getelementptr inbounds i8, i8* %Ptr.addr.062, i64 4
+ %3 = load i8, i8* %incdec.ptr2, align 1
+ %cmp5 = icmp ult i8 %0, %1
+ %.sink = select i1 %cmp5, i8 %0, i8 %1
+ %cmp12 = icmp ult i8 %.sink, %2
+ %.sink40 = select i1 %cmp12, i8 %.sink, i8 %2
+ %cmp23 = icmp ult i8 %.sink40, %3
+ %.sink41 = select i1 %cmp23, i8 %.sink40, i8 %3
+ %conv28 = zext i8 %.sink41 to i32
+ %cmp33 = icmp slt i32 %s.060, %conv28
+ %cond39 = select i1 %cmp33, i32 %s.060, i32 %conv28
+ %inc = add nuw nsw i32 %i.061, 1
+ %exitcond = icmp eq i32 %inc, %Nels
+ br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:
+ %cond39.lcssa = phi i32 [ %cond39, %for.body ]
+ br label %for.end
+
+for.end:
+ %s.0.lcssa = phi i32 [ 0, %entry ], [ %cond39.lcssa, %for.end.loopexit ]
+ ret i32 %s.0.lcssa
+}
--- /dev/null
+; REQUIRES: asserts
+; RUN: opt -loop-vectorize -S -mcpu=core-avx2 --debug-only=loop-vectorize -vectorizer-maximize-bandwidth < %s 2>&1 | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Function Attrs: norecurse nounwind uwtable
+define void @doit_stride3(i8* nocapture %Ptr, i32 %Nels) local_unnamed_addr {
+;CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %conv4
+;CHECK: LV: Found an estimated cost of 8 for VF 2 For instruction: store i8 %conv4
+;CHECK: LV: Found an estimated cost of 9 for VF 4 For instruction: store i8 %conv4
+;CHECK: LV: Found an estimated cost of 12 for VF 8 For instruction: store i8 %conv4
+;CHECK: LV: Found an estimated cost of 19 for VF 16 For instruction: store i8 %conv4
+;CHECK: LV: Found an estimated cost of 35 for VF 32 For instruction: store i8 %conv4
+entry:
+ %cmp14 = icmp sgt i32 %Nels, 0
+ br i1 %cmp14, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:
+ %conv = trunc i32 %Nels to i8
+ %conv1 = shl i8 %conv, 1
+ %conv4 = shl i8 %conv, 2
+ br label %for.body
+
+for.body:
+ %i.016 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
+ %Ptr.addr.015 = phi i8* [ %Ptr, %for.body.lr.ph ], [ %incdec.ptr5, %for.body ]
+ %incdec.ptr = getelementptr inbounds i8, i8* %Ptr.addr.015, i64 1
+ store i8 %conv, i8* %Ptr.addr.015, align 1
+ %incdec.ptr2 = getelementptr inbounds i8, i8* %Ptr.addr.015, i64 2
+ store i8 %conv1, i8* %incdec.ptr, align 1
+ %incdec.ptr5 = getelementptr inbounds i8, i8* %Ptr.addr.015, i64 3
+ store i8 %conv4, i8* %incdec.ptr2, align 1
+ %inc = add nuw nsw i32 %i.016, 1
+ %exitcond = icmp eq i32 %inc, %Nels
+ br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:
+ br label %for.end
+
+for.end:
+ ret void
+}
+
+; Function Attrs: norecurse nounwind uwtable
+define void @doit_stride4(i8* nocapture %Ptr, i32 %Nels) local_unnamed_addr {
+;CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %conv7
+;CHECK: LV: Found an estimated cost of 13 for VF 2 For instruction: store i8 %conv7
+;CHECK: LV: Found an estimated cost of 10 for VF 4 For instruction: store i8 %conv7
+;CHECK: LV: Found an estimated cost of 17 for VF 8 For instruction: store i8 %conv7
+;CHECK: LV: Found an estimated cost of 22 for VF 16 For instruction: store i8 %conv7
+;CHECK: LV: Found an estimated cost of 44 for VF 32 For instruction: store i8 %conv7
+entry:
+ %cmp19 = icmp sgt i32 %Nels, 0
+ br i1 %cmp19, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:
+ %conv = trunc i32 %Nels to i8
+ %conv1 = shl i8 %conv, 1
+ %conv4 = shl i8 %conv, 2
+ %mul6 = mul nsw i32 %Nels, 5
+ %conv7 = trunc i32 %mul6 to i8
+ br label %for.body
+
+for.body:
+ %i.021 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
+ %Ptr.addr.020 = phi i8* [ %Ptr, %for.body.lr.ph ], [ %incdec.ptr8, %for.body ]
+ %incdec.ptr = getelementptr inbounds i8, i8* %Ptr.addr.020, i64 1
+ store i8 %conv, i8* %Ptr.addr.020, align 1
+ %incdec.ptr2 = getelementptr inbounds i8, i8* %Ptr.addr.020, i64 2
+ store i8 %conv1, i8* %incdec.ptr, align 1
+ %incdec.ptr5 = getelementptr inbounds i8, i8* %Ptr.addr.020, i64 3
+ store i8 %conv4, i8* %incdec.ptr2, align 1
+ %incdec.ptr8 = getelementptr inbounds i8, i8* %Ptr.addr.020, i64 4
+ store i8 %conv7, i8* %incdec.ptr5, align 1
+ %inc = add nuw nsw i32 %i.021, 1
+ %exitcond = icmp eq i32 %inc, %Nels
+ br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:
+ br label %for.end
+
+for.end:
+ ret void
+}