return LT.first * Entry->Cost;
}
+ static const CostTblEntry AVX512BWCostTable[] = {
+ // Vectorizing division is a bad idea. See the SSE2 table for more comments.
+ { ISD::SDIV, MVT::v64i8, 64*20 },
+ { ISD::SDIV, MVT::v32i16, 32*20 },
+ { ISD::SDIV, MVT::v16i32, 16*20 },
+ { ISD::SDIV, MVT::v8i64, 8*20 },
+ { ISD::UDIV, MVT::v64i8, 64*20 },
+ { ISD::UDIV, MVT::v32i16, 32*20 },
+ { ISD::UDIV, MVT::v16i32, 16*20 },
+ { ISD::UDIV, MVT::v8i64, 8*20 },
+ };
+
+ // Look for AVX512BW lowering tricks for custom cases.
+ if (ST->hasBWI()) {
+ if (const auto *Entry = CostTableLookup(AVX512BWCostTable, ISD,
+ LT.second))
+ return LT.first * Entry->Cost;
+ }
+
static const CostTblEntry AVX512CostTable[] = {
{ ISD::SHL, MVT::v16i32, 1 },
{ ISD::SRL, MVT::v16i32, 1 },
{ ISD::SRA, MVT::v16i16, 10 }, // extend/vpsravd/pack sequence.
{ ISD::SRA, MVT::v2i64, 4 }, // srl/xor/sub sequence.
{ ISD::SRA, MVT::v4i64, 4 }, // srl/xor/sub sequence.
+ };
+
+ // Look for AVX2 lowering tricks for custom cases.
+ if (ST->hasAVX2()) {
+ if (const auto *Entry = CostTableLookup(AVX2CustomCostTable, ISD,
+ LT.second))
+ return LT.first * Entry->Cost;
+ }
+ static const CostTblEntry AVXCustomCostTable[] = {
// Vectorizing division is a bad idea. See the SSE2 table for more comments.
{ ISD::SDIV, MVT::v32i8, 32*20 },
{ ISD::SDIV, MVT::v16i16, 16*20 },
};
// Look for AVX2 lowering tricks for custom cases.
- if (ST->hasAVX2()) {
- if (const auto *Entry = CostTableLookup(AVX2CustomCostTable, ISD,
+ if (ST->hasAVX()) {
+ if (const auto *Entry = CostTableLookup(AVXCustomCostTable, ISD,
LT.second))
return LT.first * Entry->Cost;
}
; CHECK: cost of 1 {{.*}} %I64 = sdiv
%I64 = sdiv i64 undef, undef
; SSE: cost of 40 {{.*}} %V2i64 = sdiv
- ; AVX1: cost of 40 {{.*}} %V2i64 = sdiv
- ; AVX2: cost of 40 {{.*}} %V2i64 = sdiv
- ; AVX512: cost of 40 {{.*}} %V2i64 = sdiv
+ ; AVX: cost of 40 {{.*}} %V2i64 = sdiv
%V2i64 = sdiv <2 x i64> undef, undef
; SSE: cost of 80 {{.*}} %V4i64 = sdiv
- ; AVX1: cost of 12 {{.*}} %V4i64 = sdiv
- ; AVX2: cost of 80 {{.*}} %V4i64 = sdiv
- ; AVX512: cost of 80 {{.*}} %V4i64 = sdiv
+ ; AVX: cost of 80 {{.*}} %V4i64 = sdiv
%V4i64 = sdiv <4 x i64> undef, undef
; SSE: cost of 160 {{.*}} %V8i64 = sdiv
- ; AVX1: cost of 24 {{.*}} %V8i64 = sdiv
- ; AVX2: cost of 160 {{.*}} %V8i64 = sdiv
- ; AVX512: cost of 24 {{.*}} %V8i64 = sdiv
+ ; AVX: cost of 160 {{.*}} %V8i64 = sdiv
%V8i64 = sdiv <8 x i64> undef, undef
; CHECK: cost of 1 {{.*}} %I32 = sdiv
%I32 = sdiv i32 undef, undef
; SSE: cost of 80 {{.*}} %V4i32 = sdiv
- ; AVX1: cost of 80 {{.*}} %V4i32 = sdiv
- ; AVX2: cost of 80 {{.*}} %V4i32 = sdiv
- ; AVX512: cost of 80 {{.*}} %V4i32 = sdiv
+ ; AVX: cost of 80 {{.*}} %V4i32 = sdiv
%V4i32 = sdiv <4 x i32> undef, undef
; SSE: cost of 160 {{.*}} %V8i32 = sdiv
- ; AVX1: cost of 24 {{.*}} %V8i32 = sdiv
- ; AVX2: cost of 160 {{.*}} %V8i32 = sdiv
- ; AVX512: cost of 160 {{.*}} %V8i32 = sdiv
+ ; AVX: cost of 160 {{.*}} %V8i32 = sdiv
%V8i32 = sdiv <8 x i32> undef, undef
; SSE: cost of 320 {{.*}} %V16i32 = sdiv
- ; AVX1: cost of 48 {{.*}} %V16i32 = sdiv
- ; AVX2: cost of 320 {{.*}} %V16i32 = sdiv
- ; AVX512: cost of 48 {{.*}} %V16i32 = sdiv
+ ; AVX: cost of 320 {{.*}} %V16i32 = sdiv
%V16i32 = sdiv <16 x i32> undef, undef
; CHECK: cost of 1 {{.*}} %I16 = sdiv
%I16 = sdiv i16 undef, undef
; SSE: cost of 160 {{.*}} %V8i16 = sdiv
- ; AVX1: cost of 160 {{.*}} %V8i16 = sdiv
- ; AVX2: cost of 160 {{.*}} %V8i16 = sdiv
- ; AVX512: cost of 160 {{.*}} %V8i16 = sdiv
+ ; AVX: cost of 160 {{.*}} %V8i16 = sdiv
%V8i16 = sdiv <8 x i16> undef, undef
; SSE: cost of 320 {{.*}} %V16i16 = sdiv
- ; AVX1: cost of 48 {{.*}} %V16i16 = sdiv
- ; AVX2: cost of 320 {{.*}} %V16i16 = sdiv
- ; AVX512: cost of 320 {{.*}} %V16i16 = sdiv
+ ; AVX: cost of 320 {{.*}} %V16i16 = sdiv
%V16i16 = sdiv <16 x i16> undef, undef
; SSE: cost of 640 {{.*}} %V32i16 = sdiv
- ; AVX1: cost of 96 {{.*}} %V32i16 = sdiv
- ; AVX2: cost of 640 {{.*}} %V32i16 = sdiv
- ; AVX512F: cost of 640 {{.*}} %V32i16 = sdiv
- ; AVX512BW: cost of 96 {{.*}} %V32i16 = sdiv
+ ; AVX: cost of 640 {{.*}} %V32i16 = sdiv
%V32i16 = sdiv <32 x i16> undef, undef
; CHECK: cost of 1 {{.*}} %I8 = sdiv
%I8 = sdiv i8 undef, undef
; SSE: cost of 320 {{.*}} %V16i8 = sdiv
- ; AVX1: cost of 320 {{.*}} %V16i8 = sdiv
- ; AVX2: cost of 320 {{.*}} %V16i8 = sdiv
- ; AVX512: cost of 320 {{.*}} %V16i8 = sdiv
+ ; AVX: cost of 320 {{.*}} %V16i8 = sdiv
%V16i8 = sdiv <16 x i8> undef, undef
; SSE: cost of 640 {{.*}} %V32i8 = sdiv
- ; AVX1: cost of 96 {{.*}} %V32i8 = sdiv
- ; AVX2: cost of 640 {{.*}} %V32i8 = sdiv
- ; AVX512: cost of 640 {{.*}} %V32i8 = sdiv
+ ; AVX: cost of 640 {{.*}} %V32i8 = sdiv
%V32i8 = sdiv <32 x i8> undef, undef
; SSE: cost of 1280 {{.*}} %V64i8 = sdiv
- ; AVX1: cost of 192 {{.*}} %V64i8 = sdiv
- ; AVX2: cost of 1280 {{.*}} %V64i8 = sdiv
- ; AVX512F: cost of 1280 {{.*}} %V64i8 = sdiv
- ; AVX512BW: cost of 192 {{.*}} %V64i8 = sdiv
+ ; AVX: cost of 1280 {{.*}} %V64i8 = sdiv
%V64i8 = sdiv <64 x i8> undef, undef
ret i32 undef
; CHECK: cost of 1 {{.*}} %I64 = udiv
%I64 = udiv i64 undef, undef
; SSE: cost of 40 {{.*}} %V2i64 = udiv
- ; AVX1: cost of 40 {{.*}} %V2i64 = udiv
- ; AVX2: cost of 40 {{.*}} %V2i64 = udiv
- ; AVX512: cost of 40 {{.*}} %V2i64 = udiv
+ ; AVX: cost of 40 {{.*}} %V2i64 = udiv
%V2i64 = udiv <2 x i64> undef, undef
; SSE: cost of 80 {{.*}} %V4i64 = udiv
- ; AVX1: cost of 12 {{.*}} %V4i64 = udiv
- ; AVX2: cost of 80 {{.*}} %V4i64 = udiv
- ; AVX512: cost of 80 {{.*}} %V4i64 = udiv
+ ; AVX: cost of 80 {{.*}} %V4i64 = udiv
%V4i64 = udiv <4 x i64> undef, undef
; SSE: cost of 160 {{.*}} %V8i64 = udiv
- ; AVX1: cost of 24 {{.*}} %V8i64 = udiv
- ; AVX2: cost of 160 {{.*}} %V8i64 = udiv
- ; AVX512: cost of 24 {{.*}} %V8i64 = udiv
+ ; AVX: cost of 160 {{.*}} %V8i64 = udiv
%V8i64 = udiv <8 x i64> undef, undef
; CHECK: cost of 1 {{.*}} %I32 = udiv
%I32 = udiv i32 undef, undef
; SSE: cost of 80 {{.*}} %V4i32 = udiv
- ; AVX1: cost of 80 {{.*}} %V4i32 = udiv
- ; AVX2: cost of 80 {{.*}} %V4i32 = udiv
- ; AVX512: cost of 80 {{.*}} %V4i32 = udiv
+ ; AVX: cost of 80 {{.*}} %V4i32 = udiv
%V4i32 = udiv <4 x i32> undef, undef
; SSE: cost of 160 {{.*}} %V8i32 = udiv
- ; AVX1: cost of 24 {{.*}} %V8i32 = udiv
- ; AVX2: cost of 160 {{.*}} %V8i32 = udiv
- ; AVX512: cost of 160 {{.*}} %V8i32 = udiv
+ ; AVX: cost of 160 {{.*}} %V8i32 = udiv
%V8i32 = udiv <8 x i32> undef, undef
; SSE: cost of 320 {{.*}} %V16i32 = udiv
- ; AVX1: cost of 48 {{.*}} %V16i32 = udiv
- ; AVX2: cost of 320 {{.*}} %V16i32 = udiv
- ; AVX512: cost of 48 {{.*}} %V16i32 = udiv
+ ; AVX: cost of 320 {{.*}} %V16i32 = udiv
%V16i32 = udiv <16 x i32> undef, undef
; CHECK: cost of 1 {{.*}} %I16 = udiv
%I16 = udiv i16 undef, undef
; SSE: cost of 160 {{.*}} %V8i16 = udiv
- ; AVX1: cost of 160 {{.*}} %V8i16 = udiv
- ; AVX2: cost of 160 {{.*}} %V8i16 = udiv
- ; AVX512: cost of 160 {{.*}} %V8i16 = udiv
+ ; AVX: cost of 160 {{.*}} %V8i16 = udiv
%V8i16 = udiv <8 x i16> undef, undef
; SSE: cost of 320 {{.*}} %V16i16 = udiv
- ; AVX1: cost of 48 {{.*}} %V16i16 = udiv
- ; AVX2: cost of 320 {{.*}} %V16i16 = udiv
- ; AVX512: cost of 320 {{.*}} %V16i16 = udiv
+ ; AVX: cost of 320 {{.*}} %V16i16 = udiv
%V16i16 = udiv <16 x i16> undef, undef
; SSE: cost of 640 {{.*}} %V32i16 = udiv
- ; AVX1: cost of 96 {{.*}} %V32i16 = udiv
- ; AVX2: cost of 640 {{.*}} %V32i16 = udiv
- ; AVX512F: cost of 640 {{.*}} %V32i16 = udiv
- ; AVX512BW: cost of 96 {{.*}} %V32i16 = udiv
+ ; AVX: cost of 640 {{.*}} %V32i16 = udiv
%V32i16 = udiv <32 x i16> undef, undef
; CHECK: cost of 1 {{.*}} %I8 = udiv
%I8 = udiv i8 undef, undef
; SSE: cost of 320 {{.*}} %V16i8 = udiv
- ; AVX1: cost of 320 {{.*}} %V16i8 = udiv
- ; AVX2: cost of 320 {{.*}} %V16i8 = udiv
- ; AVX512: cost of 320 {{.*}} %V16i8 = udiv
+ ; AVX: cost of 320 {{.*}} %V16i8 = udiv
%V16i8 = udiv <16 x i8> undef, undef
; SSE: cost of 640 {{.*}} %V32i8 = udiv
- ; AVX1: cost of 96 {{.*}} %V32i8 = udiv
- ; AVX2: cost of 640 {{.*}} %V32i8 = udiv
- ; AVX512: cost of 640 {{.*}} %V32i8 = udiv
+ ; AVX: cost of 640 {{.*}} %V32i8 = udiv
%V32i8 = udiv <32 x i8> undef, undef
; SSE: cost of 1280 {{.*}} %V64i8 = udiv
- ; AVX1: cost of 192 {{.*}} %V64i8 = udiv
- ; AVX2: cost of 1280 {{.*}} %V64i8 = udiv
- ; AVX512F: cost of 1280 {{.*}} %V64i8 = udiv
- ; AVX512BW: cost of 192 {{.*}} %V64i8 = udiv
+ ; AVX: cost of 1280 {{.*}} %V64i8 = udiv
%V64i8 = udiv <64 x i8> undef, undef
ret i32 undef