return Cost;
}
+ static const CostTblEntry AVX512BWUniformConstCostTable[] = {
+ { ISD::SDIV, MVT::v32i16, 6 }, // vpmulhw sequence
+ { ISD::UDIV, MVT::v32i16, 6 }, // vpmulhuw sequence
+ };
+
+ if (Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
+ ST->hasBWI()) {
+ if (const auto *Entry = CostTableLookup(AVX512BWUniformConstCostTable, ISD,
+ LT.second))
+ return LT.first * Entry->Cost;
+ }
+
+ static const CostTblEntry AVX512UniformConstCostTable[] = {
+ { ISD::SDIV, MVT::v16i32, 15 }, // vpmuldq sequence
+ { ISD::UDIV, MVT::v16i32, 15 }, // vpmuludq sequence
+ };
+
+ if (Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
+ ST->hasAVX512()) {
+ if (const auto *Entry = CostTableLookup(AVX512UniformConstCostTable, ISD,
+ LT.second))
+ return LT.first * Entry->Cost;
+ }
+
static const CostTblEntry AVX2UniformConstCostTable[] = {
{ ISD::SRA, MVT::v4i64, 4 }, // 2 x psrad + shuffle.
return LT.first * Entry->Cost;
}
+ static const CostTblEntry SSE2UniformConstCostTable[] = {
+ { ISD::SDIV, MVT::v16i16, 12 }, // pmulhw sequence
+ { ISD::SDIV, MVT::v8i16, 6 }, // pmulhw sequence
+ { ISD::UDIV, MVT::v16i16, 12 }, // pmulhuw sequence
+ { ISD::UDIV, MVT::v8i16, 6 }, // pmulhuw sequence
+ { ISD::SDIV, MVT::v8i32, 38 }, // pmuludq sequence
+ { ISD::SDIV, MVT::v4i32, 19 }, // pmuludq sequence
+ { ISD::UDIV, MVT::v8i32, 30 }, // pmuludq sequence
+ { ISD::UDIV, MVT::v4i32, 15 }, // pmuludq sequence
+ };
+
+ if (Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
+ ST->hasSSE2()) {
+ // pmuldq sequence.
+ if (ISD == ISD::SDIV && LT.second == MVT::v8i32 && ST->hasAVX())
+ return LT.first * 30;
+ if (ISD == ISD::SDIV && LT.second == MVT::v4i32 && ST->hasSSE41())
+ return LT.first * 15;
+
+ if (const auto *Entry = CostTableLookup(SSE2UniformConstCostTable, ISD,
+ LT.second))
+ return LT.first * Entry->Cost;
+ }
+
static const CostTblEntry AVX512BWCostTable[] = {
// Vectorizing division is a bad idea. See the SSE2 table for more comments.
{ ISD::SDIV, MVT::v64i8, 64*20 },
return LT.first * Entry->Cost;
}
- static const CostTblEntry
- SSE2UniformConstCostTable[] = {
- // Constant splats are cheaper for the following instructions.
- { ISD::SDIV, MVT::v8i16, 6 }, // pmulhw sequence
- { ISD::UDIV, MVT::v8i16, 6 }, // pmulhuw sequence
- { ISD::SDIV, MVT::v4i32, 19 }, // pmuludq sequence
- { ISD::UDIV, MVT::v4i32, 15 }, // pmuludq sequence
- };
-
static const CostTblEntry
SSE2UniformCostTable[] = {
// Uniform splats are cheaper for the following instructions.
if (ST->hasSSE2() &&
((Op2Info == TargetTransformInfo::OK_UniformConstantValue) ||
(Op2Info == TargetTransformInfo::OK_UniformValue))) {
- if (Op2Info == TargetTransformInfo::OK_UniformConstantValue) {
- // pmuldq sequence.
- if (ISD == ISD::SDIV && LT.second == MVT::v4i32 && ST->hasSSE41())
- return LT.first * 15;
- if (const auto *Entry =
- CostTableLookup(SSE2UniformConstCostTable, ISD, LT.second))
- return LT.first * Entry->Cost;
- }
if (const auto *Entry =
CostTableLookup(SSE2UniformCostTable, ISD, LT.second))
return LT.first * Entry->Cost;
%V64i8 = udiv <64 x i8> undef, undef
ret i32 undef
-}\r
-\r
+}
+
; CHECK-LABEL: 'sdiv_uniformconst'
define i32 @sdiv_uniformconst() {
; CHECK: cost of 1 {{.*}} %I64 = sdiv
; SSE2: cost of 38 {{.*}} %V8i32 = sdiv
; SSSE3: cost of 38 {{.*}} %V8i32 = sdiv
; SSE42: cost of 30 {{.*}} %V8i32 = sdiv
- ; AVX1: cost of 160 {{.*}} %V8i32 = sdiv
+ ; AVX1: cost of 30 {{.*}} %V8i32 = sdiv
; AVX2: cost of 15 {{.*}} %V8i32 = sdiv
; AVX512: cost of 15 {{.*}} %V8i32 = sdiv
%V8i32 = sdiv <8 x i32> undef, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
; SSE2: cost of 76 {{.*}} %V16i32 = sdiv
; SSSE3: cost of 76 {{.*}} %V16i32 = sdiv
; SSE42: cost of 60 {{.*}} %V16i32 = sdiv
- ; AVX1: cost of 320 {{.*}} %V16i32 = sdiv
+ ; AVX1: cost of 60 {{.*}} %V16i32 = sdiv
; AVX2: cost of 30 {{.*}} %V16i32 = sdiv
- ; AVX512F: cost of 48 {{.*}} %V16i32 = sdiv
- ; AVX512BW: cost of 320 {{.*}} %V16i32 = sdiv
+ ; AVX512: cost of 15 {{.*}} %V16i32 = sdiv
%V16i32 = sdiv <16 x i32> undef, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
; CHECK: cost of 1 {{.*}} %I16 = sdiv
; AVX: cost of 6 {{.*}} %V8i16 = sdiv
%V8i16 = sdiv <8 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
; SSE: cost of 12 {{.*}} %V16i16 = sdiv
- ; AVX1: cost of 320 {{.*}} %V16i16 = sdiv
+ ; AVX1: cost of 12 {{.*}} %V16i16 = sdiv
; AVX2: cost of 6 {{.*}} %V16i16 = sdiv
; AVX512: cost of 6 {{.*}} %V16i16 = sdiv
%V16i16 = sdiv <16 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
; SSE: cost of 24 {{.*}} %V32i16 = sdiv
- ; AVX1: cost of 640 {{.*}} %V32i16 = sdiv
+ ; AVX1: cost of 24 {{.*}} %V32i16 = sdiv
; AVX2: cost of 12 {{.*}} %V32i16 = sdiv
; AVX512F: cost of 12 {{.*}} %V32i16 = sdiv
- ; AVX512BW: cost of 640 {{.*}} %V32i16 = sdiv
+ ; AVX512BW: cost of 6 {{.*}} %V32i16 = sdiv
%V32i16 = sdiv <32 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
; CHECK: cost of 1 {{.*}} %I8 = sdiv
%V64i8 = sdiv <64 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
ret i32 undef
-}\r
-\r
+}
+
; CHECK-LABEL: 'udiv_uniformconst'
define i32 @udiv_uniformconst() {
; CHECK: cost of 1 {{.*}} %I64 = udiv
; AVX: cost of 15 {{.*}} %V4i32 = udiv
%V4i32 = udiv <4 x i32> undef, <i32 7, i32 7, i32 7, i32 7>
; SSE: cost of 30 {{.*}} %V8i32 = udiv
- ; AVX1: cost of 160 {{.*}} %V8i32 = udiv
+ ; AVX1: cost of 30 {{.*}} %V8i32 = udiv
; AVX2: cost of 15 {{.*}} %V8i32 = udiv
; AVX512: cost of 15 {{.*}} %V8i32 = udiv
%V8i32 = udiv <8 x i32> undef, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
; SSE: cost of 60 {{.*}} %V16i32 = udiv
- ; AVX1: cost of 320 {{.*}} %V16i32 = udiv
+ ; AVX1: cost of 60 {{.*}} %V16i32 = udiv
; AVX2: cost of 30 {{.*}} %V16i32 = udiv
- ; AVX512F: cost of 48 {{.*}} %V16i32 = udiv
- ; AVX512BW: cost of 320 {{.*}} %V16i32 = udiv
+ ; AVX512: cost of 15 {{.*}} %V16i32 = udiv
%V16i32 = udiv <16 x i32> undef, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
; CHECK: cost of 1 {{.*}} %I16 = udiv
; AVX: cost of 6 {{.*}} %V8i16 = udiv
%V8i16 = udiv <8 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
; SSE: cost of 12 {{.*}} %V16i16 = udiv
- ; AVX1: cost of 320 {{.*}} %V16i16 = udiv
+ ; AVX1: cost of 12 {{.*}} %V16i16 = udiv
; AVX2: cost of 6 {{.*}} %V16i16 = udiv
; AVX512: cost of 6 {{.*}} %V16i16 = udiv
%V16i16 = udiv <16 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
; SSE: cost of 24 {{.*}} %V32i16 = udiv
- ; AVX1: cost of 640 {{.*}} %V32i16 = udiv
+ ; AVX1: cost of 24 {{.*}} %V32i16 = udiv
; AVX2: cost of 12 {{.*}} %V32i16 = udiv
; AVX512F: cost of 12 {{.*}} %V32i16 = udiv
- ; AVX512BW: cost of 640 {{.*}} %V32i16 = udiv
+ ; AVX512BW: cost of 6 {{.*}} %V32i16 = udiv
%V32i16 = udiv <32 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
; CHECK: cost of 1 {{.*}} %I8 = udiv
%V64i8 = udiv <64 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
ret i32 undef
-}\r
-\r
+}
+
; CHECK-LABEL: 'sdiv_uniformconstpow2'
define i32 @sdiv_uniformconstpow2() {
; CHECK: cost of 1 {{.*}} %I64 = sdiv
; SSE2: cost of 38 {{.*}} %V8i32 = sdiv
; SSSE3: cost of 38 {{.*}} %V8i32 = sdiv
; SSE42: cost of 30 {{.*}} %V8i32 = sdiv
- ; AVX1: cost of 160 {{.*}} %V8i32 = sdiv
+ ; AVX1: cost of 30 {{.*}} %V8i32 = sdiv
; AVX2: cost of 15 {{.*}} %V8i32 = sdiv
; AVX512: cost of 15 {{.*}} %V8i32 = sdiv
%V8i32 = sdiv <8 x i32> undef, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
; SSE2: cost of 76 {{.*}} %V16i32 = sdiv
; SSSE3: cost of 76 {{.*}} %V16i32 = sdiv
; SSE42: cost of 60 {{.*}} %V16i32 = sdiv
- ; AVX1: cost of 320 {{.*}} %V16i32 = sdiv
+ ; AVX1: cost of 60 {{.*}} %V16i32 = sdiv
; AVX2: cost of 30 {{.*}} %V16i32 = sdiv
- ; AVX512F: cost of 48 {{.*}} %V16i32 = sdiv
- ; AVX512BW: cost of 320 {{.*}} %V16i32 = sdiv
+ ; AVX512: cost of 15 {{.*}} %V16i32 = sdiv
%V16i32 = sdiv <16 x i32> undef, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
; CHECK: cost of 1 {{.*}} %I16 = sdiv
; AVX: cost of 6 {{.*}} %V8i16 = sdiv
%V8i16 = sdiv <8 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
; SSE: cost of 12 {{.*}} %V16i16 = sdiv
- ; AVX1: cost of 320 {{.*}} %V16i16 = sdiv
+ ; AVX1: cost of 12 {{.*}} %V16i16 = sdiv
; AVX2: cost of 6 {{.*}} %V16i16 = sdiv
; AVX512: cost of 6 {{.*}} %V16i16 = sdiv
%V16i16 = sdiv <16 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
; SSE: cost of 24 {{.*}} %V32i16 = sdiv
- ; AVX1: cost of 640 {{.*}} %V32i16 = sdiv
+ ; AVX1: cost of 24 {{.*}} %V32i16 = sdiv
; AVX2: cost of 12 {{.*}} %V32i16 = sdiv
; AVX512F: cost of 12 {{.*}} %V32i16 = sdiv
- ; AVX512BW: cost of 640 {{.*}} %V32i16 = sdiv
+ ; AVX512BW: cost of 6 {{.*}} %V32i16 = sdiv
%V32i16 = sdiv <32 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
; CHECK: cost of 1 {{.*}} %I8 = sdiv
ret i32 undef
}
-\r
+
; CHECK-LABEL: 'udiv_uniformconstpow2'
define i32 @udiv_uniformconstpow2() {
; CHECK: cost of 1 {{.*}} %I64 = udiv
; AVX: cost of 15 {{.*}} %V4i32 = udiv
%V4i32 = udiv <4 x i32> undef, <i32 16, i32 16, i32 16, i32 16>
; SSE: cost of 30 {{.*}} %V8i32 = udiv
- ; AVX1: cost of 160 {{.*}} %V8i32 = udiv
+ ; AVX1: cost of 30 {{.*}} %V8i32 = udiv
; AVX2: cost of 15 {{.*}} %V8i32 = udiv
; AVX512: cost of 15 {{.*}} %V8i32 = udiv
%V8i32 = udiv <8 x i32> undef, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
; SSE: cost of 60 {{.*}} %V16i32 = udiv
- ; AVX1: cost of 320 {{.*}} %V16i32 = udiv
+ ; AVX1: cost of 60 {{.*}} %V16i32 = udiv
; AVX2: cost of 30 {{.*}} %V16i32 = udiv
- ; AVX512F: cost of 48 {{.*}} %V16i32 = udiv
- ; AVX512BW: cost of 320 {{.*}} %V16i32 = udiv
+ ; AVX512: cost of 15 {{.*}} %V16i32 = udiv
%V16i32 = udiv <16 x i32> undef, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
; CHECK: cost of 1 {{.*}} %I16 = udiv
; AVX: cost of 6 {{.*}} %V8i16 = udiv
%V8i16 = udiv <8 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
; SSE: cost of 12 {{.*}} %V16i16 = udiv
- ; AVX1: cost of 320 {{.*}} %V16i16 = udiv
+ ; AVX1: cost of 12 {{.*}} %V16i16 = udiv
; AVX2: cost of 6 {{.*}} %V16i16 = udiv
; AVX512: cost of 6 {{.*}} %V16i16 = udiv
%V16i16 = udiv <16 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
; SSE: cost of 24 {{.*}} %V32i16 = udiv
- ; AVX1: cost of 640 {{.*}} %V32i16 = udiv
+ ; AVX1: cost of 24 {{.*}} %V32i16 = udiv
; AVX2: cost of 12 {{.*}} %V32i16 = udiv
; AVX512F: cost of 12 {{.*}} %V32i16 = udiv
- ; AVX512BW: cost of 640 {{.*}} %V32i16 = udiv
+ ; AVX512BW: cost of 6 {{.*}} %V32i16 = udiv
%V32i16 = udiv <32 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
; CHECK: cost of 1 {{.*}} %I8 = udiv
%V64i8 = udiv <64 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
ret i32 undef
-}\r
+}