From 13da61c8c413bbf0c69f2e4f6f3c2472b7cc2ecd Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Mon, 14 Oct 2019 14:07:43 +0000 Subject: [PATCH] [CostModel][X86] Add CTPOP scalar costs (PR43656) Add specific scalar costs for ctpop instructions, these are based on the llvm-mca's SLM throughput numbers (the oldest model we have). For targets supporting POPCNT, we provide overrides that assume 1cy costs. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@374775 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86TargetTransformInfo.cpp | 23 ++++++++ test/Analysis/CostModel/X86/ctpop.ll | 8 +-- test/Transforms/SLPVectorizer/X86/ctpop.ll | 68 +++++++++++++++------- 3 files changed, 73 insertions(+), 26 deletions(-) diff --git a/lib/Target/X86/X86TargetTransformInfo.cpp b/lib/Target/X86/X86TargetTransformInfo.cpp index dc58c98118f..2f4b55e8aaa 100644 --- a/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/lib/Target/X86/X86TargetTransformInfo.cpp @@ -2103,8 +2103,17 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, { ISD::FSQRT, MVT::f32, 28 }, // Pentium III from http://www.agner.org/ { ISD::FSQRT, MVT::v4f32, 56 }, // Pentium III from http://www.agner.org/ }; + static const CostTblEntry POPCNT64CostTbl[] = { // 64-bit targets + { ISD::CTPOP, MVT::i64, 1 }, + }; + static const CostTblEntry POPCNT32CostTbl[] = { // 32 or 64-bit targets + { ISD::CTPOP, MVT::i32, 1 }, + { ISD::CTPOP, MVT::i16, 1 }, + { ISD::CTPOP, MVT::i8, 1 }, + }; static const CostTblEntry X64CostTbl[] = { // 64-bit targets { ISD::BITREVERSE, MVT::i64, 14 }, + { ISD::CTPOP, MVT::i64, 10 }, { ISD::SADDO, MVT::i64, 1 }, { ISD::UADDO, MVT::i64, 1 }, }; @@ -2112,6 +2121,9 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, { ISD::BITREVERSE, MVT::i32, 14 }, { ISD::BITREVERSE, MVT::i16, 14 }, { ISD::BITREVERSE, MVT::i8, 11 }, + { ISD::CTPOP, MVT::i32, 8 }, + { ISD::CTPOP, MVT::i16, 9 }, + { ISD::CTPOP, MVT::i8, 7 }, { ISD::SADDO, MVT::i32, 1 }, { ISD::SADDO, MVT::i16, 1 }, { ISD::SADDO, MVT::i8, 1 }, @@ -2223,6 +2235,17 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy)) return LT.first * Entry->Cost; + if (ST->hasPOPCNT()) { + if (ST->is64Bit()) + if (const auto *Entry = CostTableLookup(POPCNT64CostTbl, ISD, MTy)) + return LT.first * Entry->Cost; + + if (const auto *Entry = CostTableLookup(POPCNT32CostTbl, ISD, MTy)) + return LT.first * Entry->Cost; + } + + // TODO - add LZCNT and BMI (TZCNT) scalar handling + if (ST->is64Bit()) if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, MTy)) return LT.first * Entry->Cost; diff --git a/test/Analysis/CostModel/X86/ctpop.ll b/test/Analysis/CostModel/X86/ctpop.ll index bb59558e458..e9cf666af7b 100644 --- a/test/Analysis/CostModel/X86/ctpop.ll +++ b/test/Analysis/CostModel/X86/ctpop.ll @@ -16,7 +16,7 @@ declare i8 @llvm.ctpop.i8(i8) define i64 @var_ctpop_i64(i64 %a) { ; NOPOPCNT-LABEL: 'var_ctpop_i64' -; NOPOPCNT-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %ctpop = call i64 @llvm.ctpop.i64(i64 %a) +; NOPOPCNT-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %ctpop = call i64 @llvm.ctpop.i64(i64 %a) ; NOPOPCNT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %ctpop ; ; POPCNT-LABEL: 'var_ctpop_i64' @@ -29,7 +29,7 @@ define i64 @var_ctpop_i64(i64 %a) { define i32 @var_ctpop_i32(i32 %a) { ; NOPOPCNT-LABEL: 'var_ctpop_i32' -; NOPOPCNT-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %ctpop = call i32 @llvm.ctpop.i32(i32 %a) +; NOPOPCNT-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %ctpop = call i32 @llvm.ctpop.i32(i32 %a) ; NOPOPCNT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %ctpop ; ; POPCNT-LABEL: 'var_ctpop_i32' @@ -42,7 +42,7 @@ define i32 @var_ctpop_i32(i32 %a) { define i16 @var_ctpop_i16(i16 %a) { ; NOPOPCNT-LABEL: 'var_ctpop_i16' -; NOPOPCNT-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %ctpop = call i16 @llvm.ctpop.i16(i16 %a) +; NOPOPCNT-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %ctpop = call i16 @llvm.ctpop.i16(i16 %a) ; NOPOPCNT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i16 %ctpop ; ; POPCNT-LABEL: 'var_ctpop_i16' @@ -55,7 +55,7 @@ define i16 @var_ctpop_i16(i16 %a) { define i8 @var_ctpop_i8(i8 %a) { ; NOPOPCNT-LABEL: 'var_ctpop_i8' -; NOPOPCNT-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %ctpop = call i8 @llvm.ctpop.i8(i8 %a) +; NOPOPCNT-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %ctpop = call i8 @llvm.ctpop.i8(i8 %a) ; NOPOPCNT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i8 %ctpop ; ; POPCNT-LABEL: 'var_ctpop_i8' diff --git a/test/Transforms/SLPVectorizer/X86/ctpop.ll b/test/Transforms/SLPVectorizer/X86/ctpop.ll index 42f19c8ddc8..f3e4a31691b 100644 --- a/test/Transforms/SLPVectorizer/X86/ctpop.ll +++ b/test/Transforms/SLPVectorizer/X86/ctpop.ll @@ -21,14 +21,29 @@ declare i16 @llvm.ctpop.i16(i16) declare i8 @llvm.ctpop.i8(i8) define void @ctpop_2i64() #0 { -; CHECK-LABEL: @ctpop_2i64( -; CHECK-NEXT: [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i32 0, i64 0), align 8 -; CHECK-NEXT: [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i32 0, i64 1), align 8 -; CHECK-NEXT: [[CTPOP0:%.*]] = call i64 @llvm.ctpop.i64(i64 [[LD0]]) -; CHECK-NEXT: [[CTPOP1:%.*]] = call i64 @llvm.ctpop.i64(i64 [[LD1]]) -; CHECK-NEXT: store i64 [[CTPOP0]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i32 0, i64 0), align 8 -; CHECK-NEXT: store i64 [[CTPOP1]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i32 0, i64 1), align 8 -; CHECK-NEXT: ret void +; SSE2-LABEL: @ctpop_2i64( +; SSE2-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([4 x i64]* @src64 to <2 x i64>*), align 8 +; SSE2-NEXT: [[TMP2:%.*]] = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> [[TMP1]]) +; SSE2-NEXT: store <2 x i64> [[TMP2]], <2 x i64>* bitcast ([4 x i64]* @dst64 to <2 x i64>*), align 8 +; SSE2-NEXT: ret void +; +; SSE42-LABEL: @ctpop_2i64( +; SSE42-NEXT: [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i32 0, i64 0), align 8 +; SSE42-NEXT: [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i32 0, i64 1), align 8 +; SSE42-NEXT: [[CTPOP0:%.*]] = call i64 @llvm.ctpop.i64(i64 [[LD0]]) +; SSE42-NEXT: [[CTPOP1:%.*]] = call i64 @llvm.ctpop.i64(i64 [[LD1]]) +; SSE42-NEXT: store i64 [[CTPOP0]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i32 0, i64 0), align 8 +; SSE42-NEXT: store i64 [[CTPOP1]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i32 0, i64 1), align 8 +; SSE42-NEXT: ret void +; +; AVX-LABEL: @ctpop_2i64( +; AVX-NEXT: [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i32 0, i64 0), align 8 +; AVX-NEXT: [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i32 0, i64 1), align 8 +; AVX-NEXT: [[CTPOP0:%.*]] = call i64 @llvm.ctpop.i64(i64 [[LD0]]) +; AVX-NEXT: [[CTPOP1:%.*]] = call i64 @llvm.ctpop.i64(i64 [[LD1]]) +; AVX-NEXT: store i64 [[CTPOP0]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i32 0, i64 0), align 8 +; AVX-NEXT: store i64 [[CTPOP1]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i32 0, i64 1), align 8 +; AVX-NEXT: ret void ; %ld0 = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i32 0, i64 0), align 8 %ld1 = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i32 0, i64 1), align 8 @@ -40,20 +55,29 @@ define void @ctpop_2i64() #0 { } define void @ctpop_4i64() #0 { -; SSE-LABEL: @ctpop_4i64( -; SSE-NEXT: [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 0), align 4 -; SSE-NEXT: [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 1), align 4 -; SSE-NEXT: [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 2), align 4 -; SSE-NEXT: [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 3), align 4 -; SSE-NEXT: [[CTPOP0:%.*]] = call i64 @llvm.ctpop.i64(i64 [[LD0]]) -; SSE-NEXT: [[CTPOP1:%.*]] = call i64 @llvm.ctpop.i64(i64 [[LD1]]) -; SSE-NEXT: [[CTPOP2:%.*]] = call i64 @llvm.ctpop.i64(i64 [[LD2]]) -; SSE-NEXT: [[CTPOP3:%.*]] = call i64 @llvm.ctpop.i64(i64 [[LD3]]) -; SSE-NEXT: store i64 [[CTPOP0]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 0), align 4 -; SSE-NEXT: store i64 [[CTPOP1]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 1), align 4 -; SSE-NEXT: store i64 [[CTPOP2]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 2), align 4 -; SSE-NEXT: store i64 [[CTPOP3]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 3), align 4 -; SSE-NEXT: ret void +; SSE2-LABEL: @ctpop_4i64( +; SSE2-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([4 x i64]* @src64 to <2 x i64>*), align 4 +; SSE2-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 2) to <2 x i64>*), align 4 +; SSE2-NEXT: [[TMP3:%.*]] = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> [[TMP1]]) +; SSE2-NEXT: [[TMP4:%.*]] = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> [[TMP2]]) +; SSE2-NEXT: store <2 x i64> [[TMP3]], <2 x i64>* bitcast ([4 x i64]* @dst64 to <2 x i64>*), align 4 +; SSE2-NEXT: store <2 x i64> [[TMP4]], <2 x i64>* bitcast (i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 2) to <2 x i64>*), align 4 +; SSE2-NEXT: ret void +; +; SSE42-LABEL: @ctpop_4i64( +; SSE42-NEXT: [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 0), align 4 +; SSE42-NEXT: [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 1), align 4 +; SSE42-NEXT: [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 2), align 4 +; SSE42-NEXT: [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 3), align 4 +; SSE42-NEXT: [[CTPOP0:%.*]] = call i64 @llvm.ctpop.i64(i64 [[LD0]]) +; SSE42-NEXT: [[CTPOP1:%.*]] = call i64 @llvm.ctpop.i64(i64 [[LD1]]) +; SSE42-NEXT: [[CTPOP2:%.*]] = call i64 @llvm.ctpop.i64(i64 [[LD2]]) +; SSE42-NEXT: [[CTPOP3:%.*]] = call i64 @llvm.ctpop.i64(i64 [[LD3]]) +; SSE42-NEXT: store i64 [[CTPOP0]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 0), align 4 +; SSE42-NEXT: store i64 [[CTPOP1]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 1), align 4 +; SSE42-NEXT: store i64 [[CTPOP2]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 2), align 4 +; SSE42-NEXT: store i64 [[CTPOP3]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 3), align 4 +; SSE42-NEXT: ret void ; ; AVX1-LABEL: @ctpop_4i64( ; AVX1-NEXT: [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 0), align 4 -- 2.49.0