From 91921e0980dbccecb26ffc66e795d2ddcba82dbd Mon Sep 17 00:00:00 2001 From: Clement Courbet Date: Wed, 30 Jan 2019 16:02:20 +0000 Subject: [PATCH] [llvm-exegesis] Add throughput mode. Summary: This just uses the latency benchmark runner on the parallel uops snippet generator. Fixes PR37698. Reviewers: gchatelet Subscribers: tschuett, RKSimon, llvm-commits Differential Revision: https://reviews.llvm.org/D57000 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@352632 91177308-0d34-0410-b5e6-96231b3b80d8 --- docs/CommandGuide/llvm-exegesis.rst | 18 ++++++++------- .../X86/inverse_throughput-by-opcode-name.s | 8 +++++++ tools/llvm-exegesis/lib/BenchmarkResult.cpp | 2 ++ tools/llvm-exegesis/lib/BenchmarkResult.h | 2 +- tools/llvm-exegesis/lib/BenchmarkRunner.h | 2 +- tools/llvm-exegesis/lib/Latency.cpp | 21 +++++++++++++++-- tools/llvm-exegesis/lib/Latency.h | 4 ++-- tools/llvm-exegesis/lib/Target.cpp | 10 ++++---- tools/llvm-exegesis/lib/Target.h | 2 +- tools/llvm-exegesis/llvm-exegesis.cpp | 23 +++++++++++-------- 10 files changed, 63 insertions(+), 29 deletions(-) create mode 100644 test/tools/llvm-exegesis/X86/inverse_throughput-by-opcode-name.s diff --git a/docs/CommandGuide/llvm-exegesis.rst b/docs/CommandGuide/llvm-exegesis.rst index f27db9e57ed..bbd90563005 100644 --- a/docs/CommandGuide/llvm-exegesis.rst +++ b/docs/CommandGuide/llvm-exegesis.rst @@ -10,13 +10,13 @@ DESCRIPTION ----------- :program:`llvm-exegesis` is a benchmarking tool that uses information available -in LLVM to measure host machine instruction characteristics like latency or port -decomposition. +in LLVM to measure host machine instruction characteristics like latency, +throughput, or port decomposition. Given an LLVM opcode name and a benchmarking mode, :program:`llvm-exegesis` generates a code snippet that makes execution as serial (resp. as parallel) as -possible so that we can measure the latency (resp. uop decomposition) of the -instruction. +possible so that we can measure the latency (resp. inverse throughput/uop decomposition) +of the instruction. The code snippet is jitted and executed on the host subtarget. The time taken (resp. resource usage) is measured using hardware performance counters. The result is printed out as YAML to the standard output. @@ -37,11 +37,13 @@ instruction, run: $ llvm-exegesis -mode=latency -opcode-name=ADD64rr -Measuring the uop decomposition of an instruction works similarly: +Measuring the uop decomposition or inverse throughput of an instruction works similarly: .. code-block:: bash $ llvm-exegesis -mode=uops -opcode-name=ADD64rr + $ llvm-exegesis -mode=inverse_throughput -opcode-name=ADD64rr + The output is a YAML document (the default is to write to stdout, but you can redirect the output to a file using `-benchmarks-file`): @@ -186,7 +188,7 @@ OPTIONS Specify the custom code snippet to measure. See example 2 for details. Either `opcode-index`, `opcode-name` or `snippets-file` must be set. -.. option:: -mode=[latency|uops|analysis] +.. option:: -mode=[latency|uops|inverse_throughput|analysis] Specify the run mode. @@ -197,8 +199,8 @@ OPTIONS .. option:: -benchmarks-file= - File to read (`analysis` mode) or write (`latency`/`uops` modes) benchmark - results. "-" uses stdin/stdout. + File to read (`analysis` mode) or write (`latency`/`uops`/`inverse_throughput` + modes) benchmark results. "-" uses stdin/stdout. .. option:: -analysis-clusters-output-file= diff --git a/test/tools/llvm-exegesis/X86/inverse_throughput-by-opcode-name.s b/test/tools/llvm-exegesis/X86/inverse_throughput-by-opcode-name.s new file mode 100644 index 00000000000..49cb8479203 --- /dev/null +++ b/test/tools/llvm-exegesis/X86/inverse_throughput-by-opcode-name.s @@ -0,0 +1,8 @@ +# RUN: llvm-exegesis -mode=inverse_throughput -opcode-name=ADD32rr | FileCheck %s + +CHECK: --- +CHECK-NEXT: mode: inverse_throughput +CHECK-NEXT: key: +CHECK-NEXT: instructions: +CHECK-NEXT: ADD32rr +CHECK: key: inverse_throughput diff --git a/tools/llvm-exegesis/lib/BenchmarkResult.cpp b/tools/llvm-exegesis/lib/BenchmarkResult.cpp index 01748d3c45b..e18fdf332ae 100644 --- a/tools/llvm-exegesis/lib/BenchmarkResult.cpp +++ b/tools/llvm-exegesis/lib/BenchmarkResult.cpp @@ -209,6 +209,8 @@ struct ScalarEnumerationTraits { Io.enumCase(Value, "", exegesis::InstructionBenchmark::Unknown); Io.enumCase(Value, "latency", exegesis::InstructionBenchmark::Latency); Io.enumCase(Value, "uops", exegesis::InstructionBenchmark::Uops); + Io.enumCase(Value, "inverse_throughput", + exegesis::InstructionBenchmark::InverseThroughput); } }; diff --git a/tools/llvm-exegesis/lib/BenchmarkResult.h b/tools/llvm-exegesis/lib/BenchmarkResult.h index c0050054273..0ef4fb3caa9 100644 --- a/tools/llvm-exegesis/lib/BenchmarkResult.h +++ b/tools/llvm-exegesis/lib/BenchmarkResult.h @@ -57,7 +57,7 @@ struct BenchmarkMeasure { // The result of an instruction benchmark. struct InstructionBenchmark { InstructionBenchmarkKey Key; - enum ModeE { Unknown, Latency, Uops }; + enum ModeE { Unknown, Latency, Uops, InverseThroughput }; ModeE Mode; std::string CpuName; std::string LLVMTriple; diff --git a/tools/llvm-exegesis/lib/BenchmarkRunner.h b/tools/llvm-exegesis/lib/BenchmarkRunner.h index b2637788278..4387bc8456e 100644 --- a/tools/llvm-exegesis/lib/BenchmarkRunner.h +++ b/tools/llvm-exegesis/lib/BenchmarkRunner.h @@ -75,6 +75,7 @@ public: protected: const LLVMState &State; + const InstructionBenchmark::ModeE Mode; private: virtual llvm::Expected> @@ -84,7 +85,6 @@ private: writeObjectFile(const BenchmarkCode &Configuration, llvm::ArrayRef Code) const; - const InstructionBenchmark::ModeE Mode; const std::unique_ptr Scratch; }; diff --git a/tools/llvm-exegesis/lib/Latency.cpp b/tools/llvm-exegesis/lib/Latency.cpp index 4be615323d8..2f3fbaef4f4 100644 --- a/tools/llvm-exegesis/lib/Latency.cpp +++ b/tools/llvm-exegesis/lib/Latency.cpp @@ -165,6 +165,14 @@ LatencySnippetGenerator::generateCodeTemplates(const Instruction &Instr) const { return std::move(Results); } +LatencyBenchmarkRunner::LatencyBenchmarkRunner(const LLVMState &State, + InstructionBenchmark::ModeE Mode) + : BenchmarkRunner(State, Mode) { + assert((Mode == InstructionBenchmark::Latency || + Mode == InstructionBenchmark::InverseThroughput) && + "invalid mode"); +} + LatencyBenchmarkRunner::~LatencyBenchmarkRunner() = default; llvm::Expected> @@ -184,8 +192,17 @@ LatencyBenchmarkRunner::runMeasurements( if (*ExpectedCounterValue < MinValue) MinValue = *ExpectedCounterValue; } - std::vector Result = { - BenchmarkMeasure::Create("latency", MinValue)}; + std::vector Result; + switch (Mode) { + case InstructionBenchmark::Latency: + Result = {BenchmarkMeasure::Create("latency", MinValue)}; + break; + case InstructionBenchmark::InverseThroughput: + Result = {BenchmarkMeasure::Create("inverse_throughput", MinValue)}; + break; + default: + break; + } return std::move(Result); } diff --git a/tools/llvm-exegesis/lib/Latency.h b/tools/llvm-exegesis/lib/Latency.h index 5ab16df11a2..7d6d96a195e 100644 --- a/tools/llvm-exegesis/lib/Latency.h +++ b/tools/llvm-exegesis/lib/Latency.h @@ -32,8 +32,8 @@ public: class LatencyBenchmarkRunner : public BenchmarkRunner { public: - LatencyBenchmarkRunner(const LLVMState &State) - : BenchmarkRunner(State, InstructionBenchmark::Latency) {} + LatencyBenchmarkRunner(const LLVMState &State, + InstructionBenchmark::ModeE Mode); ~LatencyBenchmarkRunner() override; private: diff --git a/tools/llvm-exegesis/lib/Target.cpp b/tools/llvm-exegesis/lib/Target.cpp index 1fb0237d461..c662f1f5566 100644 --- a/tools/llvm-exegesis/lib/Target.cpp +++ b/tools/llvm-exegesis/lib/Target.cpp @@ -45,6 +45,7 @@ ExegesisTarget::createSnippetGenerator(InstructionBenchmark::ModeE Mode, case InstructionBenchmark::Latency: return createLatencySnippetGenerator(State); case InstructionBenchmark::Uops: + case InstructionBenchmark::InverseThroughput: return createUopsSnippetGenerator(State); } return nullptr; @@ -57,7 +58,8 @@ ExegesisTarget::createBenchmarkRunner(InstructionBenchmark::ModeE Mode, case InstructionBenchmark::Unknown: return nullptr; case InstructionBenchmark::Latency: - return createLatencyBenchmarkRunner(State); + case InstructionBenchmark::InverseThroughput: + return createLatencyBenchmarkRunner(State, Mode); case InstructionBenchmark::Uops: return createUopsBenchmarkRunner(State); } @@ -74,9 +76,9 @@ ExegesisTarget::createUopsSnippetGenerator(const LLVMState &State) const { return llvm::make_unique(State); } -std::unique_ptr -ExegesisTarget::createLatencyBenchmarkRunner(const LLVMState &State) const { - return llvm::make_unique(State); +std::unique_ptr ExegesisTarget::createLatencyBenchmarkRunner( + const LLVMState &State, InstructionBenchmark::ModeE Mode) const { + return llvm::make_unique(State, Mode); } std::unique_ptr diff --git a/tools/llvm-exegesis/lib/Target.h b/tools/llvm-exegesis/lib/Target.h index da3441e67b2..f3429b79a34 100644 --- a/tools/llvm-exegesis/lib/Target.h +++ b/tools/llvm-exegesis/lib/Target.h @@ -130,7 +130,7 @@ private: std::unique_ptr virtual createUopsSnippetGenerator( const LLVMState &State) const; std::unique_ptr virtual createLatencyBenchmarkRunner( - const LLVMState &State) const; + const LLVMState &State, InstructionBenchmark::ModeE Mode) const; std::unique_ptr virtual createUopsBenchmarkRunner( const LLVMState &State) const; diff --git a/tools/llvm-exegesis/llvm-exegesis.cpp b/tools/llvm-exegesis/llvm-exegesis.cpp index 145fea55beb..ce11fadbf6e 100644 --- a/tools/llvm-exegesis/llvm-exegesis.cpp +++ b/tools/llvm-exegesis/llvm-exegesis.cpp @@ -56,16 +56,19 @@ static cl::opt SnippetsFile("snippets-file", static cl::opt BenchmarkFile("benchmarks-file", cl::desc(""), cl::init("")); -static cl::opt - BenchmarkMode("mode", cl::desc("the mode to run"), - cl::values(clEnumValN(exegesis::InstructionBenchmark::Latency, - "latency", "Instruction Latency"), - clEnumValN(exegesis::InstructionBenchmark::Uops, - "uops", "Uop Decomposition"), - // When not asking for a specific benchmark mode, - // we'll analyse the results. - clEnumValN(exegesis::InstructionBenchmark::Unknown, - "analysis", "Analysis"))); +static cl::opt BenchmarkMode( + "mode", cl::desc("the mode to run"), + cl::values(clEnumValN(exegesis::InstructionBenchmark::Latency, "latency", + "Instruction Latency"), + clEnumValN(exegesis::InstructionBenchmark::InverseThroughput, + "inverse_throughput", + "Instruction Inverse Throughput"), + clEnumValN(exegesis::InstructionBenchmark::Uops, "uops", + "Uop Decomposition"), + // When not asking for a specific benchmark mode, + // we'll analyse the results. + clEnumValN(exegesis::InstructionBenchmark::Unknown, "analysis", + "Analysis"))); static cl::opt NumRepetitions("num-repetitions", -- 2.40.0