From: Marlin Cremers Date: Fri, 31 May 2019 14:21:18 +0000 (+0200) Subject: dnsdist: Add Prometheus latency histogram support X-Git-Tag: dnsdist-1.4.0-beta1~12^2 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=eb0335ff288778903ed5383fd8403234077d03d8;p=pdns dnsdist: Add Prometheus latency histogram support --- diff --git a/pdns/dnsdist-web.cc b/pdns/dnsdist-web.cc index 704c57517..b9faa5583 100644 --- a/pdns/dnsdist-web.cc +++ b/pdns/dnsdist-web.cc @@ -443,6 +443,22 @@ static void connectionThread(int sock, ComboAddress remote) output << "\n"; } + // Latency histogram buckets + output << "# HELP dnsdist_latency Histogram of responses by latency\n"; + output << "# TYPE dnsdist_latency histogram\n"; + uint64_t latency_amounts = g_stats.latency0_1; + output << "dnsdist_latency_bucket{le=\"1\"} " << latency_amounts << "\n"; + latency_amounts += g_stats.latency1_10; + output << "dnsdist_latency_bucket{le=\"10\"} " << latency_amounts << "\n"; + latency_amounts += g_stats.latency10_50; + output << "dnsdist_latency_bucket{le=\"50\"} " << latency_amounts << "\n"; + latency_amounts += g_stats.latency50_100; + output << "dnsdist_latency_bucket{le=\"100\"} " << latency_amounts << "\n"; + latency_amounts += g_stats.latency100_1000; + output << "dnsdist_latency_bucket{le=\"1000\"} " << latency_amounts << "\n"; + latency_amounts += g_stats.latencySlow; // Should be the same as latency_count + output << "dnsdist_latency_bucket{le=\"+Inf\"} " << latency_amounts << "\n"; + auto states = g_dstates.getLocal(); const string statesbase = "dnsdist_server_"; diff --git a/pdns/dnsdist.cc b/pdns/dnsdist.cc index 95b1d7f8f..d5c17029a 100644 --- a/pdns/dnsdist.cc +++ b/pdns/dnsdist.cc @@ -200,6 +200,7 @@ void doLatencyStats(double udiff) else if(udiff < 100000) ++g_stats.latency50_100; else if(udiff < 1000000) ++g_stats.latency100_1000; else ++g_stats.latencySlow; + g_stats.latencySum += udiff / 1000; auto doAvg = [](double& var, double n, double weight) { var = (weight -1) * var/weight + n/weight; @@ -2792,3 +2793,8 @@ catch(PDNSException &ae) errlog("Fatal pdns error: %s", ae.reason); _exit(EXIT_FAILURE); } + +uint64_t getLatencyCount(const std::string&) +{ + return g_stats.responses + g_stats.selfAnswered + g_stats.cacheHits; +} diff --git a/pdns/dnsdist.hh b/pdns/dnsdist.hh index a74d16467..a96a87b43 100644 --- a/pdns/dnsdist.hh +++ b/pdns/dnsdist.hh @@ -225,6 +225,8 @@ extern GlobalStateHolder> g_dynblockNMG; extern vector > g_confDelta; +extern uint64_t getLatencyCount(const std::string&); + struct DNSDistStats { using stat_t=std::atomic; // aww yiss ;-) @@ -251,7 +253,7 @@ struct DNSDistStats stat_t noPolicy{0}; stat_t cacheHits{0}; stat_t cacheMisses{0}; - stat_t latency0_1{0}, latency1_10{0}, latency10_50{0}, latency50_100{0}, latency100_1000{0}, latencySlow{0}; + stat_t latency0_1{0}, latency1_10{0}, latency10_50{0}, latency50_100{0}, latency100_1000{0}, latencySlow{0}, latencySum{0}; stat_t securityStatus{0}; double latencyAvg100{0}, latencyAvg1000{0}, latencyAvg10000{0}, latencyAvg1000000{0}; @@ -298,7 +300,10 @@ struct DNSDistStats {"fd-usage", getOpenFileDescriptors}, {"dyn-blocked", &dynBlocked}, {"dyn-block-nmg-size", [](const std::string&) { return g_dynblockNMG.getLocal()->size(); }}, - {"security-status", &securityStatus} + {"security-status", &securityStatus}, + // Latency histogram + {"latency-sum", &latencySum}, + {"latency-count", getLatencyCount}, }; }; @@ -390,6 +395,9 @@ struct MetricDefinitionStorage { { "dyn-blocked", MetricDefinition(PrometheusMetricType::counter, "Number of queries dropped because of a dynamic block")}, { "dyn-block-nmg-size", MetricDefinition(PrometheusMetricType::gauge, "Number of dynamic blocks entries") }, { "security-status", MetricDefinition(PrometheusMetricType::gauge, "Security status of this software. 0=unknown, 1=OK, 2=upgrade recommended, 3=upgrade mandatory") }, + // Latency histogram + { "latency-sum", MetricDefinition(PrometheusMetricType::counter, "Total response time in milliseconds")}, + { "latency-count", MetricDefinition(PrometheusMetricType::counter, "Number of queries contributing to response time histogram")}, }; }; diff --git a/pdns/dnsdistdist/docs/statistics.rst b/pdns/dnsdistdist/docs/statistics.rst index 10f7de4ff..ee806bb42 100644 --- a/pdns/dnsdistdist/docs/statistics.rst +++ b/pdns/dnsdistdist/docs/statistics.rst @@ -99,6 +99,19 @@ latency-slow ------------ Number of queries answered in more than 1 second. +latency-sum +----------- +Total response time of all queries combined in milliseconds since the start of dnsdist. Can be used to calculate the +average response time over all queries. + +latency-count +------------- +Number of queries contributing to response time histogram + +latency-bucket +-------------- +Number of queries contributing to response time histogram per latency bucket + latency0-1 ---------- Number of queries answered in less than 1 ms. diff --git a/regression-tests.dnsdist/test_API.py b/regression-tests.dnsdist/test_API.py index 8ec87804a..ecad521c9 100644 --- a/regression-tests.dnsdist/test_API.py +++ b/regression-tests.dnsdist/test_API.py @@ -230,8 +230,8 @@ class TestAPIBasics(DNSDistTest): 'rule-drop', 'rule-nxdomain', 'rule-refused', 'self-answered', 'downstream-timeouts', 'downstream-send-errors', 'trunc-failures', 'no-policy', 'latency0-1', 'latency1-10', 'latency10-50', 'latency50-100', 'latency100-1000', - 'latency-slow', 'latency-avg100', 'latency-avg1000', 'latency-avg10000', - 'latency-avg1000000', 'uptime', 'real-memory-usage', 'noncompliant-queries', + 'latency-slow', 'latency-sum', 'latency-count', 'latency-avg100', 'latency-avg1000', + 'latency-avg10000', 'latency-avg1000000', 'uptime', 'real-memory-usage', 'noncompliant-queries', 'noncompliant-responses', 'rdqueries', 'empty-queries', 'cache-hits', 'cache-misses', 'cpu-user-msec', 'cpu-sys-msec', 'fd-usage', 'dyn-blocked', 'dyn-block-nmg-size', 'rule-servfail', 'security-status']