]> granicus.if.org Git - pdns/commitdiff
dnsdist: Add Prometheus latency histogram support
authorMarlin Cremers <git@marlinc.nl>
Fri, 31 May 2019 14:21:18 +0000 (16:21 +0200)
committerMarlin Cremers <git@marlinc.nl>
Fri, 31 May 2019 14:21:18 +0000 (16:21 +0200)
pdns/dnsdist-web.cc
pdns/dnsdist.cc
pdns/dnsdist.hh
pdns/dnsdistdist/docs/statistics.rst
regression-tests.dnsdist/test_API.py

index 704c575172a0533690e185b4ea777bc56f7c635d..b9faa558374723b6f7ea69fe81643f8d628d3211 100644 (file)
@@ -443,6 +443,22 @@ static void connectionThread(int sock, ComboAddress remote)
           output << "\n";
         }
 
+        // Latency histogram buckets
+        output << "# HELP dnsdist_latency Histogram of responses by latency\n";
+        output << "# TYPE dnsdist_latency histogram\n";
+        uint64_t latency_amounts = g_stats.latency0_1;
+        output << "dnsdist_latency_bucket{le=\"1\"} " << latency_amounts << "\n";
+        latency_amounts += g_stats.latency1_10;
+        output << "dnsdist_latency_bucket{le=\"10\"} " << latency_amounts << "\n";
+        latency_amounts += g_stats.latency10_50;
+        output << "dnsdist_latency_bucket{le=\"50\"} " << latency_amounts << "\n";
+        latency_amounts += g_stats.latency50_100;
+        output << "dnsdist_latency_bucket{le=\"100\"} " << latency_amounts << "\n";
+        latency_amounts += g_stats.latency100_1000;
+        output << "dnsdist_latency_bucket{le=\"1000\"} " << latency_amounts << "\n";
+        latency_amounts += g_stats.latencySlow; // Should be the same as latency_count
+        output << "dnsdist_latency_bucket{le=\"+Inf\"} " << latency_amounts << "\n";
+
         auto states = g_dstates.getLocal();
         const string statesbase = "dnsdist_server_";
 
index 95b1d7f8f2689efce3add9eacb41d6b370225935..d5c17029a986ca9f5ebabb205317182dcd1f3a8a 100644 (file)
@@ -200,6 +200,7 @@ void doLatencyStats(double udiff)
   else if(udiff < 100000) ++g_stats.latency50_100;
   else if(udiff < 1000000) ++g_stats.latency100_1000;
   else ++g_stats.latencySlow;
+  g_stats.latencySum += udiff / 1000;
 
   auto doAvg = [](double& var, double n, double weight) {
     var = (weight -1) * var/weight + n/weight;
@@ -2792,3 +2793,8 @@ catch(PDNSException &ae)
   errlog("Fatal pdns error: %s", ae.reason);
   _exit(EXIT_FAILURE);
 }
+
+uint64_t getLatencyCount(const std::string&)
+{
+    return g_stats.responses + g_stats.selfAnswered + g_stats.cacheHits;
+}
index a74d1646736541acab3284156f461c6c7e5fe4d4..a96a87b439d17ff920bc7e65eb0ae8d642b9eb99 100644 (file)
@@ -225,6 +225,8 @@ extern GlobalStateHolder<NetmaskTree<DynBlock>> g_dynblockNMG;
 
 extern vector<pair<struct timeval, std::string> > g_confDelta;
 
+extern uint64_t getLatencyCount(const std::string&);
+
 struct DNSDistStats
 {
   using stat_t=std::atomic<uint64_t>; // aww yiss ;-)
@@ -251,7 +253,7 @@ struct DNSDistStats
   stat_t noPolicy{0};
   stat_t cacheHits{0};
   stat_t cacheMisses{0};
-  stat_t latency0_1{0}, latency1_10{0}, latency10_50{0}, latency50_100{0}, latency100_1000{0}, latencySlow{0};
+  stat_t latency0_1{0}, latency1_10{0}, latency10_50{0}, latency50_100{0}, latency100_1000{0}, latencySlow{0}, latencySum{0};
   stat_t securityStatus{0};
 
   double latencyAvg100{0}, latencyAvg1000{0}, latencyAvg10000{0}, latencyAvg1000000{0};
@@ -298,7 +300,10 @@ struct DNSDistStats
     {"fd-usage", getOpenFileDescriptors},
     {"dyn-blocked", &dynBlocked},
     {"dyn-block-nmg-size", [](const std::string&) { return g_dynblockNMG.getLocal()->size(); }},
-    {"security-status", &securityStatus}
+    {"security-status", &securityStatus},
+    // Latency histogram
+    {"latency-sum", &latencySum},
+    {"latency-count", getLatencyCount},
   };
 };
 
@@ -390,6 +395,9 @@ struct MetricDefinitionStorage {
     { "dyn-blocked",            MetricDefinition(PrometheusMetricType::counter, "Number of queries dropped because of a dynamic block")},
     { "dyn-block-nmg-size",     MetricDefinition(PrometheusMetricType::gauge,   "Number of dynamic blocks entries") },
     { "security-status",        MetricDefinition(PrometheusMetricType::gauge,   "Security status of this software. 0=unknown, 1=OK, 2=upgrade recommended, 3=upgrade mandatory") },
+    // Latency histogram
+    { "latency-sum",            MetricDefinition(PrometheusMetricType::counter, "Total response time in milliseconds")},
+    { "latency-count",          MetricDefinition(PrometheusMetricType::counter, "Number of queries contributing to response time histogram")},
   };
 };
 
index 10f7de4ff1ba49c53c23e7887f9bf64228a9cdc5..ee806bb425c2390d558452ecf04011d2dcfbe9a4 100644 (file)
@@ -99,6 +99,19 @@ latency-slow
 ------------
 Number of queries answered in more than 1 second.
 
+latency-sum
+-----------
+Total response time of all queries combined in milliseconds since the start of dnsdist. Can be used to calculate the
+average response time over all queries.
+
+latency-count
+-------------
+Number of queries contributing to response time histogram
+
+latency-bucket
+--------------
+Number of queries contributing to response time histogram per latency bucket
+
 latency0-1
 ----------
 Number of queries answered in less than 1 ms.
index 8ec87804a7f352e00a4a611830e668d210db6149..ecad521c9d7bbd05bde2e3099e814471d84658f2 100644 (file)
@@ -230,8 +230,8 @@ class TestAPIBasics(DNSDistTest):
                     'rule-drop', 'rule-nxdomain', 'rule-refused', 'self-answered', 'downstream-timeouts',
                     'downstream-send-errors', 'trunc-failures', 'no-policy', 'latency0-1',
                     'latency1-10', 'latency10-50', 'latency50-100', 'latency100-1000',
-                    'latency-slow', 'latency-avg100', 'latency-avg1000', 'latency-avg10000',
-                    'latency-avg1000000', 'uptime', 'real-memory-usage', 'noncompliant-queries',
+                    'latency-slow', 'latency-sum', 'latency-count', 'latency-avg100', 'latency-avg1000',
+                    'latency-avg10000', 'latency-avg1000000', 'uptime', 'real-memory-usage', 'noncompliant-queries',
                     'noncompliant-responses', 'rdqueries', 'empty-queries', 'cache-hits',
                     'cache-misses', 'cpu-user-msec', 'cpu-sys-msec', 'fd-usage', 'dyn-blocked',
                     'dyn-block-nmg-size', 'rule-servfail', 'security-status']