]> granicus.if.org Git - pdns/commitdiff
Introduced metric description storage and improved Prometheus support
authorPavel Odintsov <pavel@cloudflare.com>
Thu, 30 Aug 2018 18:00:37 +0000 (19:00 +0100)
committerPavel Odintsov <pavel@cloudflare.com>
Thu, 30 Aug 2018 18:45:10 +0000 (19:45 +0100)
pdns/dnsdist-web.cc
pdns/dnsdist.cc
pdns/dnsdist.hh
pdns/dnsdistdist/docs/guides/webserver.rst

index f1d11693f7c90c84aa30bd86bb606d01593bfea7..d9ed73341369f097376e41c75457d45f99bb7fa6 100644 (file)
@@ -128,7 +128,7 @@ static bool isAnAPIRequestAllowedWithWebAuth(const YaHTTP::Request& req)
 
 static bool isAStatsRequest(const YaHTTP::Request& req)
 {
-  return req.url.path == "/jsonstat" || req.url.path == "/prometheus";
+  return req.url.path == "/jsonstat" || req.url.path == "/metrics";
 }
 
 static bool compareAuthorization(const YaHTTP::Request& req, const string &expected_password, const string& expectedApiKey)
@@ -386,70 +386,98 @@ static void connectionThread(int sock, ComboAddress remote, string password, str
         resp.status=404;
       }
     }
-    else if(req.url.path=="/prometheus") {
+    else if (req.url.path == "/metrics") {
         handleCORS(req, resp);
-        resp.status=200;
+        resp.status = 200;
 
-        ostringstream str;
-        for(const auto& e : g_stats.entries) {
-          string metricName = "dnsdist_main_" + std::get<0>(e);
-          boost::replace_all(metricName, "-", "_");
+        std::ostringstream output;
+        for (const auto& e : g_stats.entries) {
+          std::string metricName = std::get<0>(e);
+
+          // Prometheus suggest using '_' instead of '-'
+          std::string prometheusMetricName = "dnsdist_main_" + boost::replace_all_copy(metricName, "-", "_");
+
+          MetricDefinition metricDetails; 
+
+          if (!g_metricDefinitions.getMetricDetails(metricName, metricDetails)) {
+              warnlog("Do not have metric details for %s", metricName);
+              continue;
+          }
 
           // for these we have the help and types encoded in the sources:
-          str<<"# HELP "<<metricName<<' '<< std::get<3>(e)<<"\n";
-          str<<"# TYPE "<<metricName<<' '<< std::get<2>(e)<<"\n";
-          str<<metricName<<' ';
-          if(const auto& val = boost::get<DNSDistStats::stat_t*>(&std::get<1>(e)))
-            str<<(*val)->load();
+          output << "# HELP " << prometheusMetricName << " " << metricDetails.description    << "\n";
+          output << "# TYPE " << prometheusMetricName << " " << metricDetails.prometheusType << "\n";
+          output << prometheusMetricName << " ";
+
+          if (const auto& val = boost::get<DNSDistStats::stat_t*>(&std::get<1>(e)))
+            output << (*val)->load();
           else if (const auto& dval = boost::get<double*>(&std::get<1>(e)))
-            str<<**dval;
+            output << **dval;
           else
-            str<<(*boost::get<DNSDistStats::statfunction_t>(&std::get<1>(e)))(std::get<0>(e));
-          str<<"\n";
+            output << (*boost::get<DNSDistStats::statfunction_t>(&std::get<1>(e)))(std::get<0>(e));
+          
+          output << "\n";
         }
+
         const auto states = g_dstates.getCopy();
         const string statesbase = "dnsdist_main_servers_";
-        for(const auto& state : states) {
-          string serverName = state->name.empty() ? (state->remote.toString() + ":" + std::to_string(state->remote.getPort())) : state->getName();
+        
+        for (const auto& state : states) {
+          string serverName;
+           
+          if (state->name.empty())
+              serverName = state->remote.toString() + ":" + std::to_string(state->remote.getPort());
+          else
+              serverName = state->getName();
+
           boost::replace_all(serverName, ".", "_");
+
           const string label = "{server=\"" + serverName + "\"}";
-          str<<statesbase<<"queries"<<label<<' '<< state->queries.load() <<"\n";
-          str<<statesbase<<"drops"<<label<<' '<< state->reuseds.load() << "\n";
-          str<<statesbase<<"latency"<<label<<' '<< state->latencyUsec/1000.0 << "\n";
-          str<<statesbase<<"senderrors"<<label<<' '<< state->sendErrors.load() << "\n";
-          str<<statesbase<<"outstanding"<<label<<' '<< state->outstanding.load() << "\n";
+          output << statesbase << "queries"     << label << " " << state->queries.load()     << "\n";
+          output << statesbase << "drops"       << label << " " << state->reuseds.load()     << "\n";
+          output << statesbase << "latency"     << label << " " << state->latencyUsec/1000.0 << "\n";
+          output << statesbase << "senderrors"  << label << " " << state->sendErrors.load()  << "\n";
+          output << statesbase << "outstanding" << label << " " << state->outstanding.load() << "\n";
         }
-        for(const auto& front : g_frontends) {
+
+        for (const auto& front : g_frontends) {
           if (front->udpFD == -1 && front->tcpFD == -1)
             continue;
 
           string frontName = front->local.toString() + ":" + std::to_string(front->local.getPort());
           boost::replace_all(frontName, ".", "_");
           string proto = (front->udpFD >= 0 ? "udp" : "tcp");
-          str<<"dnsdist_main_frontend_queries{frontend=\""<<frontName<<"\",proto=\""<<proto<<"\"} "<< front->queries.load() << "\n";
+
+          output << "dnsdist_main_frontend_queries{frontend=\"" << frontName << "\",proto=\"" << proto
+              << "\"} " << front->queries.load() << "\n";
         }
+
         const auto localPools = g_pools.getCopy();
         const string cachebase = "dnsdist_pool_";
+        
         for (const auto& entry : localPools) {
           string poolName = entry.first;
           boost::replace_all(poolName, ".", "_");
+          
           if (poolName.empty()) {
             poolName = "_default_";
           }
           const string label = "{pool=\"" + poolName + "\"}";
           const std::shared_ptr<ServerPool> pool = entry.second;
-          str<<"dnsdist_main_pools_servers"<<label<< ' ' << pool->servers.size() <<"\n";
+          output << "dnsdist_main_pools_servers" << label << " " << pool->countServers(false) << "\n";
+
           if (pool->packetCache != nullptr) {
             const auto& cache = pool->packetCache;
-            str<<cachebase<<"cache_size"<<label << ' ' << cache->getMaxEntries() << "\n";
-            str<<cachebase<<"cache_entries"<<label << ' ' << cache->getEntriesCount() << "\n";
-            str<<cachebase<<"cache_hits"<<label << ' ' << cache->getHits() << "\n";
-            str<<cachebase<<"cache_misses"<<label << ' ' << cache->getMisses() << "\n";
-            str<<cachebase<<"cache_deferred_inserts"<<label << ' ' << cache->getDeferredInserts() << "\n";
-            str<<cachebase<<"cache_deferred_lookups"<<label << ' ' << cache->getDeferredLookups() << "\n";
-            str<<cachebase<<"cache_lookup_collisions"<<label << ' ' << cache->getLookupCollisions() << "\n";
-            str<<cachebase<<"cache_insert_collisions"<<label << ' ' << cache->getInsertCollisions() << "\n";
-            str<<cachebase<<"cache_ttl_too_shorts"<<label << ' ' << cache->getTTLTooShorts() << "\n";
+
+            output << cachebase << "cache_size"              <<label << " " << cache->getMaxEntries()       << "\n";
+            output << cachebase << "cache_entries"           <<label << " " << cache->getEntriesCount()     << "\n";
+            output << cachebase << "cache_hits"              <<label << " " << cache->getHits()             << "\n";
+            output << cachebase << "cache_misses"            <<label << " " << cache->getMisses()           << "\n";
+            output << cachebase << "cache_deferred_inserts"  <<label << " " << cache->getDeferredInserts()  << "\n";
+            output << cachebase << "cache_deferred_lookups"  <<label << " " << cache->getDeferredLookups()  << "\n";
+            output << cachebase << "cache_lookup_collisions" <<label << " " << cache->getLookupCollisions() << "\n";
+            output << cachebase << "cache_insert_collisions" <<label << " " << cache->getInsertCollisions() << "\n";
+            output << cachebase << "cache_ttl_too_shorts"    <<label << " " << cache->getTTLTooShorts()     << "\n";
           }
         }
 
@@ -457,15 +485,18 @@ static void connectionThread(int sock, ComboAddress remote, string password, str
           WriteLock wl(&g_qcount.queryLock);
           std::string qname;
           const string qnamebase = "dnsdist_querycount_queries";
+
           for(auto &record: g_qcount.records) {
             qname = record.first;
             boost::replace_all(qname, ".", "_");
-           const string label = "{qname=\"" + qname + "\"}";
-            str<<qnamebase<<label<<' '<<record.second<<"\n";
+           
+            const std::string label = "{qname=\"" + qname + "\"}";
+            output << qnamebase << label << " " << record.second << "\n";
           }
           g_qcount.records.clear();
         }
-        resp.body=str.str();
+
+        resp.body = output.str();
         resp.headers["Content-Type"] = "text/plain";
     }
 
index 2cf2c97fb62b2d142dc3dde8db4601de06ea571d..b0c17bde51fe4555e9d9a154910f4f8744ade2b5 100644 (file)
@@ -80,6 +80,8 @@ using std::thread;
 bool g_verbose;
 
 struct DNSDistStats g_stats;
+MetricDefinitionStorage g_metricDefinitions;
+
 uint16_t g_maxOutstanding{10240};
 bool g_verboseHealthChecks{false};
 uint32_t g_staleCacheEntriesTTL{0};
index 4cb88b4e655da04967b5c9bfdfcf5e090995d42f..fc7e606135d399fa0352bf3cf3b67cbc26cd5dcf 100644 (file)
@@ -262,7 +262,75 @@ struct DNSDistStats
   };
 };
 
+// Keeps additional information about metrics
+struct MetricDefinition {
+  MetricDefinition(std::string description, std::string prometheusType) {
+    this->description = description;
+    this->prometheusType = prometheusType;
+  }
+  MetricDefinition() = default;
+
+  // Metric description
+  std::string description;
+  // Metric type for Prometheus
+  std::string prometheusType;
+};
+
+struct MetricDefinitionStorage {
+  // Return metric definition by name
+  bool getMetricDetails(std::string metricName, MetricDefinition& metric) {
+  auto metricDetailsIter = metrics.find(metricName);
+
+  if (metricDetailsIter == metrics.end()) {
+    return false;
+  }
+
+  metric = metricDetailsIter->second;
+    return true;
+  };
+
+  std::map<std::string, MetricDefinition> metrics = {
+    { "responses",              MetricDefinition("counter", "Number of responses received from backends") },
+    { "servfail-responses",     MetricDefinition("counter", "Number of SERVFAIL answers received from backends") },
+    { "queries",                MetricDefinition("counter", "Number of received queries")},
+    { "acl-drops",              MetricDefinition("counter", "Number of packets dropped because of the ACL")},
+    { "rule-drop",              MetricDefinition("counter", "Number of queries dropped because of a rule")},
+    { "rule-nxdomain",          MetricDefinition("counter", "Number of NXDomain answers returned because of a rule")},
+    { "rule-refused",           MetricDefinition("counter", "Number of Refused answers returned because of a rule")},
+    { "rule-servfail",          MetricDefinition("counter", "Number of SERVFAIL answers received because of a rule")},
+    { "self-answered",          MetricDefinition("counter", "Number of self-answered responses")},
+    { "downstream-timeouts",    MetricDefinition("counter", "Number of queries not answered in time by a backend")},
+    { "downstream-send-errors", MetricDefinition("counter", "Number of errors when sending a query to a backend")},
+    { "trunc-failures",         MetricDefinition("counter", "Number of errors encountered while truncating an answer")},
+    { "no-policy",              MetricDefinition("counter", "Number of queries dropped because no server was available")},
+    { "latency0-1",             MetricDefinition("counter", "Number of queries answered in less than 1ms")},
+    { "latency1-10",            MetricDefinition("counter", "Number of queries answered in 1-10 ms")},
+    { "latency10-50",           MetricDefinition("counter", "Number of queries answered in 10-50 ms")},
+    { "latency50-100",          MetricDefinition("counter", "Number of queries answered in 50-100 ms")},
+    { "latency100-1000",        MetricDefinition("counter", "Number of queries answered in 100-1000 ms")},
+    { "latency-slow",           MetricDefinition("counter", "Number of queries answered in more than 1 second")},
+    { "latency-avg100",         MetricDefinition("gauge",   "Average response latency in microseconds of the last 100 packets")},
+    { "latency-avg1000",        MetricDefinition("gauge",   "Average response latency in microseconds of the last 1000 packets")},
+    { "latency-avg10000",       MetricDefinition("gauge",   "Average response latency in microseconds of the last 10000 packets")},
+    { "latency-avg1000000",     MetricDefinition("gauge",   "Average response latency in microseconds of the last 1000000 packets")},
+    { "uptime",                 MetricDefinition("gauge",   "Uptime of the dnsdist process in seconds")},
+    { "real-memory-usage",      MetricDefinition("gauge",   "Current memory usage in bytes")},
+    { "noncompliant-queries",   MetricDefinition("counter", "Number of queries dropped as non-compliant")},
+    { "noncompliant-responses", MetricDefinition("counter", "Number of answers from a backend dropped as non-compliant")},
+    { "rdqueries",              MetricDefinition("counter", "Number of received queries with the recursion desired bit set")},
+    { "empty-queries",          MetricDefinition("counter", "Number of empty queries received from clients")},
+    { "cache-hits",             MetricDefinition("counter", "Number of times an answer was retrieved from cache")},
+    { "cache-misses",           MetricDefinition("counter", "Number of times an answer not found in the cache")},
+    { "cpu-user-msec",          MetricDefinition("counter", "Milliseconds spent by dnsdist in the user state")},
+    { "cpu-sys-msec",           MetricDefinition("counter", "Milliseconds spent by dnsdist in the system state")},
+    { "fd-usage",               MetricDefinition("gauge",   "Number of currently used file descriptors")},
+    { "dyn-blocked",            MetricDefinition("counter", "Number of queries dropped because of a dynamic block")},
+    { "dyn-block-nmg-size",     MetricDefinition("gauge",   "Number of dynamic blocks entries") },
+  };
+};
 
+extern MetricDefinitionStorage g_metricDefinitions;
 extern struct DNSDistStats g_stats;
 void doLatencyStats(double udiff);
 
index a8e73a1d20143d3a1594d6caaa1c04e886ada567..989fb4422359d7533a3b1ba81f5ca325e484c646 100644 (file)
@@ -99,7 +99,7 @@ URL Endpoints
 
   :query command: one of ``stats``, ``dynblocklist`` or ``ebpfblocklist``
 
-.. http:get:: /prometheus
+.. http:get:: /metrics
 
   Get statistics from dnsdist in `Prometheus <https://prometheus.io>`_ format.
 
@@ -107,7 +107,7 @@ URL Endpoints
 
    .. sourcecode:: http
 
-      GET /prometheus
+      GET /metrics
 
   **Example response**:
    .. sourcecode:: http
@@ -271,7 +271,7 @@ URL Endpoints
       job_name: dnsdist
       scrape_interval: 10s
       scrape_timeout: 2s
-      metrics_path: /prometheus
+      metrics_path: /metrics
       basic_auth:
         username: dontcare
         password: yoursecret