]> granicus.if.org Git - pdns/commitdiff
Recursor: Throttle all queries to a server after N consecutive timeouts (or unreachab...
authorPaulo Anes <pmca31@gmail.com>
Mon, 24 Feb 2014 18:22:21 +0000 (18:22 +0000)
committerPaulo Anes <paulo.anes@zonoptimus.pt>
Mon, 24 Feb 2014 18:22:21 +0000 (18:22 +0000)
This feature saves valuable resources by avoiding queries to servers from which we don't receive any answer. It's disabled by default.

pdns/pdns_recursor.cc
pdns/syncres.cc
pdns/syncres.hh

index a4726d718115109ca2c728c5a95de0763e30bfc8..bead667525f705682617897d42d4313b08259e1e 100644 (file)
@@ -1798,6 +1798,8 @@ int serviceMain(int argc, char*argv[])
   SyncRes::s_maxcachettl=::arg().asNum("max-cache-ttl");
   SyncRes::s_packetcachettl=::arg().asNum("packetcache-ttl");
   SyncRes::s_packetcacheservfailttl=::arg().asNum("packetcache-servfail-ttl");
+  SyncRes::s_serverdownmaxfails=::arg().asNum("server-down-max-fails");
+  SyncRes::s_serverdownthrottletime=::arg().asNum("server-down-throttle-time");
   SyncRes::s_serverID=::arg()["server-id"];
   if(SyncRes::s_serverID.empty()) {
     char tmp[128];
@@ -2080,6 +2082,8 @@ int main(int argc, char **argv)
     ::arg().set("client-tcp-timeout","Timeout in seconds when talking to TCP clients")="2";
     ::arg().set("max-mthreads", "Maximum number of simultaneous Mtasker threads")="2048";
     ::arg().set("max-tcp-clients","Maximum number of simultaneous TCP clients")="128";
+    ::arg().set("server-down-max-fails","Maximum number of consecutive timeouts (and unreachables) to mark a server as down ( 0 => disabled )")="0";
+    ::arg().set("server-down-throttle-time","Number of seconds to throttle all queries to a server after being maked as down")="60";
     ::arg().set("hint-file", "If set, load root hints from this file")="";
     ::arg().set("max-cache-entries", "If set, maximum number of entries in the main cache")="1000000";
     ::arg().set("max-negative-ttl", "maximum number of seconds to keep a negative cached entry in memory")="3600";
index 7a6b7b403131405df8b8e53349cca7c2bea63222..9e3ae6c748ae4401e537febeb8a9f6f6240b0ff1 100644 (file)
@@ -49,6 +49,8 @@ unsigned int SyncRes::s_maxnegttl;
 unsigned int SyncRes::s_maxcachettl;
 unsigned int SyncRes::s_packetcachettl;
 unsigned int SyncRes::s_packetcacheservfailttl;
+unsigned int SyncRes::s_serverdownmaxfails;
+unsigned int SyncRes::s_serverdownthrottletime;
 unsigned int SyncRes::s_queries;
 unsigned int SyncRes::s_outgoingtimeouts;
 unsigned int SyncRes::s_outqueries;
@@ -907,7 +909,12 @@ int SyncRes::doResolveAt(set<string, CIStringCompare> nameservers, string auth,
           LOG(prefix<<qname<<": Trying IP "<< remoteIP->toStringWithPort() <<", asking '"<<qname<<"|"<<qtype.getName()<<"'"<<endl);
           extern NetmaskGroup* g_dontQuery;
           
-          if(t_sstorage->throttle.shouldThrottle(d_now.tv_sec, make_tuple(*remoteIP, qname, qtype.getCode()))) {
+          if(t_sstorage->throttle.shouldThrottle(d_now.tv_sec, make_tuple(*remoteIP, "", 0))) {
+            LOG(prefix<<qname<<": server throttled "<<endl);
+            s_throttledqueries++; d_throttledqueries++;
+            continue;
+          }
+          else if(t_sstorage->throttle.shouldThrottle(d_now.tv_sec, make_tuple(*remoteIP, qname, qtype.getCode()))) {
             LOG(prefix<<qname<<": query throttled "<<endl);
             s_throttledqueries++; d_throttledqueries++;
             continue;
@@ -947,7 +954,10 @@ int SyncRes::doResolveAt(set<string, CIStringCompare> nameservers, string auth,
                   
                   t_sstorage->nsSpeeds[*tns].submit(*remoteIP, 1000000, &d_now); // 1 sec
                 }
-                if(resolveret==-1)
+                if (s_serverdownmaxfails > 0 && t_sstorage->fails.incr(*remoteIP) >= s_serverdownmaxfails) {
+                  LOG(prefix<<qname<<": Max fails reached resolving on "<< remoteIP->toString() <<". Going full throttle for 1 minute" <<endl);
+                  t_sstorage->throttle.throttle(d_now.tv_sec, make_tuple(*remoteIP, "", 0), s_serverdownthrottletime, 10000); // mark server as down
+                } else if(resolveret==-1)
                   t_sstorage->throttle.throttle(d_now.tv_sec, make_tuple(*remoteIP, qname, qtype.getCode()), 60, 100); // unreachable, 1 minute or 100 queries
                 else
                   t_sstorage->throttle.throttle(d_now.tv_sec, make_tuple(*remoteIP, qname, qtype.getCode()), 10, 5);  // timeout
@@ -961,6 +971,9 @@ int SyncRes::doResolveAt(set<string, CIStringCompare> nameservers, string auth,
               continue;
             }
             
+            if(s_serverdownmaxfails > 0)
+              t_sstorage->fails.clear(*remoteIP);
+
             break;  // this IP address worked!
           wasLame:; // well, it didn't
             LOG(prefix<<qname<<": status=NS "<<*tns<<" ("<< remoteIP->toString() <<") is lame for '"<<auth<<"', trying sibling IP or NS"<<endl);
index 0b6d7c1e11ee4d6c4aedcecfd14136ae81c45237..67beafcee360bff924ca4590a6160323ff2f00e7 100644 (file)
@@ -173,6 +173,59 @@ private:
   bool d_needinit;
 };
 
+template<class Thing> class Counters : public boost::noncopyable
+{
+public:
+  Counters()
+  {
+  }
+  unsigned long value(const Thing& t)
+  {
+    typename cont_t::iterator i=d_cont.find(t);
+
+    if(i==d_cont.end()) {
+      return 0;
+    }
+    return (unsigned long)i->second;
+  }
+  unsigned long incr(const Thing& t)
+  {
+    typename cont_t::iterator i=d_cont.find(t);
+
+    if(i==d_cont.end()) {
+      d_cont[t]=1;
+      return 1;
+    }
+    else {
+      if (i->second < std::numeric_limits<unsigned long>::max())
+        i->second++;
+      return (unsigned long)i->second;
+   }
+  }
+  unsigned long decr(const Thing& t)
+  {
+    typename cont_t::iterator i=d_cont.find(t);
+
+    if(i!=d_cont.end() && --i->second == 0) {
+      d_cont.erase(i);
+      return 0;
+    } else
+      return (unsigned long)i->second;
+  }
+  void clear(const Thing& t)
+  {
+    typename cont_t::iterator i=d_cont.find(t);
+
+    if(i!=d_cont.end()) {
+      d_cont.erase(i);
+    }
+  }
+
+private:
+  typedef map<Thing,unsigned long> cont_t;
+  cont_t d_cont;
+};
+
 
 class SyncRes : public boost::noncopyable
 {
@@ -348,12 +401,16 @@ public:
   
 
   typedef Throttle<tuple<ComboAddress,string,uint16_t> > throttle_t;
+
+  typedef Counters<ComboAddress> fails_t;
   
   struct timeval d_now;
   static unsigned int s_maxnegttl;
   static unsigned int s_maxcachettl;
   static unsigned int s_packetcachettl;
   static unsigned int s_packetcacheservfailttl;
+  static unsigned int s_serverdownmaxfails;
+  static unsigned int s_serverdownthrottletime;
   static bool s_nopacketcache;
   static string s_serverID;
   
@@ -363,6 +420,7 @@ public:
     nsspeeds_t nsSpeeds;
     ednsstatus_t ednsstatus;
     throttle_t throttle;
+    fails_t fails;
     domainmap_t* domainmap;
   };