From: Paulo Anes Date: Mon, 24 Feb 2014 18:22:21 +0000 (+0000) Subject: Recursor: Throttle all queries to a server after N consecutive timeouts (or unreachab... X-Git-Tag: rec-3.6.0-rc1~161^2 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=628e2c7b985bab0caaa6244877bada3f7b06f71a;p=pdns Recursor: Throttle all queries to a server after N consecutive timeouts (or unreachables). This feature saves valuable resources by avoiding queries to servers from which we don't receive any answer. It's disabled by default. --- diff --git a/pdns/pdns_recursor.cc b/pdns/pdns_recursor.cc index a4726d718..bead66752 100644 --- a/pdns/pdns_recursor.cc +++ b/pdns/pdns_recursor.cc @@ -1798,6 +1798,8 @@ int serviceMain(int argc, char*argv[]) SyncRes::s_maxcachettl=::arg().asNum("max-cache-ttl"); SyncRes::s_packetcachettl=::arg().asNum("packetcache-ttl"); SyncRes::s_packetcacheservfailttl=::arg().asNum("packetcache-servfail-ttl"); + SyncRes::s_serverdownmaxfails=::arg().asNum("server-down-max-fails"); + SyncRes::s_serverdownthrottletime=::arg().asNum("server-down-throttle-time"); SyncRes::s_serverID=::arg()["server-id"]; if(SyncRes::s_serverID.empty()) { char tmp[128]; @@ -2080,6 +2082,8 @@ int main(int argc, char **argv) ::arg().set("client-tcp-timeout","Timeout in seconds when talking to TCP clients")="2"; ::arg().set("max-mthreads", "Maximum number of simultaneous Mtasker threads")="2048"; ::arg().set("max-tcp-clients","Maximum number of simultaneous TCP clients")="128"; + ::arg().set("server-down-max-fails","Maximum number of consecutive timeouts (and unreachables) to mark a server as down ( 0 => disabled )")="0"; + ::arg().set("server-down-throttle-time","Number of seconds to throttle all queries to a server after being maked as down")="60"; ::arg().set("hint-file", "If set, load root hints from this file")=""; ::arg().set("max-cache-entries", "If set, maximum number of entries in the main cache")="1000000"; ::arg().set("max-negative-ttl", "maximum number of seconds to keep a negative cached entry in memory")="3600"; diff --git a/pdns/syncres.cc b/pdns/syncres.cc index 7a6b7b403..9e3ae6c74 100644 --- a/pdns/syncres.cc +++ b/pdns/syncres.cc @@ -49,6 +49,8 @@ unsigned int SyncRes::s_maxnegttl; unsigned int SyncRes::s_maxcachettl; unsigned int SyncRes::s_packetcachettl; unsigned int SyncRes::s_packetcacheservfailttl; +unsigned int SyncRes::s_serverdownmaxfails; +unsigned int SyncRes::s_serverdownthrottletime; unsigned int SyncRes::s_queries; unsigned int SyncRes::s_outgoingtimeouts; unsigned int SyncRes::s_outqueries; @@ -907,7 +909,12 @@ int SyncRes::doResolveAt(set nameservers, string auth, LOG(prefix<toStringWithPort() <<", asking '"<throttle.shouldThrottle(d_now.tv_sec, make_tuple(*remoteIP, qname, qtype.getCode()))) { + if(t_sstorage->throttle.shouldThrottle(d_now.tv_sec, make_tuple(*remoteIP, "", 0))) { + LOG(prefix<throttle.shouldThrottle(d_now.tv_sec, make_tuple(*remoteIP, qname, qtype.getCode()))) { LOG(prefix< nameservers, string auth, t_sstorage->nsSpeeds[*tns].submit(*remoteIP, 1000000, &d_now); // 1 sec } - if(resolveret==-1) + if (s_serverdownmaxfails > 0 && t_sstorage->fails.incr(*remoteIP) >= s_serverdownmaxfails) { + LOG(prefix<toString() <<". Going full throttle for 1 minute" <throttle.throttle(d_now.tv_sec, make_tuple(*remoteIP, "", 0), s_serverdownthrottletime, 10000); // mark server as down + } else if(resolveret==-1) t_sstorage->throttle.throttle(d_now.tv_sec, make_tuple(*remoteIP, qname, qtype.getCode()), 60, 100); // unreachable, 1 minute or 100 queries else t_sstorage->throttle.throttle(d_now.tv_sec, make_tuple(*remoteIP, qname, qtype.getCode()), 10, 5); // timeout @@ -961,6 +971,9 @@ int SyncRes::doResolveAt(set nameservers, string auth, continue; } + if(s_serverdownmaxfails > 0) + t_sstorage->fails.clear(*remoteIP); + break; // this IP address worked! wasLame:; // well, it didn't LOG(prefix<toString() <<") is lame for '"< class Counters : public boost::noncopyable +{ +public: + Counters() + { + } + unsigned long value(const Thing& t) + { + typename cont_t::iterator i=d_cont.find(t); + + if(i==d_cont.end()) { + return 0; + } + return (unsigned long)i->second; + } + unsigned long incr(const Thing& t) + { + typename cont_t::iterator i=d_cont.find(t); + + if(i==d_cont.end()) { + d_cont[t]=1; + return 1; + } + else { + if (i->second < std::numeric_limits::max()) + i->second++; + return (unsigned long)i->second; + } + } + unsigned long decr(const Thing& t) + { + typename cont_t::iterator i=d_cont.find(t); + + if(i!=d_cont.end() && --i->second == 0) { + d_cont.erase(i); + return 0; + } else + return (unsigned long)i->second; + } + void clear(const Thing& t) + { + typename cont_t::iterator i=d_cont.find(t); + + if(i!=d_cont.end()) { + d_cont.erase(i); + } + } + +private: + typedef map cont_t; + cont_t d_cont; +}; + class SyncRes : public boost::noncopyable { @@ -348,12 +401,16 @@ public: typedef Throttle > throttle_t; + + typedef Counters fails_t; struct timeval d_now; static unsigned int s_maxnegttl; static unsigned int s_maxcachettl; static unsigned int s_packetcachettl; static unsigned int s_packetcacheservfailttl; + static unsigned int s_serverdownmaxfails; + static unsigned int s_serverdownthrottletime; static bool s_nopacketcache; static string s_serverID; @@ -363,6 +420,7 @@ public: nsspeeds_t nsSpeeds; ednsstatus_t ednsstatus; throttle_t throttle; + fails_t fails; domainmap_t* domainmap; };