From 9e87dcb83e33f0185192b5cff979e38baf65d11e Mon Sep 17 00:00:00 2001 From: Remi Gacogne Date: Thu, 18 Feb 2016 17:36:25 +0100 Subject: [PATCH] dnsdist: Add health check logging, `maxCheckFailures` to backend `maxCheckFailures` allows waiting for several health check failures before marking a downstream server down. Health check errors are logged only in verbose mode and if `setVerboseHealthChecks()` is set to true. --- pdns/README-dnsdist.md | 16 ++++++---- pdns/dnsdist-console.cc | 2 +- pdns/dnsdist-lua.cc | 4 +++ pdns/dnsdist-lua2.cc | 2 ++ pdns/dnsdist.cc | 66 ++++++++++++++++++++++++++++++++++++----- pdns/dnsdist.hh | 3 ++ 6 files changed, 78 insertions(+), 15 deletions(-) diff --git a/pdns/README-dnsdist.md b/pdns/README-dnsdist.md index ea72728bf..dca06adf9 100644 --- a/pdns/README-dnsdist.md +++ b/pdns/README-dnsdist.md @@ -127,11 +127,13 @@ To change the QPS for a server: By default, the availability of a downstream server is checked by regularly sending an A query for "a.root-servers.net.". A different query type and target -can be specified by passing, respectively, the 'checkType' and 'checkName' +can be specified by passing, respectively, the `checkType` and `checkName` parameters to `newServer`. The default behavior is to consider any valid response -with a RCODE different from ServFail as valid. If the 'mustResolve' parameter +with a RCODE different from ServFail as valid. If the `mustResolve` parameter of `newServer` is set to true, a response will only be considered valid if its RCODE differs from NXDomain, ServFail and Refused. +The number of health check failures before a server is considered down is +configurable via the`maxCheckFailures` parameter, defaulting to 1. ``` newServer({address="192.0.2.1", checkType="AAAA", checkName="a.root-servers.net.", mustResolve=true}) @@ -141,7 +143,7 @@ In order to provide the downstream server with the address of the real client, or at least the one talking to `dnsdist`, the 'useClientSubnet' parameter can be used when declaring a new server. This parameter indicates whether an EDNS Client Subnet option should be added to the request. If the incoming request already contains an EDNS Client Subnet value, -it will not be overriden unless setECSOverride is set to true. The source prefix-length may be +it will not be overriden unless `setECSOverride()` is set to true. The source prefix-length may be configured with: ``` > setECSSourcePrefixV4(24) @@ -161,9 +163,10 @@ or if the answer can't be sent in less than 2s. This can be configured with: The same kind of timeouts is enforced on the TCP connections to the downstream servers. The default value of 30s can be modified by passing the `tcpRecvTimeout` and `tcpSendTimeout` -parameters to `newServer`: +parameters to `newServer`. If the TCP connection to a downstream server fails, `dnsdist` +will try to establish a new one up to `retries` times before giving up. ``` -newServer({address="192.0.2.1", tcpRecvTimeout=10, tcpSendTimeout=10}) +newServer({address="192.0.2.1", tcpRecvTimeout=10, tcpSendTimeout=10, retries=5}) ``` Source address @@ -827,9 +830,10 @@ Here are all functions: * `infolog(string)`: log at level info * `warnlog(string)`: log at level warning * `errlog(string)`: log at level error + * `setVerboseHealthChecks(bool)`: whether health check errors will be logged. Note that even if set to true, health check errors will be logged at verbose level only. * Server related: * `newServer("ip:port")`: instantiate a new downstream server with default settings - * `newServer({address="ip:port", qps=1000, order=1, weight=10, pool="abuse", retries=5, tcpSendTimeout=30, tcpRecvTimeout=30, checkName="a.root-servers.net.", checkType="A", mustResolve=false, useClientSubnet=true, source="address|interface name|address@interface"})`: + * `newServer({address="ip:port", qps=1000, order=1, weight=10, pool="abuse", retries=5, tcpSendTimeout=30, tcpRecvTimeout=30, checkName="a.root-servers.net.", checkType="A", maxCheckFailures=1, mustResolve=false, useClientSubnet=true, source="address|interface name|address@interface"})`: instantiate a server with additional parameters * `showServers()`: output all servers * `getServer(n)`: returns server with index n diff --git a/pdns/dnsdist-console.cc b/pdns/dnsdist-console.cc index 3838b7478..a293890c8 100644 --- a/pdns/dnsdist-console.cc +++ b/pdns/dnsdist-console.cc @@ -207,7 +207,7 @@ char* my_generator(const char* text, int state) "setACL(", "setDNSSECPool(", "setECSOverride(", "setECSSourcePrefixV4(", "setECSSourcePrefixV6(", "setKey(", "setLocal(", "setMaxTCPClientThreads(", "setMaxUDPOutstanding(", "setServerPolicy(", "setServerPolicyLua(", - "setTCPRecvTimeout(", "setTCPSendTimeout(", "show(", "showACL()", + "setTCPRecvTimeout(", "setTCPSendTimeout(", "setVerboseHealthChecks(", "show(", "showACL()", "showDNSCryptBinds()", "showDynBlocks()", "showResponseLatency()", "showRules()", "showServerPolicy()", "showServers()", "shutdown()", "SpoofAction(", "TCAction(", "testCrypto()", "topBandwidth(", "topClients(", diff --git a/pdns/dnsdist-lua.cc b/pdns/dnsdist-lua.cc index 32fc83282..86488d8c4 100644 --- a/pdns/dnsdist-lua.cc +++ b/pdns/dnsdist-lua.cc @@ -302,6 +302,10 @@ vector> setupLua(bool client, const std::string& confi ret->useECS=boost::get(vars["useClientSubnet"]); } + if(vars.count("maxCheckFailures")) { + ret->maxCheckFailures=std::stoi(boost::get(vars["maxCheckFailures"])); + } + if(g_launchWork) { g_launchWork->push_back([ret]() { ret->tid = move(thread(responderThread, ret)); diff --git a/pdns/dnsdist-lua2.cc b/pdns/dnsdist-lua2.cc index 68cab6987..a78a1e556 100644 --- a/pdns/dnsdist-lua2.cc +++ b/pdns/dnsdist-lua2.cc @@ -558,4 +558,6 @@ void moreLua(bool client) g_pools.setState(localPools); return pool; }); + + g_lua.writeFunction("setVerboseHealthChecks", [](bool verbose) { g_verboseHealthChecks=verbose; }); } diff --git a/pdns/dnsdist.cc b/pdns/dnsdist.cc index e5c2402a0..d8d0cff61 100644 --- a/pdns/dnsdist.cc +++ b/pdns/dnsdist.cc @@ -61,6 +61,7 @@ bool g_verbose; struct DNSDistStats g_stats; uint16_t g_maxOutstanding; bool g_console; +bool g_verboseHealthChecks{false}; GlobalStateHolder g_ACL; string g_outputBuffer; @@ -904,34 +905,75 @@ try } sock.connect(ds.remote); ssize_t sent = udpClientSendRequestToBackend(&ds, sock.getHandle(), (char*)&packet[0], packet.size()); - if (sent < 0) + if (sent < 0) { + int ret = errno; + if (g_verboseHealthChecks) + vinfolog("Error while sending a health check query to backend %s: %d", ds.getNameWithAddr(), ret); return false; + } int ret=waitForRWData(sock.getHandle(), true, 1, 0); - if(ret < 0 || !ret) // error, timeout, both are down! + if(ret < 0 || !ret) { // error, timeout, both are down! + if (ret < 0) { + ret = errno; + if (g_verboseHealthChecks) + vinfolog("Error while waiting for the health check response from backend %s: %d", ds.getNameWithAddr(), ret); + } + else { + if (g_verboseHealthChecks) + vinfolog("Timeout while waiting for the health check response from backend %s", ds.getNameWithAddr()); + } return false; + } + string reply; sock.recvFrom(reply, ds.remote); const dnsheader * responseHeader = (const dnsheader *) reply.c_str(); - if (reply.size() < sizeof(*responseHeader)) + if (reply.size() < sizeof(*responseHeader)) { + if (g_verboseHealthChecks) + vinfolog("Invalid health check response of size %d from backend %s, expecting at least %d", reply.size(), ds.getNameWithAddr(), sizeof(*responseHeader)); return false; + } - if (responseHeader->id != requestHeader->id) + if (responseHeader->id != requestHeader->id) { + if (g_verboseHealthChecks) + vinfolog("Invalid health check response id %d from backend %s, expecting %d", responseHeader->id, ds.getNameWithAddr(), requestHeader->id); return false; - if (!responseHeader->qr) + } + + if (!responseHeader->qr) { + if (g_verboseHealthChecks) + vinfolog("Invalid health check response from backend %s, expecting QR to be set", ds.getNameWithAddr()); return false; - if (responseHeader->rcode == RCode::ServFail) + } + + if (responseHeader->rcode == RCode::ServFail) { + if (g_verboseHealthChecks) + vinfolog("Backend %s responded to health check with ServFail", ds.getNameWithAddr()); return false; - if (ds.mustResolve && (responseHeader->rcode == RCode::NXDomain || responseHeader->rcode == RCode::Refused)) + } + + if (ds.mustResolve && (responseHeader->rcode == RCode::NXDomain || responseHeader->rcode == RCode::Refused)) { + if (g_verboseHealthChecks) + vinfolog("Backend %s responded to health check with %s while mustResolve is set", ds.getNameWithAddr(), responseHeader->rcode == RCode::NXDomain ? "NXDomain" : "Refused"); return false; + } // XXX fixme do bunch of checking here etc return true; } +catch(const std::exception& e) +{ + if (g_verboseHealthChecks) + vinfolog("Error checking the health of backend %s: %s", ds.getNameWithAddr(), e.what()); + return false; +} catch(...) { + if (g_verboseHealthChecks) + vinfolog("Unknown exception while checking the health of backend %s", ds.getNameWithAddr()); return false; } @@ -980,10 +1022,18 @@ void* healthChecksThread() for(auto& dss : g_dstates.getCopy()) { // this points to the actual shared_ptrs! if(dss->availability==DownstreamState::Availability::Auto) { bool newState=upCheck(*dss); + if (!newState && dss->upStatus) { + dss->currentCheckFailures++; + if (dss->currentCheckFailures < dss->maxCheckFailures) { + newState = true; + } + } + if(newState != dss->upStatus) { warnlog("Marking downstream %s as '%s'", dss->getNameWithAddr(), newState ? "up" : "down"); + dss->upStatus = newState; + dss->currentCheckFailures = 0; } - dss->upStatus = newState; } auto delta = dss->sw.udiffAndSet()/1000000.0; diff --git a/pdns/dnsdist.hh b/pdns/dnsdist.hh index f44fbbad5..d979d4e5e 100644 --- a/pdns/dnsdist.hh +++ b/pdns/dnsdist.hh @@ -325,6 +325,8 @@ struct DownstreamState int tcpSendTimeout{30}; unsigned int sourceItf{0}; uint16_t retries{5}; + uint8_t currentCheckFailures{0}; + uint8_t maxCheckFailures{1}; StopWatch sw; set pools; enum class Availability { Up, Down, Auto} availability{Availability::Auto}; @@ -466,6 +468,7 @@ extern std::atomic g_cacheCleaningDelay; extern uint16_t g_ECSSourcePrefixV4; extern uint16_t g_ECSSourcePrefixV6; extern bool g_ECSOverride; +extern bool g_verboseHealthChecks; struct dnsheader; -- 2.40.0