str<<base<<"latency" << ' ' << (state->availability != DownstreamState::Availability::Down ? state->latencyUsec/1000.0 : 0) << " " << now << "\r\n";
str<<base<<"senderrors" << ' ' << state->sendErrors.load() << " " << now << "\r\n";
str<<base<<"outstanding" << ' ' << state->outstanding.load() << " " << now << "\r\n";
+ str<<base<<"tcpdiedsendingquery" << "' '"<< state->tcpDiedSendingQuery.load() << " " << now << "\r\n";
+ str<<base<<"tcpdiedreaddingresponse" << "' '"<< state->tcpDiedReadingResponse.load() << " " << now << "\r\n";
+ str<<base<<"tcpgaveup" << "' '"<< state->tcpGaveUp.load() << " " << now << "\r\n";
+ str<<base<<"tcpreadimeouts" << "' '"<< state->tcpReadTimeouts.load() << " " << now << "\r\n";
+ str<<base<<"tcpwritetimeouts" << "' '"<< state->tcpWriteTimeouts.load() << " " << now << "\r\n";
}
for(const auto& front : g_frontends) {
if (front->udpFD == -1 && front->tcpFD == -1)
boost::replace_all(frontName, ".", "_");
const string base = namespace_name + "." + hostname + "." + instance_name + ".frontends." + frontName + ".";
str<<base<<"queries" << ' ' << front->queries.load() << " " << now << "\r\n";
+ str<<base<<"tcpdiedreadingquery" << "' '"<< front->tcpDiedReadingQuery.load() << " " << now << "\r\n";
+ str<<base<<"tcpdiedsendingresponse" << "' '"<< front->tcpDiedSendingResponse.load() << " " << now << "\r\n";
+ str<<base<<"tcpgaveup" << "' '"<< front->tcpGaveUp.load() << " " << now << "\r\n";
+ str<<base<<"tcpclientimeouts" << "' '"<< front->tcpClientTimeouts.load() << " " << now << "\r\n";
+ str<<base<<"tcpdownstreamtimeouts" << "' '"<< front->tcpDownstreamTimeouts.load() << " " << now << "\r\n";
}
auto localPools = g_pools.getLocal();
for (const auto& entry : *localPools) {
static void releaseDownstreamConnection(std::shared_ptr<DownstreamState>& ds, std::unique_ptr<Socket>&& socket)
{
+ if (socket == nullptr) {
+ return;
+ }
+
const auto& it = t_downstreamSockets.find(ds->remote);
if (it != t_downstreamSockets.end()) {
auto& list = it->second;
}
catch (const std::exception& e) {
vinfolog("Got an exception while writing TCP response to %s: %s", state->d_ci.remote.toStringWithPort(), e.what());
+ ++state->d_ci.cs->tcpDiedSendingResponse;
handleNewIOState(state, IOState::Done, state->d_ci.fd, handleIOCallback);
}
}
state->d_downstreamSocket = getConnectionToDownstream(ds, state->d_downstreamFailures, state->d_freshDownstreamConnection);
if (!state->d_downstreamSocket) {
+ ++ds->tcpGaveUp;
+ ++state->d_ci.cs->tcpGaveUp;
vinfolog("Downstream connection to %s failed %d times in a row, giving up.", ds->getName(), state->d_downstreamFailures);
return;
}
return;
}
+ ++ds->tcpGaveUp;
+ ++state->d_ci.cs->tcpGaveUp;
vinfolog("Downstream connection to %s failed %u times in a row, giving up.", ds->getName(), state->d_downstreamFailures);
}
Let's just drop the connection
*/
vinfolog("Got an exception while handling (%s backend) TCP query from %s: %s", (state->d_lastIOState == IOState::NeedRead ? "reading from" : "writing to"), state->d_ci.remote.toStringWithPort(), e.what());
+ if (state->d_state == IncomingTCPConnectionState::State::sendingQueryToBackend) {
+ ++state->d_ds->tcpDiedSendingQuery;
+ }
+ else {
+ ++state->d_ds->tcpDiedReadingResponse;
+ }
+
/* don't increase this counter when reusing connections */
if (state->d_freshDownstreamConnection) {
++state->d_downstreamFailures;
but it might also be a real IO error or something else.
Let's just drop the connection
*/
+ if (state->d_state == IncomingTCPConnectionState::State::doingHandshake ||
+ state->d_state == IncomingTCPConnectionState::State::readingQuerySize ||
+ state->d_state == IncomingTCPConnectionState::State::readingQuery) {
+ ++state->d_ci.cs->tcpDiedReadingQuery;
+ }
+ else if (state->d_state == IncomingTCPConnectionState::State::sendingResponse) {
+ ++state->d_ci.cs->tcpDiedSendingResponse;
+ }
+
if (state->d_lastIOState == IOState::NeedWrite || state->d_readingFirstQuery) {
vinfolog("Got an exception while handling (%s) TCP query from %s: %s", (state->d_lastIOState == IOState::NeedRead ? "reading" : "writing"), state->d_ci.remote.toStringWithPort(), e.what());
}
auto state = boost::any_cast<std::shared_ptr<IncomingTCPConnectionState>>(conn.second);
if (conn.first == state->d_ci.fd) {
vinfolog("Timeout (read) from remote TCP client %s", state->d_ci.remote.toStringWithPort());
+ ++state->d_ci.cs->tcpClientTimeouts;
}
else if (state->d_ds) {
vinfolog("Timeout (read) from remote backend %s", state->d_ds->getName());
+ ++state->d_ci.cs->tcpDownstreamTimeouts;
+ ++state->d_ds->tcpReadTimeouts;
}
data.mplexer->removeReadFD(conn.first);
state->d_lastIOState = IOState::Done;
auto state = boost::any_cast<std::shared_ptr<IncomingTCPConnectionState>>(conn.second);
if (conn.first == state->d_ci.fd) {
vinfolog("Timeout (write) from remote TCP client %s", state->d_ci.remote.toStringWithPort());
+ ++state->d_ci.cs->tcpClientTimeouts;
}
else if (state->d_ds) {
vinfolog("Timeout (write) from remote backend %s", state->d_ds->getName());
+ ++state->d_ci.cs->tcpDownstreamTimeouts;
+ ++state->d_ds->tcpWriteTimeouts;
}
data.mplexer->removeWriteFD(conn.first);
state->d_lastIOState = IOState::Done;
auto states = g_dstates.getLocal();
const string statesbase = "dnsdist_server_";
- output << "# HELP " << statesbase << "queries " << "Amount of queries relayed to server" << "\n";
- output << "# TYPE " << statesbase << "queries " << "counter" << "\n";
- output << "# HELP " << statesbase << "drops " << "Amount of queries not answered by server" << "\n";
- output << "# TYPE " << statesbase << "drops " << "counter" << "\n";
- output << "# HELP " << statesbase << "latency " << "Server's latency when answering questions in miliseconds" << "\n";
- output << "# TYPE " << statesbase << "latency " << "gauge" << "\n";
- output << "# HELP " << statesbase << "senderrors " << "Total number of OS snd errors while relaying queries" << "\n";
- output << "# TYPE " << statesbase << "senderrors " << "counter" << "\n";
- output << "# HELP " << statesbase << "outstanding " << "Current number of queries that are waiting for a backend response" << "\n";
- output << "# TYPE " << statesbase << "outstanding " << "gauge" << "\n";
- output << "# HELP " << statesbase << "order " << "The order in which this server is picked" << "\n";
- output << "# TYPE " << statesbase << "order " << "gauge" << "\n";
- output << "# HELP " << statesbase << "weight " << "The weight within the order in which this server is picked" << "\n";
- output << "# TYPE " << statesbase << "weight " << "gauge" << "\n";
+ output << "# HELP " << statesbase << "queries " << "Amount of queries relayed to server" << "\n";
+ output << "# TYPE " << statesbase << "queries " << "counter" << "\n";
+ output << "# HELP " << statesbase << "drops " << "Amount of queries not answered by server" << "\n";
+ output << "# TYPE " << statesbase << "drops " << "counter" << "\n";
+ output << "# HELP " << statesbase << "latency " << "Server's latency when answering questions in miliseconds" << "\n";
+ output << "# TYPE " << statesbase << "latency " << "gauge" << "\n";
+ output << "# HELP " << statesbase << "senderrors " << "Total number of OS snd errors while relaying queries" << "\n";
+ output << "# TYPE " << statesbase << "senderrors " << "counter" << "\n";
+ output << "# HELP " << statesbase << "outstanding " << "Current number of queries that are waiting for a backend response" << "\n";
+ output << "# TYPE " << statesbase << "outstanding " << "gauge" << "\n";
+ output << "# HELP " << statesbase << "order " << "The order in which this server is picked" << "\n";
+ output << "# TYPE " << statesbase << "order " << "gauge" << "\n";
+ output << "# HELP " << statesbase << "weight " << "The weight within the order in which this server is picked" << "\n";
+ output << "# TYPE " << statesbase << "weight " << "gauge" << "\n";
+ output << "# HELP " << statesbase << "tcpdiedsendingquery " << "The number of TCP I/O errors while sending the query" << "\n";
+ output << "# TYPE " << statesbase << "tcpdiedsendingquery " << "counter" << "\n";
+ output << "# HELP " << statesbase << "tcpdiedreadingresponse " << "The number of TCP I/O errors while reading the response" << "\n";
+ output << "# TYPE " << statesbase << "tcpdiedreadingresponse " << "counter" << "\n";
+ output << "# HELP " << statesbase << "tcpgaveup " << "The number of TCP connections failing after too many attempts" << "\n";
+ output << "# TYPE " << statesbase << "tcpgaveup " << "counter" << "\n";
+ output << "# HELP " << statesbase << "tcpreadtimeouts " << "The number of TCP read timeouts" << "\n";
+ output << "# TYPE " << statesbase << "tcpreadtimeouts " << "counter" << "\n";
+ output << "# HELP " << statesbase << "tcpwritetimeouts " << "The number of TCP write timeouts" << "\n";
+ output << "# TYPE " << statesbase << "tcpwritetimeouts " << "counter" << "\n";
for (const auto& state : *states) {
string serverName;
const std::string label = boost::str(boost::format("{server=\"%1%\",address=\"%2%\"}")
% serverName % state->remote.toStringWithPort());
- output << statesbase << "queries" << label << " " << state->queries.load() << "\n";
- output << statesbase << "drops" << label << " " << state->reuseds.load() << "\n";
- output << statesbase << "latency" << label << " " << state->latencyUsec/1000.0 << "\n";
- output << statesbase << "senderrors" << label << " " << state->sendErrors.load() << "\n";
- output << statesbase << "outstanding" << label << " " << state->outstanding.load() << "\n";
- output << statesbase << "order" << label << " " << state->order << "\n";
- output << statesbase << "weight" << label << " " << state->weight << "\n";
+ output << statesbase << "queries" << label << " " << state->queries.load() << "\n";
+ output << statesbase << "drops" << label << " " << state->reuseds.load() << "\n";
+ output << statesbase << "latency" << label << " " << state->latencyUsec/1000.0 << "\n";
+ output << statesbase << "senderrors" << label << " " << state->sendErrors.load() << "\n";
+ output << statesbase << "outstanding" << label << " " << state->outstanding.load() << "\n";
+ output << statesbase << "order" << label << " " << state->order << "\n";
+ output << statesbase << "weight" << label << " " << state->weight << "\n";
+ output << statesbase << "tcpdiedsendingquery" << label << " " << state->tcpDiedSendingQuery << "\n";
+ output << statesbase << "tcpdiedreadingresponse" << label << " " << state->tcpDiedReadingResponse << "\n";
+ output << statesbase << "tcpgaveup" << label << " " << state->tcpGaveUp << "\n";
+ output << statesbase << "tcpreadtimeouts" << label << " " << state->tcpReadTimeouts << "\n";
+ output << statesbase << "tcpwritetimeouts" << label << " " << state->tcpWriteTimeouts << "\n";
}
for (const auto& front : g_frontends) {
{"latency", (double)(a->latencyUsec/1000.0)},
{"queries", (double)a->queries},
{"sendErrors", (double)a->sendErrors},
+ {"tcpDiedSendingQuery", (double)a->tcpDiedSendingQuery},
+ {"tcpDiedReadingResponse", (double)a->tcpDiedReadingResponse},
+ {"tcpGaveUp", (double)a->tcpGaveUp},
+ {"tcpReadTimeouts", (double)a->tcpReadTimeouts},
+ {"tcpWriteTimeouts", (double)a->tcpWriteTimeouts},
{"dropRate", (double)a->dropRate}
};
{ "address", front->local.toStringWithPort() },
{ "udp", front->udpFD >= 0 },
{ "tcp", front->tcpFD >= 0 },
- { "queries", (double) front->queries.load() }
+ { "queries", (double) front->queries.load() },
+ { "tcpDiedReadingQuery", (double) front->tcpDiedReadingQuery.load() },
+ { "tcpDiedSendingResponse", (double) front->tcpDiedSendingResponse.load() },
+ { "tcpGaveUp", (double) front->tcpGaveUp.load() },
+ { "tcpClientTimeouts", (double) front->tcpClientTimeouts },
+ { "tcpDownstreamTimeouts", (double) front->tcpDownstreamTimeouts },
};
frontends.push_back(frontend);
}