From 840ed663290f106dfcc37b7eaf20653d8bc5f328 Mon Sep 17 00:00:00 2001 From: Remi Gacogne Date: Thu, 16 Mar 2017 18:05:59 +0100 Subject: [PATCH] dnsdist: Cleanup closed TCP downstream connections Regularly walk the TCP downstream connections to properly close whose that have been shutdown by the other end. It occurs only after a TCP client connection has been closed and at most every `setTCPDownstreamCleanupInterval()` seconds, defaulting to 60s. Until now we only detected that the other end closed the connection when we tried to reuse it. While this is not an issue with a small number of backends because the connection are reused pretty quickly, with a large number of backends dnsdist might end up with thousands of idle TCP connections to downstream servers in `CLOSE_WAIT` state, wasting open file descriptors. --- pdns/README-dnsdist.md | 1 + pdns/dnsdist-console.cc | 1 + pdns/dnsdist-lua2.cc | 5 +++++ pdns/dnsdist-tcp.cc | 20 ++++++++++++++++++++ pdns/dnsdist.hh | 1 + pdns/iputils.cc | 42 +++++++++++++++++++++++++++++++++++++++++ pdns/iputils.hh | 2 ++ 7 files changed, 72 insertions(+) diff --git a/pdns/README-dnsdist.md b/pdns/README-dnsdist.md index c3d045074..348cc7d33 100644 --- a/pdns/README-dnsdist.md +++ b/pdns/README-dnsdist.md @@ -1614,6 +1614,7 @@ instantiate a server with additional parameters * `setCacheCleaningDelay(n)`: set the interval in seconds between two runs of the cache cleaning algorithm, removing expired entries * `setCacheCleaningPercentage(n)`: set the percentage of the cache that the cache cleaning algorithm will try to free by removing expired entries. By default (100), all expired entries are removed * `setStaleCacheEntriesTTL(n)`: allows using cache entries expired for at most `n` seconds when no backend available to answer for a query + * `setTCPDownstreamCleanupInterval(interval)`: minimum interval in seconds between two cleanups of the idle TCP downstream connections. Defaults to 60s * `setTCPUseSinglePipe(bool)`: whether the incoming TCP connections should be put into a single queue instead of using per-thread queues. Defaults to false * `setTCPRecvTimeout(n)`: set the read timeout on TCP connections from the client, in seconds * `setTCPSendTimeout(n)`: set the write timeout on TCP connections from the client, in seconds diff --git a/pdns/dnsdist-console.cc b/pdns/dnsdist-console.cc index e73b9541c..5da8dc332 100644 --- a/pdns/dnsdist-console.cc +++ b/pdns/dnsdist-console.cc @@ -375,6 +375,7 @@ const std::vector g_consoleKeywords{ { "setServerPolicy", true, "policy", "set server selection policy to that policy" }, { "setServerPolicyLua", true, "name, function", "set server selection policy to one named 'name' and provided by 'function'" }, { "setServFailWhenNoServer", true, "bool", "if set, return a ServFail when no servers are available, instead of the default behaviour of dropping the query" }, + { "setTCPDownstreamCleanupInterval", true, "interval", "minimum interval in seconds between two cleanups of the idle TCP downstream connections" }, { "setTCPUseSinglePipe", true, "bool", "whether the incoming TCP connections should be put into a single queue instead of using per-thread queues. Defaults to false" }, { "setTCPRecvTimeout", true, "n", "set the read timeout on TCP connections from the client, in seconds" }, { "setTCPSendTimeout", true, "n", "set the write timeout on TCP connections from the client, in seconds" }, diff --git a/pdns/dnsdist-lua2.cc b/pdns/dnsdist-lua2.cc index 0c3ebc919..d698f5a27 100644 --- a/pdns/dnsdist-lua2.cc +++ b/pdns/dnsdist-lua2.cc @@ -1327,4 +1327,9 @@ void moreLua(bool client) g_outputBuffer=poolObj->policy->name+"\n"; } }); + + g_lua.writeFunction("setTCPDownstreamCleanupInterval", [](uint16_t interval) { + setLuaSideEffect(); + g_downstreamTCPCleanupInterval = interval; + }); } diff --git a/pdns/dnsdist-tcp.cc b/pdns/dnsdist-tcp.cc index 75d1caf74..95e3ac204 100644 --- a/pdns/dnsdist-tcp.cc +++ b/pdns/dnsdist-tcp.cc @@ -93,6 +93,7 @@ size_t g_maxTCPConnectionsPerClient{0}; static std::mutex tcpClientsCountMutex; static std::map tcpClientsCount; bool g_useTCPSinglePipe{false}; +std::atomic g_downstreamTCPCleanupInterval{60}; void* tcpClientThread(int pipefd); @@ -194,6 +195,19 @@ static bool maxConnectionDurationReached(unsigned int maxConnectionDuration, tim return false; } +void cleanupClosedTCPConnections(std::map& sockets) +{ + for(auto it = sockets.begin(); it != sockets.end(); ) { + if (isTCPSocketUsable(it->second)) { + ++it; + } + else { + close(it->second); + it = sockets.erase(it); + } + } +} + std::shared_ptr g_tcpclientthreads; void* tcpClientThread(int pipefd) @@ -203,6 +217,7 @@ void* tcpClientThread(int pipefd) bool outstanding = false; blockfilter_t blockFilter = 0; + time_t lastTCPCleanup = time(nullptr); { std::lock_guard lock(g_luamutex); @@ -602,6 +617,11 @@ void* tcpClientThread(int pipefd) --ds->outstanding; } decrementTCPClientCount(ci.remote); + + if (g_downstreamTCPCleanupInterval > 0 && (connectionStartTime > (lastTCPCleanup + g_downstreamTCPCleanupInterval))) { + cleanupClosedTCPConnections(sockets); + lastTCPCleanup = time(nullptr); + } } return 0; } diff --git a/pdns/dnsdist.hh b/pdns/dnsdist.hh index 100dd43b9..36823820d 100644 --- a/pdns/dnsdist.hh +++ b/pdns/dnsdist.hh @@ -673,6 +673,7 @@ extern std::string g_apiConfigDirectory; extern bool g_servFailOnNoPolicy; extern uint32_t g_hashperturb; extern bool g_useTCPSinglePipe; +extern std::atomic g_downstreamTCPCleanupInterval; struct ConsoleKeyword { std::string name; diff --git a/pdns/iputils.cc b/pdns/iputils.cc index f06e08e0b..8b8601966 100644 --- a/pdns/iputils.cc +++ b/pdns/iputils.cc @@ -401,3 +401,45 @@ bool sendSizeAndMsgWithTimeout(int sock, uint16_t bufferLen, const char* buffer, return false; } + +/* requires a non-blocking socket. + On Linux, we could use MSG_DONTWAIT on a blocking socket + but this is not portable. +*/ +bool isTCPSocketUsable(int sock) +{ + int err = 0; + char buf = '\0'; + size_t buf_size = sizeof(buf); + + do { + ssize_t got = recv(sock, &buf, buf_size, MSG_PEEK); + + if (got > 0) { + /* socket is usable, some data is even waiting to be read */ + return true; + } + else if (got == 0) { + /* other end has closed the socket */ + return false; + } + else { + int err = errno; + + if (err == EAGAIN || err == EWOULDBLOCK) { + /* socket is usable, no data waiting */ + return true; + } + else { + if (err != EINTR) { + /* something is wrong, could be ECONNRESET, + ENOTCONN, EPIPE, but anyway this socket is + not usable. */ + return false; + } + } + } + } while (err == EINTR); + + return false; +} diff --git a/pdns/iputils.hh b/pdns/iputils.hh index 15be20d83..5c53fa6d9 100644 --- a/pdns/iputils.hh +++ b/pdns/iputils.hh @@ -911,6 +911,8 @@ void fillMSGHdr(struct msghdr* msgh, struct iovec* iov, char* cbuf, size_t cbufs ssize_t sendfromto(int sock, const char* data, size_t len, int flags, const ComboAddress& from, const ComboAddress& to); ssize_t sendMsgWithTimeout(int fd, const char* buffer, size_t len, int timeout, ComboAddress& dest, const ComboAddress& local, unsigned int localItf); bool sendSizeAndMsgWithTimeout(int sock, uint16_t bufferLen, const char* buffer, int idleTimeout, const ComboAddress* dest, const ComboAddress* local, unsigned int localItf, int totalTimeout, int flags); +/* requires a non-blocking, connected TCP socket */ +bool isTCPSocketUsable(int sock); extern template class NetmaskTree; -- 2.40.0