From 06e381ceeae5199fa197542a4384e060126cf46b Mon Sep 17 00:00:00 2001 From: Noah Hilverling Date: Thu, 18 Jan 2018 15:22:16 +0100 Subject: [PATCH] Add metrics about RemoteCheckQueue to Icinga check, API and logs refs #4841 --- doc/09-object-types.md | 2 +- lib/icinga/cib.cpp | 3 +++ lib/icinga/clusterevents-check.cpp | 41 +++++++++++++++++++++++++++++- lib/icinga/clusterevents.hpp | 6 +++++ lib/methods/icingachecktask.cpp | 3 +++ 5 files changed, 53 insertions(+), 2 deletions(-) diff --git a/doc/09-object-types.md b/doc/09-object-types.md index cb8f35ccf..9e2b5b3c6 100644 --- a/doc/09-object-types.md +++ b/doc/09-object-types.md @@ -256,7 +256,7 @@ Configuration Attributes: Name | Type | Description --------------------------|-----------------------|---------------------------------- - concurrent\_checks | Number | **Optional and Deprecated.** The maximum number of concurrent checks. Was replaced by global constant `MaxConcurrentChecks` which will be set if you still use `concurrent_checks`. + concurrent\_checks | Number | **Optional and deprecated.** The maximum number of concurrent checks. Was replaced by global constant `MaxConcurrentChecks` which will be set if you still use `concurrent_checks`. ## CheckResultReader diff --git a/lib/icinga/cib.cpp b/lib/icinga/cib.cpp index 7da5e3c8d..e68df57ad 100644 --- a/lib/icinga/cib.cpp +++ b/lib/icinga/cib.cpp @@ -20,6 +20,7 @@ #include "icinga/cib.hpp" #include "icinga/host.hpp" #include "icinga/service.hpp" +#include "icinga/clusterevents.hpp" #include "base/objectlock.hpp" #include "base/utility.hpp" #include "base/perfdatavalue.hpp" @@ -305,6 +306,8 @@ void CIB::StatsFunc(const Dictionary::Ptr& status, const Array::Ptr& perfdata) { status->Set("active_service_checks_15min", GetActiveServiceChecksStatistics(60 * 15)); status->Set("passive_service_checks_15min", GetPassiveServiceChecksStatistics(60 * 15)); + status->Set("remote_check_queue", ClusterEvents::GetCheckRequestQueueSize()); + CheckableCheckStatistics scs = CalculateServiceCheckStats(); status->Set("min_latency", scs.min_latency); diff --git a/lib/icinga/clusterevents-check.cpp b/lib/icinga/clusterevents-check.cpp index 41e2be237..58d983eb8 100644 --- a/lib/icinga/clusterevents-check.cpp +++ b/lib/icinga/clusterevents-check.cpp @@ -21,6 +21,7 @@ #include "remote/apilistener.hpp" #include "base/serializer.hpp" #include "base/exception.hpp" +#include #include using namespace icinga; @@ -28,6 +29,9 @@ using namespace icinga; boost::mutex ClusterEvents::m_Mutex; std::deque> ClusterEvents::m_CheckRequestQueue; bool ClusterEvents::m_CheckSchedulerRunning; +int ClusterEvents::m_ChecksExecutedDuringInterval; +int ClusterEvents::m_ChecksDroppedDuringInterval; +Timer::Ptr ClusterEvents::m_LogTimer; void ClusterEvents::RemoteCheckThreadProc() { @@ -45,6 +49,7 @@ void ClusterEvents::RemoteCheckThreadProc() auto callback = m_CheckRequestQueue.front(); m_CheckRequestQueue.pop_front(); + m_ChecksExecutedDuringInterval++; lock.unlock(); callback(); @@ -58,10 +63,19 @@ void ClusterEvents::RemoteCheckThreadProc() void ClusterEvents::EnqueueCheck(const MessageOrigin::Ptr& origin, const Dictionary::Ptr& params) { + static boost::once_flag once = BOOST_ONCE_INIT; + + boost::call_once(once, []() { + m_LogTimer = new Timer(); + m_LogTimer->SetInterval(10); + m_LogTimer->OnTimerExpired.connect(std::bind(ClusterEvents::LogRemoteCheckQueueInformation)); + m_LogTimer->Start(); + }); + boost::mutex::scoped_lock lock(m_Mutex); if (m_CheckRequestQueue.size() >= 25000) { - Log(LogCritical, "ClusterEvents", "Remote check queue ran out of slots. Discarding remote check request."); + m_ChecksDroppedDuringInterval++; return; } @@ -184,3 +198,28 @@ void ClusterEvents::ExecuteCheckFromQueue(const MessageOrigin::Ptr& origin, cons } } +int ClusterEvents::GetCheckRequestQueueSize() +{ + return m_CheckRequestQueue.size(); +} + +void ClusterEvents::LogRemoteCheckQueueInformation() { + if (m_ChecksDroppedDuringInterval > 0) { + Log(LogCritical, "ClusterEvents") + << "Remote check queue ran out of slots. " + << m_ChecksDroppedDuringInterval << " checks dropped."; + m_ChecksDroppedDuringInterval = 0; + } + + if (m_ChecksExecutedDuringInterval == 0) + return; + + Log(LogInformation, "RemoteCheckQueue") + << "items: " << m_CheckRequestQueue.size() + << ", rate: " << m_ChecksExecutedDuringInterval / 10 << "/s " + << "(" << m_ChecksExecutedDuringInterval * 6 << "/min " + << m_ChecksExecutedDuringInterval * 6 * 5 << "/5min " + << m_ChecksExecutedDuringInterval * 6 * 15 << "/15min" << ");"; + + m_ChecksExecutedDuringInterval = 0; +} \ No newline at end of file diff --git a/lib/icinga/clusterevents.hpp b/lib/icinga/clusterevents.hpp index d712b7fb8..5b8acf78a 100644 --- a/lib/icinga/clusterevents.hpp +++ b/lib/icinga/clusterevents.hpp @@ -75,10 +75,16 @@ public: NotificationType notificationType, const CheckResult::Ptr& cr, const String& author, const String& commentText, const MessageOrigin::Ptr& origin); static Value NotificationSentToAllUsersAPIHandler(const MessageOrigin::Ptr& origin, const Dictionary::Ptr& params); + static int GetCheckRequestQueueSize(); + static void LogRemoteCheckQueueInformation(); + private: static boost::mutex m_Mutex; static std::deque> m_CheckRequestQueue; static bool m_CheckSchedulerRunning; + static int m_ChecksExecutedDuringInterval; + static int m_ChecksDroppedDuringInterval; + static Timer::Ptr m_LogTimer; static void RemoteCheckThreadProc(); static void EnqueueCheck(const MessageOrigin::Ptr& origin, const Dictionary::Ptr& params); diff --git a/lib/methods/icingachecktask.cpp b/lib/methods/icingachecktask.cpp index 90e9390bd..16f58e71d 100644 --- a/lib/methods/icingachecktask.cpp +++ b/lib/methods/icingachecktask.cpp @@ -23,6 +23,7 @@ #include "icinga/checkcommand.hpp" #include "icinga/macroprocessor.hpp" #include "icinga/icingaapplication.hpp" +#include "icinga/clusterevents.hpp" #include "base/application.hpp" #include "base/objectlock.hpp" #include "base/utility.hpp" @@ -84,6 +85,8 @@ void IcingaCheckTask::ScriptFunc(const Checkable::Ptr& checkable, const CheckRes perfdata->Add(new PerfdataValue("active_service_checks_15min", CIB::GetActiveServiceChecksStatistics(60 * 15))); perfdata->Add(new PerfdataValue("passive_service_checks_15min", CIB::GetPassiveServiceChecksStatistics(60 * 15))); + perfdata->Add(new PerfdataValue("remote_check_queue", ClusterEvents::GetCheckRequestQueueSize())); + CheckableCheckStatistics scs = CIB::CalculateServiceCheckStats(); perfdata->Add(new PerfdataValue("min_latency", scs.min_latency)); -- 2.40.0