]> granicus.if.org Git - icinga2/commitdiff
Add metrics about RemoteCheckQueue to Icinga check, API and logs
authorNoah Hilverling <noah.hilverling@icinga.com>
Thu, 18 Jan 2018 14:22:16 +0000 (15:22 +0100)
committerNoah Hilverling <noah.hilverling@icinga.com>
Mon, 29 Jan 2018 15:07:35 +0000 (16:07 +0100)
refs #4841

doc/09-object-types.md
lib/icinga/cib.cpp
lib/icinga/clusterevents-check.cpp
lib/icinga/clusterevents.hpp
lib/methods/icingachecktask.cpp

index cb8f35ccfe9e383740acf5ef23e1a99ff3c003b7..9e2b5b3c601d83ccde94a2279ecad76aad274c66 100644 (file)
@@ -256,7 +256,7 @@ Configuration Attributes:
 
   Name                      | Type                  | Description
   --------------------------|-----------------------|----------------------------------
-  concurrent\_checks        | Number                | **Optional and Deprecated.** The maximum number of concurrent checks. Was replaced by global constant `MaxConcurrentChecks` which will be set if you still use `concurrent_checks`.
+  concurrent\_checks        | Number                | **Optional and deprecated.** The maximum number of concurrent checks. Was replaced by global constant `MaxConcurrentChecks` which will be set if you still use `concurrent_checks`.
 
 ## CheckResultReader <a id="objecttype-checkresultreader"></a>
 
index 7da5e3c8d0e36c8c2962bf43fab219de98faa341..e68df57aded2c0ed42a156d415480bd01f0c02cf 100644 (file)
@@ -20,6 +20,7 @@
 #include "icinga/cib.hpp"
 #include "icinga/host.hpp"
 #include "icinga/service.hpp"
+#include "icinga/clusterevents.hpp"
 #include "base/objectlock.hpp"
 #include "base/utility.hpp"
 #include "base/perfdatavalue.hpp"
@@ -305,6 +306,8 @@ void CIB::StatsFunc(const Dictionary::Ptr& status, const Array::Ptr& perfdata) {
        status->Set("active_service_checks_15min", GetActiveServiceChecksStatistics(60 * 15));
        status->Set("passive_service_checks_15min", GetPassiveServiceChecksStatistics(60 * 15));
 
+       status->Set("remote_check_queue", ClusterEvents::GetCheckRequestQueueSize());
+
        CheckableCheckStatistics scs = CalculateServiceCheckStats();
 
        status->Set("min_latency", scs.min_latency);
index 41e2be2378037b294693fdbc3c4fe98360eaa688..58d983eb863677c981bec5cfd8396140af89821b 100644 (file)
@@ -21,6 +21,7 @@
 #include "remote/apilistener.hpp"
 #include "base/serializer.hpp"
 #include "base/exception.hpp"
+#include <boost/thread/once.hpp>
 #include <thread>
 
 using namespace icinga;
@@ -28,6 +29,9 @@ using namespace icinga;
 boost::mutex ClusterEvents::m_Mutex;
 std::deque<std::function<void ()>> ClusterEvents::m_CheckRequestQueue;
 bool ClusterEvents::m_CheckSchedulerRunning;
+int ClusterEvents::m_ChecksExecutedDuringInterval;
+int ClusterEvents::m_ChecksDroppedDuringInterval;
+Timer::Ptr ClusterEvents::m_LogTimer;
 
 void ClusterEvents::RemoteCheckThreadProc()
 {
@@ -45,6 +49,7 @@ void ClusterEvents::RemoteCheckThreadProc()
 
                auto callback = m_CheckRequestQueue.front();
                m_CheckRequestQueue.pop_front();
+               m_ChecksExecutedDuringInterval++;
                lock.unlock();
 
                callback();
@@ -58,10 +63,19 @@ void ClusterEvents::RemoteCheckThreadProc()
 
 void ClusterEvents::EnqueueCheck(const MessageOrigin::Ptr& origin, const Dictionary::Ptr& params)
 {
+       static boost::once_flag once = BOOST_ONCE_INIT;
+
+       boost::call_once(once, []() {
+               m_LogTimer = new Timer();
+               m_LogTimer->SetInterval(10);
+               m_LogTimer->OnTimerExpired.connect(std::bind(ClusterEvents::LogRemoteCheckQueueInformation));
+               m_LogTimer->Start();
+       });
+
        boost::mutex::scoped_lock lock(m_Mutex);
 
        if (m_CheckRequestQueue.size() >= 25000) {
-               Log(LogCritical, "ClusterEvents", "Remote check queue ran out of slots. Discarding remote check request.");
+               m_ChecksDroppedDuringInterval++;
                return;
        }
 
@@ -184,3 +198,28 @@ void ClusterEvents::ExecuteCheckFromQueue(const MessageOrigin::Ptr& origin, cons
        }
 }
 
+int ClusterEvents::GetCheckRequestQueueSize()
+{
+       return m_CheckRequestQueue.size();
+}
+
+void ClusterEvents::LogRemoteCheckQueueInformation() {
+       if (m_ChecksDroppedDuringInterval > 0) {
+               Log(LogCritical, "ClusterEvents")
+                       << "Remote check queue ran out of slots. "
+                       << m_ChecksDroppedDuringInterval << " checks dropped.";
+               m_ChecksDroppedDuringInterval = 0;
+       }
+
+       if (m_ChecksExecutedDuringInterval == 0)
+               return;
+
+       Log(LogInformation, "RemoteCheckQueue")
+               << "items: " << m_CheckRequestQueue.size()
+               << ", rate: " << m_ChecksExecutedDuringInterval / 10 << "/s "
+               << "(" << m_ChecksExecutedDuringInterval * 6 << "/min "
+               << m_ChecksExecutedDuringInterval * 6 * 5 << "/5min "
+               << m_ChecksExecutedDuringInterval * 6 * 15 << "/15min" << ");";
+
+       m_ChecksExecutedDuringInterval = 0;
+}
\ No newline at end of file
index d712b7fb808b87611211716d3dba7d032cb2408e..5b8acf78a848c5ff08859e680c9e50bcfbd047f2 100644 (file)
@@ -75,10 +75,16 @@ public:
                NotificationType notificationType, const CheckResult::Ptr& cr, const String& author, const String& commentText, const MessageOrigin::Ptr& origin);
        static Value NotificationSentToAllUsersAPIHandler(const MessageOrigin::Ptr& origin, const Dictionary::Ptr& params);
 
+       static int GetCheckRequestQueueSize();
+       static void LogRemoteCheckQueueInformation();
+
 private:
        static boost::mutex m_Mutex;
        static std::deque<std::function<void ()>> m_CheckRequestQueue;
        static bool m_CheckSchedulerRunning;
+       static int m_ChecksExecutedDuringInterval;
+       static int m_ChecksDroppedDuringInterval;
+       static Timer::Ptr m_LogTimer;
 
        static void RemoteCheckThreadProc();
        static void EnqueueCheck(const MessageOrigin::Ptr& origin, const Dictionary::Ptr& params);
index 90e9390bdd2baa4ba3f2d2f1ab7cbba89be3547d..16f58e71d294e08af691ed232b02c4b5750c0140 100644 (file)
@@ -23,6 +23,7 @@
 #include "icinga/checkcommand.hpp"
 #include "icinga/macroprocessor.hpp"
 #include "icinga/icingaapplication.hpp"
+#include "icinga/clusterevents.hpp"
 #include "base/application.hpp"
 #include "base/objectlock.hpp"
 #include "base/utility.hpp"
@@ -84,6 +85,8 @@ void IcingaCheckTask::ScriptFunc(const Checkable::Ptr& checkable, const CheckRes
        perfdata->Add(new PerfdataValue("active_service_checks_15min", CIB::GetActiveServiceChecksStatistics(60 * 15)));
        perfdata->Add(new PerfdataValue("passive_service_checks_15min", CIB::GetPassiveServiceChecksStatistics(60 * 15)));
 
+       perfdata->Add(new PerfdataValue("remote_check_queue", ClusterEvents::GetCheckRequestQueueSize()));
+
        CheckableCheckStatistics scs = CIB::CalculateServiceCheckStats();
 
        perfdata->Add(new PerfdataValue("min_latency", scs.min_latency));