1 /* Icinga 2 | (c) 2012 Icinga GmbH | GPLv2+ */
3 #include "icinga/checkable.hpp"
4 #include "icinga/service.hpp"
5 #include "icinga/host.hpp"
6 #include "icinga/checkcommand.hpp"
7 #include "icinga/icingaapplication.hpp"
8 #include "icinga/cib.hpp"
9 #include "icinga/clusterevents.hpp"
10 #include "remote/messageorigin.hpp"
11 #include "remote/apilistener.hpp"
12 #include "base/objectlock.hpp"
13 #include "base/logger.hpp"
14 #include "base/convert.hpp"
15 #include "base/utility.hpp"
16 #include "base/context.hpp"
18 using namespace icinga;
20 boost::signals2::signal<void (const Checkable::Ptr&, const CheckResult::Ptr&, const MessageOrigin::Ptr&)> Checkable::OnNewCheckResult;
21 boost::signals2::signal<void (const Checkable::Ptr&, const CheckResult::Ptr&, StateType, const MessageOrigin::Ptr&)> Checkable::OnStateChange;
22 boost::signals2::signal<void (const Checkable::Ptr&, const CheckResult::Ptr&, std::set<Checkable::Ptr>, const MessageOrigin::Ptr&)> Checkable::OnReachabilityChanged;
23 boost::signals2::signal<void (const Checkable::Ptr&, NotificationType, const CheckResult::Ptr&, const String&, const String&, const MessageOrigin::Ptr&)> Checkable::OnNotificationsRequested;
24 boost::signals2::signal<void (const Checkable::Ptr&)> Checkable::OnNextCheckUpdated;
26 Atomic<uint_fast64_t> Checkable::CurrentConcurrentChecks (0);
28 boost::mutex Checkable::m_StatsMutex;
29 int Checkable::m_PendingChecks = 0;
30 boost::condition_variable Checkable::m_PendingChecksCV;
32 CheckCommand::Ptr Checkable::GetCheckCommand() const
34 return dynamic_pointer_cast<CheckCommand>(NavigateCheckCommandRaw());
37 TimePeriod::Ptr Checkable::GetCheckPeriod() const
39 return TimePeriod::GetByName(GetCheckPeriodRaw());
42 void Checkable::SetSchedulingOffset(long offset)
44 m_SchedulingOffset = offset;
47 long Checkable::GetSchedulingOffset()
49 return m_SchedulingOffset;
52 void Checkable::UpdateNextCheck(const MessageOrigin::Ptr& origin)
56 if (GetStateType() == StateTypeSoft && GetLastCheckResult() != nullptr)
57 interval = GetRetryInterval();
59 interval = GetCheckInterval();
61 double now = Utility::GetTime();
65 adj = fmod(now * 100 + GetSchedulingOffset(), interval * 100) / 100.0;
68 adj = std::min(0.5 + fmod(GetSchedulingOffset(), interval * 5) / 100.0, adj);
70 double nextCheck = now - adj + interval;
71 double lastCheck = GetLastCheck();
73 Log(LogDebug, "Checkable")
74 << "Update checkable '" << GetName() << "' with check interval '" << GetCheckInterval()
75 << "' from last check time at " << Utility::FormatDateTime("%Y-%m-%d %H:%M:%S %z", (lastCheck < 0 ? 0 : lastCheck))
76 << " (" << GetLastCheck() << ") to next check time at " << Utility::FormatDateTime("%Y-%m-%d %H:%M:%S %z", nextCheck) << " (" << nextCheck << ").";
78 SetNextCheck(nextCheck, false, origin);
81 bool Checkable::HasBeenChecked() const
83 return GetLastCheckResult() != nullptr;
86 double Checkable::GetLastCheck() const
88 CheckResult::Ptr cr = GetLastCheckResult();
89 double schedule_end = -1;
92 schedule_end = cr->GetScheduleEnd();
97 void Checkable::ProcessCheckResult(const CheckResult::Ptr& cr, const MessageOrigin::Ptr& origin)
100 ObjectLock olock(this);
101 m_CheckRunning = false;
107 double now = Utility::GetTime();
109 if (cr->GetScheduleStart() == 0)
110 cr->SetScheduleStart(now);
112 if (cr->GetScheduleEnd() == 0)
113 cr->SetScheduleEnd(now);
115 if (cr->GetExecutionStart() == 0)
116 cr->SetExecutionStart(now);
118 if (cr->GetExecutionEnd() == 0)
119 cr->SetExecutionEnd(now);
121 if (!origin || origin->IsLocal())
122 cr->SetCheckSource(IcingaApplication::GetInstance()->GetNodeName());
124 Endpoint::Ptr command_endpoint = GetCommandEndpoint();
126 /* override check source if command_endpoint was defined */
127 if (command_endpoint && !GetExtension("agent_check"))
128 cr->SetCheckSource(command_endpoint->GetName());
130 /* agent checks go through the api */
131 if (command_endpoint && GetExtension("agent_check")) {
132 ApiListener::Ptr listener = ApiListener::GetInstance();
135 /* send message back to its origin */
136 Dictionary::Ptr message = ClusterEvents::MakeCheckResultMessage(this, cr);
137 listener->SyncSendMessage(command_endpoint, message);
147 bool reachable = IsReachable();
148 bool notification_reachable = IsReachable(DependencyNotification);
150 ObjectLock olock(this);
152 CheckResult::Ptr old_cr = GetLastCheckResult();
153 ServiceState old_state = GetStateRaw();
154 StateType old_stateType = GetStateType();
155 long old_attempt = GetCheckAttempt();
156 bool recovery = false;
158 /* When we have an check result already (not after fresh start),
159 * prevent to accept old check results and allow overrides for
160 * CRs happened in the future.
163 double currentCRTimestamp = old_cr->GetExecutionStart();
164 double newCRTimestamp = cr->GetExecutionStart();
166 /* Our current timestamp may be from the future (wrong server time adjusted again). Allow overrides here. */
167 if (currentCRTimestamp > now) {
168 /* our current CR is from the future, let the new CR override it. */
169 Log(LogDebug, "Checkable")
170 << std::fixed << std::setprecision(6) << "Processing check result for checkable '" << GetName() << "' from "
171 << Utility::FormatDateTime("%Y-%m-%d %H:%M:%S %z", newCRTimestamp) << " (" << newCRTimestamp
172 << "). Overriding since ours is from the future at "
173 << Utility::FormatDateTime("%Y-%m-%d %H:%M:%S %z", currentCRTimestamp) << " (" << currentCRTimestamp << ").";
175 /* Current timestamp is from the past, but the new timestamp is even more in the past. Skip it. */
176 if (newCRTimestamp < currentCRTimestamp) {
177 Log(LogDebug, "Checkable")
178 << std::fixed << std::setprecision(6) << "Skipping check result for checkable '" << GetName() << "' from "
179 << Utility::FormatDateTime("%Y-%m-%d %H:%M:%S %z", newCRTimestamp) << " (" << newCRTimestamp
180 << "). It is in the past compared to ours at "
181 << Utility::FormatDateTime("%Y-%m-%d %H:%M:%S %z", currentCRTimestamp) << " (" << currentCRTimestamp << ").";
187 /* The ExecuteCheck function already sets the old state, but we need to do it again
188 * in case this was a passive check result. */
189 SetLastStateRaw(old_state);
190 SetLastStateType(old_stateType);
191 SetLastReachable(reachable);
194 Service::Ptr service;
195 tie(host, service) = GetHostService(this);
197 CheckableType checkableType = CheckableHost;
199 checkableType = CheckableService;
203 std::set<Checkable::Ptr> children = GetChildren();
205 if (IsStateOK(cr->GetState())) {
206 SetStateType(StateTypeHard); // NOT-OK -> HARD OK
208 if (!IsStateOK(old_state))
211 ResetNotificationNumbers();
212 SaveLastState(ServiceOK, Utility::GetTime());
214 /* update reachability for child objects in OK state */
215 if (!children.empty())
216 OnReachabilityChanged(this, cr, children, origin);
218 /* OK -> NOT-OK change, first SOFT state. Reset attempt counter. */
219 if (IsStateOK(old_state)) {
220 SetStateType(StateTypeSoft);
224 /* SOFT state change, increase attempt counter. */
225 if (old_stateType == StateTypeSoft && !IsStateOK(old_state)) {
226 SetStateType(StateTypeSoft);
227 attempt = old_attempt + 1;
230 /* HARD state change (e.g. previously 2/3 and this next attempt). Reset attempt counter. */
231 if (attempt >= GetMaxCheckAttempts()) {
232 SetStateType(StateTypeHard);
236 if (!IsStateOK(cr->GetState())) {
237 SaveLastState(cr->GetState(), Utility::GetTime());
240 /* update reachability for child objects in NOT-OK state */
241 if (!children.empty())
242 OnReachabilityChanged(this, cr, children, origin);
246 SetLastStateUnreachable(Utility::GetTime());
248 SetCheckAttempt(attempt);
250 ServiceState new_state = cr->GetState();
251 SetStateRaw(new_state);
255 /* Exception on state change calculation for hosts. */
256 if (checkableType == CheckableService)
257 stateChange = (old_state != new_state);
259 stateChange = (Host::CalculateState(old_state) != Host::CalculateState(new_state));
261 /* Store the current last state change for the next iteration. */
262 SetPreviousStateChange(GetLastStateChange());
265 SetLastStateChange(now);
267 /* remove acknowledgements */
268 if (GetAcknowledgement() == AcknowledgementNormal ||
269 (GetAcknowledgement() == AcknowledgementSticky && IsStateOK(new_state))) {
270 ClearAcknowledgement();
273 /* reschedule direct parents */
274 for (const Checkable::Ptr& parent : GetParents()) {
275 if (parent.get() == this)
278 if (!parent->GetEnableActiveChecks())
281 if (parent->GetNextCheck() >= now + parent->GetRetryInterval()) {
282 ObjectLock olock(parent);
283 parent->SetNextCheck(now);
288 bool remove_acknowledgement_comments = false;
290 if (GetAcknowledgement() == AcknowledgementNone)
291 remove_acknowledgement_comments = true;
293 bool hardChange = (GetStateType() == StateTypeHard && old_stateType == StateTypeSoft);
295 if (stateChange && old_stateType == StateTypeHard && GetStateType() == StateTypeHard)
298 bool is_volatile = GetVolatile();
300 if (hardChange || is_volatile) {
301 SetLastHardStateRaw(new_state);
302 SetLastHardStateChange(now);
305 if (!IsStateOK(new_state))
308 /* statistics for external tools */
309 Checkable::UpdateStatistics(cr, checkableType);
311 bool in_downtime = IsInDowntime();
313 bool send_notification = false;
314 bool suppress_notification = !notification_reachable || in_downtime || IsAcknowledged();
316 /* Send notifications whether when a hard state change occurred. */
317 if (hardChange && !(old_stateType == StateTypeSoft && IsStateOK(new_state)))
318 send_notification = true;
319 /* Or if the checkable is volatile and in a HARD state. */
320 else if (is_volatile && GetStateType() == StateTypeHard)
321 send_notification = true;
323 if (IsStateOK(old_state) && old_stateType == StateTypeSoft)
324 send_notification = false; /* Don't send notifications for SOFT-OK -> HARD-OK. */
326 if (is_volatile && IsStateOK(old_state) && IsStateOK(new_state))
327 send_notification = false; /* Don't send notifications for volatile OK -> OK changes. */
331 if (remove_acknowledgement_comments)
332 RemoveCommentsByType(CommentAcknowledgement);
334 Dictionary::Ptr vars_after = new Dictionary({
335 { "state", new_state },
336 { "state_type", GetStateType() },
337 { "attempt", GetCheckAttempt() },
338 { "reachable", reachable }
342 cr->SetVarsBefore(old_cr->GetVarsAfter());
344 cr->SetVarsAfter(vars_after);
347 SetLastCheckResult(cr);
349 bool was_flapping = IsFlapping();
351 UpdateFlappingStatus(old_state != cr->GetState());
353 bool is_flapping = IsFlapping();
355 if (cr->GetActive()) {
356 UpdateNextCheck(origin);
358 /* Reschedule the next check for external passive check results. The side effect of
359 * this is that for as long as we receive results for a service we
360 * won't execute any active checks. */
362 double ttl = cr->GetTtl();
367 offset = GetCheckInterval();
369 SetNextCheck(Utility::GetTime() + offset, false, origin);
374 #ifdef I2_DEBUG /* I2_DEBUG */
375 Log(LogDebug, "Checkable")
376 << "Flapping: Checkable " << GetName()
377 << " was: " << was_flapping
378 << " is: " << is_flapping
379 << " threshold low: " << GetFlappingThresholdLow()
380 << " threshold high: " << GetFlappingThresholdHigh()
381 << "% current: " << GetFlappingCurrent() << "%.";
382 #endif /* I2_DEBUG */
384 OnNewCheckResult(this, cr, origin);
386 /* signal status updates to for example db_ido */
387 OnStateChanged(this);
389 String old_state_str = (service ? Service::StateToString(old_state) : Host::StateToString(Host::CalculateState(old_state)));
390 String new_state_str = (service ? Service::StateToString(new_state) : Host::StateToString(Host::CalculateState(new_state)));
392 /* Whether a hard state change or a volatile state change except OK -> OK happened. */
393 if (hardChange || (is_volatile && !(IsStateOK(old_state) && IsStateOK(new_state)))) {
394 OnStateChange(this, cr, StateTypeHard, origin);
395 Log(LogNotice, "Checkable")
396 << "State Change: Checkable '" << GetName() << "' hard state change from " << old_state_str << " to " << new_state_str << " detected." << (is_volatile ? " Checkable is volatile." : "");
398 /* Whether a state change happened or the state type is SOFT (must be logged too). */
399 else if (stateChange || GetStateType() == StateTypeSoft) {
400 OnStateChange(this, cr, StateTypeSoft, origin);
401 Log(LogNotice, "Checkable")
402 << "State Change: Checkable '" << GetName() << "' soft state change from " << old_state_str << " to " << new_state_str << " detected.";
405 if (GetStateType() == StateTypeSoft || hardChange || recovery ||
406 (is_volatile && !(IsStateOK(old_state) && IsStateOK(new_state))))
407 ExecuteEventHandler();
409 int suppressed_types = 0;
411 /* Flapping start/end notifications */
412 if (!was_flapping && is_flapping) {
413 /* FlappingStart notifications happen on state changes, not in downtimes */
416 suppressed_types |= NotificationFlappingStart;
418 OnNotificationsRequested(this, NotificationFlappingStart, cr, "", "", nullptr);
422 Log(LogNotice, "Checkable")
423 << "Flapping Start: Checkable '" << GetName() << "' started flapping (Current flapping value "
424 << GetFlappingCurrent() << "% > high threshold " << GetFlappingThresholdHigh() << "%).";
426 NotifyFlapping(origin);
427 } else if (was_flapping && !is_flapping) {
428 /* FlappingEnd notifications are independent from state changes, must not happen in downtine */
431 suppressed_types |= NotificationFlappingEnd;
433 OnNotificationsRequested(this, NotificationFlappingEnd, cr, "", "", nullptr);
437 Log(LogNotice, "Checkable")
438 << "Flapping Stop: Checkable '" << GetName() << "' stopped flapping (Current flapping value "
439 << GetFlappingCurrent() << "% < low threshold " << GetFlappingThresholdLow() << "%).";
441 NotifyFlapping(origin);
444 if (send_notification && !is_flapping) {
446 if (suppress_notification) {
447 suppressed_types |= (recovery ? NotificationRecovery : NotificationProblem);
449 OnNotificationsRequested(this, recovery ? NotificationRecovery : NotificationProblem, cr, "", "", nullptr);
454 if (suppressed_types) {
455 /* If some notifications were suppressed, but just because of e.g. a downtime,
456 * stash them into a notification types bitmask for maybe re-sending later.
459 ObjectLock olock (this);
460 int suppressed_types_before (GetSuppressedNotifications());
461 int suppressed_types_after (suppressed_types_before | suppressed_types);
463 for (int conflict : {NotificationProblem | NotificationRecovery, NotificationFlappingStart | NotificationFlappingEnd}) {
464 /* E.g. problem and recovery notifications neutralize each other. */
466 if ((suppressed_types_after & conflict) == conflict) {
467 suppressed_types_after &= ~conflict;
471 if (suppressed_types_after != suppressed_types_before) {
472 SetSuppressedNotifications(suppressed_types_after);
477 void Checkable::ExecuteRemoteCheck(const Dictionary::Ptr& resolvedMacros)
479 CONTEXT("Executing remote check for object '" + GetName() + "'");
481 double scheduled_start = GetNextCheck();
482 double before_check = Utility::GetTime();
484 CheckResult::Ptr cr = new CheckResult();
485 cr->SetScheduleStart(scheduled_start);
486 cr->SetExecutionStart(before_check);
488 GetCheckCommand()->Execute(this, cr, resolvedMacros, true);
491 void Checkable::ExecuteCheck()
493 CONTEXT("Executing check for object '" + GetName() + "'");
495 /* keep track of scheduling info in case the check type doesn't provide its own information */
496 double scheduled_start = GetNextCheck();
497 double before_check = Utility::GetTime();
499 /* This calls SetNextCheck() which updates the CheckerComponent's idle/pending
500 * queues and ensures that checks are not fired multiple times. ProcessCheckResult()
501 * is called too late. See #6421.
505 bool reachable = IsReachable();
508 ObjectLock olock(this);
510 /* don't run another check if there is one pending */
514 m_CheckRunning = true;
516 SetLastStateRaw(GetStateRaw());
517 SetLastStateType(GetLastStateType());
518 SetLastReachable(reachable);
521 CheckResult::Ptr cr = new CheckResult();
523 cr->SetScheduleStart(scheduled_start);
524 cr->SetExecutionStart(before_check);
526 Endpoint::Ptr endpoint = GetCommandEndpoint();
527 bool local = !endpoint || endpoint == Endpoint::GetLocalEndpoint();
530 GetCheckCommand()->Execute(this, cr, nullptr, false);
532 Dictionary::Ptr macros = new Dictionary();
533 GetCheckCommand()->Execute(this, cr, macros, false);
535 if (endpoint->GetConnected()) {
536 /* perform check on remote endpoint */
537 Dictionary::Ptr message = new Dictionary();
538 message->Set("jsonrpc", "2.0");
539 message->Set("method", "event::ExecuteCommand");
542 Service::Ptr service;
543 tie(host, service) = GetHostService(this);
545 Dictionary::Ptr params = new Dictionary();
546 message->Set("params", params);
547 params->Set("command_type", "check_command");
548 params->Set("command", GetCheckCommand()->GetName());
549 params->Set("host", host->GetName());
552 params->Set("service", service->GetShortName());
554 params->Set("macros", macros);
556 ApiListener::Ptr listener = ApiListener::GetInstance();
559 listener->SyncSendMessage(endpoint, message);
561 /* Re-schedule the check so we don't run it again until after we've received
562 * a check result from the remote instance. The check will be re-scheduled
563 * using the proper check interval once we've received a check result.
565 SetNextCheck(Utility::GetTime() + GetCheckCommand()->GetTimeout() + 30);
566 } else if (!endpoint->GetSyncing() && Application::GetInstance()->GetStartTime() < Utility::GetTime() - 300) {
567 /* fail to perform check on unconnected endpoint */
568 cr->SetState(ServiceUnknown);
570 String output = "Remote Icinga instance '" + endpoint->GetName() + "' is not connected to ";
572 Endpoint::Ptr localEndpoint = Endpoint::GetLocalEndpoint();
575 output += "'" + localEndpoint->GetName() + "'";
577 output += "this instance";
579 cr->SetOutput(output);
581 ProcessCheckResult(cr);
585 ObjectLock olock(this);
586 m_CheckRunning = false;
591 void Checkable::UpdateStatistics(const CheckResult::Ptr& cr, CheckableType type)
593 time_t ts = cr->GetScheduleEnd();
595 if (type == CheckableHost) {
597 CIB::UpdateActiveHostChecksStatistics(ts, 1);
599 CIB::UpdatePassiveHostChecksStatistics(ts, 1);
600 } else if (type == CheckableService) {
602 CIB::UpdateActiveServiceChecksStatistics(ts, 1);
604 CIB::UpdatePassiveServiceChecksStatistics(ts, 1);
606 Log(LogWarning, "Checkable", "Unknown checkable type for statistic update.");
610 void Checkable::IncreasePendingChecks()
612 boost::mutex::scoped_lock lock(m_StatsMutex);
616 void Checkable::DecreasePendingChecks()
618 boost::mutex::scoped_lock lock(m_StatsMutex);
620 m_PendingChecksCV.notify_one();
623 int Checkable::GetPendingChecks()
625 boost::mutex::scoped_lock lock(m_StatsMutex);
626 return m_PendingChecks;
629 void Checkable::AquirePendingCheckSlot(int maxPendingChecks)
631 boost::mutex::scoped_lock lock(m_StatsMutex);
632 while (m_PendingChecks >= maxPendingChecks)
633 m_PendingChecksCV.wait(lock);