1 /* Icinga 2 | (c) 2012 Icinga GmbH | GPLv2+ */
3 #include "icinga/checkable.hpp"
4 #include "icinga/service.hpp"
5 #include "icinga/host.hpp"
6 #include "icinga/checkcommand.hpp"
7 #include "icinga/icingaapplication.hpp"
8 #include "icinga/cib.hpp"
9 #include "icinga/clusterevents.hpp"
10 #include "remote/messageorigin.hpp"
11 #include "remote/apilistener.hpp"
12 #include "base/objectlock.hpp"
13 #include "base/logger.hpp"
14 #include "base/convert.hpp"
15 #include "base/utility.hpp"
16 #include "base/context.hpp"
18 using namespace icinga;
20 boost::signals2::signal<void (const Checkable::Ptr&, const CheckResult::Ptr&, const MessageOrigin::Ptr&)> Checkable::OnNewCheckResult;
21 boost::signals2::signal<void (const Checkable::Ptr&, const CheckResult::Ptr&, StateType, const MessageOrigin::Ptr&)> Checkable::OnStateChange;
22 boost::signals2::signal<void (const Checkable::Ptr&, const CheckResult::Ptr&, std::set<Checkable::Ptr>, const MessageOrigin::Ptr&)> Checkable::OnReachabilityChanged;
23 boost::signals2::signal<void (const Checkable::Ptr&, NotificationType, const CheckResult::Ptr&, const String&, const String&, const MessageOrigin::Ptr&)> Checkable::OnNotificationsRequested;
24 boost::signals2::signal<void (const Checkable::Ptr&)> Checkable::OnNextCheckUpdated;
26 boost::mutex Checkable::m_StatsMutex;
27 int Checkable::m_PendingChecks = 0;
28 boost::condition_variable Checkable::m_PendingChecksCV;
30 CheckCommand::Ptr Checkable::GetCheckCommand() const
32 return dynamic_pointer_cast<CheckCommand>(NavigateCheckCommandRaw());
35 TimePeriod::Ptr Checkable::GetCheckPeriod() const
37 return TimePeriod::GetByName(GetCheckPeriodRaw());
40 void Checkable::SetSchedulingOffset(long offset)
42 m_SchedulingOffset = offset;
45 long Checkable::GetSchedulingOffset()
47 return m_SchedulingOffset;
50 void Checkable::UpdateNextCheck(const MessageOrigin::Ptr& origin)
54 if (GetStateType() == StateTypeSoft && GetLastCheckResult() != nullptr)
55 interval = GetRetryInterval();
57 interval = GetCheckInterval();
59 double now = Utility::GetTime();
63 adj = fmod(now * 100 + GetSchedulingOffset(), interval * 100) / 100.0;
66 adj = std::min(0.5 + fmod(GetSchedulingOffset(), interval * 5) / 100.0, adj);
68 double nextCheck = now - adj + interval;
69 double lastCheck = GetLastCheck();
71 Log(LogDebug, "Checkable")
72 << "Update checkable '" << GetName() << "' with check interval '" << GetCheckInterval()
73 << "' from last check time at " << Utility::FormatDateTime("%Y-%m-%d %H:%M:%S %z", (lastCheck < 0 ? 0 : lastCheck))
74 << " (" << GetLastCheck() << ") to next check time at " << Utility::FormatDateTime("%Y-%m-%d %H:%M:%S %z", nextCheck) << " (" << nextCheck << ").";
76 SetNextCheck(nextCheck, false, origin);
79 bool Checkable::HasBeenChecked() const
81 return GetLastCheckResult() != nullptr;
84 double Checkable::GetLastCheck() const
86 CheckResult::Ptr cr = GetLastCheckResult();
87 double schedule_end = -1;
90 schedule_end = cr->GetScheduleEnd();
95 void Checkable::ProcessCheckResult(const CheckResult::Ptr& cr, const MessageOrigin::Ptr& origin)
98 ObjectLock olock(this);
99 m_CheckRunning = false;
105 double now = Utility::GetTime();
107 if (cr->GetScheduleStart() == 0)
108 cr->SetScheduleStart(now);
110 if (cr->GetScheduleEnd() == 0)
111 cr->SetScheduleEnd(now);
113 if (cr->GetExecutionStart() == 0)
114 cr->SetExecutionStart(now);
116 if (cr->GetExecutionEnd() == 0)
117 cr->SetExecutionEnd(now);
119 if (!origin || origin->IsLocal())
120 cr->SetCheckSource(IcingaApplication::GetInstance()->GetNodeName());
122 Endpoint::Ptr command_endpoint = GetCommandEndpoint();
124 /* override check source if command_endpoint was defined */
125 if (command_endpoint && !GetExtension("agent_check"))
126 cr->SetCheckSource(command_endpoint->GetName());
128 /* agent checks go through the api */
129 if (command_endpoint && GetExtension("agent_check")) {
130 ApiListener::Ptr listener = ApiListener::GetInstance();
133 /* send message back to its origin */
134 Dictionary::Ptr message = ClusterEvents::MakeCheckResultMessage(this, cr);
135 listener->SyncSendMessage(command_endpoint, message);
145 bool reachable = IsReachable();
146 bool notification_reachable = IsReachable(DependencyNotification);
148 ObjectLock olock(this);
150 CheckResult::Ptr old_cr = GetLastCheckResult();
151 ServiceState old_state = GetStateRaw();
152 StateType old_stateType = GetStateType();
153 long old_attempt = GetCheckAttempt();
154 bool recovery = false;
156 /* When we have an check result already (not after fresh start),
157 * prevent to accept old check results and allow overrides for
158 * CRs happened in the future.
161 double currentCRTimestamp = old_cr->GetExecutionStart();
162 double newCRTimestamp = cr->GetExecutionStart();
164 /* Our current timestamp may be from the future (wrong server time adjusted again). Allow overrides here. */
165 if (currentCRTimestamp > now) {
166 /* our current CR is from the future, let the new CR override it. */
167 Log(LogDebug, "Checkable")
168 << std::fixed << std::setprecision(6) << "Processing check result for checkable '" << GetName() << "' from "
169 << Utility::FormatDateTime("%Y-%m-%d %H:%M:%S %z", newCRTimestamp) << " (" << newCRTimestamp
170 << "). Overriding since ours is from the future at "
171 << Utility::FormatDateTime("%Y-%m-%d %H:%M:%S %z", currentCRTimestamp) << " (" << currentCRTimestamp << ").";
173 /* Current timestamp is from the past, but the new timestamp is even more in the past. Skip it. */
174 if (newCRTimestamp < currentCRTimestamp) {
175 Log(LogDebug, "Checkable")
176 << std::fixed << std::setprecision(6) << "Skipping check result for checkable '" << GetName() << "' from "
177 << Utility::FormatDateTime("%Y-%m-%d %H:%M:%S %z", newCRTimestamp) << " (" << newCRTimestamp
178 << "). It is in the past compared to ours at "
179 << Utility::FormatDateTime("%Y-%m-%d %H:%M:%S %z", currentCRTimestamp) << " (" << currentCRTimestamp << ").";
185 /* The ExecuteCheck function already sets the old state, but we need to do it again
186 * in case this was a passive check result. */
187 SetLastStateRaw(old_state);
188 SetLastStateType(old_stateType);
189 SetLastReachable(reachable);
192 Service::Ptr service;
193 tie(host, service) = GetHostService(this);
195 CheckableType checkableType = CheckableHost;
197 checkableType = CheckableService;
201 std::set<Checkable::Ptr> children = GetChildren();
203 if (IsStateOK(cr->GetState())) {
204 SetStateType(StateTypeHard); // NOT-OK -> HARD OK
206 if (!IsStateOK(old_state))
209 ResetNotificationNumbers();
210 SaveLastState(ServiceOK, Utility::GetTime());
212 /* update reachability for child objects in OK state */
213 if (!children.empty())
214 OnReachabilityChanged(this, cr, children, origin);
216 /* OK -> NOT-OK change, first SOFT state. Reset attempt counter. */
217 if (IsStateOK(old_state)) {
218 SetStateType(StateTypeSoft);
222 /* SOFT state change, increase attempt counter. */
223 if (old_stateType == StateTypeSoft && !IsStateOK(old_state)) {
224 SetStateType(StateTypeSoft);
225 attempt = old_attempt + 1;
228 /* HARD state change (e.g. previously 2/3 and this next attempt). Reset attempt counter. */
229 if (attempt >= GetMaxCheckAttempts()) {
230 SetStateType(StateTypeHard);
234 if (!IsStateOK(cr->GetState())) {
235 SaveLastState(cr->GetState(), Utility::GetTime());
238 /* update reachability for child objects in NOT-OK state */
239 if (!children.empty())
240 OnReachabilityChanged(this, cr, children, origin);
244 SetLastStateUnreachable(Utility::GetTime());
246 SetCheckAttempt(attempt);
248 ServiceState new_state = cr->GetState();
249 SetStateRaw(new_state);
253 /* Exception on state change calculation for hosts. */
254 if (checkableType == CheckableService)
255 stateChange = (old_state != new_state);
257 stateChange = (Host::CalculateState(old_state) != Host::CalculateState(new_state));
259 /* Store the current last state change for the next iteration. */
260 SetPreviousStateChange(GetLastStateChange());
263 SetLastStateChange(now);
265 /* remove acknowledgements */
266 if (GetAcknowledgement() == AcknowledgementNormal ||
267 (GetAcknowledgement() == AcknowledgementSticky && IsStateOK(new_state))) {
268 ClearAcknowledgement();
271 /* reschedule direct parents */
272 for (const Checkable::Ptr& parent : GetParents()) {
273 if (parent.get() == this)
276 if (!parent->GetEnableActiveChecks())
279 if (parent->GetNextCheck() >= now + parent->GetRetryInterval()) {
280 ObjectLock olock(parent);
281 parent->SetNextCheck(now);
286 bool remove_acknowledgement_comments = false;
288 if (GetAcknowledgement() == AcknowledgementNone)
289 remove_acknowledgement_comments = true;
291 bool hardChange = (GetStateType() == StateTypeHard && old_stateType == StateTypeSoft);
293 if (stateChange && old_stateType == StateTypeHard && GetStateType() == StateTypeHard)
296 bool is_volatile = GetVolatile();
298 if (hardChange || is_volatile) {
299 SetLastHardStateRaw(new_state);
300 SetLastHardStateChange(now);
303 if (!IsStateOK(new_state))
306 /* statistics for external tools */
307 Checkable::UpdateStatistics(cr, checkableType);
309 bool in_downtime = IsInDowntime();
311 bool send_notification = false;
312 bool suppress_notification = !notification_reachable || in_downtime || IsAcknowledged();
314 /* Send notifications whether when a hard state change occurred. */
315 if (hardChange && !(old_stateType == StateTypeSoft && IsStateOK(new_state)))
316 send_notification = true;
317 /* Or if the checkable is volatile and in a HARD state. */
318 else if (is_volatile && GetStateType() == StateTypeHard)
319 send_notification = true;
321 if (IsStateOK(old_state) && old_stateType == StateTypeSoft)
322 send_notification = false; /* Don't send notifications for SOFT-OK -> HARD-OK. */
324 if (is_volatile && IsStateOK(old_state) && IsStateOK(new_state))
325 send_notification = false; /* Don't send notifications for volatile OK -> OK changes. */
329 if (remove_acknowledgement_comments)
330 RemoveCommentsByType(CommentAcknowledgement);
332 Dictionary::Ptr vars_after = new Dictionary({
333 { "state", new_state },
334 { "state_type", GetStateType() },
335 { "attempt", GetCheckAttempt() },
336 { "reachable", reachable }
340 cr->SetVarsBefore(old_cr->GetVarsAfter());
342 cr->SetVarsAfter(vars_after);
345 SetLastCheckResult(cr);
347 bool was_flapping = IsFlapping();
349 UpdateFlappingStatus(old_state != cr->GetState());
351 bool is_flapping = IsFlapping();
353 if (cr->GetActive()) {
354 UpdateNextCheck(origin);
356 /* Reschedule the next check for external passive check results. The side effect of
357 * this is that for as long as we receive results for a service we
358 * won't execute any active checks. */
360 double ttl = cr->GetTtl();
365 offset = GetCheckInterval();
367 SetNextCheck(Utility::GetTime() + offset, false, origin);
372 #ifdef I2_DEBUG /* I2_DEBUG */
373 Log(LogDebug, "Checkable")
374 << "Flapping: Checkable " << GetName()
375 << " was: " << was_flapping
376 << " is: " << is_flapping
377 << " threshold low: " << GetFlappingThresholdLow()
378 << " threshold high: " << GetFlappingThresholdHigh()
379 << "% current: " << GetFlappingCurrent() << "%.";
380 #endif /* I2_DEBUG */
382 OnNewCheckResult(this, cr, origin);
384 /* signal status updates to for example db_ido */
385 OnStateChanged(this);
387 String old_state_str = (service ? Service::StateToString(old_state) : Host::StateToString(Host::CalculateState(old_state)));
388 String new_state_str = (service ? Service::StateToString(new_state) : Host::StateToString(Host::CalculateState(new_state)));
390 /* Whether a hard state change or a volatile state change except OK -> OK happened. */
391 if (hardChange || (is_volatile && !(IsStateOK(old_state) && IsStateOK(new_state)))) {
392 OnStateChange(this, cr, StateTypeHard, origin);
393 Log(LogNotice, "Checkable")
394 << "State Change: Checkable '" << GetName() << "' hard state change from " << old_state_str << " to " << new_state_str << " detected." << (is_volatile ? " Checkable is volatile." : "");
396 /* Whether a state change happened or the state type is SOFT (must be logged too). */
397 else if (stateChange || GetStateType() == StateTypeSoft) {
398 OnStateChange(this, cr, StateTypeSoft, origin);
399 Log(LogNotice, "Checkable")
400 << "State Change: Checkable '" << GetName() << "' soft state change from " << old_state_str << " to " << new_state_str << " detected.";
403 if (GetStateType() == StateTypeSoft || hardChange || recovery ||
404 (is_volatile && !(IsStateOK(old_state) && IsStateOK(new_state))))
405 ExecuteEventHandler();
407 int suppressed_types = 0;
409 /* Flapping start/end notifications */
410 if (!was_flapping && is_flapping) {
411 /* FlappingStart notifications happen on state changes, not in downtimes */
414 suppressed_types |= NotificationFlappingStart;
416 OnNotificationsRequested(this, NotificationFlappingStart, cr, "", "", nullptr);
420 Log(LogNotice, "Checkable")
421 << "Flapping Start: Checkable '" << GetName() << "' started flapping (Current flapping value "
422 << GetFlappingCurrent() << "% > high threshold " << GetFlappingThresholdHigh() << "%).";
424 NotifyFlapping(origin);
425 } else if (was_flapping && !is_flapping) {
426 /* FlappingEnd notifications are independent from state changes, must not happen in downtine */
429 suppressed_types |= NotificationFlappingEnd;
431 OnNotificationsRequested(this, NotificationFlappingEnd, cr, "", "", nullptr);
435 Log(LogNotice, "Checkable")
436 << "Flapping Stop: Checkable '" << GetName() << "' stopped flapping (Current flapping value "
437 << GetFlappingCurrent() << "% < low threshold " << GetFlappingThresholdLow() << "%).";
439 NotifyFlapping(origin);
442 if (send_notification && !is_flapping) {
444 if (suppress_notification) {
445 suppressed_types |= (recovery ? NotificationRecovery : NotificationProblem);
447 OnNotificationsRequested(this, recovery ? NotificationRecovery : NotificationProblem, cr, "", "", nullptr);
452 if (suppressed_types) {
453 /* If some notifications were suppressed, but just because of e.g. a downtime,
454 * stash them into a notification types bitmask for maybe re-sending later.
457 ObjectLock olock (this);
458 int suppressed_types_before (GetSuppressedNotifications());
459 int suppressed_types_after (suppressed_types_before | suppressed_types);
461 for (int conflict : {NotificationProblem | NotificationRecovery, NotificationFlappingStart | NotificationFlappingEnd}) {
462 /* E.g. problem and recovery notifications neutralize each other. */
464 if ((suppressed_types_after & conflict) == conflict) {
465 suppressed_types_after &= ~conflict;
469 if (suppressed_types_after != suppressed_types_before) {
470 SetSuppressedNotifications(suppressed_types_after);
475 void Checkable::ExecuteRemoteCheck(const Dictionary::Ptr& resolvedMacros)
477 CONTEXT("Executing remote check for object '" + GetName() + "'");
479 double scheduled_start = GetNextCheck();
480 double before_check = Utility::GetTime();
482 CheckResult::Ptr cr = new CheckResult();
483 cr->SetScheduleStart(scheduled_start);
484 cr->SetExecutionStart(before_check);
486 GetCheckCommand()->Execute(this, cr, resolvedMacros, true);
489 void Checkable::ExecuteCheck()
491 CONTEXT("Executing check for object '" + GetName() + "'");
493 /* keep track of scheduling info in case the check type doesn't provide its own information */
494 double scheduled_start = GetNextCheck();
495 double before_check = Utility::GetTime();
497 /* This calls SetNextCheck() which updates the CheckerComponent's idle/pending
498 * queues and ensures that checks are not fired multiple times. ProcessCheckResult()
499 * is called too late. See #6421.
503 bool reachable = IsReachable();
506 ObjectLock olock(this);
508 /* don't run another check if there is one pending */
512 m_CheckRunning = true;
514 SetLastStateRaw(GetStateRaw());
515 SetLastStateType(GetLastStateType());
516 SetLastReachable(reachable);
519 CheckResult::Ptr cr = new CheckResult();
521 cr->SetScheduleStart(scheduled_start);
522 cr->SetExecutionStart(before_check);
524 Endpoint::Ptr endpoint = GetCommandEndpoint();
525 bool local = !endpoint || endpoint == Endpoint::GetLocalEndpoint();
528 GetCheckCommand()->Execute(this, cr, nullptr, false);
530 Dictionary::Ptr macros = new Dictionary();
531 GetCheckCommand()->Execute(this, cr, macros, false);
533 if (endpoint->GetConnected()) {
534 /* perform check on remote endpoint */
535 Dictionary::Ptr message = new Dictionary();
536 message->Set("jsonrpc", "2.0");
537 message->Set("method", "event::ExecuteCommand");
540 Service::Ptr service;
541 tie(host, service) = GetHostService(this);
543 Dictionary::Ptr params = new Dictionary();
544 message->Set("params", params);
545 params->Set("command_type", "check_command");
546 params->Set("command", GetCheckCommand()->GetName());
547 params->Set("host", host->GetName());
550 params->Set("service", service->GetShortName());
552 params->Set("macros", macros);
554 ApiListener::Ptr listener = ApiListener::GetInstance();
557 listener->SyncSendMessage(endpoint, message);
559 /* Re-schedule the check so we don't run it again until after we've received
560 * a check result from the remote instance. The check will be re-scheduled
561 * using the proper check interval once we've received a check result.
563 SetNextCheck(Utility::GetTime() + GetCheckCommand()->GetTimeout() + 30);
564 } else if (!endpoint->GetSyncing() && Application::GetInstance()->GetStartTime() < Utility::GetTime() - 300) {
565 /* fail to perform check on unconnected endpoint */
566 cr->SetState(ServiceUnknown);
568 String output = "Remote Icinga instance '" + endpoint->GetName() + "' is not connected to ";
570 Endpoint::Ptr localEndpoint = Endpoint::GetLocalEndpoint();
573 output += "'" + localEndpoint->GetName() + "'";
575 output += "this instance";
577 cr->SetOutput(output);
579 ProcessCheckResult(cr);
583 ObjectLock olock(this);
584 m_CheckRunning = false;
589 void Checkable::UpdateStatistics(const CheckResult::Ptr& cr, CheckableType type)
591 time_t ts = cr->GetScheduleEnd();
593 if (type == CheckableHost) {
595 CIB::UpdateActiveHostChecksStatistics(ts, 1);
597 CIB::UpdatePassiveHostChecksStatistics(ts, 1);
598 } else if (type == CheckableService) {
600 CIB::UpdateActiveServiceChecksStatistics(ts, 1);
602 CIB::UpdatePassiveServiceChecksStatistics(ts, 1);
604 Log(LogWarning, "Checkable", "Unknown checkable type for statistic update.");
608 void Checkable::IncreasePendingChecks()
610 boost::mutex::scoped_lock lock(m_StatsMutex);
614 void Checkable::DecreasePendingChecks()
616 boost::mutex::scoped_lock lock(m_StatsMutex);
618 m_PendingChecksCV.notify_one();
621 int Checkable::GetPendingChecks()
623 boost::mutex::scoped_lock lock(m_StatsMutex);
624 return m_PendingChecks;
627 void Checkable::AquirePendingCheckSlot(int maxPendingChecks)
629 boost::mutex::scoped_lock lock(m_StatsMutex);
630 while (m_PendingChecks >= maxPendingChecks)
631 m_PendingChecksCV.wait(lock);