]> granicus.if.org Git - icinga2/blob - lib/icinga/checkable-check.cpp
Re-send suppressed notifications
[icinga2] / lib / icinga / checkable-check.cpp
1 /* Icinga 2 | (c) 2012 Icinga GmbH | GPLv2+ */
2
3 #include "icinga/checkable.hpp"
4 #include "icinga/service.hpp"
5 #include "icinga/host.hpp"
6 #include "icinga/checkcommand.hpp"
7 #include "icinga/icingaapplication.hpp"
8 #include "icinga/cib.hpp"
9 #include "icinga/clusterevents.hpp"
10 #include "remote/messageorigin.hpp"
11 #include "remote/apilistener.hpp"
12 #include "base/objectlock.hpp"
13 #include "base/logger.hpp"
14 #include "base/convert.hpp"
15 #include "base/utility.hpp"
16 #include "base/context.hpp"
17
18 using namespace icinga;
19
20 boost::signals2::signal<void (const Checkable::Ptr&, const CheckResult::Ptr&, const MessageOrigin::Ptr&)> Checkable::OnNewCheckResult;
21 boost::signals2::signal<void (const Checkable::Ptr&, const CheckResult::Ptr&, StateType, const MessageOrigin::Ptr&)> Checkable::OnStateChange;
22 boost::signals2::signal<void (const Checkable::Ptr&, const CheckResult::Ptr&, std::set<Checkable::Ptr>, const MessageOrigin::Ptr&)> Checkable::OnReachabilityChanged;
23 boost::signals2::signal<void (const Checkable::Ptr&, NotificationType, const CheckResult::Ptr&, const String&, const String&, const MessageOrigin::Ptr&)> Checkable::OnNotificationsRequested;
24 boost::signals2::signal<void (const Checkable::Ptr&)> Checkable::OnNextCheckUpdated;
25
26 boost::mutex Checkable::m_StatsMutex;
27 int Checkable::m_PendingChecks = 0;
28 boost::condition_variable Checkable::m_PendingChecksCV;
29
30 CheckCommand::Ptr Checkable::GetCheckCommand() const
31 {
32         return dynamic_pointer_cast<CheckCommand>(NavigateCheckCommandRaw());
33 }
34
35 TimePeriod::Ptr Checkable::GetCheckPeriod() const
36 {
37         return TimePeriod::GetByName(GetCheckPeriodRaw());
38 }
39
40 void Checkable::SetSchedulingOffset(long offset)
41 {
42         m_SchedulingOffset = offset;
43 }
44
45 long Checkable::GetSchedulingOffset()
46 {
47         return m_SchedulingOffset;
48 }
49
50 void Checkable::UpdateNextCheck(const MessageOrigin::Ptr& origin)
51 {
52         double interval;
53
54         if (GetStateType() == StateTypeSoft && GetLastCheckResult() != nullptr)
55                 interval = GetRetryInterval();
56         else
57                 interval = GetCheckInterval();
58
59         double now = Utility::GetTime();
60         double adj = 0;
61
62         if (interval > 1)
63                 adj = fmod(now * 100 + GetSchedulingOffset(), interval * 100) / 100.0;
64
65         if (adj != 0.0)
66                 adj = std::min(0.5 + fmod(GetSchedulingOffset(), interval * 5) / 100.0, adj);
67
68         double nextCheck = now - adj + interval;
69         double lastCheck = GetLastCheck();
70
71         Log(LogDebug, "Checkable")
72                 << "Update checkable '" << GetName() << "' with check interval '" << GetCheckInterval()
73                 << "' from last check time at " << Utility::FormatDateTime("%Y-%m-%d %H:%M:%S %z", (lastCheck < 0 ? 0 : lastCheck))
74                 << " (" << GetLastCheck() << ") to next check time at " << Utility::FormatDateTime("%Y-%m-%d %H:%M:%S %z", nextCheck) << " (" << nextCheck << ").";
75
76         SetNextCheck(nextCheck, false, origin);
77 }
78
79 bool Checkable::HasBeenChecked() const
80 {
81         return GetLastCheckResult() != nullptr;
82 }
83
84 double Checkable::GetLastCheck() const
85 {
86         CheckResult::Ptr cr = GetLastCheckResult();
87         double schedule_end = -1;
88
89         if (cr)
90                 schedule_end = cr->GetScheduleEnd();
91
92         return schedule_end;
93 }
94
95 void Checkable::ProcessCheckResult(const CheckResult::Ptr& cr, const MessageOrigin::Ptr& origin)
96 {
97         {
98                 ObjectLock olock(this);
99                 m_CheckRunning = false;
100         }
101
102         if (!cr)
103                 return;
104
105         double now = Utility::GetTime();
106
107         if (cr->GetScheduleStart() == 0)
108                 cr->SetScheduleStart(now);
109
110         if (cr->GetScheduleEnd() == 0)
111                 cr->SetScheduleEnd(now);
112
113         if (cr->GetExecutionStart() == 0)
114                 cr->SetExecutionStart(now);
115
116         if (cr->GetExecutionEnd() == 0)
117                 cr->SetExecutionEnd(now);
118
119         if (!origin || origin->IsLocal())
120                 cr->SetCheckSource(IcingaApplication::GetInstance()->GetNodeName());
121
122         Endpoint::Ptr command_endpoint = GetCommandEndpoint();
123
124         /* override check source if command_endpoint was defined */
125         if (command_endpoint && !GetExtension("agent_check"))
126                 cr->SetCheckSource(command_endpoint->GetName());
127
128         /* agent checks go through the api */
129         if (command_endpoint && GetExtension("agent_check")) {
130                 ApiListener::Ptr listener = ApiListener::GetInstance();
131
132                 if (listener) {
133                         /* send message back to its origin */
134                         Dictionary::Ptr message = ClusterEvents::MakeCheckResultMessage(this, cr);
135                         listener->SyncSendMessage(command_endpoint, message);
136                 }
137
138                 return;
139
140         }
141
142         if (!IsActive())
143                 return;
144
145         bool reachable = IsReachable();
146         bool notification_reachable = IsReachable(DependencyNotification);
147
148         ObjectLock olock(this);
149
150         CheckResult::Ptr old_cr = GetLastCheckResult();
151         ServiceState old_state = GetStateRaw();
152         StateType old_stateType = GetStateType();
153         long old_attempt = GetCheckAttempt();
154         bool recovery = false;
155
156         /* When we have an check result already (not after fresh start),
157          * prevent to accept old check results and allow overrides for
158          * CRs happened in the future.
159          */
160         if (old_cr) {
161                 double currentCRTimestamp = old_cr->GetExecutionStart();
162                 double newCRTimestamp = cr->GetExecutionStart();
163
164                 /* Our current timestamp may be from the future (wrong server time adjusted again). Allow overrides here. */
165                 if (currentCRTimestamp > now) {
166                         /* our current CR is from the future, let the new CR override it. */
167                         Log(LogDebug, "Checkable")
168                                 << std::fixed << std::setprecision(6) << "Processing check result for checkable '" << GetName() << "' from "
169                                 << Utility::FormatDateTime("%Y-%m-%d %H:%M:%S %z", newCRTimestamp) << " (" << newCRTimestamp
170                                 << "). Overriding since ours is from the future at "
171                                 << Utility::FormatDateTime("%Y-%m-%d %H:%M:%S %z", currentCRTimestamp) << " (" << currentCRTimestamp << ").";
172                 } else {
173                         /* Current timestamp is from the past, but the new timestamp is even more in the past. Skip it. */
174                         if (newCRTimestamp < currentCRTimestamp) {
175                                 Log(LogDebug, "Checkable")
176                                         << std::fixed << std::setprecision(6) << "Skipping check result for checkable '" << GetName() << "' from "
177                                         << Utility::FormatDateTime("%Y-%m-%d %H:%M:%S %z", newCRTimestamp) << " (" << newCRTimestamp
178                                         << "). It is in the past compared to ours at "
179                                         << Utility::FormatDateTime("%Y-%m-%d %H:%M:%S %z", currentCRTimestamp) << " (" << currentCRTimestamp << ").";
180                                 return;
181                         }
182                 }
183         }
184
185         /* The ExecuteCheck function already sets the old state, but we need to do it again
186          * in case this was a passive check result. */
187         SetLastStateRaw(old_state);
188         SetLastStateType(old_stateType);
189         SetLastReachable(reachable);
190
191         Host::Ptr host;
192         Service::Ptr service;
193         tie(host, service) = GetHostService(this);
194
195         CheckableType checkableType = CheckableHost;
196         if (service)
197                 checkableType = CheckableService;
198
199         long attempt = 1;
200
201         std::set<Checkable::Ptr> children = GetChildren();
202
203         if (IsStateOK(cr->GetState())) {
204                 SetStateType(StateTypeHard); // NOT-OK -> HARD OK
205
206                 if (!IsStateOK(old_state))
207                         recovery = true;
208
209                 ResetNotificationNumbers();
210                 SaveLastState(ServiceOK, Utility::GetTime());
211
212                 /* update reachability for child objects in OK state */
213                 if (!children.empty())
214                         OnReachabilityChanged(this, cr, children, origin);
215         } else {
216                 /* OK -> NOT-OK change, first SOFT state. Reset attempt counter. */
217                 if (IsStateOK(old_state)) {
218                         SetStateType(StateTypeSoft);
219                         attempt = 1;
220                 }
221
222                 /* SOFT state change, increase attempt counter. */
223                 if (old_stateType == StateTypeSoft && !IsStateOK(old_state)) {
224                         SetStateType(StateTypeSoft);
225                         attempt = old_attempt + 1;
226                 }
227
228                 /* HARD state change (e.g. previously 2/3 and this next attempt). Reset attempt counter. */
229                 if (attempt >= GetMaxCheckAttempts()) {
230                         SetStateType(StateTypeHard);
231                         attempt = 1;
232                 }
233
234                 if (!IsStateOK(cr->GetState())) {
235                         SaveLastState(cr->GetState(), Utility::GetTime());
236                 }
237
238                 /* update reachability for child objects in NOT-OK state */
239                 if (!children.empty())
240                         OnReachabilityChanged(this, cr, children, origin);
241         }
242
243         if (!reachable)
244                 SetLastStateUnreachable(Utility::GetTime());
245
246         SetCheckAttempt(attempt);
247
248         ServiceState new_state = cr->GetState();
249         SetStateRaw(new_state);
250
251         bool stateChange;
252
253         /* Exception on state change calculation for hosts. */
254         if (checkableType == CheckableService)
255                 stateChange = (old_state != new_state);
256         else
257                 stateChange = (Host::CalculateState(old_state) != Host::CalculateState(new_state));
258
259         /* Store the current last state change for the next iteration. */
260         SetPreviousStateChange(GetLastStateChange());
261
262         if (stateChange) {
263                 SetLastStateChange(now);
264
265                 /* remove acknowledgements */
266                 if (GetAcknowledgement() == AcknowledgementNormal ||
267                         (GetAcknowledgement() == AcknowledgementSticky && IsStateOK(new_state))) {
268                         ClearAcknowledgement();
269                 }
270
271                 /* reschedule direct parents */
272                 for (const Checkable::Ptr& parent : GetParents()) {
273                         if (parent.get() == this)
274                                 continue;
275
276                         if (!parent->GetEnableActiveChecks())
277                                 continue;
278
279                         if (parent->GetNextCheck() >= now + parent->GetRetryInterval()) {
280                                 ObjectLock olock(parent);
281                                 parent->SetNextCheck(now);
282                         }
283                 }
284         }
285
286         bool remove_acknowledgement_comments = false;
287
288         if (GetAcknowledgement() == AcknowledgementNone)
289                 remove_acknowledgement_comments = true;
290
291         bool hardChange = (GetStateType() == StateTypeHard && old_stateType == StateTypeSoft);
292
293         if (stateChange && old_stateType == StateTypeHard && GetStateType() == StateTypeHard)
294                 hardChange = true;
295
296         bool is_volatile = GetVolatile();
297
298         if (hardChange || is_volatile) {
299                 SetLastHardStateRaw(new_state);
300                 SetLastHardStateChange(now);
301         }
302
303         if (!IsStateOK(new_state))
304                 TriggerDowntimes();
305
306         /* statistics for external tools */
307         Checkable::UpdateStatistics(cr, checkableType);
308
309         bool in_downtime = IsInDowntime();
310
311         bool send_notification = false;
312         bool suppress_notification = !notification_reachable || in_downtime || IsAcknowledged();
313
314         /* Send notifications whether when a hard state change occurred. */
315         if (hardChange && !(old_stateType == StateTypeSoft && IsStateOK(new_state)))
316                 send_notification = true;
317         /* Or if the checkable is volatile and in a HARD state. */
318         else if (is_volatile && GetStateType() == StateTypeHard)
319                 send_notification = true;
320
321         if (IsStateOK(old_state) && old_stateType == StateTypeSoft)
322                 send_notification = false; /* Don't send notifications for SOFT-OK -> HARD-OK. */
323
324         if (is_volatile && IsStateOK(old_state) && IsStateOK(new_state))
325                 send_notification = false; /* Don't send notifications for volatile OK -> OK changes. */
326
327         olock.Unlock();
328
329         if (remove_acknowledgement_comments)
330                 RemoveCommentsByType(CommentAcknowledgement);
331
332         Dictionary::Ptr vars_after = new Dictionary({
333                 { "state", new_state },
334                 { "state_type", GetStateType() },
335                 { "attempt", GetCheckAttempt() },
336                 { "reachable", reachable }
337         });
338
339         if (old_cr)
340                 cr->SetVarsBefore(old_cr->GetVarsAfter());
341
342         cr->SetVarsAfter(vars_after);
343
344         olock.Lock();
345         SetLastCheckResult(cr);
346
347         bool was_flapping = IsFlapping();
348
349         UpdateFlappingStatus(old_state != cr->GetState());
350
351         bool is_flapping = IsFlapping();
352
353         if (cr->GetActive()) {
354                 UpdateNextCheck(origin);
355         } else {
356                 /* Reschedule the next check for external passive check results. The side effect of
357                  * this is that for as long as we receive results for a service we
358                  * won't execute any active checks. */
359                 double offset;
360                 double ttl = cr->GetTtl();
361
362                 if (ttl > 0)
363                         offset = ttl;
364                 else
365                         offset = GetCheckInterval();
366
367                 SetNextCheck(Utility::GetTime() + offset, false, origin);
368         }
369
370         olock.Unlock();
371
372 #ifdef I2_DEBUG /* I2_DEBUG */
373         Log(LogDebug, "Checkable")
374                 << "Flapping: Checkable " << GetName()
375                 << " was: " << was_flapping
376                 << " is: " << is_flapping
377                 << " threshold low: " << GetFlappingThresholdLow()
378                 << " threshold high: " << GetFlappingThresholdHigh()
379                 << "% current: " << GetFlappingCurrent() << "%.";
380 #endif /* I2_DEBUG */
381
382         OnNewCheckResult(this, cr, origin);
383
384         /* signal status updates to for example db_ido */
385         OnStateChanged(this);
386
387         String old_state_str = (service ? Service::StateToString(old_state) : Host::StateToString(Host::CalculateState(old_state)));
388         String new_state_str = (service ? Service::StateToString(new_state) : Host::StateToString(Host::CalculateState(new_state)));
389
390         /* Whether a hard state change or a volatile state change except OK -> OK happened. */
391         if (hardChange || (is_volatile && !(IsStateOK(old_state) && IsStateOK(new_state)))) {
392                 OnStateChange(this, cr, StateTypeHard, origin);
393                 Log(LogNotice, "Checkable")
394                         << "State Change: Checkable '" << GetName() << "' hard state change from " << old_state_str << " to " << new_state_str << " detected." << (is_volatile ? " Checkable is volatile." : "");
395         }
396         /* Whether a state change happened or the state type is SOFT (must be logged too). */
397         else if (stateChange || GetStateType() == StateTypeSoft) {
398                 OnStateChange(this, cr, StateTypeSoft, origin);
399                 Log(LogNotice, "Checkable")
400                         << "State Change: Checkable '" << GetName() << "' soft state change from " << old_state_str << " to " << new_state_str << " detected.";
401         }
402
403         if (GetStateType() == StateTypeSoft || hardChange || recovery ||
404                 (is_volatile && !(IsStateOK(old_state) && IsStateOK(new_state))))
405                 ExecuteEventHandler();
406
407         int suppressed_types = 0;
408
409         /* Flapping start/end notifications */
410         if (!was_flapping && is_flapping) {
411                 /* FlappingStart notifications happen on state changes, not in downtimes */
412                 if (!IsPaused()) {
413                         if (in_downtime) {
414                                 suppressed_types |= NotificationFlappingStart;
415                         } else {
416                                 OnNotificationsRequested(this, NotificationFlappingStart, cr, "", "", nullptr);
417                         }
418                 }
419
420                 Log(LogNotice, "Checkable")
421                         << "Flapping Start: Checkable '" << GetName() << "' started flapping (Current flapping value "
422                         << GetFlappingCurrent() << "% > high threshold " << GetFlappingThresholdHigh() << "%).";
423
424                 NotifyFlapping(origin);
425         } else if (was_flapping && !is_flapping) {
426                 /* FlappingEnd notifications are independent from state changes, must not happen in downtine */
427                 if (!IsPaused()) {
428                         if (in_downtime) {
429                                 suppressed_types |= NotificationFlappingEnd;
430                         } else {
431                                 OnNotificationsRequested(this, NotificationFlappingEnd, cr, "", "", nullptr);
432                         }
433                 }
434
435                 Log(LogNotice, "Checkable")
436                         << "Flapping Stop: Checkable '" << GetName() << "' stopped flapping (Current flapping value "
437                         << GetFlappingCurrent() << "% < low threshold " << GetFlappingThresholdLow() << "%).";
438
439                 NotifyFlapping(origin);
440         }
441
442         if (send_notification && !is_flapping) {
443                 if (!IsPaused()) {
444                         if (suppress_notification) {
445                                 suppressed_types |= (recovery ? NotificationRecovery : NotificationProblem);
446                         } else {
447                                 OnNotificationsRequested(this, recovery ? NotificationRecovery : NotificationProblem, cr, "", "", nullptr);
448                         }
449                 }
450         }
451
452         if (suppressed_types) {
453                 /* If some notifications were suppressed, but just because of e.g. a downtime,
454                  * stash them into a notification types bitmask for maybe re-sending later.
455                  */
456
457                 ObjectLock olock (this);
458                 int suppressed_types_before (GetSuppressedNotifications());
459                 int suppressed_types_after (suppressed_types_before | suppressed_types);
460
461                 for (int conflict : {NotificationProblem | NotificationRecovery, NotificationFlappingStart | NotificationFlappingEnd}) {
462                         /* E.g. problem and recovery notifications neutralize each other. */
463
464                         if (suppressed_types_after & conflict == conflict) {
465                                 suppressed_types_after &= ~conflict;
466                         }
467                 }
468
469                 if (suppressed_types_after != suppressed_types_before) {
470                         SetSuppressedNotifications(suppressed_types_after);
471                 }
472         }
473 }
474
475 void Checkable::ExecuteRemoteCheck(const Dictionary::Ptr& resolvedMacros)
476 {
477         CONTEXT("Executing remote check for object '" + GetName() + "'");
478
479         double scheduled_start = GetNextCheck();
480         double before_check = Utility::GetTime();
481
482         CheckResult::Ptr cr = new CheckResult();
483         cr->SetScheduleStart(scheduled_start);
484         cr->SetExecutionStart(before_check);
485
486         GetCheckCommand()->Execute(this, cr, resolvedMacros, true);
487 }
488
489 void Checkable::ExecuteCheck()
490 {
491         CONTEXT("Executing check for object '" + GetName() + "'");
492
493         /* keep track of scheduling info in case the check type doesn't provide its own information */
494         double scheduled_start = GetNextCheck();
495         double before_check = Utility::GetTime();
496
497         /* This calls SetNextCheck() which updates the CheckerComponent's idle/pending
498          * queues and ensures that checks are not fired multiple times. ProcessCheckResult()
499          * is called too late. See #6421.
500          */
501         UpdateNextCheck();
502
503         bool reachable = IsReachable();
504
505         {
506                 ObjectLock olock(this);
507
508                 /* don't run another check if there is one pending */
509                 if (m_CheckRunning)
510                         return;
511
512                 m_CheckRunning = true;
513
514                 SetLastStateRaw(GetStateRaw());
515                 SetLastStateType(GetLastStateType());
516                 SetLastReachable(reachable);
517         }
518
519         CheckResult::Ptr cr = new CheckResult();
520
521         cr->SetScheduleStart(scheduled_start);
522         cr->SetExecutionStart(before_check);
523
524         Endpoint::Ptr endpoint = GetCommandEndpoint();
525         bool local = !endpoint || endpoint == Endpoint::GetLocalEndpoint();
526
527         if (local) {
528                 GetCheckCommand()->Execute(this, cr, nullptr, false);
529         } else {
530                 Dictionary::Ptr macros = new Dictionary();
531                 GetCheckCommand()->Execute(this, cr, macros, false);
532
533                 if (endpoint->GetConnected()) {
534                         /* perform check on remote endpoint */
535                         Dictionary::Ptr message = new Dictionary();
536                         message->Set("jsonrpc", "2.0");
537                         message->Set("method", "event::ExecuteCommand");
538
539                         Host::Ptr host;
540                         Service::Ptr service;
541                         tie(host, service) = GetHostService(this);
542
543                         Dictionary::Ptr params = new Dictionary();
544                         message->Set("params", params);
545                         params->Set("command_type", "check_command");
546                         params->Set("command", GetCheckCommand()->GetName());
547                         params->Set("host", host->GetName());
548
549                         if (service)
550                                 params->Set("service", service->GetShortName());
551
552                         params->Set("macros", macros);
553
554                         ApiListener::Ptr listener = ApiListener::GetInstance();
555
556                         if (listener)
557                                 listener->SyncSendMessage(endpoint, message);
558
559                         /* Re-schedule the check so we don't run it again until after we've received
560                          * a check result from the remote instance. The check will be re-scheduled
561                          * using the proper check interval once we've received a check result.
562                          */
563                         SetNextCheck(Utility::GetTime() + GetCheckCommand()->GetTimeout() + 30);
564                 } else if (!endpoint->GetSyncing() && Application::GetInstance()->GetStartTime() < Utility::GetTime() - 300) {
565                         /* fail to perform check on unconnected endpoint */
566                         cr->SetState(ServiceUnknown);
567
568                         String output = "Remote Icinga instance '" + endpoint->GetName() + "' is not connected to ";
569
570                         Endpoint::Ptr localEndpoint = Endpoint::GetLocalEndpoint();
571
572                         if (localEndpoint)
573                                 output += "'" + localEndpoint->GetName() + "'";
574                         else
575                                 output += "this instance";
576
577                         cr->SetOutput(output);
578
579                         ProcessCheckResult(cr);
580                 }
581
582                 {
583                         ObjectLock olock(this);
584                         m_CheckRunning = false;
585                 }
586         }
587 }
588
589 void Checkable::UpdateStatistics(const CheckResult::Ptr& cr, CheckableType type)
590 {
591         time_t ts = cr->GetScheduleEnd();
592
593         if (type == CheckableHost) {
594                 if (cr->GetActive())
595                         CIB::UpdateActiveHostChecksStatistics(ts, 1);
596                 else
597                         CIB::UpdatePassiveHostChecksStatistics(ts, 1);
598         } else if (type == CheckableService) {
599                 if (cr->GetActive())
600                         CIB::UpdateActiveServiceChecksStatistics(ts, 1);
601                 else
602                         CIB::UpdatePassiveServiceChecksStatistics(ts, 1);
603         } else {
604                 Log(LogWarning, "Checkable", "Unknown checkable type for statistic update.");
605         }
606 }
607
608 void Checkable::IncreasePendingChecks()
609 {
610         boost::mutex::scoped_lock lock(m_StatsMutex);
611         m_PendingChecks++;
612 }
613
614 void Checkable::DecreasePendingChecks()
615 {
616         boost::mutex::scoped_lock lock(m_StatsMutex);
617         m_PendingChecks--;
618         m_PendingChecksCV.notify_one();
619 }
620
621 int Checkable::GetPendingChecks()
622 {
623         boost::mutex::scoped_lock lock(m_StatsMutex);
624         return m_PendingChecks;
625 }
626
627 void Checkable::AquirePendingCheckSlot(int maxPendingChecks)
628 {
629         boost::mutex::scoped_lock lock(m_StatsMutex);
630         while (m_PendingChecks >= maxPendingChecks)
631                 m_PendingChecksCV.wait(lock);
632
633         m_PendingChecks++;
634 }