]> granicus.if.org Git - icinga2/blob - lib/icinga/checkable-check.cpp
Merge pull request #7527 from Icinga/bugfix/checkable-command-endpoint-zone
[icinga2] / lib / icinga / checkable-check.cpp
1 /* Icinga 2 | (c) 2012 Icinga GmbH | GPLv2+ */
2
3 #include "icinga/checkable.hpp"
4 #include "icinga/service.hpp"
5 #include "icinga/host.hpp"
6 #include "icinga/checkcommand.hpp"
7 #include "icinga/icingaapplication.hpp"
8 #include "icinga/cib.hpp"
9 #include "icinga/clusterevents.hpp"
10 #include "remote/messageorigin.hpp"
11 #include "remote/apilistener.hpp"
12 #include "base/objectlock.hpp"
13 #include "base/logger.hpp"
14 #include "base/convert.hpp"
15 #include "base/utility.hpp"
16 #include "base/context.hpp"
17
18 using namespace icinga;
19
20 boost::signals2::signal<void (const Checkable::Ptr&, const CheckResult::Ptr&, const MessageOrigin::Ptr&)> Checkable::OnNewCheckResult;
21 boost::signals2::signal<void (const Checkable::Ptr&, const CheckResult::Ptr&, StateType, const MessageOrigin::Ptr&)> Checkable::OnStateChange;
22 boost::signals2::signal<void (const Checkable::Ptr&, const CheckResult::Ptr&, std::set<Checkable::Ptr>, const MessageOrigin::Ptr&)> Checkable::OnReachabilityChanged;
23 boost::signals2::signal<void (const Checkable::Ptr&, NotificationType, const CheckResult::Ptr&, const String&, const String&, const MessageOrigin::Ptr&)> Checkable::OnNotificationsRequested;
24 boost::signals2::signal<void (const Checkable::Ptr&)> Checkable::OnNextCheckUpdated;
25
26 Atomic<uint_fast64_t> Checkable::CurrentConcurrentChecks (0);
27
28 boost::mutex Checkable::m_StatsMutex;
29 int Checkable::m_PendingChecks = 0;
30 boost::condition_variable Checkable::m_PendingChecksCV;
31
32 CheckCommand::Ptr Checkable::GetCheckCommand() const
33 {
34         return dynamic_pointer_cast<CheckCommand>(NavigateCheckCommandRaw());
35 }
36
37 TimePeriod::Ptr Checkable::GetCheckPeriod() const
38 {
39         return TimePeriod::GetByName(GetCheckPeriodRaw());
40 }
41
42 void Checkable::SetSchedulingOffset(long offset)
43 {
44         m_SchedulingOffset = offset;
45 }
46
47 long Checkable::GetSchedulingOffset()
48 {
49         return m_SchedulingOffset;
50 }
51
52 void Checkable::UpdateNextCheck(const MessageOrigin::Ptr& origin)
53 {
54         double interval;
55
56         if (GetStateType() == StateTypeSoft && GetLastCheckResult() != nullptr)
57                 interval = GetRetryInterval();
58         else
59                 interval = GetCheckInterval();
60
61         double now = Utility::GetTime();
62         double adj = 0;
63
64         if (interval > 1)
65                 adj = fmod(now * 100 + GetSchedulingOffset(), interval * 100) / 100.0;
66
67         if (adj != 0.0)
68                 adj = std::min(0.5 + fmod(GetSchedulingOffset(), interval * 5) / 100.0, adj);
69
70         double nextCheck = now - adj + interval;
71         double lastCheck = GetLastCheck();
72
73         Log(LogDebug, "Checkable")
74                 << "Update checkable '" << GetName() << "' with check interval '" << GetCheckInterval()
75                 << "' from last check time at " << Utility::FormatDateTime("%Y-%m-%d %H:%M:%S %z", (lastCheck < 0 ? 0 : lastCheck))
76                 << " (" << GetLastCheck() << ") to next check time at " << Utility::FormatDateTime("%Y-%m-%d %H:%M:%S %z", nextCheck) << " (" << nextCheck << ").";
77
78         SetNextCheck(nextCheck, false, origin);
79 }
80
81 bool Checkable::HasBeenChecked() const
82 {
83         return GetLastCheckResult() != nullptr;
84 }
85
86 double Checkable::GetLastCheck() const
87 {
88         CheckResult::Ptr cr = GetLastCheckResult();
89         double schedule_end = -1;
90
91         if (cr)
92                 schedule_end = cr->GetScheduleEnd();
93
94         return schedule_end;
95 }
96
97 void Checkable::ProcessCheckResult(const CheckResult::Ptr& cr, const MessageOrigin::Ptr& origin)
98 {
99         {
100                 ObjectLock olock(this);
101                 m_CheckRunning = false;
102         }
103
104         if (!cr)
105                 return;
106
107         double now = Utility::GetTime();
108
109         if (cr->GetScheduleStart() == 0)
110                 cr->SetScheduleStart(now);
111
112         if (cr->GetScheduleEnd() == 0)
113                 cr->SetScheduleEnd(now);
114
115         if (cr->GetExecutionStart() == 0)
116                 cr->SetExecutionStart(now);
117
118         if (cr->GetExecutionEnd() == 0)
119                 cr->SetExecutionEnd(now);
120
121         if (!origin || origin->IsLocal())
122                 cr->SetCheckSource(IcingaApplication::GetInstance()->GetNodeName());
123
124         Endpoint::Ptr command_endpoint = GetCommandEndpoint();
125
126         /* override check source if command_endpoint was defined */
127         if (command_endpoint && !GetExtension("agent_check"))
128                 cr->SetCheckSource(command_endpoint->GetName());
129
130         /* agent checks go through the api */
131         if (command_endpoint && GetExtension("agent_check")) {
132                 ApiListener::Ptr listener = ApiListener::GetInstance();
133
134                 if (listener) {
135                         /* send message back to its origin */
136                         Dictionary::Ptr message = ClusterEvents::MakeCheckResultMessage(this, cr);
137                         listener->SyncSendMessage(command_endpoint, message);
138                 }
139
140                 return;
141
142         }
143
144         if (!IsActive())
145                 return;
146
147         bool reachable = IsReachable();
148         bool notification_reachable = IsReachable(DependencyNotification);
149
150         ObjectLock olock(this);
151
152         CheckResult::Ptr old_cr = GetLastCheckResult();
153         ServiceState old_state = GetStateRaw();
154         StateType old_stateType = GetStateType();
155         long old_attempt = GetCheckAttempt();
156         bool recovery = false;
157
158         /* When we have an check result already (not after fresh start),
159          * prevent to accept old check results and allow overrides for
160          * CRs happened in the future.
161          */
162         if (old_cr) {
163                 double currentCRTimestamp = old_cr->GetExecutionStart();
164                 double newCRTimestamp = cr->GetExecutionStart();
165
166                 /* Our current timestamp may be from the future (wrong server time adjusted again). Allow overrides here. */
167                 if (currentCRTimestamp > now) {
168                         /* our current CR is from the future, let the new CR override it. */
169                         Log(LogDebug, "Checkable")
170                                 << std::fixed << std::setprecision(6) << "Processing check result for checkable '" << GetName() << "' from "
171                                 << Utility::FormatDateTime("%Y-%m-%d %H:%M:%S %z", newCRTimestamp) << " (" << newCRTimestamp
172                                 << "). Overriding since ours is from the future at "
173                                 << Utility::FormatDateTime("%Y-%m-%d %H:%M:%S %z", currentCRTimestamp) << " (" << currentCRTimestamp << ").";
174                 } else {
175                         /* Current timestamp is from the past, but the new timestamp is even more in the past. Skip it. */
176                         if (newCRTimestamp < currentCRTimestamp) {
177                                 Log(LogDebug, "Checkable")
178                                         << std::fixed << std::setprecision(6) << "Skipping check result for checkable '" << GetName() << "' from "
179                                         << Utility::FormatDateTime("%Y-%m-%d %H:%M:%S %z", newCRTimestamp) << " (" << newCRTimestamp
180                                         << "). It is in the past compared to ours at "
181                                         << Utility::FormatDateTime("%Y-%m-%d %H:%M:%S %z", currentCRTimestamp) << " (" << currentCRTimestamp << ").";
182                                 return;
183                         }
184                 }
185         }
186
187         /* The ExecuteCheck function already sets the old state, but we need to do it again
188          * in case this was a passive check result. */
189         SetLastStateRaw(old_state);
190         SetLastStateType(old_stateType);
191         SetLastReachable(reachable);
192
193         Host::Ptr host;
194         Service::Ptr service;
195         tie(host, service) = GetHostService(this);
196
197         CheckableType checkableType = CheckableHost;
198         if (service)
199                 checkableType = CheckableService;
200
201         long attempt = 1;
202
203         std::set<Checkable::Ptr> children = GetChildren();
204
205         if (IsStateOK(cr->GetState())) {
206                 SetStateType(StateTypeHard); // NOT-OK -> HARD OK
207
208                 if (!IsStateOK(old_state))
209                         recovery = true;
210
211                 ResetNotificationNumbers();
212                 SaveLastState(ServiceOK, Utility::GetTime());
213
214                 /* update reachability for child objects in OK state */
215                 if (!children.empty())
216                         OnReachabilityChanged(this, cr, children, origin);
217         } else {
218                 /* OK -> NOT-OK change, first SOFT state. Reset attempt counter. */
219                 if (IsStateOK(old_state)) {
220                         SetStateType(StateTypeSoft);
221                         attempt = 1;
222                 }
223
224                 /* SOFT state change, increase attempt counter. */
225                 if (old_stateType == StateTypeSoft && !IsStateOK(old_state)) {
226                         SetStateType(StateTypeSoft);
227                         attempt = old_attempt + 1;
228                 }
229
230                 /* HARD state change (e.g. previously 2/3 and this next attempt). Reset attempt counter. */
231                 if (attempt >= GetMaxCheckAttempts()) {
232                         SetStateType(StateTypeHard);
233                         attempt = 1;
234                 }
235
236                 if (!IsStateOK(cr->GetState())) {
237                         SaveLastState(cr->GetState(), Utility::GetTime());
238                 }
239
240                 /* update reachability for child objects in NOT-OK state */
241                 if (!children.empty())
242                         OnReachabilityChanged(this, cr, children, origin);
243         }
244
245         if (!reachable)
246                 SetLastStateUnreachable(Utility::GetTime());
247
248         SetCheckAttempt(attempt);
249
250         ServiceState new_state = cr->GetState();
251         SetStateRaw(new_state);
252
253         bool stateChange;
254
255         /* Exception on state change calculation for hosts. */
256         if (checkableType == CheckableService)
257                 stateChange = (old_state != new_state);
258         else
259                 stateChange = (Host::CalculateState(old_state) != Host::CalculateState(new_state));
260
261         /* Store the current last state change for the next iteration. */
262         SetPreviousStateChange(GetLastStateChange());
263
264         if (stateChange) {
265                 SetLastStateChange(now);
266
267                 /* remove acknowledgements */
268                 if (GetAcknowledgement() == AcknowledgementNormal ||
269                         (GetAcknowledgement() == AcknowledgementSticky && IsStateOK(new_state))) {
270                         ClearAcknowledgement();
271                 }
272
273                 /* reschedule direct parents */
274                 for (const Checkable::Ptr& parent : GetParents()) {
275                         if (parent.get() == this)
276                                 continue;
277
278                         if (!parent->GetEnableActiveChecks())
279                                 continue;
280
281                         if (parent->GetNextCheck() >= now + parent->GetRetryInterval()) {
282                                 ObjectLock olock(parent);
283                                 parent->SetNextCheck(now);
284                         }
285                 }
286         }
287
288         bool remove_acknowledgement_comments = false;
289
290         if (GetAcknowledgement() == AcknowledgementNone)
291                 remove_acknowledgement_comments = true;
292
293         bool hardChange = (GetStateType() == StateTypeHard && old_stateType == StateTypeSoft);
294
295         if (stateChange && old_stateType == StateTypeHard && GetStateType() == StateTypeHard)
296                 hardChange = true;
297
298         bool is_volatile = GetVolatile();
299
300         if (hardChange || is_volatile) {
301                 SetLastHardStateRaw(new_state);
302                 SetLastHardStateChange(now);
303         }
304
305         if (!IsStateOK(new_state))
306                 TriggerDowntimes();
307
308         /* statistics for external tools */
309         Checkable::UpdateStatistics(cr, checkableType);
310
311         bool in_downtime = IsInDowntime();
312
313         bool send_notification = false;
314         bool suppress_notification = !notification_reachable || in_downtime || IsAcknowledged();
315
316         /* Send notifications whether when a hard state change occurred. */
317         if (hardChange && !(old_stateType == StateTypeSoft && IsStateOK(new_state)))
318                 send_notification = true;
319         /* Or if the checkable is volatile and in a HARD state. */
320         else if (is_volatile && GetStateType() == StateTypeHard)
321                 send_notification = true;
322
323         if (IsStateOK(old_state) && old_stateType == StateTypeSoft)
324                 send_notification = false; /* Don't send notifications for SOFT-OK -> HARD-OK. */
325
326         if (is_volatile && IsStateOK(old_state) && IsStateOK(new_state))
327                 send_notification = false; /* Don't send notifications for volatile OK -> OK changes. */
328
329         olock.Unlock();
330
331         if (remove_acknowledgement_comments)
332                 RemoveCommentsByType(CommentAcknowledgement);
333
334         Dictionary::Ptr vars_after = new Dictionary({
335                 { "state", new_state },
336                 { "state_type", GetStateType() },
337                 { "attempt", GetCheckAttempt() },
338                 { "reachable", reachable }
339         });
340
341         if (old_cr)
342                 cr->SetVarsBefore(old_cr->GetVarsAfter());
343
344         cr->SetVarsAfter(vars_after);
345
346         olock.Lock();
347         SetLastCheckResult(cr);
348
349         bool was_flapping = IsFlapping();
350
351         UpdateFlappingStatus(old_state != cr->GetState());
352
353         bool is_flapping = IsFlapping();
354
355         if (cr->GetActive()) {
356                 UpdateNextCheck(origin);
357         } else {
358                 /* Reschedule the next check for external passive check results. The side effect of
359                  * this is that for as long as we receive results for a service we
360                  * won't execute any active checks. */
361                 double offset;
362                 double ttl = cr->GetTtl();
363
364                 if (ttl > 0)
365                         offset = ttl;
366                 else
367                         offset = GetCheckInterval();
368
369                 SetNextCheck(Utility::GetTime() + offset, false, origin);
370         }
371
372         olock.Unlock();
373
374 #ifdef I2_DEBUG /* I2_DEBUG */
375         Log(LogDebug, "Checkable")
376                 << "Flapping: Checkable " << GetName()
377                 << " was: " << was_flapping
378                 << " is: " << is_flapping
379                 << " threshold low: " << GetFlappingThresholdLow()
380                 << " threshold high: " << GetFlappingThresholdHigh()
381                 << "% current: " << GetFlappingCurrent() << "%.";
382 #endif /* I2_DEBUG */
383
384         OnNewCheckResult(this, cr, origin);
385
386         /* signal status updates to for example db_ido */
387         OnStateChanged(this);
388
389         String old_state_str = (service ? Service::StateToString(old_state) : Host::StateToString(Host::CalculateState(old_state)));
390         String new_state_str = (service ? Service::StateToString(new_state) : Host::StateToString(Host::CalculateState(new_state)));
391
392         /* Whether a hard state change or a volatile state change except OK -> OK happened. */
393         if (hardChange || (is_volatile && !(IsStateOK(old_state) && IsStateOK(new_state)))) {
394                 OnStateChange(this, cr, StateTypeHard, origin);
395                 Log(LogNotice, "Checkable")
396                         << "State Change: Checkable '" << GetName() << "' hard state change from " << old_state_str << " to " << new_state_str << " detected." << (is_volatile ? " Checkable is volatile." : "");
397         }
398         /* Whether a state change happened or the state type is SOFT (must be logged too). */
399         else if (stateChange || GetStateType() == StateTypeSoft) {
400                 OnStateChange(this, cr, StateTypeSoft, origin);
401                 Log(LogNotice, "Checkable")
402                         << "State Change: Checkable '" << GetName() << "' soft state change from " << old_state_str << " to " << new_state_str << " detected.";
403         }
404
405         if (GetStateType() == StateTypeSoft || hardChange || recovery ||
406                 (is_volatile && !(IsStateOK(old_state) && IsStateOK(new_state))))
407                 ExecuteEventHandler();
408
409         int suppressed_types = 0;
410
411         /* Flapping start/end notifications */
412         if (!was_flapping && is_flapping) {
413                 /* FlappingStart notifications happen on state changes, not in downtimes */
414                 if (!IsPaused()) {
415                         if (in_downtime) {
416                                 suppressed_types |= NotificationFlappingStart;
417                         } else {
418                                 OnNotificationsRequested(this, NotificationFlappingStart, cr, "", "", nullptr);
419                         }
420                 }
421
422                 Log(LogNotice, "Checkable")
423                         << "Flapping Start: Checkable '" << GetName() << "' started flapping (Current flapping value "
424                         << GetFlappingCurrent() << "% > high threshold " << GetFlappingThresholdHigh() << "%).";
425
426                 NotifyFlapping(origin);
427         } else if (was_flapping && !is_flapping) {
428                 /* FlappingEnd notifications are independent from state changes, must not happen in downtine */
429                 if (!IsPaused()) {
430                         if (in_downtime) {
431                                 suppressed_types |= NotificationFlappingEnd;
432                         } else {
433                                 OnNotificationsRequested(this, NotificationFlappingEnd, cr, "", "", nullptr);
434                         }
435                 }
436
437                 Log(LogNotice, "Checkable")
438                         << "Flapping Stop: Checkable '" << GetName() << "' stopped flapping (Current flapping value "
439                         << GetFlappingCurrent() << "% < low threshold " << GetFlappingThresholdLow() << "%).";
440
441                 NotifyFlapping(origin);
442         }
443
444         if (send_notification && !is_flapping) {
445                 if (!IsPaused()) {
446                         if (suppress_notification) {
447                                 suppressed_types |= (recovery ? NotificationRecovery : NotificationProblem);
448                         } else {
449                                 OnNotificationsRequested(this, recovery ? NotificationRecovery : NotificationProblem, cr, "", "", nullptr);
450                         }
451                 }
452         }
453
454         if (suppressed_types) {
455                 /* If some notifications were suppressed, but just because of e.g. a downtime,
456                  * stash them into a notification types bitmask for maybe re-sending later.
457                  */
458
459                 ObjectLock olock (this);
460                 int suppressed_types_before (GetSuppressedNotifications());
461                 int suppressed_types_after (suppressed_types_before | suppressed_types);
462
463                 for (int conflict : {NotificationProblem | NotificationRecovery, NotificationFlappingStart | NotificationFlappingEnd}) {
464                         /* E.g. problem and recovery notifications neutralize each other. */
465
466                         if ((suppressed_types_after & conflict) == conflict) {
467                                 suppressed_types_after &= ~conflict;
468                         }
469                 }
470
471                 if (suppressed_types_after != suppressed_types_before) {
472                         SetSuppressedNotifications(suppressed_types_after);
473                 }
474         }
475 }
476
477 void Checkable::ExecuteRemoteCheck(const Dictionary::Ptr& resolvedMacros)
478 {
479         CONTEXT("Executing remote check for object '" + GetName() + "'");
480
481         double scheduled_start = GetNextCheck();
482         double before_check = Utility::GetTime();
483
484         CheckResult::Ptr cr = new CheckResult();
485         cr->SetScheduleStart(scheduled_start);
486         cr->SetExecutionStart(before_check);
487
488         GetCheckCommand()->Execute(this, cr, resolvedMacros, true);
489 }
490
491 void Checkable::ExecuteCheck()
492 {
493         CONTEXT("Executing check for object '" + GetName() + "'");
494
495         /* keep track of scheduling info in case the check type doesn't provide its own information */
496         double scheduled_start = GetNextCheck();
497         double before_check = Utility::GetTime();
498
499         /* This calls SetNextCheck() which updates the CheckerComponent's idle/pending
500          * queues and ensures that checks are not fired multiple times. ProcessCheckResult()
501          * is called too late. See #6421.
502          */
503         UpdateNextCheck();
504
505         bool reachable = IsReachable();
506
507         {
508                 ObjectLock olock(this);
509
510                 /* don't run another check if there is one pending */
511                 if (m_CheckRunning)
512                         return;
513
514                 m_CheckRunning = true;
515
516                 SetLastStateRaw(GetStateRaw());
517                 SetLastStateType(GetLastStateType());
518                 SetLastReachable(reachable);
519         }
520
521         CheckResult::Ptr cr = new CheckResult();
522
523         cr->SetScheduleStart(scheduled_start);
524         cr->SetExecutionStart(before_check);
525
526         Endpoint::Ptr endpoint = GetCommandEndpoint();
527         bool local = !endpoint || endpoint == Endpoint::GetLocalEndpoint();
528
529         if (local) {
530                 GetCheckCommand()->Execute(this, cr, nullptr, false);
531         } else {
532                 Dictionary::Ptr macros = new Dictionary();
533                 GetCheckCommand()->Execute(this, cr, macros, false);
534
535                 if (endpoint->GetConnected()) {
536                         /* perform check on remote endpoint */
537                         Dictionary::Ptr message = new Dictionary();
538                         message->Set("jsonrpc", "2.0");
539                         message->Set("method", "event::ExecuteCommand");
540
541                         Host::Ptr host;
542                         Service::Ptr service;
543                         tie(host, service) = GetHostService(this);
544
545                         Dictionary::Ptr params = new Dictionary();
546                         message->Set("params", params);
547                         params->Set("command_type", "check_command");
548                         params->Set("command", GetCheckCommand()->GetName());
549                         params->Set("host", host->GetName());
550
551                         if (service)
552                                 params->Set("service", service->GetShortName());
553
554                         params->Set("macros", macros);
555
556                         ApiListener::Ptr listener = ApiListener::GetInstance();
557
558                         if (listener)
559                                 listener->SyncSendMessage(endpoint, message);
560
561                         /* Re-schedule the check so we don't run it again until after we've received
562                          * a check result from the remote instance. The check will be re-scheduled
563                          * using the proper check interval once we've received a check result.
564                          */
565                         SetNextCheck(Utility::GetTime() + GetCheckCommand()->GetTimeout() + 30);
566                 } else if (!endpoint->GetSyncing() && Application::GetInstance()->GetStartTime() < Utility::GetTime() - 300) {
567                         /* fail to perform check on unconnected endpoint */
568                         cr->SetState(ServiceUnknown);
569
570                         String output = "Remote Icinga instance '" + endpoint->GetName() + "' is not connected to ";
571
572                         Endpoint::Ptr localEndpoint = Endpoint::GetLocalEndpoint();
573
574                         if (localEndpoint)
575                                 output += "'" + localEndpoint->GetName() + "'";
576                         else
577                                 output += "this instance";
578
579                         cr->SetOutput(output);
580
581                         ProcessCheckResult(cr);
582                 }
583
584                 {
585                         ObjectLock olock(this);
586                         m_CheckRunning = false;
587                 }
588         }
589 }
590
591 void Checkable::UpdateStatistics(const CheckResult::Ptr& cr, CheckableType type)
592 {
593         time_t ts = cr->GetScheduleEnd();
594
595         if (type == CheckableHost) {
596                 if (cr->GetActive())
597                         CIB::UpdateActiveHostChecksStatistics(ts, 1);
598                 else
599                         CIB::UpdatePassiveHostChecksStatistics(ts, 1);
600         } else if (type == CheckableService) {
601                 if (cr->GetActive())
602                         CIB::UpdateActiveServiceChecksStatistics(ts, 1);
603                 else
604                         CIB::UpdatePassiveServiceChecksStatistics(ts, 1);
605         } else {
606                 Log(LogWarning, "Checkable", "Unknown checkable type for statistic update.");
607         }
608 }
609
610 void Checkable::IncreasePendingChecks()
611 {
612         boost::mutex::scoped_lock lock(m_StatsMutex);
613         m_PendingChecks++;
614 }
615
616 void Checkable::DecreasePendingChecks()
617 {
618         boost::mutex::scoped_lock lock(m_StatsMutex);
619         m_PendingChecks--;
620         m_PendingChecksCV.notify_one();
621 }
622
623 int Checkable::GetPendingChecks()
624 {
625         boost::mutex::scoped_lock lock(m_StatsMutex);
626         return m_PendingChecks;
627 }
628
629 void Checkable::AquirePendingCheckSlot(int maxPendingChecks)
630 {
631         boost::mutex::scoped_lock lock(m_StatsMutex);
632         while (m_PendingChecks >= maxPendingChecks)
633                 m_PendingChecksCV.wait(lock);
634
635         m_PendingChecks++;
636 }