]> granicus.if.org Git - icinga2/blob - lib/perfdata/influxdbwriter.cpp
Merge pull request #7527 from Icinga/bugfix/checkable-command-endpoint-zone
[icinga2] / lib / perfdata / influxdbwriter.cpp
1 /* Icinga 2 | (c) 2012 Icinga GmbH | GPLv2+ */
2
3 #include "perfdata/influxdbwriter.hpp"
4 #include "perfdata/influxdbwriter-ti.cpp"
5 #include "remote/url.hpp"
6 #include "icinga/service.hpp"
7 #include "icinga/macroprocessor.hpp"
8 #include "icinga/icingaapplication.hpp"
9 #include "icinga/checkcommand.hpp"
10 #include "base/application.hpp"
11 #include "base/defer.hpp"
12 #include "base/io-engine.hpp"
13 #include "base/tcpsocket.hpp"
14 #include "base/configtype.hpp"
15 #include "base/objectlock.hpp"
16 #include "base/logger.hpp"
17 #include "base/convert.hpp"
18 #include "base/utility.hpp"
19 #include "base/perfdatavalue.hpp"
20 #include "base/stream.hpp"
21 #include "base/json.hpp"
22 #include "base/networkstream.hpp"
23 #include "base/exception.hpp"
24 #include "base/statsfunction.hpp"
25 #include "base/tlsutility.hpp"
26 #include <boost/algorithm/string.hpp>
27 #include <boost/algorithm/string/replace.hpp>
28 #include <boost/asio/ssl/context.hpp>
29 #include <boost/beast/core/flat_buffer.hpp>
30 #include <boost/beast/http/field.hpp>
31 #include <boost/beast/http/message.hpp>
32 #include <boost/beast/http/parser.hpp>
33 #include <boost/beast/http/read.hpp>
34 #include <boost/beast/http/status.hpp>
35 #include <boost/beast/http/string_body.hpp>
36 #include <boost/beast/http/verb.hpp>
37 #include <boost/beast/http/write.hpp>
38 #include <boost/math/special_functions/fpclassify.hpp>
39 #include <boost/regex.hpp>
40 #include <boost/scoped_array.hpp>
41 #include <memory>
42 #include <string>
43 #include <utility>
44
45 using namespace icinga;
46
47 class InfluxdbInteger final : public Object
48 {
49 public:
50         DECLARE_PTR_TYPEDEFS(InfluxdbInteger);
51
52         InfluxdbInteger(int value)
53                 : m_Value(value)
54         { }
55
56         int GetValue() const
57         {
58                 return m_Value;
59         }
60
61 private:
62         int m_Value;
63 };
64
65 REGISTER_TYPE(InfluxdbWriter);
66
67 REGISTER_STATSFUNCTION(InfluxdbWriter, &InfluxdbWriter::StatsFunc);
68
69 void InfluxdbWriter::OnConfigLoaded()
70 {
71         ObjectImpl<InfluxdbWriter>::OnConfigLoaded();
72
73         m_WorkQueue.SetName("InfluxdbWriter, " + GetName());
74
75         if (!GetEnableHa()) {
76                 Log(LogDebug, "InfluxdbWriter")
77                         << "HA functionality disabled. Won't pause connection: " << GetName();
78
79                 SetHAMode(HARunEverywhere);
80         } else {
81                 SetHAMode(HARunOnce);
82         }
83 }
84
85 void InfluxdbWriter::StatsFunc(const Dictionary::Ptr& status, const Array::Ptr& perfdata)
86 {
87         DictionaryData nodes;
88
89         for (const InfluxdbWriter::Ptr& influxdbwriter : ConfigType::GetObjectsByType<InfluxdbWriter>()) {
90                 size_t workQueueItems = influxdbwriter->m_WorkQueue.GetLength();
91                 double workQueueItemRate = influxdbwriter->m_WorkQueue.GetTaskCount(60) / 60.0;
92                 size_t dataBufferItems = influxdbwriter->m_DataBuffer.size();
93
94                 nodes.emplace_back(influxdbwriter->GetName(), new Dictionary({
95                         { "work_queue_items", workQueueItems },
96                         { "work_queue_item_rate", workQueueItemRate },
97                         { "data_buffer_items", dataBufferItems }
98                 }));
99
100                 perfdata->Add(new PerfdataValue("influxdbwriter_" + influxdbwriter->GetName() + "_work_queue_items", workQueueItems));
101                 perfdata->Add(new PerfdataValue("influxdbwriter_" + influxdbwriter->GetName() + "_work_queue_item_rate", workQueueItemRate));
102                 perfdata->Add(new PerfdataValue("influxdbwriter_" + influxdbwriter->GetName() + "_data_queue_items", dataBufferItems));
103         }
104
105         status->Set("influxdbwriter", new Dictionary(std::move(nodes)));
106 }
107
108 void InfluxdbWriter::Resume()
109 {
110         ObjectImpl<InfluxdbWriter>::Resume();
111
112         Log(LogInformation, "InfluxdbWriter")
113                 << "'" << GetName() << "' resumed.";
114
115         /* Register exception handler for WQ tasks. */
116         m_WorkQueue.SetExceptionCallback(std::bind(&InfluxdbWriter::ExceptionHandler, this, _1));
117
118         /* Setup timer for periodically flushing m_DataBuffer */
119         m_FlushTimer = new Timer();
120         m_FlushTimer->SetInterval(GetFlushInterval());
121         m_FlushTimer->OnTimerExpired.connect(std::bind(&InfluxdbWriter::FlushTimeout, this));
122         m_FlushTimer->Start();
123         m_FlushTimer->Reschedule(0);
124
125         /* Register for new metrics. */
126         Checkable::OnNewCheckResult.connect(std::bind(&InfluxdbWriter::CheckResultHandler, this, _1, _2));
127 }
128
129 /* Pause is equivalent to Stop, but with HA capabilities to resume at runtime. */
130 void InfluxdbWriter::Pause()
131 {
132         /* Force a flush. */
133         Log(LogDebug, "InfluxdbWriter")
134                 << "Flushing pending data buffers.";
135
136         Flush();
137
138         /* Work on the missing tasks. TODO: Find a way to cache them on disk. */
139         Log(LogDebug, "InfluxdbWriter")
140                 << "Joining existing WQ tasks.";
141
142         m_WorkQueue.Join();
143
144         /* Flush again after the WQ tasks have filled the data buffer. */
145         Log(LogDebug, "InfluxdbWriter")
146                 << "Flushing data buffers from WQ tasks.";
147
148         Flush();
149
150         Log(LogInformation, "InfluxdbWriter")
151                 << "'" << GetName() << "' paused.";
152
153         ObjectImpl<InfluxdbWriter>::Pause();
154 }
155
156 void InfluxdbWriter::AssertOnWorkQueue()
157 {
158         ASSERT(m_WorkQueue.IsWorkerThread());
159 }
160
161 void InfluxdbWriter::ExceptionHandler(boost::exception_ptr exp)
162 {
163         Log(LogCritical, "InfluxdbWriter", "Exception during InfluxDB operation: Verify that your backend is operational!");
164
165         Log(LogDebug, "InfluxdbWriter")
166                 << "Exception during InfluxDB operation: " << DiagnosticInformation(std::move(exp));
167
168         //TODO: Close the connection, if we keep it open.
169 }
170
171 OptionalTlsStream InfluxdbWriter::Connect()
172 {
173         Log(LogNotice, "InfluxdbWriter")
174                 << "Reconnecting to InfluxDB on host '" << GetHost() << "' port '" << GetPort() << "'.";
175
176         OptionalTlsStream stream;
177         bool ssl = GetSslEnable();
178
179         if (ssl) {
180                 std::shared_ptr<boost::asio::ssl::context> sslContext;
181
182                 try {
183                         sslContext = MakeAsioSslContext(GetSslCert(), GetSslKey(), GetSslCaCert());
184                 } catch (const std::exception& ex) {
185                         Log(LogWarning, "InfluxdbWriter")
186                                 << "Unable to create SSL context.";
187                         throw;
188                 }
189
190                 stream.first = std::make_shared<AsioTlsStream>(IoEngine::Get().GetIoContext(), *sslContext, GetHost());
191         } else {
192                 stream.second = std::make_shared<AsioTcpStream>(IoEngine::Get().GetIoContext());
193         }
194
195         try {
196                 icinga::Connect(ssl ? stream.first->lowest_layer() : stream.second->lowest_layer(), GetHost(), GetPort());
197         } catch (const std::exception& ex) {
198                 Log(LogWarning, "InfluxdbWriter")
199                         << "Can't connect to InfluxDB on host '" << GetHost() << "' port '" << GetPort() << "'.";
200                 throw;
201         }
202
203         if (ssl) {
204                 auto& tlsStream (stream.first->next_layer());
205
206                 try {
207                         tlsStream.handshake(tlsStream.client);
208                 } catch (const std::exception& ex) {
209                         Log(LogWarning, "InfluxdbWriter")
210                                 << "TLS handshake with host '" << GetHost() << "' failed.";
211                         throw;
212                 }
213         }
214
215         return std::move(stream);
216 }
217
218 void InfluxdbWriter::CheckResultHandler(const Checkable::Ptr& checkable, const CheckResult::Ptr& cr)
219 {
220         if (IsPaused())
221                 return;
222
223         m_WorkQueue.Enqueue(std::bind(&InfluxdbWriter::CheckResultHandlerWQ, this, checkable, cr), PriorityLow);
224 }
225
226 void InfluxdbWriter::CheckResultHandlerWQ(const Checkable::Ptr& checkable, const CheckResult::Ptr& cr)
227 {
228         AssertOnWorkQueue();
229
230         CONTEXT("Processing check result for '" + checkable->GetName() + "'");
231
232         if (!IcingaApplication::GetInstance()->GetEnablePerfdata() || !checkable->GetEnablePerfdata())
233                 return;
234
235         Host::Ptr host;
236         Service::Ptr service;
237         tie(host, service) = GetHostService(checkable);
238
239         MacroProcessor::ResolverList resolvers;
240         if (service)
241                 resolvers.emplace_back("service", service);
242         resolvers.emplace_back("host", host);
243         resolvers.emplace_back("icinga", IcingaApplication::GetInstance());
244
245         String prefix;
246
247         double ts = cr->GetExecutionEnd();
248
249         // Clone the template and perform an in-place macro expansion of measurement and tag values
250         Dictionary::Ptr tmpl_clean = service ? GetServiceTemplate() : GetHostTemplate();
251         Dictionary::Ptr tmpl = static_pointer_cast<Dictionary>(tmpl_clean->ShallowClone());
252         tmpl->Set("measurement", MacroProcessor::ResolveMacros(tmpl->Get("measurement"), resolvers, cr));
253
254         Dictionary::Ptr tagsClean = tmpl->Get("tags");
255         if (tagsClean) {
256                 Dictionary::Ptr tags = new Dictionary();
257
258                 {
259                         ObjectLock olock(tagsClean);
260                         for (const Dictionary::Pair& pair : tagsClean) {
261                                 String missing_macro;
262                                 Value value = MacroProcessor::ResolveMacros(pair.second, resolvers, cr, &missing_macro);
263
264                                 if (missing_macro.IsEmpty()) {
265                                         tags->Set(pair.first, value);
266                                 }
267                         }
268                 }
269
270                 tmpl->Set("tags", tags);
271         }
272
273         CheckCommand::Ptr checkCommand = checkable->GetCheckCommand();
274
275         Array::Ptr perfdata = cr->GetPerformanceData();
276
277         if (perfdata) {
278                 ObjectLock olock(perfdata);
279                 for (const Value& val : perfdata) {
280                         PerfdataValue::Ptr pdv;
281
282                         if (val.IsObjectType<PerfdataValue>())
283                                 pdv = val;
284                         else {
285                                 try {
286                                         pdv = PerfdataValue::Parse(val);
287                                 } catch (const std::exception&) {
288                                         Log(LogWarning, "InfluxdbWriter")
289                                                 << "Ignoring invalid perfdata for checkable '"
290                                                 << checkable->GetName() << "' and command '"
291                                                 << checkCommand->GetName() << "' with value: " << val;
292                                         continue;
293                                 }
294                         }
295
296                         Dictionary::Ptr fields = new Dictionary();
297                         fields->Set("value", pdv->GetValue());
298
299                         if (GetEnableSendThresholds()) {
300                                 if (pdv->GetCrit())
301                                         fields->Set("crit", pdv->GetCrit());
302                                 if (pdv->GetWarn())
303                                         fields->Set("warn", pdv->GetWarn());
304                                 if (pdv->GetMin())
305                                         fields->Set("min", pdv->GetMin());
306                                 if (pdv->GetMax())
307                                         fields->Set("max", pdv->GetMax());
308                         }
309                         if (!pdv->GetUnit().IsEmpty()) {
310                                 fields->Set("unit", pdv->GetUnit());
311                         }
312
313                         SendMetric(checkable, tmpl, pdv->GetLabel(), fields, ts);
314                 }
315         }
316
317         if (GetEnableSendMetadata()) {
318                 Host::Ptr host;
319                 Service::Ptr service;
320                 tie(host, service) = GetHostService(checkable);
321
322                 Dictionary::Ptr fields = new Dictionary();
323
324                 if (service)
325                         fields->Set("state", new InfluxdbInteger(service->GetState()));
326                 else
327                         fields->Set("state", new InfluxdbInteger(host->GetState()));
328
329                 fields->Set("current_attempt", new InfluxdbInteger(checkable->GetCheckAttempt()));
330                 fields->Set("max_check_attempts", new InfluxdbInteger(checkable->GetMaxCheckAttempts()));
331                 fields->Set("state_type", new InfluxdbInteger(checkable->GetStateType()));
332                 fields->Set("reachable", checkable->IsReachable());
333                 fields->Set("downtime_depth", new InfluxdbInteger(checkable->GetDowntimeDepth()));
334                 fields->Set("acknowledgement", new InfluxdbInteger(checkable->GetAcknowledgement()));
335                 fields->Set("latency", cr->CalculateLatency());
336                 fields->Set("execution_time", cr->CalculateExecutionTime());
337
338                 SendMetric(checkable, tmpl, Empty, fields, ts);
339         }
340 }
341
342 String InfluxdbWriter::EscapeKeyOrTagValue(const String& str)
343 {
344         // Iterate over the key name and escape commas and spaces with a backslash
345         String result = str;
346         boost::algorithm::replace_all(result, "\"", "\\\"");
347         boost::algorithm::replace_all(result, "=", "\\=");
348         boost::algorithm::replace_all(result, ",", "\\,");
349         boost::algorithm::replace_all(result, " ", "\\ ");
350
351         // InfluxDB 'feature': although backslashes are allowed in keys they also act
352         // as escape sequences when followed by ',' or ' '.  When your tag is like
353         // 'metric=C:\' bad things happen.  Backslashes themselves cannot be escaped
354         // and through experimentation they also escape '='.  To be safe we replace
355         // trailing backslashes with and underscore.
356         // See https://github.com/influxdata/influxdb/issues/8587 for more info
357         size_t length = result.GetLength();
358         if (result[length - 1] == '\\')
359                 result[length - 1] = '_';
360
361         return result;
362 }
363
364 String InfluxdbWriter::EscapeValue(const Value& value)
365 {
366         if (value.IsObjectType<InfluxdbInteger>()) {
367                 std::ostringstream os;
368                 os << static_cast<InfluxdbInteger::Ptr>(value)->GetValue() << "i";
369                 return os.str();
370         }
371
372         if (value.IsBoolean())
373                 return value ? "true" : "false";
374
375         if (value.IsString())
376                 return "\"" + EscapeKeyOrTagValue(value) + "\"";
377
378         return value;
379 }
380
381 void InfluxdbWriter::SendMetric(const Checkable::Ptr& checkable, const Dictionary::Ptr& tmpl,
382         const String& label, const Dictionary::Ptr& fields, double ts)
383 {
384         std::ostringstream msgbuf;
385         msgbuf << EscapeKeyOrTagValue(tmpl->Get("measurement"));
386
387         Dictionary::Ptr tags = tmpl->Get("tags");
388         if (tags) {
389                 ObjectLock olock(tags);
390                 for (const Dictionary::Pair& pair : tags) {
391                         // Empty macro expansion, no tag
392                         if (!pair.second.IsEmpty()) {
393                                 msgbuf << "," << EscapeKeyOrTagValue(pair.first) << "=" << EscapeKeyOrTagValue(pair.second);
394                         }
395                 }
396         }
397
398         // Label may be empty in the case of metadata
399         if (!label.IsEmpty())
400                 msgbuf << ",metric=" << EscapeKeyOrTagValue(label);
401
402         msgbuf << " ";
403
404         {
405                 bool first = true;
406
407                 ObjectLock fieldLock(fields);
408                 for (const Dictionary::Pair& pair : fields) {
409                         if (first)
410                                 first = false;
411                         else
412                                 msgbuf << ",";
413
414                         msgbuf << EscapeKeyOrTagValue(pair.first) << "=" << EscapeValue(pair.second);
415                 }
416         }
417
418         msgbuf << " " <<  static_cast<unsigned long>(ts);
419
420         Log(LogDebug, "InfluxdbWriter")
421                 << "Checkable '" << checkable->GetName() << "' adds to metric list:'" << msgbuf.str() << "'.";
422
423         // Buffer the data point
424         m_DataBuffer.emplace_back(msgbuf.str());
425
426         // Flush if we've buffered too much to prevent excessive memory use
427         if (static_cast<int>(m_DataBuffer.size()) >= GetFlushThreshold()) {
428                 Log(LogDebug, "InfluxdbWriter")
429                         << "Data buffer overflow writing " << m_DataBuffer.size() << " data points";
430
431                 try {
432                         Flush();
433                 } catch (...) {
434                         /* Do nothing. */
435                 }
436         }
437 }
438
439 void InfluxdbWriter::FlushTimeout()
440 {
441         m_WorkQueue.Enqueue(boost::bind(&InfluxdbWriter::FlushTimeoutWQ, this), PriorityHigh);
442 }
443
444 void InfluxdbWriter::FlushTimeoutWQ()
445 {
446         AssertOnWorkQueue();
447
448         Log(LogDebug, "InfluxdbWriter")
449                 << "Timer expired writing " << m_DataBuffer.size() << " data points";
450
451         Flush();
452 }
453
454 void InfluxdbWriter::Flush()
455 {
456         namespace beast = boost::beast;
457         namespace http = beast::http;
458
459         /* Flush can be called from 1) Timeout 2) Threshold 3) on shutdown/reload. */
460         if (m_DataBuffer.empty())
461                 return;
462
463         Log(LogDebug, "InfluxdbWriter")
464                 << "Flushing data buffer to InfluxDB.";
465
466         String body = boost::algorithm::join(m_DataBuffer, "\n");
467         m_DataBuffer.clear();
468
469         OptionalTlsStream stream;
470
471         try {
472                 stream = Connect();
473         } catch (const std::exception& ex) {
474                 Log(LogWarning, "InfluxDbWriter")
475                         << "Flush failed, cannot connect to InfluxDB: " << DiagnosticInformation(ex, false);
476                 return;
477         }
478
479         Defer s ([&stream]() {
480                 if (stream.first) {
481                         stream.first->next_layer().shutdown();
482                 }
483         });
484
485         Url::Ptr url = new Url();
486         url->SetScheme(GetSslEnable() ? "https" : "http");
487         url->SetHost(GetHost());
488         url->SetPort(GetPort());
489
490         std::vector<String> path;
491         path.emplace_back("write");
492         url->SetPath(path);
493
494         url->AddQueryElement("db", GetDatabase());
495         url->AddQueryElement("precision", "s");
496         if (!GetUsername().IsEmpty())
497                 url->AddQueryElement("u", GetUsername());
498         if (!GetPassword().IsEmpty())
499                 url->AddQueryElement("p", GetPassword());
500
501         http::request<http::string_body> request (http::verb::post, std::string(url->Format(true)), 10);
502
503         request.set(http::field::user_agent, "Icinga/" + Application::GetAppVersion());
504         request.set(http::field::host, url->GetHost() + ":" + url->GetPort());
505
506         request.body() = body;
507         request.set(http::field::content_length, request.body().size());
508
509         try {
510                 if (stream.first) {
511                         http::write(*stream.first, request);
512                         stream.first->flush();
513                 } else {
514                         http::write(*stream.second, request);
515                         stream.second->flush();
516                 }
517         } catch (const std::exception& ex) {
518                 Log(LogWarning, "InfluxdbWriter")
519                         << "Cannot write to TCP socket on host '" << GetHost() << "' port '" << GetPort() << "'.";
520                 throw;
521         }
522
523         http::parser<false, http::string_body> parser;
524         beast::flat_buffer buf;
525
526         try {
527                 if (stream.first) {
528                         http::read(*stream.first, buf, parser);
529                 } else {
530                         http::read(*stream.second, buf, parser);
531                 }
532         } catch (const std::exception& ex) {
533                 Log(LogWarning, "InfluxdbWriter")
534                         << "Failed to parse HTTP response from host '" << GetHost() << "' port '" << GetPort() << "': " << DiagnosticInformation(ex);
535                 throw;
536         }
537
538         auto& response (parser.get());
539
540         if (response.result() != http::status::no_content) {
541                 Log(LogWarning, "InfluxdbWriter")
542                         << "Unexpected response code: " << response.result();
543
544                 auto& contentType (response[http::field::content_type]);
545                 if (contentType != "application/json") {
546                         Log(LogWarning, "InfluxdbWriter")
547                                 << "Unexpected Content-Type: " << contentType;
548                         return;
549                 }
550
551                 Dictionary::Ptr jsonResponse;
552                 auto& body (response.body());
553
554                 try {
555                         jsonResponse = JsonDecode(body);
556                 } catch (...) {
557                         Log(LogWarning, "InfluxdbWriter")
558                                 << "Unable to parse JSON response:\n" << body;
559                         return;
560                 }
561
562                 String error = jsonResponse->Get("error");
563
564                 Log(LogCritical, "InfluxdbWriter")
565                         << "InfluxDB error message:\n" << error;
566         }
567 }
568
569 void InfluxdbWriter::ValidateHostTemplate(const Lazy<Dictionary::Ptr>& lvalue, const ValidationUtils& utils)
570 {
571         ObjectImpl<InfluxdbWriter>::ValidateHostTemplate(lvalue, utils);
572
573         String measurement = lvalue()->Get("measurement");
574         if (!MacroProcessor::ValidateMacroString(measurement))
575                 BOOST_THROW_EXCEPTION(ValidationError(this, { "host_template", "measurement" }, "Closing $ not found in macro format string '" + measurement + "'."));
576
577         Dictionary::Ptr tags = lvalue()->Get("tags");
578         if (tags) {
579                 ObjectLock olock(tags);
580                 for (const Dictionary::Pair& pair : tags) {
581                         if (!MacroProcessor::ValidateMacroString(pair.second))
582                                 BOOST_THROW_EXCEPTION(ValidationError(this, { "host_template", "tags", pair.first }, "Closing $ not found in macro format string '" + pair.second));
583                 }
584         }
585 }
586
587 void InfluxdbWriter::ValidateServiceTemplate(const Lazy<Dictionary::Ptr>& lvalue, const ValidationUtils& utils)
588 {
589         ObjectImpl<InfluxdbWriter>::ValidateServiceTemplate(lvalue, utils);
590
591         String measurement = lvalue()->Get("measurement");
592         if (!MacroProcessor::ValidateMacroString(measurement))
593                 BOOST_THROW_EXCEPTION(ValidationError(this, { "service_template", "measurement" }, "Closing $ not found in macro format string '" + measurement + "'."));
594
595         Dictionary::Ptr tags = lvalue()->Get("tags");
596         if (tags) {
597                 ObjectLock olock(tags);
598                 for (const Dictionary::Pair& pair : tags) {
599                         if (!MacroProcessor::ValidateMacroString(pair.second))
600                                 BOOST_THROW_EXCEPTION(ValidationError(this, { "service_template", "tags", pair.first }, "Closing $ not found in macro format string '" + pair.second));
601                 }
602         }
603 }
604