]> granicus.if.org Git - icinga2/blob - lib/perfdata/influxdbwriter.cpp
Merge pull request #7124 from Icinga/bugfix/namespace-thread-safe
[icinga2] / lib / perfdata / influxdbwriter.cpp
1 /* Icinga 2 | (c) 2012 Icinga GmbH | GPLv2+ */
2
3 #include "perfdata/influxdbwriter.hpp"
4 #include "perfdata/influxdbwriter-ti.cpp"
5 #include "remote/url.hpp"
6 #include "remote/httprequest.hpp"
7 #include "remote/httpresponse.hpp"
8 #include "icinga/service.hpp"
9 #include "icinga/macroprocessor.hpp"
10 #include "icinga/icingaapplication.hpp"
11 #include "icinga/checkcommand.hpp"
12 #include "base/application.hpp"
13 #include "base/defer.hpp"
14 #include "base/io-engine.hpp"
15 #include "base/tcpsocket.hpp"
16 #include "base/configtype.hpp"
17 #include "base/objectlock.hpp"
18 #include "base/logger.hpp"
19 #include "base/convert.hpp"
20 #include "base/utility.hpp"
21 #include "base/perfdatavalue.hpp"
22 #include "base/stream.hpp"
23 #include "base/json.hpp"
24 #include "base/networkstream.hpp"
25 #include "base/exception.hpp"
26 #include "base/statsfunction.hpp"
27 #include "base/tlsutility.hpp"
28 #include <boost/algorithm/string.hpp>
29 #include <boost/algorithm/string/replace.hpp>
30 #include <boost/asio/ssl/context.hpp>
31 #include <boost/beast/core/flat_buffer.hpp>
32 #include <boost/beast/http/field.hpp>
33 #include <boost/beast/http/message.hpp>
34 #include <boost/beast/http/parser.hpp>
35 #include <boost/beast/http/read.hpp>
36 #include <boost/beast/http/status.hpp>
37 #include <boost/beast/http/string_body.hpp>
38 #include <boost/beast/http/verb.hpp>
39 #include <boost/beast/http/write.hpp>
40 #include <boost/math/special_functions/fpclassify.hpp>
41 #include <boost/regex.hpp>
42 #include <boost/scoped_array.hpp>
43 #include <memory>
44 #include <string>
45 #include <utility>
46
47 using namespace icinga;
48
49 class InfluxdbInteger final : public Object
50 {
51 public:
52         DECLARE_PTR_TYPEDEFS(InfluxdbInteger);
53
54         InfluxdbInteger(int value)
55                 : m_Value(value)
56         { }
57
58         int GetValue() const
59         {
60                 return m_Value;
61         }
62
63 private:
64         int m_Value;
65 };
66
67 REGISTER_TYPE(InfluxdbWriter);
68
69 REGISTER_STATSFUNCTION(InfluxdbWriter, &InfluxdbWriter::StatsFunc);
70
71 void InfluxdbWriter::OnConfigLoaded()
72 {
73         ObjectImpl<InfluxdbWriter>::OnConfigLoaded();
74
75         m_WorkQueue.SetName("InfluxdbWriter, " + GetName());
76
77         if (!GetEnableHa()) {
78                 Log(LogDebug, "InfluxdbWriter")
79                         << "HA functionality disabled. Won't pause connection: " << GetName();
80
81                 SetHAMode(HARunEverywhere);
82         } else {
83                 SetHAMode(HARunOnce);
84         }
85 }
86
87 void InfluxdbWriter::StatsFunc(const Dictionary::Ptr& status, const Array::Ptr& perfdata)
88 {
89         DictionaryData nodes;
90
91         for (const InfluxdbWriter::Ptr& influxdbwriter : ConfigType::GetObjectsByType<InfluxdbWriter>()) {
92                 size_t workQueueItems = influxdbwriter->m_WorkQueue.GetLength();
93                 double workQueueItemRate = influxdbwriter->m_WorkQueue.GetTaskCount(60) / 60.0;
94                 size_t dataBufferItems = influxdbwriter->m_DataBuffer.size();
95
96                 nodes.emplace_back(influxdbwriter->GetName(), new Dictionary({
97                         { "work_queue_items", workQueueItems },
98                         { "work_queue_item_rate", workQueueItemRate },
99                         { "data_buffer_items", dataBufferItems }
100                 }));
101
102                 perfdata->Add(new PerfdataValue("influxdbwriter_" + influxdbwriter->GetName() + "_work_queue_items", workQueueItems));
103                 perfdata->Add(new PerfdataValue("influxdbwriter_" + influxdbwriter->GetName() + "_work_queue_item_rate", workQueueItemRate));
104                 perfdata->Add(new PerfdataValue("influxdbwriter_" + influxdbwriter->GetName() + "_data_queue_items", dataBufferItems));
105         }
106
107         status->Set("influxdbwriter", new Dictionary(std::move(nodes)));
108 }
109
110 void InfluxdbWriter::Resume()
111 {
112         ObjectImpl<InfluxdbWriter>::Resume();
113
114         Log(LogInformation, "InfluxdbWriter")
115                 << "'" << GetName() << "' resumed.";
116
117         /* Register exception handler for WQ tasks. */
118         m_WorkQueue.SetExceptionCallback(std::bind(&InfluxdbWriter::ExceptionHandler, this, _1));
119
120         /* Setup timer for periodically flushing m_DataBuffer */
121         m_FlushTimer = new Timer();
122         m_FlushTimer->SetInterval(GetFlushInterval());
123         m_FlushTimer->OnTimerExpired.connect(std::bind(&InfluxdbWriter::FlushTimeout, this));
124         m_FlushTimer->Start();
125         m_FlushTimer->Reschedule(0);
126
127         /* Register for new metrics. */
128         Checkable::OnNewCheckResult.connect(std::bind(&InfluxdbWriter::CheckResultHandler, this, _1, _2));
129 }
130
131 /* Pause is equivalent to Stop, but with HA capabilities to resume at runtime. */
132 void InfluxdbWriter::Pause()
133 {
134         /* Force a flush. */
135         Log(LogDebug, "InfluxdbWriter")
136                 << "Flushing pending data buffers.";
137
138         Flush();
139
140         /* Work on the missing tasks. TODO: Find a way to cache them on disk. */
141         Log(LogDebug, "InfluxdbWriter")
142                 << "Joining existing WQ tasks.";
143
144         m_WorkQueue.Join();
145
146         /* Flush again after the WQ tasks have filled the data buffer. */
147         Log(LogDebug, "InfluxdbWriter")
148                 << "Flushing data buffers from WQ tasks.";
149
150         Flush();
151
152         Log(LogInformation, "InfluxdbWriter")
153                 << "'" << GetName() << "' paused.";
154
155         ObjectImpl<InfluxdbWriter>::Pause();
156 }
157
158 void InfluxdbWriter::AssertOnWorkQueue()
159 {
160         ASSERT(m_WorkQueue.IsWorkerThread());
161 }
162
163 void InfluxdbWriter::ExceptionHandler(boost::exception_ptr exp)
164 {
165         Log(LogCritical, "InfluxdbWriter", "Exception during InfluxDB operation: Verify that your backend is operational!");
166
167         Log(LogDebug, "InfluxdbWriter")
168                 << "Exception during InfluxDB operation: " << DiagnosticInformation(std::move(exp));
169
170         //TODO: Close the connection, if we keep it open.
171 }
172
173 OptionalTlsStream InfluxdbWriter::Connect()
174 {
175         Log(LogNotice, "InfluxdbWriter")
176                 << "Reconnecting to InfluxDB on host '" << GetHost() << "' port '" << GetPort() << "'.";
177
178         OptionalTlsStream stream;
179         bool ssl = GetSslEnable();
180
181         if (ssl) {
182                 std::shared_ptr<boost::asio::ssl::context> sslContext;
183
184                 try {
185                         sslContext = MakeAsioSslContext(GetSslCert(), GetSslKey(), GetSslCaCert());
186                 } catch (const std::exception& ex) {
187                         Log(LogWarning, "InfluxdbWriter")
188                                 << "Unable to create SSL context.";
189                         throw;
190                 }
191
192                 stream.first = std::make_shared<AsioTlsStream>(IoEngine::Get().GetIoService(), *sslContext, GetHost());
193         } else {
194                 stream.second = std::make_shared<AsioTcpStream>(IoEngine::Get().GetIoService());
195         }
196
197         try {
198                 icinga::Connect(ssl ? stream.first->lowest_layer() : stream.second->lowest_layer(), GetHost(), GetPort());
199         } catch (const std::exception& ex) {
200                 Log(LogWarning, "InfluxdbWriter")
201                         << "Can't connect to InfluxDB on host '" << GetHost() << "' port '" << GetPort() << "'.";
202                 throw;
203         }
204
205         if (ssl) {
206                 auto& tlsStream (stream.first->next_layer());
207
208                 try {
209                         tlsStream.handshake(tlsStream.client);
210                 } catch (const std::exception& ex) {
211                         Log(LogWarning, "InfluxdbWriter")
212                                 << "TLS handshake with host '" << GetHost() << "' failed.";
213                         throw;
214                 }
215         }
216
217         return std::move(stream);
218 }
219
220 void InfluxdbWriter::CheckResultHandler(const Checkable::Ptr& checkable, const CheckResult::Ptr& cr)
221 {
222         if (IsPaused())
223                 return;
224
225         m_WorkQueue.Enqueue(std::bind(&InfluxdbWriter::CheckResultHandlerWQ, this, checkable, cr), PriorityLow);
226 }
227
228 void InfluxdbWriter::CheckResultHandlerWQ(const Checkable::Ptr& checkable, const CheckResult::Ptr& cr)
229 {
230         AssertOnWorkQueue();
231
232         CONTEXT("Processing check result for '" + checkable->GetName() + "'");
233
234         if (!IcingaApplication::GetInstance()->GetEnablePerfdata() || !checkable->GetEnablePerfdata())
235                 return;
236
237         Host::Ptr host;
238         Service::Ptr service;
239         tie(host, service) = GetHostService(checkable);
240
241         MacroProcessor::ResolverList resolvers;
242         if (service)
243                 resolvers.emplace_back("service", service);
244         resolvers.emplace_back("host", host);
245         resolvers.emplace_back("icinga", IcingaApplication::GetInstance());
246
247         String prefix;
248
249         double ts = cr->GetExecutionEnd();
250
251         // Clone the template and perform an in-place macro expansion of measurement and tag values
252         Dictionary::Ptr tmpl_clean = service ? GetServiceTemplate() : GetHostTemplate();
253         Dictionary::Ptr tmpl = static_pointer_cast<Dictionary>(tmpl_clean->Clone());
254         tmpl->Set("measurement", MacroProcessor::ResolveMacros(tmpl->Get("measurement"), resolvers, cr));
255
256         Dictionary::Ptr tags = tmpl->Get("tags");
257         if (tags) {
258                 ObjectLock olock(tags);
259                 for (const Dictionary::Pair& pair : tags) {
260                         String missing_macro;
261                         Value value = MacroProcessor::ResolveMacros(pair.second, resolvers, cr, &missing_macro);
262
263                         if (!missing_macro.IsEmpty())
264                                 continue;
265
266                         tags->Set(pair.first, value);
267                 }
268         }
269
270         CheckCommand::Ptr checkCommand = checkable->GetCheckCommand();
271
272         Array::Ptr perfdata = cr->GetPerformanceData();
273
274         if (perfdata) {
275                 ObjectLock olock(perfdata);
276                 for (const Value& val : perfdata) {
277                         PerfdataValue::Ptr pdv;
278
279                         if (val.IsObjectType<PerfdataValue>())
280                                 pdv = val;
281                         else {
282                                 try {
283                                         pdv = PerfdataValue::Parse(val);
284                                 } catch (const std::exception&) {
285                                         Log(LogWarning, "InfluxdbWriter")
286                                                 << "Ignoring invalid perfdata for checkable '"
287                                                 << checkable->GetName() << "' and command '"
288                                                 << checkCommand->GetName() << "' with value: " << val;
289                                         continue;
290                                 }
291                         }
292
293                         Dictionary::Ptr fields = new Dictionary();
294                         fields->Set("value", pdv->GetValue());
295
296                         if (GetEnableSendThresholds()) {
297                                 if (pdv->GetCrit())
298                                         fields->Set("crit", pdv->GetCrit());
299                                 if (pdv->GetWarn())
300                                         fields->Set("warn", pdv->GetWarn());
301                                 if (pdv->GetMin())
302                                         fields->Set("min", pdv->GetMin());
303                                 if (pdv->GetMax())
304                                         fields->Set("max", pdv->GetMax());
305                         }
306                         if (!pdv->GetUnit().IsEmpty()) {
307                                 fields->Set("unit", pdv->GetUnit());
308                         }
309
310                         SendMetric(checkable, tmpl, pdv->GetLabel(), fields, ts);
311                 }
312         }
313
314         if (GetEnableSendMetadata()) {
315                 Host::Ptr host;
316                 Service::Ptr service;
317                 tie(host, service) = GetHostService(checkable);
318
319                 Dictionary::Ptr fields = new Dictionary();
320
321                 if (service)
322                         fields->Set("state", new InfluxdbInteger(service->GetState()));
323                 else
324                         fields->Set("state", new InfluxdbInteger(host->GetState()));
325
326                 fields->Set("current_attempt", new InfluxdbInteger(checkable->GetCheckAttempt()));
327                 fields->Set("max_check_attempts", new InfluxdbInteger(checkable->GetMaxCheckAttempts()));
328                 fields->Set("state_type", new InfluxdbInteger(checkable->GetStateType()));
329                 fields->Set("reachable", checkable->IsReachable());
330                 fields->Set("downtime_depth", new InfluxdbInteger(checkable->GetDowntimeDepth()));
331                 fields->Set("acknowledgement", new InfluxdbInteger(checkable->GetAcknowledgement()));
332                 fields->Set("latency", cr->CalculateLatency());
333                 fields->Set("execution_time", cr->CalculateExecutionTime());
334
335                 SendMetric(checkable, tmpl, Empty, fields, ts);
336         }
337 }
338
339 String InfluxdbWriter::EscapeKeyOrTagValue(const String& str)
340 {
341         // Iterate over the key name and escape commas and spaces with a backslash
342         String result = str;
343         boost::algorithm::replace_all(result, "\"", "\\\"");
344         boost::algorithm::replace_all(result, "=", "\\=");
345         boost::algorithm::replace_all(result, ",", "\\,");
346         boost::algorithm::replace_all(result, " ", "\\ ");
347
348         // InfluxDB 'feature': although backslashes are allowed in keys they also act
349         // as escape sequences when followed by ',' or ' '.  When your tag is like
350         // 'metric=C:\' bad things happen.  Backslashes themselves cannot be escaped
351         // and through experimentation they also escape '='.  To be safe we replace
352         // trailing backslashes with and underscore.
353         // See https://github.com/influxdata/influxdb/issues/8587 for more info
354         size_t length = result.GetLength();
355         if (result[length - 1] == '\\')
356                 result[length - 1] = '_';
357
358         return result;
359 }
360
361 String InfluxdbWriter::EscapeValue(const Value& value)
362 {
363         if (value.IsObjectType<InfluxdbInteger>()) {
364                 std::ostringstream os;
365                 os << static_cast<InfluxdbInteger::Ptr>(value)->GetValue() << "i";
366                 return os.str();
367         }
368
369         if (value.IsBoolean())
370                 return value ? "true" : "false";
371
372         if (value.IsString())
373                 return "\"" + EscapeKeyOrTagValue(value) + "\"";
374
375         return value;
376 }
377
378 void InfluxdbWriter::SendMetric(const Checkable::Ptr& checkable, const Dictionary::Ptr& tmpl,
379         const String& label, const Dictionary::Ptr& fields, double ts)
380 {
381         std::ostringstream msgbuf;
382         msgbuf << EscapeKeyOrTagValue(tmpl->Get("measurement"));
383
384         Dictionary::Ptr tags = tmpl->Get("tags");
385         if (tags) {
386                 ObjectLock olock(tags);
387                 for (const Dictionary::Pair& pair : tags) {
388                         // Empty macro expansion, no tag
389                         if (!pair.second.IsEmpty()) {
390                                 msgbuf << "," << EscapeKeyOrTagValue(pair.first) << "=" << EscapeKeyOrTagValue(pair.second);
391                         }
392                 }
393         }
394
395         // Label may be empty in the case of metadata
396         if (!label.IsEmpty())
397                 msgbuf << ",metric=" << EscapeKeyOrTagValue(label);
398
399         msgbuf << " ";
400
401         {
402                 bool first = true;
403
404                 ObjectLock fieldLock(fields);
405                 for (const Dictionary::Pair& pair : fields) {
406                         if (first)
407                                 first = false;
408                         else
409                                 msgbuf << ",";
410
411                         msgbuf << EscapeKeyOrTagValue(pair.first) << "=" << EscapeValue(pair.second);
412                 }
413         }
414
415         msgbuf << " " <<  static_cast<unsigned long>(ts);
416
417         Log(LogDebug, "InfluxdbWriter")
418                 << "Checkable '" << checkable->GetName() << "' adds to metric list:'" << msgbuf.str() << "'.";
419
420         // Buffer the data point
421         m_DataBuffer.emplace_back(msgbuf.str());
422
423         // Flush if we've buffered too much to prevent excessive memory use
424         if (static_cast<int>(m_DataBuffer.size()) >= GetFlushThreshold()) {
425                 Log(LogDebug, "InfluxdbWriter")
426                         << "Data buffer overflow writing " << m_DataBuffer.size() << " data points";
427
428                 try {
429                         Flush();
430                 } catch (...) {
431                         /* Do nothing. */
432                 }
433         }
434 }
435
436 void InfluxdbWriter::FlushTimeout()
437 {
438         m_WorkQueue.Enqueue(boost::bind(&InfluxdbWriter::FlushTimeoutWQ, this), PriorityHigh);
439 }
440
441 void InfluxdbWriter::FlushTimeoutWQ()
442 {
443         AssertOnWorkQueue();
444
445         Log(LogDebug, "InfluxdbWriter")
446                 << "Timer expired writing " << m_DataBuffer.size() << " data points";
447
448         Flush();
449 }
450
451 void InfluxdbWriter::Flush()
452 {
453         namespace beast = boost::beast;
454         namespace http = beast::http;
455
456         /* Flush can be called from 1) Timeout 2) Threshold 3) on shutdown/reload. */
457         if (m_DataBuffer.empty())
458                 return;
459
460         Log(LogDebug, "InfluxdbWriter")
461                 << "Flushing data buffer to InfluxDB.";
462
463         String body = boost::algorithm::join(m_DataBuffer, "\n");
464         m_DataBuffer.clear();
465
466         OptionalTlsStream stream;
467
468         try {
469                 stream = Connect();
470         } catch (const std::exception& ex) {
471                 Log(LogWarning, "InfluxDbWriter")
472                         << "Flush failed, cannot connect to InfluxDB: " << DiagnosticInformation(ex, false);
473                 return;
474         }
475
476         Defer s ([&stream]() {
477                 if (stream.first) {
478                         stream.first->next_layer().shutdown();
479                 }
480         });
481
482         Url::Ptr url = new Url();
483         url->SetScheme(GetSslEnable() ? "https" : "http");
484         url->SetHost(GetHost());
485         url->SetPort(GetPort());
486
487         std::vector<String> path;
488         path.emplace_back("write");
489         url->SetPath(path);
490
491         url->AddQueryElement("db", GetDatabase());
492         url->AddQueryElement("precision", "s");
493         if (!GetUsername().IsEmpty())
494                 url->AddQueryElement("u", GetUsername());
495         if (!GetPassword().IsEmpty())
496                 url->AddQueryElement("p", GetPassword());
497
498         http::request<http::string_body> request (http::verb::post, std::string(url->Format(true)), 10);
499
500         request.set(http::field::user_agent, "Icinga/" + Application::GetAppVersion());
501         request.set(http::field::host, url->GetHost() + ":" + url->GetPort());
502
503         request.body() = body;
504         request.set(http::field::content_length, request.body().size());
505
506         try {
507                 if (stream.first) {
508                         http::write(*stream.first, request);
509                         stream.first->flush();
510                 } else {
511                         http::write(*stream.second, request);
512                         stream.second->flush();
513                 }
514         } catch (const std::exception& ex) {
515                 Log(LogWarning, "InfluxdbWriter")
516                         << "Cannot write to TCP socket on host '" << GetHost() << "' port '" << GetPort() << "'.";
517                 throw;
518         }
519
520         http::parser<false, http::string_body> parser;
521         beast::flat_buffer buf;
522
523         try {
524                 if (stream.first) {
525                         http::read(*stream.first, buf, parser);
526                 } else {
527                         http::read(*stream.second, buf, parser);
528                 }
529         } catch (const std::exception& ex) {
530                 Log(LogWarning, "InfluxdbWriter")
531                         << "Failed to parse HTTP response from host '" << GetHost() << "' port '" << GetPort() << "': " << DiagnosticInformation(ex);
532                 throw;
533         }
534
535         auto& response (parser.get());
536
537         if (response.result() != http::status::no_content) {
538                 Log(LogWarning, "InfluxdbWriter")
539                         << "Unexpected response code: " << response.result();
540
541                 auto& contentType (response[http::field::content_type]);
542                 if (contentType != "application/json") {
543                         Log(LogWarning, "InfluxdbWriter")
544                                 << "Unexpected Content-Type: " << contentType;
545                         return;
546                 }
547
548                 Dictionary::Ptr jsonResponse;
549                 auto& body (response.body());
550
551                 try {
552                         jsonResponse = JsonDecode(body);
553                 } catch (...) {
554                         Log(LogWarning, "InfluxdbWriter")
555                                 << "Unable to parse JSON response:\n" << body;
556                         return;
557                 }
558
559                 String error = jsonResponse->Get("error");
560
561                 Log(LogCritical, "InfluxdbWriter")
562                         << "InfluxDB error message:\n" << error;
563         }
564 }
565
566 void InfluxdbWriter::ValidateHostTemplate(const Lazy<Dictionary::Ptr>& lvalue, const ValidationUtils& utils)
567 {
568         ObjectImpl<InfluxdbWriter>::ValidateHostTemplate(lvalue, utils);
569
570         String measurement = lvalue()->Get("measurement");
571         if (!MacroProcessor::ValidateMacroString(measurement))
572                 BOOST_THROW_EXCEPTION(ValidationError(this, { "host_template", "measurement" }, "Closing $ not found in macro format string '" + measurement + "'."));
573
574         Dictionary::Ptr tags = lvalue()->Get("tags");
575         if (tags) {
576                 ObjectLock olock(tags);
577                 for (const Dictionary::Pair& pair : tags) {
578                         if (!MacroProcessor::ValidateMacroString(pair.second))
579                                 BOOST_THROW_EXCEPTION(ValidationError(this, { "host_template", "tags", pair.first }, "Closing $ not found in macro format string '" + pair.second));
580                 }
581         }
582 }
583
584 void InfluxdbWriter::ValidateServiceTemplate(const Lazy<Dictionary::Ptr>& lvalue, const ValidationUtils& utils)
585 {
586         ObjectImpl<InfluxdbWriter>::ValidateServiceTemplate(lvalue, utils);
587
588         String measurement = lvalue()->Get("measurement");
589         if (!MacroProcessor::ValidateMacroString(measurement))
590                 BOOST_THROW_EXCEPTION(ValidationError(this, { "service_template", "measurement" }, "Closing $ not found in macro format string '" + measurement + "'."));
591
592         Dictionary::Ptr tags = lvalue()->Get("tags");
593         if (tags) {
594                 ObjectLock olock(tags);
595                 for (const Dictionary::Pair& pair : tags) {
596                         if (!MacroProcessor::ValidateMacroString(pair.second))
597                                 BOOST_THROW_EXCEPTION(ValidationError(this, { "service_template", "tags", pair.first }, "Closing $ not found in macro format string '" + pair.second));
598                 }
599         }
600 }
601