instance\_name | String | **Optional.** Unique identifier for the local Icinga 2 instance. Defaults to `default`.
instance\_description | String | **Optional.** Description for the Icinga 2 instance.
enable\_ha | Boolean | **Optional.** Enable the high availability functionality. Only valid in a [cluster setup](06-distributed-monitoring.md#distributed-monitoring-high-availability-db-ido). Defaults to `true`.
- failover\_timeout | Duration | **Optional.** Set the failover timeout in a [HA cluster](06-distributed-monitoring.md#distributed-monitoring-high-availability-db-ido). Must not be lower than 60s. Defaults to `60s`.
+ failover\_timeout | Duration | **Optional.** Set the failover timeout in a [HA cluster](06-distributed-monitoring.md#distributed-monitoring-high-availability-db-ido). Must not be lower than 30s. Defaults to `30s`.
cleanup | Dictionary | **Optional.** Dictionary with items for historical table cleanup.
categories | Array | **Optional.** Array of information types that should be written to the database.
In addition to the category flags listed above the `DbCatEverything`
flag may be used as a shortcut for listing all flags.
+Runtime Attributes:
+
+ Name | Type | Description
+ ----------------------------|-----------------------|-----------------
+ last\_failover | Timestamp | When the last failover happened for this connection (only available with `enable_ha = true`.
+
## IdoPgsqlConnection <a id="objecttype-idopgsqlconnection"></a>
IDO database adapter for PostgreSQL.
instance\_name | String | **Optional.** Unique identifier for the local Icinga 2 instance. Defaults to `default`.
instance\_description | String | **Optional.** Description for the Icinga 2 instance.
enable\_ha | Boolean | **Optional.** Enable the high availability functionality. Only valid in a [cluster setup](06-distributed-monitoring.md#distributed-monitoring-high-availability-db-ido). Defaults to `true`.
- failover\_timeout | Duration | **Optional.** Set the failover timeout in a [HA cluster](06-distributed-monitoring.md#distributed-monitoring-high-availability-db-ido). Must not be lower than 60s. Defaults to `60s`.
+ failover\_timeout | Duration | **Optional.** Set the failover timeout in a [HA cluster](06-distributed-monitoring.md#distributed-monitoring-high-availability-db-ido). Must not be lower than 30s. Defaults to `30s`.
cleanup | Dictionary | **Optional.** Dictionary with items for historical table cleanup.
categories | Array | **Optional.** Array of information types that should be written to the database.
In addition to the category flags listed above the `DbCatEverything`
flag may be used as a shortcut for listing all flags.
+Runtime Attributes:
+
+ Name | Type | Description
+ ----------------------------|-----------------------|-----------------
+ last\_failover | Timestamp | When the last failover happened for this connection (only available with `enable_ha = true`.
+
## InfluxdbWriter <a id="objecttype-influxdbwriter"></a>
Writes check result metrics and performance data to a defined InfluxDB host.
enum WorkQueuePriority
{
- PriorityLow,
- PriorityNormal,
- PriorityHigh
+ PriorityLow = 0,
+ PriorityNormal = 1,
+ PriorityHigh = 2,
+ PriorityImmediate = 4
};
using TaskFunction = std::function<void ()>;
{
ObjectImpl<DbConnection>::ValidateFailoverTimeout(lvalue, utils);
- if (lvalue() < 60)
- BOOST_THROW_EXCEPTION(ValidationError(this, { "failover_timeout" }, "Failover timeout minimum is 60s."));
+ if (lvalue() < 30)
+ BOOST_THROW_EXCEPTION(ValidationError(this, { "failover_timeout" }, "Failover timeout minimum is 30s."));
}
void DbConnection::ValidateCategories(const Lazy<Array::Ptr>& lvalue, const ValidationUtils& utils)
};
[config] double failover_timeout {
- default {{{ return 60; }}}
+ default {{{ return 30; }}}
};
+ [state, no_user_modify] double last_failover;
+
[no_user_modify] String schema_version;
[no_user_modify] bool connected;
[no_user_modify] bool should_connect {
void IdoMysqlConnection::Resume()
{
- DbConnection::Resume();
-
Log(LogInformation, "IdoMysqlConnection")
<< "'" << GetName() << "' resumed.";
m_QueryQueue.SetExceptionCallback(std::bind(&IdoMysqlConnection::ExceptionHandler, this, _1));
+ /* Immediately try to connect on Resume() without timer. */
+ m_QueryQueue.Enqueue(std::bind(&IdoMysqlConnection::Reconnect, this), PriorityImmediate);
+
m_TxTimer = new Timer();
m_TxTimer->SetInterval(1);
m_TxTimer->OnTimerExpired.connect(std::bind(&IdoMysqlConnection::TxTimerHandler, this));
m_ReconnectTimer->SetInterval(10);
m_ReconnectTimer->OnTimerExpired.connect(std::bind(&IdoMysqlConnection::ReconnectTimerHandler, this));
m_ReconnectTimer->Start();
- m_ReconnectTimer->Reschedule(0);
+
+ /* Start with queries after connect. */
+ DbConnection::Resume();
ASSERT(m_Mysql->thread_safe());
}
void IdoMysqlConnection::Pause()
{
- m_ReconnectTimer.reset();
+ Log(LogDebug, "IdoMysqlConnection")
+ << "Attempting to pause '" << GetName() << "'.";
DbConnection::Pause();
+ m_ReconnectTimer.reset();
+
#ifdef I2_DEBUG /* I2_DEBUG */
Log(LogDebug, "IdoMysqlConnection")
<< "Rescheduling disconnect task.";
#endif /* I2_DEBUG */
m_QueryQueue.Enqueue(std::bind(&IdoMysqlConnection::Disconnect, this), PriorityLow);
+
+ /* Work on remaining tasks but never delete the threads, for HA resuming later. */
m_QueryQueue.Join();
Log(LogInformation, "IdoMysqlConnection")
m_Mysql->close(&m_Connection);
SetConnected(false);
+
+ Log(LogInformation, "IdoMysqlConnection")
+ << "Disconnected from '" << GetName() << "' database '" << GetDatabase() << "'.";
}
void IdoMysqlConnection::TxTimerHandler()
<< "Scheduling new transaction and finishing async queries.";
#endif /* I2_DEBUG */
- m_QueryQueue.Enqueue(std::bind(&IdoMysqlConnection::InternalNewTransaction, this), PriorityHigh);
- m_QueryQueue.Enqueue(std::bind(&IdoMysqlConnection::FinishAsyncQueries, this), PriorityHigh);
+ m_QueryQueue.Enqueue(std::bind(&IdoMysqlConnection::InternalNewTransaction, this), PriorityNormal);
+ m_QueryQueue.Enqueue(std::bind(&IdoMysqlConnection::FinishAsyncQueries, this), PriorityNormal);
}
void IdoMysqlConnection::InternalNewTransaction()
<< "Scheduling reconnect task.";
#endif /* I2_DEBUG */
- m_QueryQueue.Enqueue(std::bind(&IdoMysqlConnection::Reconnect, this), PriorityHigh);
+ /* Only allow Reconnect events with high priority. */
+ m_QueryQueue.Enqueue(std::bind(&IdoMysqlConnection::Reconnect, this), PriorityImmediate);
}
void IdoMysqlConnection::Reconnect()
bool reconnect = false;
+ /* Ensure to close old connections first. */
if (GetConnected()) {
/* Check if we're really still connected */
if (m_Mysql->ping(&m_Connection) == 0)
reconnect = true;
}
+ Log(LogDebug, "IdoMysqlConnection")
+ << "Reconnect: Clearing ID cache.";
+
ClearIDCache();
String ihost, isocket_path, iuser, ipasswd, idb;
BOOST_THROW_EXCEPTION(std::runtime_error(m_Mysql->error(&m_Connection)));
}
+ Log(LogNotice, "IdoMysqlConnection")
+ << "Reconnect: '" << GetName() << "' is now connected to database '" << GetDatabase() << "'.";
+
SetConnected(true);
IdoMysqlResult result = Query("SELECT @@global.max_allowed_packet AS max_allowed_packet");
else
status_update_time = 0;
- double status_update_age = Utility::GetTime() - status_update_time;
+ double now = Utility::GetTime();
+
+ double status_update_age = now - status_update_time;
+ double failoverTimeout = GetFailoverTimeout();
- Log(LogNotice, "IdoMysqlConnection")
- << "Last update by '" << endpoint_name << "' was " << status_update_age << "s ago.";
+ if (status_update_age < failoverTimeout) {
+ Log(LogInformation, "IdoMysqlConnection")
+ << "Last update by endpoint '" << endpoint_name << "' was "
+ << status_update_age << "s ago (< failover timeout of " << failoverTimeout << "s). Retrying.";
- if (status_update_age < GetFailoverTimeout()) {
m_Mysql->close(&m_Connection);
SetConnected(false);
SetShouldConnect(false);
return;
}
+
+ SetLastFailover(now);
+
+ Log(LogInformation, "IdoMysqlConnection")
+ << "Last update by endpoint '" << endpoint_name << "' was "
+ << status_update_age << "s ago. Taking over '" << GetName() << "' in HA zone '" << Zone::GetLocalZone()->GetName() << "'.";
}
- Log(LogNotice, "IdoMysqlConnection", "Enabling IDO connection.");
+ Log(LogNotice, "IdoMysqlConnection", "Enabling IDO connection in HA zone.");
}
Log(LogInformation, "IdoMysqlConnection")
<< "Scheduling session table clear and finish connect task.";
#endif /* I2_DEBUG */
- m_QueryQueue.Enqueue(std::bind(&IdoMysqlConnection::ClearTablesBySession, this), PriorityHigh);
+ m_QueryQueue.Enqueue(std::bind(&IdoMysqlConnection::ClearTablesBySession, this), PriorityNormal);
- m_QueryQueue.Enqueue(std::bind(&IdoMysqlConnection::FinishConnect, this, startTime), PriorityHigh);
+ m_QueryQueue.Enqueue(std::bind(&IdoMysqlConnection::FinishConnect, this, startTime), PriorityNormal);
}
void IdoMysqlConnection::FinishConnect(double startTime)
FinishAsyncQueries();
Log(LogInformation, "IdoMysqlConnection")
- << "Finished reconnecting to MySQL IDO database in " << std::setw(2) << Utility::GetTime() - startTime << " second(s).";
+ << "Finished reconnecting to '" << GetName() << "' database '" << GetDatabase() << "' in "
+ << std::setw(2) << Utility::GetTime() - startTime << " second(s).";
Query("COMMIT");
Query("BEGIN");
<< "Scheduling object activation task for '" << dbobj->GetName1() << "!" << dbobj->GetName2() << "'.";
#endif /* I2_DEBUG */
- m_QueryQueue.Enqueue(std::bind(&IdoMysqlConnection::InternalActivateObject, this, dbobj), PriorityHigh);
+ m_QueryQueue.Enqueue(std::bind(&IdoMysqlConnection::InternalActivateObject, this, dbobj), PriorityNormal);
}
void IdoMysqlConnection::InternalActivateObject(const DbObject::Ptr& dbobj)
<< "Scheduling object deactivation task for '" << dbobj->GetName1() << "!" << dbobj->GetName2() << "'.";
#endif /* I2_DEBUG */
- m_QueryQueue.Enqueue(std::bind(&IdoMysqlConnection::InternalDeactivateObject, this, dbobj), PriorityHigh);
+ m_QueryQueue.Enqueue(std::bind(&IdoMysqlConnection::InternalDeactivateObject, this, dbobj), PriorityNormal);
}
void IdoMysqlConnection::InternalDeactivateObject(const DbObject::Ptr& dbobj)
void IdoPgsqlConnection::Resume()
{
- DbConnection::Resume();
-
Log(LogInformation, "IdoPgsqlConnection")
<< "'" << GetName() << "' resumed.";
m_QueryQueue.SetExceptionCallback(std::bind(&IdoPgsqlConnection::ExceptionHandler, this, _1));
+ /* Immediately try to connect on Resume() without timer. */
+ m_QueryQueue.Enqueue(std::bind(&IdoPgsqlConnection::Reconnect, this), PriorityImmediate);
+
m_TxTimer = new Timer();
m_TxTimer->SetInterval(1);
m_TxTimer->OnTimerExpired.connect(std::bind(&IdoPgsqlConnection::TxTimerHandler, this));
m_ReconnectTimer->SetInterval(10);
m_ReconnectTimer->OnTimerExpired.connect(std::bind(&IdoPgsqlConnection::ReconnectTimerHandler, this));
m_ReconnectTimer->Start();
- m_ReconnectTimer->Reschedule(0);
+
+ /* Start with queries after connect. */
+ DbConnection::Resume();
ASSERT(m_Pgsql->isthreadsafe());
}
void IdoPgsqlConnection::Pause()
{
- m_ReconnectTimer.reset();
-
DbConnection::Pause();
+ m_ReconnectTimer.reset();
+
m_QueryQueue.Enqueue(std::bind(&IdoPgsqlConnection::Disconnect, this), PriorityLow);
+
+ /* Work on remaining tasks but never delete the threads, for HA resuming later. */
m_QueryQueue.Join();
Log(LogInformation, "IdoPgsqlConnection")
<< "'" << GetName() << "' paused.";
-
}
void IdoPgsqlConnection::ExceptionHandler(boost::exception_ptr exp)
m_Pgsql->finish(m_Connection);
SetConnected(false);
+
+ Log(LogInformation, "IdoPgsqlConnection")
+ << "Disconnected from '" << GetName() << "' database '" << GetDatabase() << "'.";
}
void IdoPgsqlConnection::TxTimerHandler()
if (IsPaused())
return;
- m_QueryQueue.Enqueue(std::bind(&IdoPgsqlConnection::InternalNewTransaction, this), PriorityHigh, true);
+ m_QueryQueue.Enqueue(std::bind(&IdoPgsqlConnection::InternalNewTransaction, this), PriorityNormal, true);
}
void IdoPgsqlConnection::InternalNewTransaction()
void IdoPgsqlConnection::ReconnectTimerHandler()
{
+ /* Only allow Reconnect events with high priority. */
m_QueryQueue.Enqueue(std::bind(&IdoPgsqlConnection::Reconnect, this), PriorityHigh);
}
else
status_update_time = 0;
- double status_update_age = Utility::GetTime() - status_update_time;
+ double now = Utility::GetTime();
- Log(LogNotice, "IdoPgsqlConnection")
- << "Last update by '" << endpoint_name << "' was " << status_update_age << "s ago.";
+ double status_update_age = now - status_update_time;
+ double failoverTimeout = GetFailoverTimeout();
if (status_update_age < GetFailoverTimeout()) {
+ Log(LogInformation, "IdoPgsqlConnection")
+ << "Last update by endpoint '" << endpoint_name << "' was "
+ << status_update_age << "s ago (< failover timeout of " << failoverTimeout << "s). Retrying.";
+
m_Pgsql->finish(m_Connection);
SetConnected(false);
SetShouldConnect(false);
return;
}
+
+ SetLastFailover(now);
+
+ Log(LogInformation, "IdoPgsqlConnection")
+ << "Last update by endpoint '" << endpoint_name << "' was "
+ << status_update_age << "s ago. Taking over '" << GetName() << "' in HA zone '" << Zone::GetLocalZone()->GetName() << "'.";
}
Log(LogNotice, "IdoPgsqlConnection", "Enabling IDO connection.");
UpdateAllObjects();
- m_QueryQueue.Enqueue(std::bind(&IdoPgsqlConnection::ClearTablesBySession, this), PriorityHigh);
+ m_QueryQueue.Enqueue(std::bind(&IdoPgsqlConnection::ClearTablesBySession, this), PriorityNormal);
- m_QueryQueue.Enqueue(std::bind(&IdoPgsqlConnection::FinishConnect, this, startTime), PriorityHigh);
+ m_QueryQueue.Enqueue(std::bind(&IdoPgsqlConnection::FinishConnect, this, startTime), PriorityNormal);
}
void IdoPgsqlConnection::FinishConnect(double startTime)
return;
Log(LogInformation, "IdoPgsqlConnection")
- << "Finished reconnecting to PostgreSQL IDO database in " << std::setw(2) << Utility::GetTime() - startTime << " second(s).";
+ << "Finished reconnecting to '" << GetName() << "' database '" << GetDatabase() << "' in "
+ << std::setw(2) << Utility::GetTime() - startTime << " second(s).";
Query("COMMIT");
Query("BEGIN");
if (IsPaused())
return;
- m_QueryQueue.Enqueue(std::bind(&IdoPgsqlConnection::InternalActivateObject, this, dbobj), PriorityHigh);
+ m_QueryQueue.Enqueue(std::bind(&IdoPgsqlConnection::InternalActivateObject, this, dbobj), PriorityNormal);
}
void IdoPgsqlConnection::InternalActivateObject(const DbObject::Ptr& dbobj)
if (IsPaused())
return;
- m_QueryQueue.Enqueue(std::bind(&IdoPgsqlConnection::InternalDeactivateObject, this, dbobj), PriorityHigh);
+ m_QueryQueue.Enqueue(std::bind(&IdoPgsqlConnection::InternalDeactivateObject, this, dbobj), PriorityNormal);
}
void IdoPgsqlConnection::InternalDeactivateObject(const DbObject::Ptr& dbobj)
m_ReconnectTimer->Start();
m_ReconnectTimer->Reschedule(0);
+ /* Keep this in relative sync with the cold startup in UpdateObjectAuthority() and the reconnect interval above.
+ * Previous: 60s reconnect, 30s OA, 60s cold startup.
+ * Now: 10s reconnect, 10s OA, 30s cold startup.
+ */
m_AuthorityTimer = new Timer();
m_AuthorityTimer->OnTimerExpired.connect(std::bind(&ApiListener::UpdateObjectAuthority));
- m_AuthorityTimer->SetInterval(30);
+ m_AuthorityTimer->SetInterval(10);
m_AuthorityTimer->Start();
m_CleanupCertificateRequestsTimer = new Timer();
#include "remote/apilistener.hpp"
#include "base/configtype.hpp"
#include "base/utility.hpp"
+#include "base/convert.hpp"
using namespace icinga;
void ApiListener::UpdateObjectAuthority()
{
+ ApiListener::Ptr instance = ApiListener::GetInstance();
+
+ if (!instance)
+ return;
+
+ Log(LogNotice, "ApiListener")
+ << "Updating object authority for objects at endpoint '" << instance->GetIdentity() << "'.";
+
Zone::Ptr my_zone = Zone::GetLocalZone();
std::vector<Endpoint::Ptr> endpoints;
double mainTime = Application::GetMainTime();
- if (num_total > 1 && endpoints.size() <= 1 && (mainTime == 0 || Utility::GetTime() - mainTime < 60))
+ /* 30 seconds cold startup, don't update any authority to give the secondary endpoint time to reconnect. */
+ if (num_total > 1 && endpoints.size() <= 1 && (mainTime == 0 || Utility::GetTime() - mainTime < 30))
return;
std::sort(endpoints.begin(), endpoints.end(),
else
authority = endpoints[Utility::SDBM(object->GetName()) % endpoints.size()] == my_endpoint;
+#ifdef I2_DEBUG
+// //Enable on demand, causes heavy logging on each run.
+// Log(LogDebug, "ApiListener")
+// << "Setting authority '" << Convert::ToString(authority) << "' for object '" << object->GetName() << "' of type '" << object->GetReflectionType()->GetName() << "'.";
+#endif /* I2_DEBUG */
+
object->SetAuthority(authority);
}
}