Merge "Mtce heartbeat cluster state change notification improvement"
This commit is contained in:
commit
7a3adb2cdc
|
@ -1,22 +1,13 @@
|
||||||
[Unit]
|
[Unit]
|
||||||
Description=StarlingX Maintenance Heartbeat Agent
|
Description=StarlingX Maintenance Heartbeat Agent
|
||||||
After=network.target syslog.service config.service
|
After=hbsClient.service
|
||||||
Before=pmon.service
|
Before=pmon.service
|
||||||
|
|
||||||
[Service]
|
[Service]
|
||||||
Type=forking
|
Type=forking
|
||||||
ExecStart=/etc/rc.d/init.d/hbsAgent start
|
ExecStart=/etc/rc.d/init.d/hbsAgent start
|
||||||
ExecStop=/etc/rc.d/init.d/hbsAgent start
|
ExecStop=/etc/rc.d/init.d/hbsAgent stop
|
||||||
PIDFile=/var/run/hbsAgent.pid
|
PIDFile=/var/run/hbsAgent.pid
|
||||||
KillMode=process
|
|
||||||
SendSIGKILL=no
|
|
||||||
|
|
||||||
# Process recovery is handled by pmond if its running.
|
|
||||||
# Delay 10 seconds to give pmond a chance to recover
|
|
||||||
# before systemd kicks in to do it as a backup plan.
|
|
||||||
Restart=always
|
|
||||||
RestartSec=10
|
|
||||||
|
|
||||||
[Install]
|
[Install]
|
||||||
WantedBy=multi-user.target
|
WantedBy=multi-user.target
|
||||||
|
|
||||||
|
|
|
@ -8628,7 +8628,7 @@ void nodeLinkClass::manage_heartbeat_alarm ( struct nodeLinkClass::node * node_p
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
#define HBS_LOSS_REPORT_THROTTLE (100)
|
#define HBS_LOSS_REPORT_THROTTLE (100000)
|
||||||
int nodeLinkClass::lost_pulses ( iface_enum iface, bool & storage_0_responding )
|
int nodeLinkClass::lost_pulses ( iface_enum iface, bool & storage_0_responding )
|
||||||
{
|
{
|
||||||
int lost = 0 ;
|
int lost = 0 ;
|
||||||
|
@ -8668,6 +8668,13 @@ int nodeLinkClass::lost_pulses ( iface_enum iface, bool & storage_0_responding )
|
||||||
|
|
||||||
if ( pulse_ptr->b2b_misses_count[iface] > 1 )
|
if ( pulse_ptr->b2b_misses_count[iface] > 1 )
|
||||||
{
|
{
|
||||||
|
if ( pulse_ptr->b2b_misses_count[iface] < hbs_failure_threshold )
|
||||||
|
{
|
||||||
|
hbs_cluster_change ( pulse_ptr->hostname + " " +
|
||||||
|
get_iface_name_str(iface) +
|
||||||
|
" heartbeat miss " +
|
||||||
|
itos(pulse_ptr->b2b_misses_count[iface]));
|
||||||
|
}
|
||||||
if ( pulse_ptr->b2b_misses_count[iface] >= hbs_failure_threshold )
|
if ( pulse_ptr->b2b_misses_count[iface] >= hbs_failure_threshold )
|
||||||
{
|
{
|
||||||
if ( pulse_ptr->b2b_misses_count[iface] == hbs_failure_threshold )
|
if ( pulse_ptr->b2b_misses_count[iface] == hbs_failure_threshold )
|
||||||
|
@ -8774,57 +8781,43 @@ int nodeLinkClass::lost_pulses ( iface_enum iface, bool & storage_0_responding )
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Turn the cluster-host heartbeat loss into a degrade only
|
|
||||||
* condition if the clstr_degrade_only flag is set */
|
|
||||||
if (( iface == CLSTR_IFACE ) &&
|
|
||||||
( pulse_ptr->b2b_misses_count[iface] >= hbs_failure_threshold ) &&
|
|
||||||
( clstr_degrade_only == true ))
|
|
||||||
{
|
|
||||||
/* Only print the log at the threshold boundary */
|
|
||||||
if (( pulse_ptr->b2b_misses_count[iface]%HBS_LOSS_REPORT_THROTTLE) == hbs_failure_threshold )
|
|
||||||
{
|
|
||||||
if ( this->active_controller )
|
|
||||||
{
|
|
||||||
manage_heartbeat_alarm ( pulse_ptr, FM_ALARM_SEVERITY_CRITICAL, iface );
|
|
||||||
}
|
|
||||||
|
|
||||||
wlog ( "%s %s *** Heartbeat Loss *** (degrade only)\n",
|
|
||||||
pulse_ptr->hostname.c_str(),
|
|
||||||
get_iface_name_str(iface) );
|
|
||||||
hbs_cluster_change ( pulse_ptr->hostname + " heartbeat loss" );
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Turn the clstr heartbeat loss into a degrade only
|
/* Turn the clstr heartbeat loss into a degrade only
|
||||||
* condition for inactive controller on normal system. */
|
* condition for inactive controller on normal system. */
|
||||||
else if (( iface == CLSTR_IFACE ) &&
|
if (( iface == CLSTR_IFACE ) &&
|
||||||
( pulse_ptr->b2b_misses_count[iface] >= hbs_failure_threshold ) &&
|
((( this->system_type == SYSTEM_TYPE__NORMAL ) &&
|
||||||
( this->system_type == SYSTEM_TYPE__NORMAL ) &&
|
(( pulse_ptr->nodetype & CONTROLLER_TYPE) == CONTROLLER_TYPE )) ||
|
||||||
(( pulse_ptr->nodetype & CONTROLLER_TYPE) == CONTROLLER_TYPE ))
|
( clstr_degrade_only == true )))
|
||||||
{
|
{
|
||||||
/* Only print the log at the threshold boundary */
|
/* Only print the log at the threshold boundary */
|
||||||
if ( (pulse_ptr->b2b_misses_count[iface]%HBS_LOSS_REPORT_THROTTLE) == hbs_failure_threshold )
|
if ( pulse_ptr->b2b_misses_count[iface]%HBS_LOSS_REPORT_THROTTLE == hbs_failure_threshold )
|
||||||
{
|
{
|
||||||
if ( this->active_controller )
|
if ( this->active_controller )
|
||||||
{
|
{
|
||||||
manage_heartbeat_alarm ( pulse_ptr, FM_ALARM_SEVERITY_CRITICAL, iface );
|
manage_heartbeat_alarm ( pulse_ptr, FM_ALARM_SEVERITY_CRITICAL, iface );
|
||||||
}
|
}
|
||||||
wlog ( "%s %s *** Heartbeat Loss *** (degrade only)\n",
|
wlog ( "%s %s *** Heartbeat Loss *** (degrade only due to %s)\n",
|
||||||
pulse_ptr->hostname.c_str(),
|
pulse_ptr->hostname.c_str(),
|
||||||
get_iface_name_str(iface));
|
get_iface_name_str(iface),
|
||||||
hbs_cluster_change ( pulse_ptr->hostname + " heartbeat loss" );
|
clstr_degrade_only ? "config option" : "system type");
|
||||||
|
hbs_cluster_change ( pulse_ptr->hostname + " " + get_iface_name_str(iface) + " heartbeat loss" );
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
else if ((pulse_ptr->b2b_misses_count[iface]%HBS_LOSS_REPORT_THROTTLE) == hbs_failure_threshold )
|
else if ((pulse_ptr->b2b_misses_count[iface]%HBS_LOSS_REPORT_THROTTLE) == hbs_failure_threshold )
|
||||||
|
// else if ( pulse_ptr->hbs_failure[iface] == false )
|
||||||
{
|
{
|
||||||
elog ("%s %s *** Heartbeat Loss ***\n", pulse_ptr->hostname.c_str(),
|
elog ("%s %s *** Heartbeat Loss *** (b2b_misses:0x%x)\n",
|
||||||
get_iface_name_str(iface) );
|
pulse_ptr->hostname.c_str(),
|
||||||
|
get_iface_name_str(iface),
|
||||||
|
pulse_ptr->b2b_misses_count[iface]);
|
||||||
|
hbs_cluster_change ( pulse_ptr->hostname + " " + get_iface_name_str(iface) + " heartbeat loss" );
|
||||||
|
|
||||||
if ( this->active_controller )
|
if ( this->active_controller )
|
||||||
{
|
{
|
||||||
manage_heartbeat_alarm ( pulse_ptr, FM_ALARM_SEVERITY_CRITICAL, iface );
|
if ( pulse_ptr->hbs_failure[iface] == false )
|
||||||
|
{
|
||||||
|
manage_heartbeat_alarm ( pulse_ptr, FM_ALARM_SEVERITY_CRITICAL, iface );
|
||||||
|
}
|
||||||
/* report this host as failed */
|
/* report this host as failed */
|
||||||
if ( send_event ( pulse_ptr->hostname, MTC_EVENT_HEARTBEAT_LOSS , iface ) == PASS )
|
if ( send_event ( pulse_ptr->hostname, MTC_EVENT_HEARTBEAT_LOSS , iface ) == PASS )
|
||||||
{
|
{
|
||||||
|
@ -8832,10 +8825,8 @@ int nodeLinkClass::lost_pulses ( iface_enum iface, bool & storage_0_responding )
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
|
||||||
pulse_ptr->hbs_failure[iface] = true ;
|
pulse_ptr->hbs_failure[iface] = true ;
|
||||||
}
|
|
||||||
hbs_cluster_change ( pulse_ptr->hostname + " heartbeat loss" );
|
|
||||||
pulse_ptr->hbs_failure_count[iface]++ ;
|
pulse_ptr->hbs_failure_count[iface]++ ;
|
||||||
}
|
}
|
||||||
if ( pulse_ptr->b2b_misses_count[iface] > pulse_ptr->max_count[iface] )
|
if ( pulse_ptr->b2b_misses_count[iface] > pulse_ptr->max_count[iface] )
|
||||||
|
|
|
@ -2374,6 +2374,7 @@ void daemon_service_run ( void )
|
||||||
arrival_histogram[iface] = "" ;
|
arrival_histogram[iface] = "" ;
|
||||||
unexpected_pulse_list[iface] = "" ;
|
unexpected_pulse_list[iface] = "" ;
|
||||||
|
|
||||||
|
|
||||||
rc = hbs_pulse_request ( (iface_enum)iface, seq_num, ri, rri );
|
rc = hbs_pulse_request ( (iface_enum)iface, seq_num, ri, rri );
|
||||||
if ( rc != 0 )
|
if ( rc != 0 )
|
||||||
{
|
{
|
||||||
|
|
|
@ -326,7 +326,7 @@ void hbs_cluster_log ( string & hostname, mtce_hbs_cluster_type & cluster, stri
|
||||||
void hbs_sm_handler ( void );
|
void hbs_sm_handler ( void );
|
||||||
|
|
||||||
/* send the cluster vault to SM */
|
/* send the cluster vault to SM */
|
||||||
void hbs_cluster_send ( msgClassSock * sm_client_sock, int reqid , string reason );
|
int hbs_cluster_send ( msgClassSock * sm_client_sock, int reqid , string reason );
|
||||||
|
|
||||||
/* copy cluster data from src to dst */
|
/* copy cluster data from src to dst */
|
||||||
void hbs_cluster_copy ( mtce_hbs_cluster_type & src, mtce_hbs_cluster_type & dst );
|
void hbs_cluster_copy ( mtce_hbs_cluster_type & src, mtce_hbs_cluster_type & dst );
|
||||||
|
@ -338,6 +338,10 @@ void hbs_cluster_dump ( mtce_hbs_cluster_type & vault );
|
||||||
/* Heartbeat service state audit */
|
/* Heartbeat service state audit */
|
||||||
void hbs_state_audit ( void );
|
void hbs_state_audit ( void );
|
||||||
|
|
||||||
|
/* Send state change message to SM if there has been a
|
||||||
|
* state change in the last period */
|
||||||
|
void hbs_cluster_change_notifier ( void );
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @} hbs_base
|
* @} hbs_base
|
||||||
*/
|
*/
|
||||||
|
|
|
@ -69,6 +69,8 @@ typedef struct
|
||||||
|
|
||||||
msgClassSock * sm_socket_ptr ;
|
msgClassSock * sm_socket_ptr ;
|
||||||
|
|
||||||
|
string cluster_change_reason ;
|
||||||
|
|
||||||
} hbs_cluster_ctrl_type ;
|
} hbs_cluster_ctrl_type ;
|
||||||
|
|
||||||
/* Cluster control structire construct allocation. */
|
/* Cluster control structire construct allocation. */
|
||||||
|
@ -122,6 +124,8 @@ void hbs_cluster_init ( unsigned short period, msgClassSock * sm_socket_ptr )
|
||||||
{
|
{
|
||||||
ctrl.sm_socket_ptr = sm_socket_ptr ;
|
ctrl.sm_socket_ptr = sm_socket_ptr ;
|
||||||
}
|
}
|
||||||
|
ctrl.cluster_change_reason = "";
|
||||||
|
|
||||||
ctrl.log_throttle = 0 ;
|
ctrl.log_throttle = 0 ;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -173,7 +177,30 @@ void hbs_cluster_nums ( unsigned short this_controller,
|
||||||
|
|
||||||
void hbs_cluster_change ( string cluster_change_reason )
|
void hbs_cluster_change ( string cluster_change_reason )
|
||||||
{
|
{
|
||||||
hbs_cluster_send( ctrl.sm_socket_ptr, 0, cluster_change_reason );
|
ilog ("reason: %s", cluster_change_reason.c_str());
|
||||||
|
if ( ctrl.cluster_change_reason.empty() )
|
||||||
|
ctrl.cluster_change_reason = cluster_change_reason ;
|
||||||
|
else
|
||||||
|
ctrl.cluster_change_reason.append("," + cluster_change_reason) ;
|
||||||
|
}
|
||||||
|
|
||||||
|
/****************************************************************************
|
||||||
|
*
|
||||||
|
* Name : hbs_cluster_change_notifier
|
||||||
|
*
|
||||||
|
* Description : Send SM the cluster info if there has been a state change.
|
||||||
|
*
|
||||||
|
***************************************************************************/
|
||||||
|
void hbs_cluster_change_notifier ( void )
|
||||||
|
{
|
||||||
|
if ( ! ctrl.cluster_change_reason.empty () )
|
||||||
|
{
|
||||||
|
if ( hbs_cluster_send( ctrl.sm_socket_ptr, 0,
|
||||||
|
ctrl.cluster_change_reason ) == PASS )
|
||||||
|
{
|
||||||
|
ctrl.cluster_change_reason.clear();
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/****************************************************************************
|
/****************************************************************************
|
||||||
|
@ -444,6 +471,7 @@ void hbs_cluster_update ( iface_enum iface,
|
||||||
wlog_throttled ( ctrl.log_throttle, THROTTLE_COUNT,
|
wlog_throttled ( ctrl.log_throttle, THROTTLE_COUNT,
|
||||||
"Unable to store history beyond %d ",
|
"Unable to store history beyond %d ",
|
||||||
ctrl.cluster.histories );
|
ctrl.cluster.histories );
|
||||||
|
hbs_cluster_change_notifier ();
|
||||||
return ;
|
return ;
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
|
@ -544,6 +572,8 @@ void hbs_cluster_update ( iface_enum iface,
|
||||||
else
|
else
|
||||||
history_ptr->oldest_entry_index++ ;
|
history_ptr->oldest_entry_index++ ;
|
||||||
|
|
||||||
|
hbs_cluster_change_notifier ();
|
||||||
|
|
||||||
/* clear the log throttle if we are updating history ok. */
|
/* clear the log throttle if we are updating history ok. */
|
||||||
ctrl.log_throttle = 0 ;
|
ctrl.log_throttle = 0 ;
|
||||||
}
|
}
|
||||||
|
@ -647,12 +677,12 @@ unsigned short hbs_cluster_unused_bytes ( void )
|
||||||
*
|
*
|
||||||
***************************************************************************/
|
***************************************************************************/
|
||||||
|
|
||||||
void hbs_cluster_send ( msgClassSock * sm_client_sock, int reqid , string reason )
|
int hbs_cluster_send ( msgClassSock * sm_client_sock, int reqid , string reason )
|
||||||
{
|
{
|
||||||
|
int rc = FAIL_SOCKET_SENDTO ;
|
||||||
ctrl.cluster.reqid = (unsigned short)reqid ;
|
ctrl.cluster.reqid = (unsigned short)reqid ;
|
||||||
if (( sm_client_sock ) && ( sm_client_sock->sock_ok() == true ))
|
if (( sm_client_sock ) && ( sm_client_sock->sock_ok() == true ))
|
||||||
{
|
{
|
||||||
ilog ("cluster state notification Reason: %s", reason.c_str());
|
|
||||||
int len = sizeof(mtce_hbs_cluster_type)-hbs_cluster_unused_bytes();
|
int len = sizeof(mtce_hbs_cluster_type)-hbs_cluster_unused_bytes();
|
||||||
int bytes = sm_client_sock->write((char*)&ctrl.cluster, len);
|
int bytes = sm_client_sock->write((char*)&ctrl.cluster, len);
|
||||||
if ( bytes <= 0 )
|
if ( bytes <= 0 )
|
||||||
|
@ -660,12 +690,19 @@ void hbs_cluster_send ( msgClassSock * sm_client_sock, int reqid , string reason
|
||||||
elog ("failed to send cluster vault to SM (bytes=%d) (%d:%s)\n",
|
elog ("failed to send cluster vault to SM (bytes=%d) (%d:%s)\n",
|
||||||
bytes , errno, strerror(errno));
|
bytes , errno, strerror(errno));
|
||||||
}
|
}
|
||||||
hbs_cluster_dump ( ctrl.cluster );
|
else
|
||||||
|
{
|
||||||
|
/* limit the string length */
|
||||||
|
ilog ("reason: %s", reason.substr(0,80).c_str());
|
||||||
|
hbs_cluster_dump ( ctrl.cluster );
|
||||||
|
rc = PASS ;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
wlog ("cannot send cluster info due to socket error");
|
wlog ("cannot send cluster info due to socket error");
|
||||||
}
|
}
|
||||||
|
return(rc);
|
||||||
}
|
}
|
||||||
|
|
||||||
/****************************************************************************
|
/****************************************************************************
|
||||||
|
@ -689,7 +726,7 @@ void hbs_history_save ( string hostname,
|
||||||
{
|
{
|
||||||
if ( hbs_cluster_cmp( sample, ctrl.cluster.history[h] ) )
|
if ( hbs_cluster_cmp( sample, ctrl.cluster.history[h] ) )
|
||||||
{
|
{
|
||||||
hbs_cluster_change ("peer controller cluster event " +
|
hbs_cluster_change ("peer cluster delta " +
|
||||||
hbs_cluster_network_name((mtce_hbs_network_enum)sample.network));
|
hbs_cluster_network_name((mtce_hbs_network_enum)sample.network));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue