Merge "Mtce heartbeat cluster state change notification improvement"

This commit is contained in:
Zuul 2021-01-18 16:15:27 +00:00 committed by Gerrit Code Review
commit 7a3adb2cdc
5 changed files with 78 additions and 54 deletions

View File

@ -1,22 +1,13 @@
[Unit]
Description=StarlingX Maintenance Heartbeat Agent
After=network.target syslog.service config.service
After=hbsClient.service
Before=pmon.service
[Service]
Type=forking
ExecStart=/etc/rc.d/init.d/hbsAgent start
ExecStop=/etc/rc.d/init.d/hbsAgent start
ExecStop=/etc/rc.d/init.d/hbsAgent stop
PIDFile=/var/run/hbsAgent.pid
KillMode=process
SendSIGKILL=no
# Process recovery is handled by pmond if its running.
# Delay 10 seconds to give pmond a chance to recover
# before systemd kicks in to do it as a backup plan.
Restart=always
RestartSec=10
[Install]
WantedBy=multi-user.target

View File

@ -8628,7 +8628,7 @@ void nodeLinkClass::manage_heartbeat_alarm ( struct nodeLinkClass::node * node_p
#define HBS_LOSS_REPORT_THROTTLE (100)
#define HBS_LOSS_REPORT_THROTTLE (100000)
int nodeLinkClass::lost_pulses ( iface_enum iface, bool & storage_0_responding )
{
int lost = 0 ;
@ -8668,6 +8668,13 @@ int nodeLinkClass::lost_pulses ( iface_enum iface, bool & storage_0_responding )
if ( pulse_ptr->b2b_misses_count[iface] > 1 )
{
if ( pulse_ptr->b2b_misses_count[iface] < hbs_failure_threshold )
{
hbs_cluster_change ( pulse_ptr->hostname + " " +
get_iface_name_str(iface) +
" heartbeat miss " +
itos(pulse_ptr->b2b_misses_count[iface]));
}
if ( pulse_ptr->b2b_misses_count[iface] >= hbs_failure_threshold )
{
if ( pulse_ptr->b2b_misses_count[iface] == hbs_failure_threshold )
@ -8774,57 +8781,43 @@ int nodeLinkClass::lost_pulses ( iface_enum iface, bool & storage_0_responding )
}
}
/* Turn the cluster-host heartbeat loss into a degrade only
* condition if the clstr_degrade_only flag is set */
if (( iface == CLSTR_IFACE ) &&
( pulse_ptr->b2b_misses_count[iface] >= hbs_failure_threshold ) &&
( clstr_degrade_only == true ))
{
/* Only print the log at the threshold boundary */
if (( pulse_ptr->b2b_misses_count[iface]%HBS_LOSS_REPORT_THROTTLE) == hbs_failure_threshold )
{
if ( this->active_controller )
{
manage_heartbeat_alarm ( pulse_ptr, FM_ALARM_SEVERITY_CRITICAL, iface );
}
wlog ( "%s %s *** Heartbeat Loss *** (degrade only)\n",
pulse_ptr->hostname.c_str(),
get_iface_name_str(iface) );
hbs_cluster_change ( pulse_ptr->hostname + " heartbeat loss" );
}
}
/* Turn the clstr heartbeat loss into a degrade only
* condition for inactive controller on normal system. */
else if (( iface == CLSTR_IFACE ) &&
( pulse_ptr->b2b_misses_count[iface] >= hbs_failure_threshold ) &&
( this->system_type == SYSTEM_TYPE__NORMAL ) &&
(( pulse_ptr->nodetype & CONTROLLER_TYPE) == CONTROLLER_TYPE ))
if (( iface == CLSTR_IFACE ) &&
((( this->system_type == SYSTEM_TYPE__NORMAL ) &&
(( pulse_ptr->nodetype & CONTROLLER_TYPE) == CONTROLLER_TYPE )) ||
( clstr_degrade_only == true )))
{
/* Only print the log at the threshold boundary */
if ( (pulse_ptr->b2b_misses_count[iface]%HBS_LOSS_REPORT_THROTTLE) == hbs_failure_threshold )
if ( pulse_ptr->b2b_misses_count[iface]%HBS_LOSS_REPORT_THROTTLE == hbs_failure_threshold )
{
if ( this->active_controller )
{
manage_heartbeat_alarm ( pulse_ptr, FM_ALARM_SEVERITY_CRITICAL, iface );
}
wlog ( "%s %s *** Heartbeat Loss *** (degrade only)\n",
wlog ( "%s %s *** Heartbeat Loss *** (degrade only due to %s)\n",
pulse_ptr->hostname.c_str(),
get_iface_name_str(iface));
hbs_cluster_change ( pulse_ptr->hostname + " heartbeat loss" );
get_iface_name_str(iface),
clstr_degrade_only ? "config option" : "system type");
hbs_cluster_change ( pulse_ptr->hostname + " " + get_iface_name_str(iface) + " heartbeat loss" );
}
}
else if ((pulse_ptr->b2b_misses_count[iface]%HBS_LOSS_REPORT_THROTTLE) == hbs_failure_threshold )
// else if ( pulse_ptr->hbs_failure[iface] == false )
{
elog ("%s %s *** Heartbeat Loss ***\n", pulse_ptr->hostname.c_str(),
get_iface_name_str(iface) );
elog ("%s %s *** Heartbeat Loss *** (b2b_misses:0x%x)\n",
pulse_ptr->hostname.c_str(),
get_iface_name_str(iface),
pulse_ptr->b2b_misses_count[iface]);
hbs_cluster_change ( pulse_ptr->hostname + " " + get_iface_name_str(iface) + " heartbeat loss" );
if ( this->active_controller )
{
manage_heartbeat_alarm ( pulse_ptr, FM_ALARM_SEVERITY_CRITICAL, iface );
if ( pulse_ptr->hbs_failure[iface] == false )
{
manage_heartbeat_alarm ( pulse_ptr, FM_ALARM_SEVERITY_CRITICAL, iface );
}
/* report this host as failed */
if ( send_event ( pulse_ptr->hostname, MTC_EVENT_HEARTBEAT_LOSS , iface ) == PASS )
{
@ -8832,10 +8825,8 @@ int nodeLinkClass::lost_pulses ( iface_enum iface, bool & storage_0_responding )
}
}
else
{
pulse_ptr->hbs_failure[iface] = true ;
}
hbs_cluster_change ( pulse_ptr->hostname + " heartbeat loss" );
pulse_ptr->hbs_failure_count[iface]++ ;
}
if ( pulse_ptr->b2b_misses_count[iface] > pulse_ptr->max_count[iface] )

View File

@ -2374,6 +2374,7 @@ void daemon_service_run ( void )
arrival_histogram[iface] = "" ;
unexpected_pulse_list[iface] = "" ;
rc = hbs_pulse_request ( (iface_enum)iface, seq_num, ri, rri );
if ( rc != 0 )
{

View File

@ -326,7 +326,7 @@ void hbs_cluster_log ( string & hostname, mtce_hbs_cluster_type & cluster, stri
void hbs_sm_handler ( void );
/* send the cluster vault to SM */
void hbs_cluster_send ( msgClassSock * sm_client_sock, int reqid , string reason );
int hbs_cluster_send ( msgClassSock * sm_client_sock, int reqid , string reason );
/* copy cluster data from src to dst */
void hbs_cluster_copy ( mtce_hbs_cluster_type & src, mtce_hbs_cluster_type & dst );
@ -338,6 +338,10 @@ void hbs_cluster_dump ( mtce_hbs_cluster_type & vault );
/* Heartbeat service state audit */
void hbs_state_audit ( void );
/* Send state change message to SM if there has been a
* state change in the last period */
void hbs_cluster_change_notifier ( void );
/**
* @} hbs_base
*/

View File

@ -69,6 +69,8 @@ typedef struct
msgClassSock * sm_socket_ptr ;
string cluster_change_reason ;
} hbs_cluster_ctrl_type ;
/* Cluster control structire construct allocation. */
@ -122,6 +124,8 @@ void hbs_cluster_init ( unsigned short period, msgClassSock * sm_socket_ptr )
{
ctrl.sm_socket_ptr = sm_socket_ptr ;
}
ctrl.cluster_change_reason = "";
ctrl.log_throttle = 0 ;
}
@ -173,7 +177,30 @@ void hbs_cluster_nums ( unsigned short this_controller,
void hbs_cluster_change ( string cluster_change_reason )
{
hbs_cluster_send( ctrl.sm_socket_ptr, 0, cluster_change_reason );
ilog ("reason: %s", cluster_change_reason.c_str());
if ( ctrl.cluster_change_reason.empty() )
ctrl.cluster_change_reason = cluster_change_reason ;
else
ctrl.cluster_change_reason.append("," + cluster_change_reason) ;
}
/****************************************************************************
*
* Name : hbs_cluster_change_notifier
*
* Description : Send SM the cluster info if there has been a state change.
*
***************************************************************************/
void hbs_cluster_change_notifier ( void )
{
if ( ! ctrl.cluster_change_reason.empty () )
{
if ( hbs_cluster_send( ctrl.sm_socket_ptr, 0,
ctrl.cluster_change_reason ) == PASS )
{
ctrl.cluster_change_reason.clear();
}
}
}
/****************************************************************************
@ -444,6 +471,7 @@ void hbs_cluster_update ( iface_enum iface,
wlog_throttled ( ctrl.log_throttle, THROTTLE_COUNT,
"Unable to store history beyond %d ",
ctrl.cluster.histories );
hbs_cluster_change_notifier ();
return ;
}
else
@ -544,6 +572,8 @@ void hbs_cluster_update ( iface_enum iface,
else
history_ptr->oldest_entry_index++ ;
hbs_cluster_change_notifier ();
/* clear the log throttle if we are updating history ok. */
ctrl.log_throttle = 0 ;
}
@ -647,12 +677,12 @@ unsigned short hbs_cluster_unused_bytes ( void )
*
***************************************************************************/
void hbs_cluster_send ( msgClassSock * sm_client_sock, int reqid , string reason )
int hbs_cluster_send ( msgClassSock * sm_client_sock, int reqid , string reason )
{
int rc = FAIL_SOCKET_SENDTO ;
ctrl.cluster.reqid = (unsigned short)reqid ;
if (( sm_client_sock ) && ( sm_client_sock->sock_ok() == true ))
{
ilog ("cluster state notification Reason: %s", reason.c_str());
int len = sizeof(mtce_hbs_cluster_type)-hbs_cluster_unused_bytes();
int bytes = sm_client_sock->write((char*)&ctrl.cluster, len);
if ( bytes <= 0 )
@ -660,12 +690,19 @@ void hbs_cluster_send ( msgClassSock * sm_client_sock, int reqid , string reason
elog ("failed to send cluster vault to SM (bytes=%d) (%d:%s)\n",
bytes , errno, strerror(errno));
}
hbs_cluster_dump ( ctrl.cluster );
else
{
/* limit the string length */
ilog ("reason: %s", reason.substr(0,80).c_str());
hbs_cluster_dump ( ctrl.cluster );
rc = PASS ;
}
}
else
{
wlog ("cannot send cluster info due to socket error");
}
return(rc);
}
/****************************************************************************
@ -689,7 +726,7 @@ void hbs_history_save ( string hostname,
{
if ( hbs_cluster_cmp( sample, ctrl.cluster.history[h] ) )
{
hbs_cluster_change ("peer controller cluster event " +
hbs_cluster_change ("peer cluster delta " +
hbs_cluster_network_name((mtce_hbs_network_enum)sample.network));
}