Merge "Mtce heartbeat cluster state change notification improvement"

This commit is contained in:
Zuul 2021-01-18 16:15:27 +00:00 committed by Gerrit Code Review
commit 7a3adb2cdc
5 changed files with 78 additions and 54 deletions

View File

@ -1,22 +1,13 @@
[Unit] [Unit]
Description=StarlingX Maintenance Heartbeat Agent Description=StarlingX Maintenance Heartbeat Agent
After=network.target syslog.service config.service After=hbsClient.service
Before=pmon.service Before=pmon.service
[Service] [Service]
Type=forking Type=forking
ExecStart=/etc/rc.d/init.d/hbsAgent start ExecStart=/etc/rc.d/init.d/hbsAgent start
ExecStop=/etc/rc.d/init.d/hbsAgent start ExecStop=/etc/rc.d/init.d/hbsAgent stop
PIDFile=/var/run/hbsAgent.pid PIDFile=/var/run/hbsAgent.pid
KillMode=process
SendSIGKILL=no
# Process recovery is handled by pmond if its running.
# Delay 10 seconds to give pmond a chance to recover
# before systemd kicks in to do it as a backup plan.
Restart=always
RestartSec=10
[Install] [Install]
WantedBy=multi-user.target WantedBy=multi-user.target

View File

@ -8628,7 +8628,7 @@ void nodeLinkClass::manage_heartbeat_alarm ( struct nodeLinkClass::node * node_p
#define HBS_LOSS_REPORT_THROTTLE (100) #define HBS_LOSS_REPORT_THROTTLE (100000)
int nodeLinkClass::lost_pulses ( iface_enum iface, bool & storage_0_responding ) int nodeLinkClass::lost_pulses ( iface_enum iface, bool & storage_0_responding )
{ {
int lost = 0 ; int lost = 0 ;
@ -8668,6 +8668,13 @@ int nodeLinkClass::lost_pulses ( iface_enum iface, bool & storage_0_responding )
if ( pulse_ptr->b2b_misses_count[iface] > 1 ) if ( pulse_ptr->b2b_misses_count[iface] > 1 )
{ {
if ( pulse_ptr->b2b_misses_count[iface] < hbs_failure_threshold )
{
hbs_cluster_change ( pulse_ptr->hostname + " " +
get_iface_name_str(iface) +
" heartbeat miss " +
itos(pulse_ptr->b2b_misses_count[iface]));
}
if ( pulse_ptr->b2b_misses_count[iface] >= hbs_failure_threshold ) if ( pulse_ptr->b2b_misses_count[iface] >= hbs_failure_threshold )
{ {
if ( pulse_ptr->b2b_misses_count[iface] == hbs_failure_threshold ) if ( pulse_ptr->b2b_misses_count[iface] == hbs_failure_threshold )
@ -8774,57 +8781,43 @@ int nodeLinkClass::lost_pulses ( iface_enum iface, bool & storage_0_responding )
} }
} }
/* Turn the cluster-host heartbeat loss into a degrade only
* condition if the clstr_degrade_only flag is set */
if (( iface == CLSTR_IFACE ) &&
( pulse_ptr->b2b_misses_count[iface] >= hbs_failure_threshold ) &&
( clstr_degrade_only == true ))
{
/* Only print the log at the threshold boundary */
if (( pulse_ptr->b2b_misses_count[iface]%HBS_LOSS_REPORT_THROTTLE) == hbs_failure_threshold )
{
if ( this->active_controller )
{
manage_heartbeat_alarm ( pulse_ptr, FM_ALARM_SEVERITY_CRITICAL, iface );
}
wlog ( "%s %s *** Heartbeat Loss *** (degrade only)\n",
pulse_ptr->hostname.c_str(),
get_iface_name_str(iface) );
hbs_cluster_change ( pulse_ptr->hostname + " heartbeat loss" );
}
}
/* Turn the clstr heartbeat loss into a degrade only /* Turn the clstr heartbeat loss into a degrade only
* condition for inactive controller on normal system. */ * condition for inactive controller on normal system. */
else if (( iface == CLSTR_IFACE ) && if (( iface == CLSTR_IFACE ) &&
( pulse_ptr->b2b_misses_count[iface] >= hbs_failure_threshold ) && ((( this->system_type == SYSTEM_TYPE__NORMAL ) &&
( this->system_type == SYSTEM_TYPE__NORMAL ) && (( pulse_ptr->nodetype & CONTROLLER_TYPE) == CONTROLLER_TYPE )) ||
(( pulse_ptr->nodetype & CONTROLLER_TYPE) == CONTROLLER_TYPE )) ( clstr_degrade_only == true )))
{ {
/* Only print the log at the threshold boundary */ /* Only print the log at the threshold boundary */
if ( (pulse_ptr->b2b_misses_count[iface]%HBS_LOSS_REPORT_THROTTLE) == hbs_failure_threshold ) if ( pulse_ptr->b2b_misses_count[iface]%HBS_LOSS_REPORT_THROTTLE == hbs_failure_threshold )
{ {
if ( this->active_controller ) if ( this->active_controller )
{ {
manage_heartbeat_alarm ( pulse_ptr, FM_ALARM_SEVERITY_CRITICAL, iface ); manage_heartbeat_alarm ( pulse_ptr, FM_ALARM_SEVERITY_CRITICAL, iface );
} }
wlog ( "%s %s *** Heartbeat Loss *** (degrade only)\n", wlog ( "%s %s *** Heartbeat Loss *** (degrade only due to %s)\n",
pulse_ptr->hostname.c_str(), pulse_ptr->hostname.c_str(),
get_iface_name_str(iface)); get_iface_name_str(iface),
hbs_cluster_change ( pulse_ptr->hostname + " heartbeat loss" ); clstr_degrade_only ? "config option" : "system type");
hbs_cluster_change ( pulse_ptr->hostname + " " + get_iface_name_str(iface) + " heartbeat loss" );
} }
} }
else if ((pulse_ptr->b2b_misses_count[iface]%HBS_LOSS_REPORT_THROTTLE) == hbs_failure_threshold ) else if ((pulse_ptr->b2b_misses_count[iface]%HBS_LOSS_REPORT_THROTTLE) == hbs_failure_threshold )
// else if ( pulse_ptr->hbs_failure[iface] == false )
{ {
elog ("%s %s *** Heartbeat Loss ***\n", pulse_ptr->hostname.c_str(), elog ("%s %s *** Heartbeat Loss *** (b2b_misses:0x%x)\n",
get_iface_name_str(iface) ); pulse_ptr->hostname.c_str(),
get_iface_name_str(iface),
pulse_ptr->b2b_misses_count[iface]);
hbs_cluster_change ( pulse_ptr->hostname + " " + get_iface_name_str(iface) + " heartbeat loss" );
if ( this->active_controller ) if ( this->active_controller )
{ {
manage_heartbeat_alarm ( pulse_ptr, FM_ALARM_SEVERITY_CRITICAL, iface ); if ( pulse_ptr->hbs_failure[iface] == false )
{
manage_heartbeat_alarm ( pulse_ptr, FM_ALARM_SEVERITY_CRITICAL, iface );
}
/* report this host as failed */ /* report this host as failed */
if ( send_event ( pulse_ptr->hostname, MTC_EVENT_HEARTBEAT_LOSS , iface ) == PASS ) if ( send_event ( pulse_ptr->hostname, MTC_EVENT_HEARTBEAT_LOSS , iface ) == PASS )
{ {
@ -8832,10 +8825,8 @@ int nodeLinkClass::lost_pulses ( iface_enum iface, bool & storage_0_responding )
} }
} }
else else
{
pulse_ptr->hbs_failure[iface] = true ; pulse_ptr->hbs_failure[iface] = true ;
}
hbs_cluster_change ( pulse_ptr->hostname + " heartbeat loss" );
pulse_ptr->hbs_failure_count[iface]++ ; pulse_ptr->hbs_failure_count[iface]++ ;
} }
if ( pulse_ptr->b2b_misses_count[iface] > pulse_ptr->max_count[iface] ) if ( pulse_ptr->b2b_misses_count[iface] > pulse_ptr->max_count[iface] )

View File

@ -2374,6 +2374,7 @@ void daemon_service_run ( void )
arrival_histogram[iface] = "" ; arrival_histogram[iface] = "" ;
unexpected_pulse_list[iface] = "" ; unexpected_pulse_list[iface] = "" ;
rc = hbs_pulse_request ( (iface_enum)iface, seq_num, ri, rri ); rc = hbs_pulse_request ( (iface_enum)iface, seq_num, ri, rri );
if ( rc != 0 ) if ( rc != 0 )
{ {

View File

@ -326,7 +326,7 @@ void hbs_cluster_log ( string & hostname, mtce_hbs_cluster_type & cluster, stri
void hbs_sm_handler ( void ); void hbs_sm_handler ( void );
/* send the cluster vault to SM */ /* send the cluster vault to SM */
void hbs_cluster_send ( msgClassSock * sm_client_sock, int reqid , string reason ); int hbs_cluster_send ( msgClassSock * sm_client_sock, int reqid , string reason );
/* copy cluster data from src to dst */ /* copy cluster data from src to dst */
void hbs_cluster_copy ( mtce_hbs_cluster_type & src, mtce_hbs_cluster_type & dst ); void hbs_cluster_copy ( mtce_hbs_cluster_type & src, mtce_hbs_cluster_type & dst );
@ -338,6 +338,10 @@ void hbs_cluster_dump ( mtce_hbs_cluster_type & vault );
/* Heartbeat service state audit */ /* Heartbeat service state audit */
void hbs_state_audit ( void ); void hbs_state_audit ( void );
/* Send state change message to SM if there has been a
* state change in the last period */
void hbs_cluster_change_notifier ( void );
/** /**
* @} hbs_base * @} hbs_base
*/ */

View File

@ -69,6 +69,8 @@ typedef struct
msgClassSock * sm_socket_ptr ; msgClassSock * sm_socket_ptr ;
string cluster_change_reason ;
} hbs_cluster_ctrl_type ; } hbs_cluster_ctrl_type ;
/* Cluster control structire construct allocation. */ /* Cluster control structire construct allocation. */
@ -122,6 +124,8 @@ void hbs_cluster_init ( unsigned short period, msgClassSock * sm_socket_ptr )
{ {
ctrl.sm_socket_ptr = sm_socket_ptr ; ctrl.sm_socket_ptr = sm_socket_ptr ;
} }
ctrl.cluster_change_reason = "";
ctrl.log_throttle = 0 ; ctrl.log_throttle = 0 ;
} }
@ -173,7 +177,30 @@ void hbs_cluster_nums ( unsigned short this_controller,
void hbs_cluster_change ( string cluster_change_reason ) void hbs_cluster_change ( string cluster_change_reason )
{ {
hbs_cluster_send( ctrl.sm_socket_ptr, 0, cluster_change_reason ); ilog ("reason: %s", cluster_change_reason.c_str());
if ( ctrl.cluster_change_reason.empty() )
ctrl.cluster_change_reason = cluster_change_reason ;
else
ctrl.cluster_change_reason.append("," + cluster_change_reason) ;
}
/****************************************************************************
*
* Name : hbs_cluster_change_notifier
*
* Description : Send SM the cluster info if there has been a state change.
*
***************************************************************************/
void hbs_cluster_change_notifier ( void )
{
if ( ! ctrl.cluster_change_reason.empty () )
{
if ( hbs_cluster_send( ctrl.sm_socket_ptr, 0,
ctrl.cluster_change_reason ) == PASS )
{
ctrl.cluster_change_reason.clear();
}
}
} }
/**************************************************************************** /****************************************************************************
@ -444,6 +471,7 @@ void hbs_cluster_update ( iface_enum iface,
wlog_throttled ( ctrl.log_throttle, THROTTLE_COUNT, wlog_throttled ( ctrl.log_throttle, THROTTLE_COUNT,
"Unable to store history beyond %d ", "Unable to store history beyond %d ",
ctrl.cluster.histories ); ctrl.cluster.histories );
hbs_cluster_change_notifier ();
return ; return ;
} }
else else
@ -544,6 +572,8 @@ void hbs_cluster_update ( iface_enum iface,
else else
history_ptr->oldest_entry_index++ ; history_ptr->oldest_entry_index++ ;
hbs_cluster_change_notifier ();
/* clear the log throttle if we are updating history ok. */ /* clear the log throttle if we are updating history ok. */
ctrl.log_throttle = 0 ; ctrl.log_throttle = 0 ;
} }
@ -647,12 +677,12 @@ unsigned short hbs_cluster_unused_bytes ( void )
* *
***************************************************************************/ ***************************************************************************/
void hbs_cluster_send ( msgClassSock * sm_client_sock, int reqid , string reason ) int hbs_cluster_send ( msgClassSock * sm_client_sock, int reqid , string reason )
{ {
int rc = FAIL_SOCKET_SENDTO ;
ctrl.cluster.reqid = (unsigned short)reqid ; ctrl.cluster.reqid = (unsigned short)reqid ;
if (( sm_client_sock ) && ( sm_client_sock->sock_ok() == true )) if (( sm_client_sock ) && ( sm_client_sock->sock_ok() == true ))
{ {
ilog ("cluster state notification Reason: %s", reason.c_str());
int len = sizeof(mtce_hbs_cluster_type)-hbs_cluster_unused_bytes(); int len = sizeof(mtce_hbs_cluster_type)-hbs_cluster_unused_bytes();
int bytes = sm_client_sock->write((char*)&ctrl.cluster, len); int bytes = sm_client_sock->write((char*)&ctrl.cluster, len);
if ( bytes <= 0 ) if ( bytes <= 0 )
@ -660,12 +690,19 @@ void hbs_cluster_send ( msgClassSock * sm_client_sock, int reqid , string reason
elog ("failed to send cluster vault to SM (bytes=%d) (%d:%s)\n", elog ("failed to send cluster vault to SM (bytes=%d) (%d:%s)\n",
bytes , errno, strerror(errno)); bytes , errno, strerror(errno));
} }
hbs_cluster_dump ( ctrl.cluster ); else
{
/* limit the string length */
ilog ("reason: %s", reason.substr(0,80).c_str());
hbs_cluster_dump ( ctrl.cluster );
rc = PASS ;
}
} }
else else
{ {
wlog ("cannot send cluster info due to socket error"); wlog ("cannot send cluster info due to socket error");
} }
return(rc);
} }
/**************************************************************************** /****************************************************************************
@ -689,7 +726,7 @@ void hbs_history_save ( string hostname,
{ {
if ( hbs_cluster_cmp( sample, ctrl.cluster.history[h] ) ) if ( hbs_cluster_cmp( sample, ctrl.cluster.history[h] ) )
{ {
hbs_cluster_change ("peer controller cluster event " + hbs_cluster_change ("peer cluster delta " +
hbs_cluster_network_name((mtce_hbs_network_enum)sample.network)); hbs_cluster_network_name((mtce_hbs_network_enum)sample.network));
} }