Merge "Mtce: Improve robustness of heartbeat Loss reporting"

This commit is contained in:
Zuul 2018-12-21 14:59:11 +00:00 committed by Gerrit Code Review
commit 7512c6b105
5 changed files with 104 additions and 152 deletions

View File

@ -8249,7 +8249,7 @@ void nodeLinkClass::manage_heartbeat_alarm ( struct nodeLinkClass::node * node_p
#define HBS_LOSS_REPORT_THROTTLE (100)
int nodeLinkClass::lost_pulses ( iface_enum iface, bool & storage_0_responding )
{
int lost = 0 ;
@ -8408,16 +8408,16 @@ int nodeLinkClass::lost_pulses ( iface_enum iface, bool & storage_0_responding )
( infra_degrade_only == true ))
{
/* Only print the log at the threshold boundary */
if ( pulse_ptr->b2b_misses_count[iface] == hbs_failure_threshold )
if (( pulse_ptr->b2b_misses_count[iface]%HBS_LOSS_REPORT_THROTTLE) == hbs_failure_threshold )
{
if ( this->active_controller )
{
manage_heartbeat_alarm ( pulse_ptr, FM_ALARM_SEVERITY_CRITICAL, iface );
}
wlog_throttled ( pulse_ptr->no_work_log_throttle, 500,
"%s %s *** Heartbeat Loss *** (degrade only)\n", pulse_ptr->hostname.c_str(),
get_iface_name_str(iface) );
wlog ( "%s %s *** Heartbeat Loss *** (degrade only)\n",
pulse_ptr->hostname.c_str(),
get_iface_name_str(iface) );
hbs_cluster_change ( pulse_ptr->hostname + " heartbeat loss" );
}
}
@ -8430,21 +8430,20 @@ int nodeLinkClass::lost_pulses ( iface_enum iface, bool & storage_0_responding )
(( pulse_ptr->nodetype & CONTROLLER_TYPE) == CONTROLLER_TYPE ))
{
/* Only print the log at the threshold boundary */
if ( pulse_ptr->b2b_misses_count[iface] == hbs_failure_threshold )
if ( (pulse_ptr->b2b_misses_count[iface]%HBS_LOSS_REPORT_THROTTLE) == hbs_failure_threshold )
{
if ( this->active_controller )
{
manage_heartbeat_alarm ( pulse_ptr, FM_ALARM_SEVERITY_CRITICAL, iface );
}
wlog_throttled ( pulse_ptr->no_work_log_throttle, 500,
"%s %s *** Heartbeat Loss *** (degrade only)\n", pulse_ptr->hostname.c_str(),
get_iface_name_str(iface) );
wlog ( "%s %s *** Heartbeat Loss *** (degrade only)\n",
pulse_ptr->hostname.c_str(),
get_iface_name_str(iface));
hbs_cluster_change ( pulse_ptr->hostname + " heartbeat loss" );
}
}
else if (( pulse_ptr->b2b_misses_count[iface] == hbs_failure_threshold ) &&
( pulse_ptr->hbs_failure[iface] == false ))
else if ((pulse_ptr->b2b_misses_count[iface]%HBS_LOSS_REPORT_THROTTLE) == hbs_failure_threshold )
{
elog ("%s %s *** Heartbeat Loss ***\n", pulse_ptr->hostname.c_str(),
get_iface_name_str(iface) );

View File

@ -1183,11 +1183,11 @@ int send_event ( string & hostname, unsigned int event_cmd, iface_enum iface )
memset (&event, 0 , sizeof(mtc_message_type));
if ( event_cmd == MTC_EVENT_HEARTBEAT_LOSS )
{
// daemon_dump_membuf_banner ();
hbsInv.print_node_info ();
if ( hbs_config.debug_state == 2 )
{
hbsInv.print_node_info ();
}
hbs_cluster_log ( hbsInv.my_hostname, "event", true );
// daemon_dump_membuf ();
snprintf ( &event.hdr[0] , MSG_HEADER_SIZE, "%s", get_heartbeat_loss_header());
}
else if ( event_cmd == MTC_EVENT_LOOPBACK )
@ -1504,9 +1504,6 @@ void daemon_service_run ( void )
daemon_exit ();
}
/* set this controller as provisioned */
hbs_manage_controller_state ( hbsInv.my_hostname , true );
/* Run heartbeat service forever or until stop condition */
for ( hbsTimer.ring = false , hbsTimer_audit.ring = false ; ; )
{
@ -1518,7 +1515,6 @@ void daemon_service_run ( void )
if ( sockets_init == true )
{
hbsInv.print_node_info();
hbs_state_audit ();
}
@ -1725,7 +1721,7 @@ void daemon_service_run ( void )
{
hbsInv.hbs_disabled = true ;
hbsInv.hbs_state_change = true ;
hbs_cluster_lock();
hbs_controller_lock ();
ilog ("heartbeat service going disabled (locked)");
/* force the throttle 'still disabled' log to wait for
@ -1904,17 +1900,12 @@ void daemon_service_run ( void )
}
else if ( msg.cmd == MTC_CMD_STOP_HOST )
{
if ( hostname == hbsInv.my_hostname )
{
ilog ("%s heartbeat service disabled by stop command",
hostname.c_str());
hbs_manage_controller_state( hostname, false );
}
else
if ( hostname != hbsInv.my_hostname )
{
hbsInv.mon_host ( hostname, false, true );
hbs_cluster_del ( hostname );
ilog ("%s heartbeat service disabled by stop command",
hostname.c_str());
}
}
else if ( msg.cmd == MTC_CMD_START_HOST )
@ -2309,7 +2300,10 @@ void daemon_service_run ( void )
*/
bool storage_0_responding = true ;
int lost = hbsInv.lost_pulses ((iface_enum)iface, storage_0_responding);
hbs_cluster_update ((iface_enum)iface, lost, storage_0_responding);
if ( !hbs_ctrl.locked && !hbsInv.hbs_disabled )
{
hbs_cluster_update ((iface_enum)iface, lost, storage_0_responding);
}
}
hbsTimer.ring = false ;
heartbeat_request = true ;

View File

@ -250,8 +250,9 @@ void hbs_cluster_change ( string cluster_change_reason );
* Automatically adjusts the numbers in the cluster vault. */
void hbs_cluster_add ( string & hostname );
void hbs_cluster_del ( string & hostname );
void hbs_cluster_rem ( unsigned short controller );
void hbs_cluster_lock ( void );
/* do actions when this controller is detected as locked */
void hbs_controller_lock ( void );
/* Do stuff in preparation for another pulse period start */
void hbs_cluster_period_start ( void );
@ -263,9 +264,6 @@ void hbs_cluster_storage0_status ( iface_enum iface , bool responding );
int hbs_cluster_cmp( mtce_hbs_cluster_history_type h1,
mtce_hbs_cluster_history_type h2 );
/* Manage the enabled state of the controllers */
void hbs_manage_controller_state ( string & hostname, bool enabled );
/* Set the number of monitored hosts and this controller's
* number in the cluster vault. */
void hbs_cluster_nums ( unsigned short this_controller,

View File

@ -750,6 +750,7 @@ static unsigned int rri[MTCE_HBS_MAX_CONTROLLERS] = {0,0} ;
static int rx_error_count[MAX_IFACES] = {0,0} ;
static int tx_error_count[MAX_IFACES] = {0,0} ;
static int missing_history_count[MAX_IFACES] = {0,0} ;
#define ERROR_LOG_THRESHOLD (200)
@ -961,18 +962,21 @@ int _service_pulse_request ( iface_enum iface , unsigned int flags )
}
else
{
int debug_state = daemon_get_cfg_ptr()->debug_state ;
clog ("controller-%d %s cluster info added to response (%d)",
controller?0:1,
get_iface_name_str(iface), missed_controller_summary_tracker[controller?0:1] );
get_iface_name_str(iface),
missed_controller_summary_tracker[controller?0:1] );
/* Now copy the other controller's cached cluster info into
* this controller's response */
hbs_cluster_copy ( controller_cluster_cache[controller?0:1],
hbs_sock.rx_mesg[iface].cluster );
if ( daemon_get_cfg_ptr()->debug_state & 4 )
string dump_banner = "" ;
if ( debug_state )
{
string dump_banner = "" ;
dump_banner.append("controller-") ;
dump_banner.append(itos(controller?0:1));
dump_banner.append(" cluster info from cache injected into controller-");
@ -980,10 +984,30 @@ int _service_pulse_request ( iface_enum iface , unsigned int flags )
dump_banner.append(":");
dump_banner.append(get_iface_name_str(iface));
dump_banner.append(" pulse response");
}
if ( debug_state & 4 )
{
hbs_cluster_dump ( hbs_sock.rx_mesg[iface].cluster, dump_banner );
}
else
{
clog ("%s", dump_banner.c_str());
}
}
}
if (missing_history_count[iface])
{
ilog ("controller-%d %s providing cluster history",
controller, get_iface_name_str(iface));
missing_history_count[iface] = 0 ;
}
}
else
{
wlog_throttled ( missing_history_count[iface], 5000,
"controller-%d %s proividing no cluster history",
controller, get_iface_name_str(iface));
}
}

View File

@ -30,17 +30,17 @@ typedef struct
/* Contains the controller number (0 or 1) for this controller. */
unsigned short this_controller ;
/* Preserves which controllers are enabled. */
bool controller_0_enabled ;
bool controller_1_enabled ;
#ifdef THREE_CONTROLLER_SYSTEM
bool controller_2_enabled ;
#endif
/* Used to manage the cluster based on this and peer controller state */
bool peer_controller_enabled ;
/* Used to prevent log flooding in presence of back to back errors. */
unsigned int log_throttle ;
/* Used to log when
* - peer history goes missing (false -> true change)
* - peer history starts being received ( true -> false change ) */
bool peer_history_missing ;
/* Used to threshold storage-0 not responding state */
unsigned int storage_0_not_responding_count[MTCE_HBS_NETWORKS];
@ -130,12 +130,8 @@ void hbs_cluster_init ( unsigned short period, msgClassSock * sm_socket_ptr )
void hbs_cluster_ctrl_init ( void )
{
ctrl.this_controller = 0xffff ;
ctrl.controller_0_enabled = false ;
ctrl.controller_1_enabled = false ;
#ifdef THREE_CONTROLLER_SYSTEM
ctrl.controller_2_enabled = false ;
#endif
ctrl.peer_controller_enabled = false ;
ctrl.peer_history_missing = true ;
ctrl.log_throttle = 0 ;
ctrl.monitored_networks = 0 ;
ctrl.monitored_hosts = 0 ;
@ -236,64 +232,6 @@ void cluster_storage0_state ( bool enabled )
}
}
/****************************************************************************
*
* Name : hbs_manage_controller_state
*
* Description : Track the monitored enabled state of the controllers.
*
***************************************************************************/
void hbs_manage_controller_state ( string & hostname, bool enabled )
{
int controller = -1 ;
/* track controller state */
if ( hostname == CONTROLLER_0 )
{
controller = 0 ;
ctrl.controller_0_enabled = enabled ;
}
else if ( hostname == CONTROLLER_1 )
{
controller = 1 ;
ctrl.controller_1_enabled = enabled ;
}
else
{
/* ignore all other host names */
return ;
}
/* manage the state of the peer controller */
if ( ctrl.this_controller != controller )
{
/* Clear peer controller cluster history when the peer
* controller goes disabled */
if (( ctrl.peer_controller_enabled == true ) &&
( enabled == false ))
{
hbs_cluster_rem ( controller );
}
if ( enabled == false )
{
hbs_cluster_change ( "peer controller disabled" ) ;
}
else
{
hbs_cluster_change ( "peer controller enabled" ) ;
}
ctrl.peer_controller_enabled = enabled ;
}
else if ( enabled == false )
{
hbs_cluster_change ( "this controller locked" ) ;
hbs_cluster_lock();
}
}
/****************************************************************************
*
* Name : hbs_cluster_add
@ -346,8 +284,11 @@ void hbs_cluster_add ( string & hostname )
hbs_cluster_init ( ctrl.cluster.period_msec, NULL );
}
/* Manage controller state ; true means enabled in this case. */
hbs_manage_controller_state ( hostname, true );
/* Catch enable/provisioning of the peer controller */
if (( hostname == CONTROLLER_0 ) && ( ctrl.this_controller != 0 ))
ctrl.peer_controller_enabled = true ;
if (( hostname == CONTROLLER_1 ) && ( ctrl.this_controller != 1 ))
ctrl.peer_controller_enabled = true ;
}
/****************************************************************************
@ -391,9 +332,6 @@ void hbs_cluster_del ( string & hostname )
hbs_cluster_init ( ctrl.cluster.period_msec, NULL );
}
/* Manage controller state ; false means not enabled in this case. */
hbs_manage_controller_state ( hostname , false );
ilog ("%s deleted from cluster", hostname.c_str());
cluster_list ();
@ -455,7 +393,6 @@ void hbs_cluster_period_start ( void )
*
***************************************************************************/
void hbs_cluster_update ( iface_enum iface,
unsigned short not_responding_hosts,
bool storage_0_responding )
@ -636,7 +573,7 @@ void hbs_cluster_append ( hbs_message_type & msg )
}
msg.cluster.bytes = BYTES_IN_CLUSTER_VAULT(msg.cluster.histories);
clog2 ("controller-%d appending cluster info to heartbeat message (%d:%d:%d)",
clog1 ("controller-%d appending cluster info to heartbeat message (%d:%d:%d)",
ctrl.this_controller, ctrl.monitored_networks, ctrl.cluster.histories, msg.cluster.bytes );
}
@ -649,12 +586,22 @@ void hbs_cluster_peer ( void )
if (( ctrl.got_peer_controller_history == false ) &&
( ctrl.peer_controller_enabled == true ))
{
ilog ("missing peer controller cluster view" ); /* ERIK: DEBUG */
if ( ctrl.peer_history_missing == false )
{
wlog ( "missing peer controller cluster view" );
ctrl.peer_history_missing = true ;
}
/* if no nodes have reported peer controller history then inject
* a 0:0 value in for this pulse period for that controller. */
hbs_cluster_inject ( ctrl.this_controller?0:1, 0, 0 );
}
else if (( ctrl.got_peer_controller_history == true ) &&
( ctrl.peer_controller_enabled == true ) &&
( ctrl.peer_history_missing == true ))
{
wlog ( "receiving peer controller cluster view" );
ctrl.peer_history_missing = false ;
}
}
/****************************************************************************
@ -720,7 +667,9 @@ void hbs_cluster_send ( msgClassSock * sm_client_sock, int reqid , string reason
*
***************************************************************************/
void hbs_history_save ( string hostname, mtce_hbs_cluster_history_type & sample )
void hbs_history_save ( string hostname,
mtce_hbs_network_enum network,
mtce_hbs_cluster_history_type & sample )
{
for ( int h = 0 ; h < ctrl.cluster.histories ; h++ )
{
@ -736,12 +685,12 @@ void hbs_history_save ( string hostname, mtce_hbs_cluster_history_type & sample
memcpy( &ctrl.cluster.history[h], &sample,
sizeof(mtce_hbs_cluster_history_type));
clog1 ("controller-%d updated vault with controller-%d:%s network history through %s (histories:%d)",
clog1 ("controller-%d vault update from controller-%d %s reply with %d histories (this:%s)",
ctrl.this_controller,
sample.controller,
hbs_cluster_network_name((mtce_hbs_network_enum)sample.network).c_str(),
hostname.c_str(),
ctrl.cluster.histories);
hbs_cluster_network_name(network).c_str(),
ctrl.cluster.histories,
hbs_cluster_network_name((mtce_hbs_network_enum)sample.network).c_str());
return ;
}
}
@ -756,16 +705,17 @@ void hbs_history_save ( string hostname, mtce_hbs_cluster_history_type & sample
ctrl.cluster.histories++ ;
ctrl.cluster.bytes = BYTES_IN_CLUSTER_VAULT(ctrl.cluster.histories);
ilog ("controller-%d added new controller-%d:%s history to vault ; now have %d network views",
ilog ("controller-%d added new %s:%s history to vault ; now have %d network views",
ctrl.this_controller,
sample.controller,
hostname.c_str(),
hbs_cluster_network_name((mtce_hbs_network_enum)sample.network).c_str(),
ctrl.cluster.histories);
}
void hbs_state_audit ( void )
{
hbs_cluster_dump ( ctrl.cluster, "Audit" );
if ( ctrl.monitored_hosts )
hbs_cluster_dump ( ctrl.cluster, "Audit" );
}
@ -872,7 +822,7 @@ int hbs_cluster_save ( string & hostname,
{
/* set that we got some history and save it */
ctrl.got_peer_controller_history = true ;
hbs_history_save ( hostname, msg.cluster.history[h] );
hbs_history_save ( hostname, network, msg.cluster.history[h] );
}
hbs_cluster_log( hostname, ctrl.cluster, hbs_cluster_network_name(network) );
}
@ -915,39 +865,26 @@ void hbs_cluster_inject ( unsigned short controller, unsigned short hosts_enable
}
}
/****************************************************************************
*
* Name : hbs_controller_lock
*
* Description : Clear all history for this controller.
* Called when this controller is detected as locked.
*
***************************************************************************/
void hbs_cluster_rem ( unsigned short controller )
void hbs_controller_lock ( void )
{
int removed = 0 ;
for ( int h = 0 ; h < ctrl.cluster.histories ; h++ )
if ( ctrl.cluster.histories )
{
if ( ctrl.cluster.history[h].controller == controller )
ilog ("controller-%d locked ; clearing all cluster info", ctrl.this_controller );
for ( int h = 0 ; h < ctrl.cluster.histories ; h++ )
{
removed++ ;
wlog ("controller-%d %s network history removed from cluster (slot %d)",
controller,
hbs_cluster_network_name((mtce_hbs_network_enum)ctrl.cluster.history[h].network).c_str(),
h );
memset ( &ctrl.cluster.history[h], 0, sizeof(mtce_hbs_cluster_history_type));
}
ctrl.cluster.histories = 0 ;
hbs_cluster_change ( "this controller locked" ) ;
}
if ( removed )
{
hbs_cluster_change ( "removed controller history" ) ;
}
ctrl.cluster.histories -= removed ;
ctrl.cluster.bytes = BYTES_IN_CLUSTER_VAULT(ctrl.cluster.histories);
}
/* remove all cluster history on a lock operation */
void hbs_cluster_lock( void )
{
ilog ("controller-%d lock ; clearing all cluster info", ctrl.this_controller );
for ( int h = 0 ; h < ctrl.cluster.histories ; h++ )
{
memset ( &ctrl.cluster.history[h], 0, sizeof(mtce_hbs_cluster_history_type));
}
ctrl.cluster.histories = 0 ;
}