Mtce: Debouce heartbeat recovery

For the event of Heartbeat Failure with a host, the Mtce Heartbeat Agent
will declare heartbeat recovery upon the first successful heartbeat
reply after the loss is declared ; basically edge level trigger
recovery.

In cases where a networking issue causes heartbeat loss of a group of
hosts, Maintenance tracks the group of hosts that experienced heartbeta
loss and puts the system into 'Multi Node Failure Avoidance' mode.
maintenance then simply waits up to a configured timeout period for
hosts to regain heartbeat.
As heartbeat is regained for each host that host is attempted to be
'Gracefully Recovered'.

However, if the networking issue persists in a way that the occasional
transient heartbeat pulse gets through then the maintenance system can
prematurely take hosts and then 'the system' out of MNFA mode only to
find that heartbeat is actually not properly recovered/working only to
then fail and force reboot/reset each node that is still experiencing
heartbeat loss.

This update changes the heartbeat service from an 'edge' to 'level'
sensitive recovery by requiring a number of back-2-back heartbeat pulses
following a failure before that host is delared as recovered and pulled
out of the MMNFA pool.

Basically, This update makes the system's MNFA recovery algorithm more
robust in the face of transient heartbeat loss for a group of hosts.

Story: 2002882
Task: 22845

Change-Id: Ie36b73a14cfad317d900e3a3a9ddb434326737a1
Signed-off-by: Jack Ding <jack.ding@windriver.com>
This commit is contained in:
Eric MacDonald 2018-06-08 09:45:50 -04:00 committed by Jack Ding
parent ed1410a736
commit acd2d684f6
4 changed files with 88 additions and 64 deletions

View File

@ -165,7 +165,7 @@ void daemon_exit ( void );
#define MAX_API_LOG_LEN (0x1000)
#define MAX_FILENAME_LEN (100)
#define MAX_SYSTEM_CMD_LEN (200)
#define HBS_PULSES_REQUIRED_FOR_RECOVERY (10)
#define MAX_START_SERVICES_RETRY (20)
#define DEFAULT_MTCALIVE_TIMEOUT (1200)

View File

@ -690,7 +690,9 @@ nodeLinkClass::node* nodeLinkClass::addNode( string hostname )
ptr->max_count[i] = 0 ;
ptr->hbs_count[i] = 0 ;
ptr->hbs_minor_count[i] = 0 ;
ptr->hbs_misses_count[i] = 0 ;
ptr->b2b_misses_count[i] = 0 ;
ptr->b2b_pulses_count[i] = 0 ;
ptr->hbs_degrade_count[i] = 0 ;
ptr->hbs_failure_count[i] = 0 ;
ptr->heartbeat_failed[i] = false;
@ -1139,25 +1141,26 @@ void nodeLinkClass::print_node_info ( void )
if (( i == INFRA_IFACE ) && ( infra_network_provisioned == false ))
continue ;
syslog ( LOG_INFO, "+--------------+-----+-----+------+-----+------+------------+-----------------+\n");
syslog ( LOG_INFO, "| %s: %3d | Mon | Mis | Max | Deg | Fail | Pulses | %s (%4d) |\n" ,
syslog ( LOG_INFO, "+--------------+-----+-----+------+-----+------+------------+---------+-----------------+\n");
syslog ( LOG_INFO, "| %s: %3d | Mon | Mis | Max | Deg | Fail | Pulses Tot | Pulses | %s (%4d) |\n" ,
get_iface_name_str ((iface_enum)i), hosts, hbs_disabled ? "DISABLED" : "Enabled ", hbs_pulse_period );
syslog ( LOG_INFO, "+--------------+-----+-----+------+-----+------+------------+-----------------+\n");
syslog ( LOG_INFO, "+--------------+-----+-----+------+-----+------+------------+---------+-----------------+\n");
for ( struct node * ptr = head ; ptr != NULL ; ptr = ptr->next )
{
syslog ( LOG_INFO, "| %-12s | %c | %3i | %4i | %3i | %4i | %8x | %d msec\n",
syslog ( LOG_INFO, "| %-12s | %c | %3i | %4i | %3i | %4i | %8x | %7x | %d msec\n",
ptr->hostname.c_str(),
ptr->monitor[i] ? 'Y' : 'n',
ptr->b2b_misses_count[i],
ptr->hbs_misses_count[i],
ptr->max_count[i],
ptr->hbs_degrade_count[i],
ptr->hbs_failure_count[i],
ptr->hbs_count[i],
ptr->hbs_count[i],
ptr->b2b_pulses_count[i],
hbs_pulse_period );
}
}
syslog ( LOG_INFO, "+--------------+-----+-----+------+-----+------+------------+-----------------+\n");
syslog ( LOG_INFO, "+--------------+-----+-----+------+-----+------+------------+---------+-----------------+\n");
}
}
@ -7285,7 +7288,7 @@ int nodeLinkClass::launch_host_services_cmd ( struct nodeLinkClass::node * node_
int send_event ( string & hostname, unsigned int cmd, iface_enum iface );
int nodeLinkClass::mon_host ( const string & hostname, iface_enum iface, bool true_false )
int nodeLinkClass::mon_host ( const string & hostname, iface_enum iface, bool true_false, bool send_clear )
{
int rc = FAIL ;
if ( ! hostname.empty() )
@ -7299,12 +7302,17 @@ int nodeLinkClass::mon_host ( const string & hostname, iface_enum iface, bool tr
{
node_ptr->no_work_log_throttle = 0 ;
node_ptr->b2b_misses_count[iface] = 0 ;
node_ptr->hbs_misses_count[iface] = 0 ;
node_ptr->b2b_pulses_count[iface] = 0 ;
node_ptr->max_count[iface] = 0 ;
node_ptr->hbs_failure[iface] = false ;
send_event ( node_ptr->hostname, MTC_EVENT_HEARTBEAT_MINOR_CLR, iface ) ;
node_ptr->hbs_minor[iface] = false ;
send_event ( node_ptr->hostname, MTC_EVENT_HEARTBEAT_DEGRADE_CLR, iface ) ;
node_ptr->hbs_degrade[iface] = false ;
if ( send_clear == true )
{
send_event ( node_ptr->hostname, MTC_EVENT_HEARTBEAT_MINOR_CLR, iface ) ;
send_event ( node_ptr->hostname, MTC_EVENT_HEARTBEAT_DEGRADE_CLR, iface ) ;
}
}
return PASS ;
}
@ -7693,12 +7701,57 @@ int nodeLinkClass::remPulse ( struct node * node_ptr, iface_enum iface, bool cle
if (( pulse_list[iface].head_ptr != NULL ) && ( ptr != NULL ) && ( ptr->linknum[iface] != 0))
{
pulse_ptr = ptr ;
ptr->hbs_count[iface]++ ;
manage_pulse_flags ( pulse_ptr , flags );
/* clear_b2b_misses_count override check ; thresold recovery */
if ( clear_b2b_misses_count == true )
{
ptr->hbs_count[iface]++ ;
ptr->b2b_pulses_count[iface]++ ;
if ( ptr->hbs_failure[iface] == true )
{
/* threshold failure recovery */
if ( ptr->b2b_pulses_count[iface] < HBS_PULSES_REQUIRED_FOR_RECOVERY )
{
/* don't clear the alarm or send clear notifications to mtc
* if this interfaces failed and has not yet received the
* required number of back to back pulses needed for recovery */
clear_b2b_misses_count = false ;
ilog ("%s %s heartbeat failure recovery (%d of %d)\n",
node_ptr->hostname.c_str(),
get_iface_name_str(iface),
ptr->b2b_pulses_count[iface],
HBS_PULSES_REQUIRED_FOR_RECOVERY);
}
else
{
ptr->hbs_failure[iface] = false ;
ilog ("%s %s heartbeat failure recovery (%d)\n",
node_ptr->hostname.c_str(),
get_iface_name_str(iface),
ptr->b2b_pulses_count[iface]);
}
}
else
{
ptr->b2b_misses_count[iface] = 0 ;
}
}
else
{
if (( ptr->b2b_pulses_count[iface] != 0 ) && ( ptr->hbs_failure[iface] == true ))
{
ilog ("%s %s failed but %d\n", node_ptr->hostname.c_str(),
get_iface_name_str(iface),
ptr->b2b_pulses_count[iface]);
}
}
if ( clear_b2b_misses_count == true )
{
manage_heartbeat_alarm ( pulse_ptr, FM_ALARM_SEVERITY_CLEAR, iface );
if ( ptr->b2b_misses_count[iface] > hbs_degrade_threshold )
{
ilog ("%s %s Pulse Rxed (after %d misses)\n",
@ -7707,8 +7760,6 @@ int nodeLinkClass::remPulse ( struct node * node_ptr, iface_enum iface, bool cle
node_ptr->b2b_misses_count[iface]);
}
manage_heartbeat_alarm ( pulse_ptr, FM_ALARM_SEVERITY_CLEAR, iface );
ptr->b2b_misses_count[iface] = 0 ;
if ( pulse_ptr->hbs_degrade[iface] == true )
{
@ -7964,15 +8015,15 @@ int nodeLinkClass::lost_pulses ( iface_enum iface )
{
string flat = "Flat Line:" ;
pulse_ptr->b2b_misses_count[iface]++ ;
pulse_ptr->hbs_misses_count[iface]++ ;
pulse_ptr->b2b_pulses_count[iface] = 0 ;
// pulse_ptr->max_count[iface]++ ;
/* Don't log single misses unless in debug mode */
if ( pulse_ptr->b2b_misses_count[iface] > 1 )
{
// if ( pulse_ptr->b2b_misses_count[iface] >= 25 )
if ( pulse_ptr->b2b_misses_count[iface] >= hbs_failure_threshold )
{
// if ( pulse_ptr->b2b_misses_count[iface] == 25 )
if ( pulse_ptr->b2b_misses_count[iface] == hbs_failure_threshold )
{
ilog ("%-13s %s Pulse Miss (%d) (log throttled to every %d)\n",
@ -8440,13 +8491,15 @@ void nodeLinkClass::mem_log_hbs_cnts ( struct nodeLinkClass::node * node_ptr )
char str[MAX_MEM_LOG_DATA] ;
for ( int iface = 0 ; iface < MAX_IFACES ; iface++ )
{
snprintf (&str[0], MAX_MEM_LOG_DATA, "%s\t%s Counts Minor:%d Degrade:%d Failed:%d Max:%d Cur:%d\n",
node_ptr->hostname.c_str(),
snprintf (&str[0], MAX_MEM_LOG_DATA, "%s\t%s Counts Minor:%d Degrade:%d Failed:%d Misses:%d MaxB2BMisses:%d Cur:%d Tot:%d\n",
node_ptr->hostname.c_str(),
get_iface_name_str(iface),
node_ptr->hbs_minor_count[iface],
node_ptr->hbs_degrade_count[iface],
node_ptr->hbs_minor_count[iface],
node_ptr->hbs_degrade_count[iface],
node_ptr->hbs_failure_count[iface],
node_ptr->hbs_misses_count[iface],
node_ptr->max_count[iface],
node_ptr->b2b_pulses_count[iface],
node_ptr->hbs_count[iface]);
mem_log (str);
}

View File

@ -515,9 +515,15 @@ private:
/** Ongoing heartbeat count cleared on HBS_START reset */
int hbs_count [MAX_IFACES] ;
/** Keep track of the number of misses since heartbeat was started */
int hbs_misses_count [MAX_IFACES];
/** Immediate running count of consecutive heartbeat misses */
int b2b_misses_count [MAX_IFACES];
/** Number of consecutive pulses received since last miss */
int b2b_pulses_count [MAX_IFACES];
/** Maximum heartbeat misses since node was last brought into service */
int max_count [MAX_IFACES];
@ -1929,7 +1935,7 @@ public:
void manage_pulse_flags ( string & hostname, unsigned int flags );
/** Control the heartbeat monitoring state of a host */
int mon_host ( const string & hostname, iface_enum iface, bool true_false );
int mon_host ( const string & hostname, iface_enum iface, bool true_false, bool send_clear );
/** Return true if the pulse list is empty */
bool pulse_list_empty ( iface_enum iface );

View File

@ -132,33 +132,6 @@ void monitor_scheduling ( unsigned long long & this_time, unsigned long long & p
prev_time = this_time ;
}
void nodeLinkClass::recalibrate_thresholds ( void )
{
if ( hbsInv.hosts > hbs_config.hbs_calibrate_threshold )
{
hbsInv.hbs_pulse_period = (hbsInv.hosts * hbs_config.hbs_calibrate_period_factor ) ;
hbsInv.hbs_minor_threshold = (hbsInv.hosts * hbs_config.hbs_calibrate_minor_factor ) ;
hbsInv.hbs_degrade_threshold = (hbsInv.hosts * hbs_config.hbs_calibrate_degrade_factor) ;
hbsInv.hbs_failure_threshold = (hbsInv.hosts * hbs_config.hbs_calibrate_fail_factor ) ;
}
else
{
hbsInv.hbs_pulse_period = hbs_config.hbs_pulse_period ;
hbsInv.hbs_minor_threshold = hbs_config.hbs_minor_threshold ;
hbsInv.hbs_degrade_threshold = hbs_config.hbs_degrade_threshold ;
hbsInv.hbs_failure_threshold = hbs_config.hbs_failure_threshold ;
}
hbsInv.hbs_pulse_period_save = hbsInv.hbs_pulse_period ;
ilog ("Heartbeat Thresholds ; hosts:%d pulse:%d msecs - minor:%d degrade:%d failure:%d\n",
hbsInv.hosts,
hbsInv.hbs_pulse_period,
hbsInv.hbs_minor_threshold,
hbsInv.hbs_degrade_threshold,
hbsInv.hbs_failure_threshold);
}
/* Cleanup exit handler */
void daemon_exit ( void )
{
@ -253,18 +226,18 @@ static int hbs_config_handler ( void * user,
if (MATCH("agent", "hbs_minor_threshold"))
{
config_ptr->hbs_minor_threshold = atoi(value);
config_ptr->hbs_minor_threshold =
hbsInv.hbs_minor_threshold = atoi(value);
}
if (MATCH("agent", "heartbeat_degrade_threshold"))
{
config_ptr->hbs_degrade_threshold = atoi(value);
config_ptr->hbs_degrade_threshold =
hbsInv.hbs_degrade_threshold = atoi(value);
config_ptr->mask |= CONFIG_AGENT_HBS_DEGRADE ;
}
if (MATCH("agent", "heartbeat_failure_threshold"))
{
config_ptr->hbs_failure_threshold = atoi(value);
config_ptr->hbs_failure_threshold =
hbsInv.hbs_failure_threshold = atoi(value);
config_ptr->mask |= CONFIG_AGENT_HBS_FAILURE ;
}
@ -1387,14 +1360,12 @@ void daemon_service_run ( void )
/* clear any outstanding alarms on the ADD */
hbsAlarm_clear_all ( hostname );
// hbsInv.recalibrate_thresholds ();
}
else if ( msg.cmd == MTC_CMD_DEL_HOST )
{
for ( int iface = 0 ; iface < MAX_IFACES ; iface++ )
{
hbsInv.mon_host ( hostname, (iface_enum)iface, false );
hbsInv.mon_host ( hostname, (iface_enum)iface, false, false );
}
hbsInv.del_host ( hostname );
@ -1402,35 +1373,29 @@ void daemon_service_run ( void )
/* clear any outstanding alarms on the DEL */
hbsAlarm_clear_all ( hostname );
hbsInv.print_node_info();
// hbsInv.recalibrate_thresholds ();
}
else if ( msg.cmd == MTC_CMD_STOP_HOST )
{
for ( int iface = 0 ; iface < MAX_IFACES ; iface++ )
{
hbsInv.mon_host ( hostname, (iface_enum)iface, false );
hbsInv.mon_host ( hostname, (iface_enum)iface, false, true );
}
ilog ("%s stopping heartbeat service\n", hostname.c_str());
hbsInv.print_node_info();
}
else if ( msg.cmd == MTC_CMD_START_HOST )
{
for ( int iface = 0 ; iface < MAX_IFACES ; iface++ )
{
hbsInv.mon_host ( hostname, (iface_enum)iface, true );
hbsInv.mon_host ( hostname, (iface_enum)iface, true, true );
}
ilog ("%s starting heartbeat service\n", hostname.c_str());
hbsInv.print_node_info();
}
else if ( msg.cmd == MTC_RESTART_HBS )
{
for ( int iface = 0 ; iface < MAX_IFACES ; iface++ )
{
hbsInv.mon_host ( hostname, (iface_enum)iface, false );
hbsInv.mon_host ( hostname, (iface_enum)iface, true );
hbsInv.mon_host ( hostname, (iface_enum)iface, false, false );
hbsInv.mon_host ( hostname, (iface_enum)iface, true, false );
}
ilog ("%s restarting heartbeat service\n", hostname.c_str());
hbsInv.print_node_info();