Fix MNFA recovery race condition that leads to stuck degrade

Seeing from 0 to 10% of hosts get stuck in the degrade state
after MNFA recovery.

Clearing host degrade on Multi-Node Failure Avoidance (MNFA)
recovery does not send degrade clear but does clear the hbs
controol states. Instead relies on explicit events from
hbsAgent per host/network to do so.

If MNFA Recovery (exit) event occurs before all hbsAgent
clear messages arrive then the hbs control clear tricks
the mtcAgent into thinking that there was no degrade event
active when it actually may still be.

This fix enables the clear option the mon_host MNFA Recovery
call so that the host's degrade condition is cleared.
It also removes the unnecessary heartbeat disable call.

Test Plan:

PASS: soak MNFA in large system over and over to verify
      a 0-10% stuck degrade occurance rate drops to 0
      after many (more than 20) occurances.

Regression:

PASS: Verify heartbeat.
PASS: Verify single node graceful recovery.

Change-Id: I699a376af5a95cc8dcc6ea5cc8266dc14fbacd09
Closes-Bug: 1845344
Signed-off-by: Eric MacDonald <eric.macdonald@windriver.com>
This commit is contained in:
Eric MacDonald 2019-09-28 19:41:53 -04:00
parent 901e81a191
commit f01fd85470
2 changed files with 6 additions and 19 deletions

View File

@ -7516,16 +7516,12 @@ int nodeLinkClass::mon_host ( const string & hostname, bool true_false, bool sen
node_ptr = nodeLinkClass::getNode ( hostname );
if ( node_ptr != NULL )
{
bool want_log = true ;
for ( int iface = 0 ; iface < MAX_IFACES ; iface++ )
{
if ( iface == CLSTR_IFACE )
{
if ( this->clstr_network_provisioned == false )
continue ;
if ( node_ptr->monitor[MGMNT_IFACE] == true_false )
want_log = false ;
}
if ( send_clear == true )
@ -7536,11 +7532,7 @@ int nodeLinkClass::mon_host ( const string & hostname, bool true_false, bool sen
if ( true_false == true )
{
if ( want_log )
{
ilog ("%s starting heartbeat service \n",
hostname.c_str());
}
ilog ("%s heartbeat start", hostname.c_str());
node_ptr->no_work_log_throttle = 0 ;
node_ptr->b2b_misses_count[iface] = 0 ;
node_ptr->hbs_misses_count[iface] = 0 ;
@ -7552,11 +7544,7 @@ int nodeLinkClass::mon_host ( const string & hostname, bool true_false, bool sen
}
else
{
if ( want_log )
{
ilog ("%s stopping heartbeat service\n",
hostname.c_str());
}
ilog ("%s heartbeat stop", hostname.c_str());
}
node_ptr->monitor[iface] = true_false ;
}

View File

@ -1963,11 +1963,11 @@ void daemon_service_run ( void )
hbsInv.mon_host ( hostname, true, true );
}
}
else if ( msg.cmd == MTC_RESTART_HBS )
else if (( msg.cmd == MTC_RESTART_HBS ) &&
( hostname != hbsInv.my_hostname ))
{
hbsInv.mon_host ( hostname, false, false );
hbsInv.mon_host ( hostname, true, false );
ilog ("%s restarting heartbeat service\n", hostname.c_str());
hbsInv.mon_host ( hostname, true, true );
ilog ("%s heartbeat restart", hostname.c_str());
hbsInv.print_node_info();
}
else if ( msg.cmd == MTC_RECOVER_HBS )
@ -1978,7 +1978,6 @@ void daemon_service_run ( void )
}
else if ( msg.cmd == MTC_BACKOFF_HBS )
{
hbsInv.hbs_pulse_period = (hbsInv.hbs_pulse_period_save * HBS_BACKOFF_FACTOR) ;
ilog ("%s starting heartbeat backoff (period:%d msecs)\n", hostname.c_str(), hbsInv.hbs_pulse_period );
hbs_cluster_change ( "backoff" );