Fix Mtce Heartbeat period recovery on MNFA Exit

When Multi-Node Failure Avoidance (MNFA) occurs,
maintenance commands the Heartbeat Agent to slow
down by a factor of 4.

The rate recovery following a MNFA is not occurring.

Update https://review.opendev.org/#/c/701057 made
a condition check change that introduced this issue
by requiring mnfa_timeout to be non-zero before an
attempt is made to recover heartbeat period following
MNFA recovery.

This update switches that condition check to use more
specific mnfa_backoff state tracker and because MNFA
is a global maintenance mode feature rather than a
node specific feature, moves the recovery check code
from the node level fsm into a mnfa_recovery_handler
called in the main select loop.

Test Plan:

PASS: Verify MNFA handling/recovery with mnfa_timeout!=0
             that expires.
PASS: Verify MNFA handling/recovery when mnfa_timeout!=0
             but before the timeout expires.
PASS: Verify MNFA handling/recovery when mnfa_timeout=0
PASS: Verify MNFA backoff rate recovery over mtcAgent
             process restart.
PASS: Verify MNFA backoff rate is sent to hbsAgent if
             hbsAgent restarts while MNFA his active.

Change-Id: I8da5a000ab503692c7cfa620233ed8aa772c50f8
Closes-Bug: #1893212
Signed-off-by: Eric MacDonald <eric.macdonald@windriver.com>
This commit is contained in:
Eric MacDonald 2020-09-06 07:58:29 -04:00
parent c7e18ca9e9
commit 2210c71216
5 changed files with 71 additions and 15 deletions

View File

@ -1382,7 +1382,9 @@ public:
/* the main fsm entrypoint to service all hosts */
void fsm ( void ) ;
/** This controller's hostname set'er */
void mnfa_recovery_handler ( string & hostname );
/** This controller's hostname set'er */
void set_my_hostname ( string hostname );
/** This controller's hostname get'er */
@ -1506,6 +1508,7 @@ public:
* node failure avoidance threshold and until there are no more
* in service trouble hosts */
bool mnfa_active ;
bool mnfa_backoff = false ;
void mnfa_cancel( void );
std::list<string> mnfa_awol_list ;

View File

@ -1226,6 +1226,12 @@ int service_events ( nodeLinkClass * obj_ptr, mtc_socket_type * sock_ptr )
ilog ("%s %s inventory push ... done",
controller.c_str(),
MTC_SERVICE_HBSAGENT_NAME);
/* Ensure that the hbsAgent heartbeat period is correct */
if ( obj_ptr->mnfa_backoff == true )
send_hbs_command ( obj_ptr->my_hostname, MTC_BACKOFF_HBS, CONTROLLER );
else
send_hbs_command ( obj_ptr->my_hostname, MTC_RECOVER_HBS, CONTROLLER );
}
else
{

View File

@ -1569,6 +1569,9 @@ void daemon_service_run ( void )
continue ;
}
/* Handle recovery from MNFA */
mtcInv.mnfa_recovery_handler ( mtcInv.my_hostname );
mtcInv.fsm ( );
/* Initialize the master fd_set */

View File

@ -41,13 +41,6 @@ int nodeLinkClass::fsm ( struct nodeLinkClass::node * node_ptr )
return FAIL ;
}
/* if the multi-Node-Failure Avoidance timer rang then run its recovery handler */
if (( this->mnfa_timeout != 0 ) && ( mtcTimer_mnfa.ring == true ))
{
mtcTimer_mnfa.ring = false ;
mnfa_exit ( true );
}
/* handle clear task request */
if ( node_ptr->clear_task == true )
{
@ -57,7 +50,7 @@ int nodeLinkClass::fsm ( struct nodeLinkClass::node * node_ptr )
/* Service the libEvent work queue */
workQueue_process ( node_ptr ) ;
/* Service the maintenance command queue if there are commands waiting */
if ( node_ptr->mtcCmd_work_fifo.size())
{

View File

@ -202,7 +202,7 @@ void nodeLinkClass::mnfa_enter ( void )
wlog ("MNFA ENTER --> Entering Multi-Node Failure Avoidance\n");
mtcAlarm_log ( active_controller_hostname , MTC_LOG_ID__EVENT_MNFA_ENTER );
mnfa_active = true ;
mnfa_backoff = true ;
send_hbs_command ( my_hostname, MTC_BACKOFF_HBS );
/* Handle the case where we are already trying to recover from a
@ -237,6 +237,10 @@ void nodeLinkClass::mnfa_enter ( void )
wlog ("MNFA Auto-Recovery in %d seconds\n", this->mnfa_timeout);
mtcTimer_start ( mtcTimer_mnfa, mtcTimer_handler, this->mnfa_timeout);
}
else
{
this->mtcTimer_mnfa.ring = false ;
}
log_mnfa_pool ( mnfa_awol_list );
}
@ -342,11 +346,6 @@ void nodeLinkClass::mnfa_exit ( bool force )
/* Start the timer that will eventually send the MTC_RECOVER_HBS command */
mtcTimer_start ( mtcTimer_mnfa, mtcTimer_handler, MTC_MNFA_RECOVERY_TIMER );
}
else
{
send_hbs_command ( my_hostname, MTC_RECOVER_HBS );
}
mnfa_host_count[MGMNT_IFACE] = 0 ;
mnfa_host_count[CLSTR_IFACE] = 0 ;
mnfa_awol_list.clear();
@ -392,3 +391,55 @@ void nodeLinkClass::mnfa_cancel ( void )
}
mnfa_awol_list.clear();
}
/**************************************************************************
*
* Name : mnfa_recovery_handler
*
* Purpose : Handle recovery from mnfa
*
* Description: This handler is called from the main loop to handle
* exiting MNFA and scheduling a timer to send the recover
* command to hbsAgent at base level.
*
* Assumptions: Need to send the recover command to hbsAgent at base level.
*
* If mnfa is timer driven ( mnfa_timeout != 0 ) then exit
* from mnfa happens within the mnfa timer handler which
* should not be sending messages.
*
**************************************************************************/
void nodeLinkClass::mnfa_recovery_handler ( string & hostname )
{
/* if the multi-Node-Failure Avoidance timer rang
* then run the recovery handler */
if ( this->mtcTimer_mnfa.ring == true )
{
/* rang due to mnfa_timeout */
if ( this->mnfa_active == true )
{
mtcTimer_mnfa.ring = false ;
mnfa_exit ( true );
}
/* rang due to 3 second recovery timer set in mnfa_exit */
else if ( this->mnfa_backoff == true )
{
ilog("%s heartbeat backoff recovery", hostname.c_str())
if ( send_hbs_command ( my_hostname, MTC_RECOVER_HBS ) == PASS )
{
this->mnfa_backoff = false ;
}
else
{
int retry_timeout = MTC_SECS_30 ;
/* in the case of a send failure, to avoid log flooding,
* start the timer again in 30 seconds */
mtcTimer_start ( mtcTimer_mnfa, mtcTimer_handler, retry_timeout );
ilog("%s heartbeat backoff recovery command send failed, retrying in %d secs",
hostname.c_str(), retry_timeout);
}
}
}
}