From 2210c71216416ac3cfd46d2dc82d0114ee2ef7e1 Mon Sep 17 00:00:00 2001 From: Eric MacDonald Date: Sun, 6 Sep 2020 07:58:29 -0400 Subject: [PATCH] Fix Mtce Heartbeat period recovery on MNFA Exit When Multi-Node Failure Avoidance (MNFA) occurs, maintenance commands the Heartbeat Agent to slow down by a factor of 4. The rate recovery following a MNFA is not occurring. Update https://review.opendev.org/#/c/701057 made a condition check change that introduced this issue by requiring mnfa_timeout to be non-zero before an attempt is made to recover heartbeat period following MNFA recovery. This update switches that condition check to use more specific mnfa_backoff state tracker and because MNFA is a global maintenance mode feature rather than a node specific feature, moves the recovery check code from the node level fsm into a mnfa_recovery_handler called in the main select loop. Test Plan: PASS: Verify MNFA handling/recovery with mnfa_timeout!=0 that expires. PASS: Verify MNFA handling/recovery when mnfa_timeout!=0 but before the timeout expires. PASS: Verify MNFA handling/recovery when mnfa_timeout=0 PASS: Verify MNFA backoff rate recovery over mtcAgent process restart. PASS: Verify MNFA backoff rate is sent to hbsAgent if hbsAgent restarts while MNFA his active. Change-Id: I8da5a000ab503692c7cfa620233ed8aa772c50f8 Closes-Bug: #1893212 Signed-off-by: Eric MacDonald --- mtce/src/common/nodeClass.h | 5 ++- mtce/src/maintenance/mtcCtrlMsg.cpp | 6 +++ mtce/src/maintenance/mtcNodeCtrl.cpp | 3 ++ mtce/src/maintenance/mtcNodeFsm.cpp | 9 +--- mtce/src/maintenance/mtcNodeMnfa.cpp | 63 +++++++++++++++++++++++++--- 5 files changed, 71 insertions(+), 15 deletions(-) diff --git a/mtce/src/common/nodeClass.h b/mtce/src/common/nodeClass.h index 425c4c0c..42ca79d6 100755 --- a/mtce/src/common/nodeClass.h +++ b/mtce/src/common/nodeClass.h @@ -1382,7 +1382,9 @@ public: /* the main fsm entrypoint to service all hosts */ void fsm ( void ) ; - /** This controller's hostname set'er */ + void mnfa_recovery_handler ( string & hostname ); + + /** This controller's hostname set'er */ void set_my_hostname ( string hostname ); /** This controller's hostname get'er */ @@ -1506,6 +1508,7 @@ public: * node failure avoidance threshold and until there are no more * in service trouble hosts */ bool mnfa_active ; + bool mnfa_backoff = false ; void mnfa_cancel( void ); std::list mnfa_awol_list ; diff --git a/mtce/src/maintenance/mtcCtrlMsg.cpp b/mtce/src/maintenance/mtcCtrlMsg.cpp index c6a9c69d..6a820ed1 100755 --- a/mtce/src/maintenance/mtcCtrlMsg.cpp +++ b/mtce/src/maintenance/mtcCtrlMsg.cpp @@ -1226,6 +1226,12 @@ int service_events ( nodeLinkClass * obj_ptr, mtc_socket_type * sock_ptr ) ilog ("%s %s inventory push ... done", controller.c_str(), MTC_SERVICE_HBSAGENT_NAME); + + /* Ensure that the hbsAgent heartbeat period is correct */ + if ( obj_ptr->mnfa_backoff == true ) + send_hbs_command ( obj_ptr->my_hostname, MTC_BACKOFF_HBS, CONTROLLER ); + else + send_hbs_command ( obj_ptr->my_hostname, MTC_RECOVER_HBS, CONTROLLER ); } else { diff --git a/mtce/src/maintenance/mtcNodeCtrl.cpp b/mtce/src/maintenance/mtcNodeCtrl.cpp index ea77d7f2..6732ca88 100644 --- a/mtce/src/maintenance/mtcNodeCtrl.cpp +++ b/mtce/src/maintenance/mtcNodeCtrl.cpp @@ -1569,6 +1569,9 @@ void daemon_service_run ( void ) continue ; } + /* Handle recovery from MNFA */ + mtcInv.mnfa_recovery_handler ( mtcInv.my_hostname ); + mtcInv.fsm ( ); /* Initialize the master fd_set */ diff --git a/mtce/src/maintenance/mtcNodeFsm.cpp b/mtce/src/maintenance/mtcNodeFsm.cpp index 38aee7cd..af5e9a26 100755 --- a/mtce/src/maintenance/mtcNodeFsm.cpp +++ b/mtce/src/maintenance/mtcNodeFsm.cpp @@ -41,13 +41,6 @@ int nodeLinkClass::fsm ( struct nodeLinkClass::node * node_ptr ) return FAIL ; } - /* if the multi-Node-Failure Avoidance timer rang then run its recovery handler */ - if (( this->mnfa_timeout != 0 ) && ( mtcTimer_mnfa.ring == true )) - { - mtcTimer_mnfa.ring = false ; - mnfa_exit ( true ); - } - /* handle clear task request */ if ( node_ptr->clear_task == true ) { @@ -57,7 +50,7 @@ int nodeLinkClass::fsm ( struct nodeLinkClass::node * node_ptr ) /* Service the libEvent work queue */ workQueue_process ( node_ptr ) ; - + /* Service the maintenance command queue if there are commands waiting */ if ( node_ptr->mtcCmd_work_fifo.size()) { diff --git a/mtce/src/maintenance/mtcNodeMnfa.cpp b/mtce/src/maintenance/mtcNodeMnfa.cpp index edad5440..af2493b1 100644 --- a/mtce/src/maintenance/mtcNodeMnfa.cpp +++ b/mtce/src/maintenance/mtcNodeMnfa.cpp @@ -202,7 +202,7 @@ void nodeLinkClass::mnfa_enter ( void ) wlog ("MNFA ENTER --> Entering Multi-Node Failure Avoidance\n"); mtcAlarm_log ( active_controller_hostname , MTC_LOG_ID__EVENT_MNFA_ENTER ); mnfa_active = true ; - + mnfa_backoff = true ; send_hbs_command ( my_hostname, MTC_BACKOFF_HBS ); /* Handle the case where we are already trying to recover from a @@ -237,6 +237,10 @@ void nodeLinkClass::mnfa_enter ( void ) wlog ("MNFA Auto-Recovery in %d seconds\n", this->mnfa_timeout); mtcTimer_start ( mtcTimer_mnfa, mtcTimer_handler, this->mnfa_timeout); } + else + { + this->mtcTimer_mnfa.ring = false ; + } log_mnfa_pool ( mnfa_awol_list ); } @@ -342,11 +346,6 @@ void nodeLinkClass::mnfa_exit ( bool force ) /* Start the timer that will eventually send the MTC_RECOVER_HBS command */ mtcTimer_start ( mtcTimer_mnfa, mtcTimer_handler, MTC_MNFA_RECOVERY_TIMER ); } - else - { - send_hbs_command ( my_hostname, MTC_RECOVER_HBS ); - } - mnfa_host_count[MGMNT_IFACE] = 0 ; mnfa_host_count[CLSTR_IFACE] = 0 ; mnfa_awol_list.clear(); @@ -392,3 +391,55 @@ void nodeLinkClass::mnfa_cancel ( void ) } mnfa_awol_list.clear(); } + +/************************************************************************** + * + * Name : mnfa_recovery_handler + * + * Purpose : Handle recovery from mnfa + * + * Description: This handler is called from the main loop to handle + * exiting MNFA and scheduling a timer to send the recover + * command to hbsAgent at base level. + * + * Assumptions: Need to send the recover command to hbsAgent at base level. + * + * If mnfa is timer driven ( mnfa_timeout != 0 ) then exit + * from mnfa happens within the mnfa timer handler which + * should not be sending messages. + * + **************************************************************************/ + +void nodeLinkClass::mnfa_recovery_handler ( string & hostname ) +{ + /* if the multi-Node-Failure Avoidance timer rang + * then run the recovery handler */ + if ( this->mtcTimer_mnfa.ring == true ) + { + /* rang due to mnfa_timeout */ + if ( this->mnfa_active == true ) + { + mtcTimer_mnfa.ring = false ; + mnfa_exit ( true ); + } + /* rang due to 3 second recovery timer set in mnfa_exit */ + else if ( this->mnfa_backoff == true ) + { + ilog("%s heartbeat backoff recovery", hostname.c_str()) + if ( send_hbs_command ( my_hostname, MTC_RECOVER_HBS ) == PASS ) + { + this->mnfa_backoff = false ; + } + else + { + int retry_timeout = MTC_SECS_30 ; + + /* in the case of a send failure, to avoid log flooding, + * start the timer again in 30 seconds */ + mtcTimer_start ( mtcTimer_mnfa, mtcTimer_handler, retry_timeout ); + ilog("%s heartbeat backoff recovery command send failed, retrying in %d secs", + hostname.c_str(), retry_timeout); + } + } + } +}