diff --git a/mtce/src/common/nodeClass.h b/mtce/src/common/nodeClass.h index 425c4c0c..42ca79d6 100755 --- a/mtce/src/common/nodeClass.h +++ b/mtce/src/common/nodeClass.h @@ -1382,7 +1382,9 @@ public: /* the main fsm entrypoint to service all hosts */ void fsm ( void ) ; - /** This controller's hostname set'er */ + void mnfa_recovery_handler ( string & hostname ); + + /** This controller's hostname set'er */ void set_my_hostname ( string hostname ); /** This controller's hostname get'er */ @@ -1506,6 +1508,7 @@ public: * node failure avoidance threshold and until there are no more * in service trouble hosts */ bool mnfa_active ; + bool mnfa_backoff = false ; void mnfa_cancel( void ); std::list mnfa_awol_list ; diff --git a/mtce/src/maintenance/mtcCtrlMsg.cpp b/mtce/src/maintenance/mtcCtrlMsg.cpp index c6a9c69d..6a820ed1 100755 --- a/mtce/src/maintenance/mtcCtrlMsg.cpp +++ b/mtce/src/maintenance/mtcCtrlMsg.cpp @@ -1226,6 +1226,12 @@ int service_events ( nodeLinkClass * obj_ptr, mtc_socket_type * sock_ptr ) ilog ("%s %s inventory push ... done", controller.c_str(), MTC_SERVICE_HBSAGENT_NAME); + + /* Ensure that the hbsAgent heartbeat period is correct */ + if ( obj_ptr->mnfa_backoff == true ) + send_hbs_command ( obj_ptr->my_hostname, MTC_BACKOFF_HBS, CONTROLLER ); + else + send_hbs_command ( obj_ptr->my_hostname, MTC_RECOVER_HBS, CONTROLLER ); } else { diff --git a/mtce/src/maintenance/mtcNodeCtrl.cpp b/mtce/src/maintenance/mtcNodeCtrl.cpp index ea77d7f2..6732ca88 100644 --- a/mtce/src/maintenance/mtcNodeCtrl.cpp +++ b/mtce/src/maintenance/mtcNodeCtrl.cpp @@ -1569,6 +1569,9 @@ void daemon_service_run ( void ) continue ; } + /* Handle recovery from MNFA */ + mtcInv.mnfa_recovery_handler ( mtcInv.my_hostname ); + mtcInv.fsm ( ); /* Initialize the master fd_set */ diff --git a/mtce/src/maintenance/mtcNodeFsm.cpp b/mtce/src/maintenance/mtcNodeFsm.cpp index 38aee7cd..af5e9a26 100755 --- a/mtce/src/maintenance/mtcNodeFsm.cpp +++ b/mtce/src/maintenance/mtcNodeFsm.cpp @@ -41,13 +41,6 @@ int nodeLinkClass::fsm ( struct nodeLinkClass::node * node_ptr ) return FAIL ; } - /* if the multi-Node-Failure Avoidance timer rang then run its recovery handler */ - if (( this->mnfa_timeout != 0 ) && ( mtcTimer_mnfa.ring == true )) - { - mtcTimer_mnfa.ring = false ; - mnfa_exit ( true ); - } - /* handle clear task request */ if ( node_ptr->clear_task == true ) { @@ -57,7 +50,7 @@ int nodeLinkClass::fsm ( struct nodeLinkClass::node * node_ptr ) /* Service the libEvent work queue */ workQueue_process ( node_ptr ) ; - + /* Service the maintenance command queue if there are commands waiting */ if ( node_ptr->mtcCmd_work_fifo.size()) { diff --git a/mtce/src/maintenance/mtcNodeMnfa.cpp b/mtce/src/maintenance/mtcNodeMnfa.cpp index edad5440..af2493b1 100644 --- a/mtce/src/maintenance/mtcNodeMnfa.cpp +++ b/mtce/src/maintenance/mtcNodeMnfa.cpp @@ -202,7 +202,7 @@ void nodeLinkClass::mnfa_enter ( void ) wlog ("MNFA ENTER --> Entering Multi-Node Failure Avoidance\n"); mtcAlarm_log ( active_controller_hostname , MTC_LOG_ID__EVENT_MNFA_ENTER ); mnfa_active = true ; - + mnfa_backoff = true ; send_hbs_command ( my_hostname, MTC_BACKOFF_HBS ); /* Handle the case where we are already trying to recover from a @@ -237,6 +237,10 @@ void nodeLinkClass::mnfa_enter ( void ) wlog ("MNFA Auto-Recovery in %d seconds\n", this->mnfa_timeout); mtcTimer_start ( mtcTimer_mnfa, mtcTimer_handler, this->mnfa_timeout); } + else + { + this->mtcTimer_mnfa.ring = false ; + } log_mnfa_pool ( mnfa_awol_list ); } @@ -342,11 +346,6 @@ void nodeLinkClass::mnfa_exit ( bool force ) /* Start the timer that will eventually send the MTC_RECOVER_HBS command */ mtcTimer_start ( mtcTimer_mnfa, mtcTimer_handler, MTC_MNFA_RECOVERY_TIMER ); } - else - { - send_hbs_command ( my_hostname, MTC_RECOVER_HBS ); - } - mnfa_host_count[MGMNT_IFACE] = 0 ; mnfa_host_count[CLSTR_IFACE] = 0 ; mnfa_awol_list.clear(); @@ -392,3 +391,55 @@ void nodeLinkClass::mnfa_cancel ( void ) } mnfa_awol_list.clear(); } + +/************************************************************************** + * + * Name : mnfa_recovery_handler + * + * Purpose : Handle recovery from mnfa + * + * Description: This handler is called from the main loop to handle + * exiting MNFA and scheduling a timer to send the recover + * command to hbsAgent at base level. + * + * Assumptions: Need to send the recover command to hbsAgent at base level. + * + * If mnfa is timer driven ( mnfa_timeout != 0 ) then exit + * from mnfa happens within the mnfa timer handler which + * should not be sending messages. + * + **************************************************************************/ + +void nodeLinkClass::mnfa_recovery_handler ( string & hostname ) +{ + /* if the multi-Node-Failure Avoidance timer rang + * then run the recovery handler */ + if ( this->mtcTimer_mnfa.ring == true ) + { + /* rang due to mnfa_timeout */ + if ( this->mnfa_active == true ) + { + mtcTimer_mnfa.ring = false ; + mnfa_exit ( true ); + } + /* rang due to 3 second recovery timer set in mnfa_exit */ + else if ( this->mnfa_backoff == true ) + { + ilog("%s heartbeat backoff recovery", hostname.c_str()) + if ( send_hbs_command ( my_hostname, MTC_RECOVER_HBS ) == PASS ) + { + this->mnfa_backoff = false ; + } + else + { + int retry_timeout = MTC_SECS_30 ; + + /* in the case of a send failure, to avoid log flooding, + * start the timer again in 30 seconds */ + mtcTimer_start ( mtcTimer_mnfa, mtcTimer_handler, retry_timeout ); + ilog("%s heartbeat backoff recovery command send failed, retrying in %d secs", + hostname.c_str(), retry_timeout); + } + } + } +}