diff --git a/mtce-common/src/common/nodeBase.cpp b/mtce-common/src/common/nodeBase.cpp index c333656b..7ba6197d 100755 --- a/mtce-common/src/common/nodeBase.cpp +++ b/mtce-common/src/common/nodeBase.cpp @@ -376,6 +376,7 @@ void mtc_stages_init ( void ) enableStages_str [MTC_ENABLE__STAGES ] = "unknown" ; recoveryStages_str[MTC_RECOVERY__START ] = "Handler-Start"; + recoveryStages_str[MTC_RECOVERY__RETRY_WAIT ] = "Req-Retry-Wait"; recoveryStages_str[MTC_RECOVERY__REQ_MTCALIVE ] = "Req-MtcAlive"; recoveryStages_str[MTC_RECOVERY__REQ_MTCALIVE_WAIT ] = "Req-MtcAlive-Wait"; recoveryStages_str[MTC_RECOVERY__RESET_RECV_WAIT ] = "Reset-Recv-Wait"; diff --git a/mtce-common/src/common/nodeBase.h b/mtce-common/src/common/nodeBase.h index da71cd08..75a9e26b 100755 --- a/mtce-common/src/common/nodeBase.h +++ b/mtce-common/src/common/nodeBase.h @@ -936,10 +936,11 @@ typedef enum string get_delStages_str ( mtc_delStages_enum stage ); -#define MTC_MAX_FAST_ENABLES (2) +#define MTC_MAX_FAST_ENABLES (3) typedef enum { MTC_RECOVERY__START = 0, + MTC_RECOVERY__RETRY_WAIT, MTC_RECOVERY__REQ_MTCALIVE, MTC_RECOVERY__REQ_MTCALIVE_WAIT, MTC_RECOVERY__RESET_RECV_WAIT, diff --git a/mtce/src/maintenance/mtcNodeCtrl.cpp b/mtce/src/maintenance/mtcNodeCtrl.cpp index e8996423..867bd7f8 100644 --- a/mtce/src/maintenance/mtcNodeCtrl.cpp +++ b/mtce/src/maintenance/mtcNodeCtrl.cpp @@ -1173,6 +1173,9 @@ int _self_provision ( void ) return(rc); } +static int sm_unhealthy_log_throttle = 0 ; +#define SM_UNHEALTHY_LOG_THROTTLE (100) + /* Main FSM Loop */ void nodeLinkClass::fsm ( void ) { @@ -1181,7 +1184,23 @@ void nodeLinkClass::fsm ( void ) int rc ; daemon_signal_hdlr (); this->uptime_handler (); - for ( struct node * node_ptr = head ; node_ptr != NULL ; node_ptr = node_ptr->next ) + + /* Controller HA Improvements Feature + * Handle the SM unhealthy of self case. + * If the active controller is unhealthy then stop doing + * work while its in this state. Allow for self healing */ + struct node * node_ptr = nodeLinkClass::getNode ( this->my_hostname ) ; + if ( node_ptr->mtce_flags & MTC_FLAG__SM_UNHEALTHY ) + { + elog_throttled (sm_unhealthy_log_throttle, SM_UNHEALTHY_LOG_THROTTLE, + "%s SM Unhealthy ; wait on health recovery or process shutdown", + node_ptr->hostname.c_str()); + return ; + } + if ( sm_unhealthy_log_throttle ) + sm_unhealthy_log_throttle = 0 ; + + for ( node_ptr = head ; node_ptr != NULL ; node_ptr = node_ptr->next ) { string hn = node_ptr->hostname ; rc = fsm ( node_ptr ) ; diff --git a/mtce/src/maintenance/mtcNodeHdlrs.cpp b/mtce/src/maintenance/mtcNodeHdlrs.cpp index 4d96f826..ed647787 100755 --- a/mtce/src/maintenance/mtcNodeHdlrs.cpp +++ b/mtce/src/maintenance/mtcNodeHdlrs.cpp @@ -1685,16 +1685,29 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr ) } else { - /* TODO: Consider taking this log out as writing to the database - * during a fast graceful recovery might no be the best idea */ + wlog ("%s Graceful Recovery (%d of %d)\n", + node_ptr->hostname.c_str(), + node_ptr->graceful_recovery_counter, + MTC_MAX_FAST_ENABLES ); + if ( node_ptr->graceful_recovery_counter > 1 ) mtcInvApi_update_task ( node_ptr, "Graceful Recovery Retry" ); else mtcInvApi_update_task ( node_ptr, "Graceful Recovery"); - + /* need to force a 2 second wait if we are in the + * graceful recovery retry so that we honor the 5 + * second grace period */ + mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_SECS_2 ); + recoveryStageChange ( node_ptr, MTC_RECOVERY__RETRY_WAIT ) ; + } + break ; + } + case MTC_RECOVERY__RETRY_WAIT: + { + if ( mtcTimer_expired ( node_ptr->mtcTimer )) + { recoveryStageChange ( node_ptr, MTC_RECOVERY__REQ_MTCALIVE ) ; } - break ; } case MTC_RECOVERY__REQ_MTCALIVE: