From 675f49d5566e8e92e2ed713941bedff904227287 Mon Sep 17 00:00:00 2001 From: Eric MacDonald Date: Tue, 15 Oct 2019 15:16:22 -0400 Subject: [PATCH] Add mtcAgent support for sm_node_unhealthy condition When heartbeat over both networks fail, mtcAgent provides a 5 second grace period for heartbeat to recover before failing the node. However, when heartbeat fails over only one of the networks (management or cluster) the mtcAgent does not honour that 5 second grace period ; a bug. When it comes to peer controller heartbeat failure handling, SM needs that 5 second grace period to handle swact before mtcAgent declares the peer controller as failed, resets the node and updates the database. This update implements a change that forces a 2 second wait time between each fast enable and fixes the fast enable threshold count to be the intended 3 retries. This ensures that at least 5 seconds, actually 6 in the case of single network heartbeat loss, passes before declaring the node as failed. In addition to that, a special condition is added to detect and stop work if the active controller is sm_node_unhealthy. We don't want mtcAgent to make any database updates while in this failure mode. This gives SM the time to handle the failure according to the system's controllers' high availability handling feature. Test Plan: PASS: Verify mtcAgent behavior on set and clear of SM node unhealthy state. PASS: Verify SM has at least 5 seconds to shut down mtcAgent when heartbeat to peer controller fails for one or both networks. PASS: Test real case scenario with link pull. PASS: Verify logging in presence of real failure condition. Change-Id: I8f8d6688040fe899aff6fc40aadda37894c2d5e9 Closes-Bug: 1847657 Signed-off-by: Eric MacDonald --- mtce-common/src/common/nodeBase.cpp | 1 + mtce-common/src/common/nodeBase.h | 3 ++- mtce/src/maintenance/mtcNodeCtrl.cpp | 21 ++++++++++++++++++++- mtce/src/maintenance/mtcNodeHdlrs.cpp | 21 +++++++++++++++++---- 4 files changed, 40 insertions(+), 6 deletions(-) diff --git a/mtce-common/src/common/nodeBase.cpp b/mtce-common/src/common/nodeBase.cpp index c333656b..7ba6197d 100755 --- a/mtce-common/src/common/nodeBase.cpp +++ b/mtce-common/src/common/nodeBase.cpp @@ -376,6 +376,7 @@ void mtc_stages_init ( void ) enableStages_str [MTC_ENABLE__STAGES ] = "unknown" ; recoveryStages_str[MTC_RECOVERY__START ] = "Handler-Start"; + recoveryStages_str[MTC_RECOVERY__RETRY_WAIT ] = "Req-Retry-Wait"; recoveryStages_str[MTC_RECOVERY__REQ_MTCALIVE ] = "Req-MtcAlive"; recoveryStages_str[MTC_RECOVERY__REQ_MTCALIVE_WAIT ] = "Req-MtcAlive-Wait"; recoveryStages_str[MTC_RECOVERY__RESET_RECV_WAIT ] = "Reset-Recv-Wait"; diff --git a/mtce-common/src/common/nodeBase.h b/mtce-common/src/common/nodeBase.h index da71cd08..75a9e26b 100755 --- a/mtce-common/src/common/nodeBase.h +++ b/mtce-common/src/common/nodeBase.h @@ -936,10 +936,11 @@ typedef enum string get_delStages_str ( mtc_delStages_enum stage ); -#define MTC_MAX_FAST_ENABLES (2) +#define MTC_MAX_FAST_ENABLES (3) typedef enum { MTC_RECOVERY__START = 0, + MTC_RECOVERY__RETRY_WAIT, MTC_RECOVERY__REQ_MTCALIVE, MTC_RECOVERY__REQ_MTCALIVE_WAIT, MTC_RECOVERY__RESET_RECV_WAIT, diff --git a/mtce/src/maintenance/mtcNodeCtrl.cpp b/mtce/src/maintenance/mtcNodeCtrl.cpp index e8996423..867bd7f8 100644 --- a/mtce/src/maintenance/mtcNodeCtrl.cpp +++ b/mtce/src/maintenance/mtcNodeCtrl.cpp @@ -1173,6 +1173,9 @@ int _self_provision ( void ) return(rc); } +static int sm_unhealthy_log_throttle = 0 ; +#define SM_UNHEALTHY_LOG_THROTTLE (100) + /* Main FSM Loop */ void nodeLinkClass::fsm ( void ) { @@ -1181,7 +1184,23 @@ void nodeLinkClass::fsm ( void ) int rc ; daemon_signal_hdlr (); this->uptime_handler (); - for ( struct node * node_ptr = head ; node_ptr != NULL ; node_ptr = node_ptr->next ) + + /* Controller HA Improvements Feature + * Handle the SM unhealthy of self case. + * If the active controller is unhealthy then stop doing + * work while its in this state. Allow for self healing */ + struct node * node_ptr = nodeLinkClass::getNode ( this->my_hostname ) ; + if ( node_ptr->mtce_flags & MTC_FLAG__SM_UNHEALTHY ) + { + elog_throttled (sm_unhealthy_log_throttle, SM_UNHEALTHY_LOG_THROTTLE, + "%s SM Unhealthy ; wait on health recovery or process shutdown", + node_ptr->hostname.c_str()); + return ; + } + if ( sm_unhealthy_log_throttle ) + sm_unhealthy_log_throttle = 0 ; + + for ( node_ptr = head ; node_ptr != NULL ; node_ptr = node_ptr->next ) { string hn = node_ptr->hostname ; rc = fsm ( node_ptr ) ; diff --git a/mtce/src/maintenance/mtcNodeHdlrs.cpp b/mtce/src/maintenance/mtcNodeHdlrs.cpp index 4d96f826..ed647787 100755 --- a/mtce/src/maintenance/mtcNodeHdlrs.cpp +++ b/mtce/src/maintenance/mtcNodeHdlrs.cpp @@ -1685,16 +1685,29 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr ) } else { - /* TODO: Consider taking this log out as writing to the database - * during a fast graceful recovery might no be the best idea */ + wlog ("%s Graceful Recovery (%d of %d)\n", + node_ptr->hostname.c_str(), + node_ptr->graceful_recovery_counter, + MTC_MAX_FAST_ENABLES ); + if ( node_ptr->graceful_recovery_counter > 1 ) mtcInvApi_update_task ( node_ptr, "Graceful Recovery Retry" ); else mtcInvApi_update_task ( node_ptr, "Graceful Recovery"); - + /* need to force a 2 second wait if we are in the + * graceful recovery retry so that we honor the 5 + * second grace period */ + mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_SECS_2 ); + recoveryStageChange ( node_ptr, MTC_RECOVERY__RETRY_WAIT ) ; + } + break ; + } + case MTC_RECOVERY__RETRY_WAIT: + { + if ( mtcTimer_expired ( node_ptr->mtcTimer )) + { recoveryStageChange ( node_ptr, MTC_RECOVERY__REQ_MTCALIVE ) ; } - break ; } case MTC_RECOVERY__REQ_MTCALIVE: