Add mtcAgent support for sm_node_unhealthy condition

When heartbeat over both networks fail, mtcAgent provides a 5 second grace period for heartbeat to recover before failing the node. However, when heartbeat fails over only one of the networks (management or cluster) the mtcAgent does not honour that 5 second grace period ; a bug. When it comes to peer controller heartbeat failure handling, SM needs that 5 second grace period to handle swact before mtcAgent declares the peer controller as failed, resets the node and updates the database. This update implements a change that forces a 2 second wait time between each fast enable and fixes the fast enable threshold count to be the intended 3 retries. This ensures that at least 5 seconds, actually 6 in the case of single network heartbeat loss, passes before declaring the node as failed. In addition to that, a special condition is added to detect and stop work if the active controller is sm_node_unhealthy. We don't want mtcAgent to make any database updates while in this failure mode. This gives SM the time to handle the failure according to the system's controllers' high availability handling feature. Test Plan: PASS: Verify mtcAgent behavior on set and clear of SM node unhealthy state. PASS: Verify SM has at least 5 seconds to shut down mtcAgent when heartbeat to peer controller fails for one or both networks. PASS: Test real case scenario with link pull. PASS: Verify logging in presence of real failure condition. Change-Id: I8f8d6688040fe899aff6fc40aadda37894c2d5e9 Closes-Bug: 1847657 Signed-off-by: Eric MacDonald <eric.macdonald@windriver.com>
2019-10-15 15:16:22 -04:00 · 2019-10-15 15:16:22 -04:00 · 675f49d556
parent cc69bf7b4a
commit 675f49d556
4 changed files with 40 additions and 6 deletions
--- a/mtce-common/src/common/nodeBase.cpp
+++ b/mtce-common/src/common/nodeBase.cpp
@ -376,6 +376,7 @@ void mtc_stages_init ( void )
   enableStages_str  [MTC_ENABLE__STAGES               ] = "unknown" ;

   recoveryStages_str[MTC_RECOVERY__START              ] = "Handler-Start";
+   recoveryStages_str[MTC_RECOVERY__RETRY_WAIT         ] = "Req-Retry-Wait";
   recoveryStages_str[MTC_RECOVERY__REQ_MTCALIVE       ] = "Req-MtcAlive";
   recoveryStages_str[MTC_RECOVERY__REQ_MTCALIVE_WAIT  ] = "Req-MtcAlive-Wait";
   recoveryStages_str[MTC_RECOVERY__RESET_RECV_WAIT    ] = "Reset-Recv-Wait";
--- a/mtce-common/src/common/nodeBase.h
+++ b/mtce-common/src/common/nodeBase.h
@ -936,10 +936,11 @@ typedef enum
 string get_delStages_str ( mtc_delStages_enum stage );


-#define MTC_MAX_FAST_ENABLES (2)
+#define MTC_MAX_FAST_ENABLES (3)
 typedef enum
 {
    MTC_RECOVERY__START =  0,
+    MTC_RECOVERY__RETRY_WAIT,
    MTC_RECOVERY__REQ_MTCALIVE,
    MTC_RECOVERY__REQ_MTCALIVE_WAIT,
    MTC_RECOVERY__RESET_RECV_WAIT,
--- a/mtce/src/maintenance/mtcNodeCtrl.cpp
+++ b/mtce/src/maintenance/mtcNodeCtrl.cpp
@ -1173,6 +1173,9 @@ int _self_provision ( void )
    return(rc);
 }

+static int sm_unhealthy_log_throttle = 0 ;
+#define SM_UNHEALTHY_LOG_THROTTLE (100)
+
 /* Main FSM Loop */
 void nodeLinkClass::fsm ( void )
 {
@ -1181,7 +1184,23 @@ void nodeLinkClass::fsm ( void )
        int rc ;
        daemon_signal_hdlr ();
        this->uptime_handler ();
-        for ( struct node * node_ptr = head ; node_ptr != NULL ; node_ptr = node_ptr->next )
+
+        /* Controller HA Improvements Feature
+         * Handle the SM unhealthy of self case.
+         * If the active controller is unhealthy then stop doing
+         * work while its in this state. Allow for self healing */
+        struct node * node_ptr = nodeLinkClass::getNode ( this->my_hostname ) ;
+        if ( node_ptr->mtce_flags & MTC_FLAG__SM_UNHEALTHY )
+        {
+            elog_throttled (sm_unhealthy_log_throttle, SM_UNHEALTHY_LOG_THROTTLE,
+                 "%s SM Unhealthy ; wait on health recovery or process shutdown",
+                 node_ptr->hostname.c_str());
+             return ;
+        }
+        if ( sm_unhealthy_log_throttle )
+            sm_unhealthy_log_throttle = 0 ;
+
+        for ( node_ptr = head ; node_ptr != NULL ; node_ptr = node_ptr->next )
        {
            string hn = node_ptr->hostname ;
            rc = fsm ( node_ptr ) ;
--- a/mtce/src/maintenance/mtcNodeHdlrs.cpp
+++ b/mtce/src/maintenance/mtcNodeHdlrs.cpp
@ -1685,16 +1685,29 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr )
            }
            else
            {
-                /* TODO: Consider taking this log out as writing to the database
-                 *       during a fast graceful recovery might no be the best idea */
+                wlog ("%s Graceful Recovery (%d of %d)\n",
+                          node_ptr->hostname.c_str(),
+                          node_ptr->graceful_recovery_counter,
+                          MTC_MAX_FAST_ENABLES );
+
                if ( node_ptr->graceful_recovery_counter > 1 )
                    mtcInvApi_update_task ( node_ptr, "Graceful Recovery Retry" );
                else
                    mtcInvApi_update_task ( node_ptr, "Graceful Recovery");
-
+                /* need to force a 2 second wait if we are in the
+                 * graceful recovery retry so that we honor the 5
+                 * second grace period */
+                mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_SECS_2 );
+                recoveryStageChange ( node_ptr, MTC_RECOVERY__RETRY_WAIT ) ;
+            }
+            break ;
+        }
+        case MTC_RECOVERY__RETRY_WAIT:
+        {
+            if ( mtcTimer_expired ( node_ptr->mtcTimer ))
+            {
                recoveryStageChange ( node_ptr, MTC_RECOVERY__REQ_MTCALIVE ) ;
            }
-
            break ;
        }
        case MTC_RECOVERY__REQ_MTCALIVE: