From 675f49d5566e8e92e2ed713941bedff904227287 Mon Sep 17 00:00:00 2001
From: Eric MacDonald <eric.macdonald@windriver.com>
Date: Tue, 15 Oct 2019 15:16:22 -0400
Subject: [PATCH] Add mtcAgent support for sm_node_unhealthy condition

When heartbeat over both networks fail, mtcAgent
provides a 5 second grace period for heartbeat to
recover before failing the node.

However, when heartbeat fails over only one of the
networks (management or cluster) the mtcAgent does
not honour that 5 second grace period ; a bug.

When it comes to peer controller heartbeat failure
handling, SM needs that 5 second grace period to handle
swact before mtcAgent declares the peer controller as
failed, resets the node and updates the database.

This update implements a change that forces a 2 second
wait time between each fast enable and fixes the fast
enable threshold count to be the intended 3 retries.
This ensures that at least 5 seconds, actually 6 in
the case of single network heartbeat loss, passes
before declaring the node as failed.

In addition to that, a special condition is added to
detect and stop work if the active controller is
sm_node_unhealthy. We don't want mtcAgent to make
any database updates while in this failure mode.
This gives SM the time to handle the failure
according to the system's controllers' high
availability handling feature.

Test Plan:

PASS: Verify mtcAgent behavior on set and clear of
      SM node unhealthy state.
PASS: Verify SM has at least 5 seconds to shut down
      mtcAgent when heartbeat to peer controller fails
      for one or both networks.
PASS: Test real case scenario with link pull.
PASS: Verify logging in presence of real failure condition.

Change-Id: I8f8d6688040fe899aff6fc40aadda37894c2d5e9
Closes-Bug: 1847657
Signed-off-by: Eric MacDonald <eric.macdonald@windriver.com>
---
 mtce-common/src/common/nodeBase.cpp   |  1 +
 mtce-common/src/common/nodeBase.h     |  3 ++-
 mtce/src/maintenance/mtcNodeCtrl.cpp  | 21 ++++++++++++++++++++-
 mtce/src/maintenance/mtcNodeHdlrs.cpp | 21 +++++++++++++++++----
 4 files changed, 40 insertions(+), 6 deletions(-)

diff --git a/mtce-common/src/common/nodeBase.cpp b/mtce-common/src/common/nodeBase.cpp
index c333656b..7ba6197d 100755
--- a/mtce-common/src/common/nodeBase.cpp
+++ b/mtce-common/src/common/nodeBase.cpp
@@ -376,6 +376,7 @@ void mtc_stages_init ( void )
    enableStages_str  [MTC_ENABLE__STAGES               ] = "unknown" ;
 
    recoveryStages_str[MTC_RECOVERY__START              ] = "Handler-Start";
+   recoveryStages_str[MTC_RECOVERY__RETRY_WAIT         ] = "Req-Retry-Wait";
    recoveryStages_str[MTC_RECOVERY__REQ_MTCALIVE       ] = "Req-MtcAlive";
    recoveryStages_str[MTC_RECOVERY__REQ_MTCALIVE_WAIT  ] = "Req-MtcAlive-Wait";
    recoveryStages_str[MTC_RECOVERY__RESET_RECV_WAIT    ] = "Reset-Recv-Wait";
diff --git a/mtce-common/src/common/nodeBase.h b/mtce-common/src/common/nodeBase.h
index da71cd08..75a9e26b 100755
--- a/mtce-common/src/common/nodeBase.h
+++ b/mtce-common/src/common/nodeBase.h
@@ -936,10 +936,11 @@ typedef enum
 string get_delStages_str ( mtc_delStages_enum stage );
 
 
-#define MTC_MAX_FAST_ENABLES (2)
+#define MTC_MAX_FAST_ENABLES (3)
 typedef enum
 {
     MTC_RECOVERY__START =  0,
+    MTC_RECOVERY__RETRY_WAIT,
     MTC_RECOVERY__REQ_MTCALIVE,
     MTC_RECOVERY__REQ_MTCALIVE_WAIT,
     MTC_RECOVERY__RESET_RECV_WAIT,
diff --git a/mtce/src/maintenance/mtcNodeCtrl.cpp b/mtce/src/maintenance/mtcNodeCtrl.cpp
index e8996423..867bd7f8 100644
--- a/mtce/src/maintenance/mtcNodeCtrl.cpp
+++ b/mtce/src/maintenance/mtcNodeCtrl.cpp
@@ -1173,6 +1173,9 @@ int _self_provision ( void )
     return(rc);
 }
 
+static int sm_unhealthy_log_throttle = 0 ;
+#define SM_UNHEALTHY_LOG_THROTTLE (100)
+
 /* Main FSM Loop */
 void nodeLinkClass::fsm ( void )
 {
@@ -1181,7 +1184,23 @@ void nodeLinkClass::fsm ( void )
         int rc ;
         daemon_signal_hdlr ();
         this->uptime_handler ();
-        for ( struct node * node_ptr = head ; node_ptr != NULL ; node_ptr = node_ptr->next )
+
+        /* Controller HA Improvements Feature
+         * Handle the SM unhealthy of self case.
+         * If the active controller is unhealthy then stop doing
+         * work while its in this state. Allow for self healing */
+        struct node * node_ptr = nodeLinkClass::getNode ( this->my_hostname ) ;
+        if ( node_ptr->mtce_flags & MTC_FLAG__SM_UNHEALTHY )
+        {
+            elog_throttled (sm_unhealthy_log_throttle, SM_UNHEALTHY_LOG_THROTTLE,
+                 "%s SM Unhealthy ; wait on health recovery or process shutdown",
+                 node_ptr->hostname.c_str());
+             return ;
+        }
+        if ( sm_unhealthy_log_throttle )
+            sm_unhealthy_log_throttle = 0 ;
+
+        for ( node_ptr = head ; node_ptr != NULL ; node_ptr = node_ptr->next )
         {
             string hn = node_ptr->hostname ;
             rc = fsm ( node_ptr ) ;
diff --git a/mtce/src/maintenance/mtcNodeHdlrs.cpp b/mtce/src/maintenance/mtcNodeHdlrs.cpp
index 4d96f826..ed647787 100755
--- a/mtce/src/maintenance/mtcNodeHdlrs.cpp
+++ b/mtce/src/maintenance/mtcNodeHdlrs.cpp
@@ -1685,16 +1685,29 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr )
             }
             else
             {
-                /* TODO: Consider taking this log out as writing to the database
-                 *       during a fast graceful recovery might no be the best idea */
+                wlog ("%s Graceful Recovery (%d of %d)\n",
+                          node_ptr->hostname.c_str(),
+                          node_ptr->graceful_recovery_counter,
+                          MTC_MAX_FAST_ENABLES );
+
                 if ( node_ptr->graceful_recovery_counter > 1 )
                     mtcInvApi_update_task ( node_ptr, "Graceful Recovery Retry" );
                 else
                     mtcInvApi_update_task ( node_ptr, "Graceful Recovery");
-
+                /* need to force a 2 second wait if we are in the
+                 * graceful recovery retry so that we honor the 5
+                 * second grace period */
+                mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_SECS_2 );
+                recoveryStageChange ( node_ptr, MTC_RECOVERY__RETRY_WAIT ) ;
+            }
+            break ;
+        }
+        case MTC_RECOVERY__RETRY_WAIT:
+        {
+            if ( mtcTimer_expired ( node_ptr->mtcTimer ))
+            {
                 recoveryStageChange ( node_ptr, MTC_RECOVERY__REQ_MTCALIVE ) ;
             }
-
             break ;
         }
         case MTC_RECOVERY__REQ_MTCALIVE: