Merge "Add mtcAgent support for sm_node_unhealthy condition"

2019-10-16 19:15:03 +00:00 · 2019-10-16 19:15:03 +00:00 · 069daf1e22
parent 7d7a125d74 675f49d556
commit 069daf1e22
4 changed files with 40 additions and 6 deletions
--- a/mtce-common/src/common/nodeBase.cpp
+++ b/mtce-common/src/common/nodeBase.cpp
@ -376,6 +376,7 @@ void mtc_stages_init ( void )
   enableStages_str  [MTC_ENABLE__STAGES               ] = "unknown" ;

   recoveryStages_str[MTC_RECOVERY__START              ] = "Handler-Start";
+   recoveryStages_str[MTC_RECOVERY__RETRY_WAIT         ] = "Req-Retry-Wait";
   recoveryStages_str[MTC_RECOVERY__REQ_MTCALIVE       ] = "Req-MtcAlive";
   recoveryStages_str[MTC_RECOVERY__REQ_MTCALIVE_WAIT  ] = "Req-MtcAlive-Wait";
   recoveryStages_str[MTC_RECOVERY__RESET_RECV_WAIT    ] = "Reset-Recv-Wait";
--- a/mtce-common/src/common/nodeBase.h
+++ b/mtce-common/src/common/nodeBase.h
@ -936,10 +936,11 @@ typedef enum
 string get_delStages_str ( mtc_delStages_enum stage );


-#define MTC_MAX_FAST_ENABLES (2)
+#define MTC_MAX_FAST_ENABLES (3)
 typedef enum
 {
    MTC_RECOVERY__START =  0,
+    MTC_RECOVERY__RETRY_WAIT,
    MTC_RECOVERY__REQ_MTCALIVE,
    MTC_RECOVERY__REQ_MTCALIVE_WAIT,
    MTC_RECOVERY__RESET_RECV_WAIT,
--- a/mtce/src/maintenance/mtcNodeCtrl.cpp
+++ b/mtce/src/maintenance/mtcNodeCtrl.cpp
@ -1173,6 +1173,9 @@ int _self_provision ( void )
    return(rc);
 }

+static int sm_unhealthy_log_throttle = 0 ;
+#define SM_UNHEALTHY_LOG_THROTTLE (100)
+
 /* Main FSM Loop */
 void nodeLinkClass::fsm ( void )
 {
@ -1181,7 +1184,23 @@ void nodeLinkClass::fsm ( void )
        int rc ;
        daemon_signal_hdlr ();
        this->uptime_handler ();
-        for ( struct node * node_ptr = head ; node_ptr != NULL ; node_ptr = node_ptr->next )
+
+        /* Controller HA Improvements Feature
+         * Handle the SM unhealthy of self case.
+         * If the active controller is unhealthy then stop doing
+         * work while its in this state. Allow for self healing */
+        struct node * node_ptr = nodeLinkClass::getNode ( this->my_hostname ) ;
+        if ( node_ptr->mtce_flags & MTC_FLAG__SM_UNHEALTHY )
+        {
+            elog_throttled (sm_unhealthy_log_throttle, SM_UNHEALTHY_LOG_THROTTLE,
+                 "%s SM Unhealthy ; wait on health recovery or process shutdown",
+                 node_ptr->hostname.c_str());
+             return ;
+        }
+        if ( sm_unhealthy_log_throttle )
+            sm_unhealthy_log_throttle = 0 ;
+
+        for ( node_ptr = head ; node_ptr != NULL ; node_ptr = node_ptr->next )
        {
            string hn = node_ptr->hostname ;
            rc = fsm ( node_ptr ) ;
--- a/mtce/src/maintenance/mtcNodeHdlrs.cpp
+++ b/mtce/src/maintenance/mtcNodeHdlrs.cpp
@ -1685,16 +1685,29 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr )
            }
            else
            {
-                /* TODO: Consider taking this log out as writing to the database
-                 *       during a fast graceful recovery might no be the best idea */
+                wlog ("%s Graceful Recovery (%d of %d)\n",
+                          node_ptr->hostname.c_str(),
+                          node_ptr->graceful_recovery_counter,
+                          MTC_MAX_FAST_ENABLES );
+
                if ( node_ptr->graceful_recovery_counter > 1 )
                    mtcInvApi_update_task ( node_ptr, "Graceful Recovery Retry" );
                else
                    mtcInvApi_update_task ( node_ptr, "Graceful Recovery");
-
+                /* need to force a 2 second wait if we are in the
+                 * graceful recovery retry so that we honor the 5
+                 * second grace period */
+                mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_SECS_2 );
+                recoveryStageChange ( node_ptr, MTC_RECOVERY__RETRY_WAIT ) ;
+            }
+            break ;
+        }
+        case MTC_RECOVERY__RETRY_WAIT:
+        {
+            if ( mtcTimer_expired ( node_ptr->mtcTimer ))
+            {
                recoveryStageChange ( node_ptr, MTC_RECOVERY__REQ_MTCALIVE ) ;
            }
-
            break ;
        }
        case MTC_RECOVERY__REQ_MTCALIVE: