From 2210c71216416ac3cfd46d2dc82d0114ee2ef7e1 Mon Sep 17 00:00:00 2001
From: Eric MacDonald <eric.macdonald@windriver.com>
Date: Sun, 6 Sep 2020 07:58:29 -0400
Subject: [PATCH] Fix Mtce Heartbeat period recovery on MNFA Exit

When Multi-Node Failure Avoidance (MNFA) occurs,
maintenance commands the Heartbeat Agent to slow
down by a factor of 4.

The rate recovery following a MNFA is not occurring.

Update https://review.opendev.org/#/c/701057 made
a condition check change that introduced this issue
by requiring mnfa_timeout to be non-zero before an
attempt is made to recover heartbeat period following
MNFA recovery.

This update switches that condition check to use more
specific mnfa_backoff state tracker and because MNFA
is a global maintenance mode feature rather than a
node specific feature, moves the recovery check code
from the node level fsm into a mnfa_recovery_handler
called in the main select loop.

Test Plan:

PASS: Verify MNFA handling/recovery with mnfa_timeout!=0
             that expires.
PASS: Verify MNFA handling/recovery when mnfa_timeout!=0
             but before the timeout expires.
PASS: Verify MNFA handling/recovery when mnfa_timeout=0
PASS: Verify MNFA backoff rate recovery over mtcAgent
             process restart.
PASS: Verify MNFA backoff rate is sent to hbsAgent if
             hbsAgent restarts while MNFA his active.

Change-Id: I8da5a000ab503692c7cfa620233ed8aa772c50f8
Closes-Bug: #1893212
Signed-off-by: Eric MacDonald <eric.macdonald@windriver.com>
---
 mtce/src/common/nodeClass.h          |  5 ++-
 mtce/src/maintenance/mtcCtrlMsg.cpp  |  6 +++
 mtce/src/maintenance/mtcNodeCtrl.cpp |  3 ++
 mtce/src/maintenance/mtcNodeFsm.cpp  |  9 +---
 mtce/src/maintenance/mtcNodeMnfa.cpp | 63 +++++++++++++++++++++++++---
 5 files changed, 71 insertions(+), 15 deletions(-)
diff --git a/mtce/src/common/nodeClass.h b/mtce/src/common/nodeClass.h
index 425c4c0c..42ca79d6 100755
--- a/mtce/src/common/nodeClass.h
+++ b/mtce/src/common/nodeClass.h
@@ -1382,7 +1382,9 @@ public:
     /* the main fsm entrypoint to service all hosts */
     void fsm ( void ) ;
 
-   /** This controller's hostname set'er */
+    void mnfa_recovery_handler ( string & hostname );
+
+    /** This controller's hostname set'er */
     void   set_my_hostname ( string hostname );
 
     /** This controller's hostname get'er */
@@ -1506,6 +1508,7 @@ public:
      *  node failure avoidance threshold and until there are no more
      *  in service trouble hosts */
     bool mnfa_active ;
+    bool mnfa_backoff = false ;
     void mnfa_cancel( void );
 
     std::list<string>           mnfa_awol_list ;
diff --git a/mtce/src/maintenance/mtcCtrlMsg.cpp b/mtce/src/maintenance/mtcCtrlMsg.cpp
index c6a9c69d..6a820ed1 100755
--- a/mtce/src/maintenance/mtcCtrlMsg.cpp
+++ b/mtce/src/maintenance/mtcCtrlMsg.cpp
@@ -1226,6 +1226,12 @@ int service_events ( nodeLinkClass * obj_ptr, mtc_socket_type * sock_ptr )
         ilog ("%s %s inventory push ... done",
                   controller.c_str(),
                   MTC_SERVICE_HBSAGENT_NAME);
+
+        /* Ensure that the hbsAgent heartbeat period is correct */
+        if ( obj_ptr->mnfa_backoff == true )
+            send_hbs_command ( obj_ptr->my_hostname, MTC_BACKOFF_HBS, CONTROLLER );
+        else
+            send_hbs_command ( obj_ptr->my_hostname, MTC_RECOVER_HBS, CONTROLLER );
     }
     else
     {
diff --git a/mtce/src/maintenance/mtcNodeCtrl.cpp b/mtce/src/maintenance/mtcNodeCtrl.cpp
index ea77d7f2..6732ca88 100644
--- a/mtce/src/maintenance/mtcNodeCtrl.cpp
+++ b/mtce/src/maintenance/mtcNodeCtrl.cpp
@@ -1569,6 +1569,9 @@ void daemon_service_run ( void )
             continue ;
         }
 
+        /* Handle recovery from MNFA */
+        mtcInv.mnfa_recovery_handler ( mtcInv.my_hostname );
+
         mtcInv.fsm ( );
 
         /* Initialize the master fd_set */
diff --git a/mtce/src/maintenance/mtcNodeFsm.cpp b/mtce/src/maintenance/mtcNodeFsm.cpp
index 38aee7cd..af5e9a26 100755
--- a/mtce/src/maintenance/mtcNodeFsm.cpp
+++ b/mtce/src/maintenance/mtcNodeFsm.cpp
@@ -41,13 +41,6 @@ int nodeLinkClass::fsm ( struct nodeLinkClass::node * node_ptr )
         return FAIL ;
     }
 
-    /* if the multi-Node-Failure Avoidance timer rang then run its recovery handler */
-    if (( this->mnfa_timeout != 0 ) && ( mtcTimer_mnfa.ring == true ))
-    {
-        mtcTimer_mnfa.ring = false ;
-        mnfa_exit ( true );
-    }
-
     /* handle clear task request */
     if ( node_ptr->clear_task == true )
     {
@@ -57,7 +50,7 @@ int nodeLinkClass::fsm ( struct nodeLinkClass::node * node_ptr )
 
     /* Service the libEvent work queue */
     workQueue_process ( node_ptr ) ;
-    
+
     /* Service the maintenance command queue if there are commands waiting */
     if ( node_ptr->mtcCmd_work_fifo.size())
     {
diff --git a/mtce/src/maintenance/mtcNodeMnfa.cpp b/mtce/src/maintenance/mtcNodeMnfa.cpp
index edad5440..af2493b1 100644
--- a/mtce/src/maintenance/mtcNodeMnfa.cpp
+++ b/mtce/src/maintenance/mtcNodeMnfa.cpp
@@ -202,7 +202,7 @@ void nodeLinkClass::mnfa_enter ( void )
      wlog ("MNFA ENTER --> Entering Multi-Node Failure Avoidance\n");
      mtcAlarm_log ( active_controller_hostname , MTC_LOG_ID__EVENT_MNFA_ENTER );
      mnfa_active = true ;
-
+     mnfa_backoff = true ;
      send_hbs_command ( my_hostname, MTC_BACKOFF_HBS );
 
      /* Handle the case where we are already trying to recover from a
@@ -237,6 +237,10 @@ void nodeLinkClass::mnfa_enter ( void )
          wlog ("MNFA Auto-Recovery in %d seconds\n",       this->mnfa_timeout);
          mtcTimer_start ( mtcTimer_mnfa, mtcTimer_handler, this->mnfa_timeout);
      }
+     else
+     {
+         this->mtcTimer_mnfa.ring = false ;
+     }
      log_mnfa_pool ( mnfa_awol_list );
 }
 
@@ -342,11 +346,6 @@ void nodeLinkClass::mnfa_exit ( bool force )
         /* Start the timer that will eventually send the MTC_RECOVER_HBS command */
         mtcTimer_start ( mtcTimer_mnfa, mtcTimer_handler, MTC_MNFA_RECOVERY_TIMER );
     }
-    else
-    {
-        send_hbs_command ( my_hostname, MTC_RECOVER_HBS );
-    }
-
     mnfa_host_count[MGMNT_IFACE] = 0 ;
     mnfa_host_count[CLSTR_IFACE] = 0 ;
     mnfa_awol_list.clear();
@@ -392,3 +391,55 @@ void nodeLinkClass::mnfa_cancel ( void )
     }
     mnfa_awol_list.clear();
 }
+
+/**************************************************************************
+ *
+ * Name       : mnfa_recovery_handler
+ *
+ * Purpose    : Handle recovery from mnfa
+ *
+ * Description: This handler is called from the main loop to handle
+ *              exiting MNFA and scheduling a timer to send the recover
+ *              command to hbsAgent at base level.
+ *
+ * Assumptions: Need to send the recover command to hbsAgent at base level.
+ *
+ *              If mnfa is timer driven ( mnfa_timeout != 0 ) then exit
+ *              from mnfa happens within the mnfa timer handler which
+ *              should not be sending messages.
+ *
+ **************************************************************************/
+
+void nodeLinkClass::mnfa_recovery_handler ( string & hostname )
+{
+    /* if the multi-Node-Failure Avoidance timer rang
+     * then run the recovery handler */
+    if ( this->mtcTimer_mnfa.ring == true )
+    {
+        /* rang due to mnfa_timeout */
+        if ( this->mnfa_active == true )
+        {
+            mtcTimer_mnfa.ring = false ;
+            mnfa_exit ( true );
+        }
+        /* rang due to 3 second recovery timer set in mnfa_exit */
+        else if ( this->mnfa_backoff == true )
+        {
+            ilog("%s heartbeat backoff recovery", hostname.c_str())
+            if ( send_hbs_command ( my_hostname, MTC_RECOVER_HBS ) == PASS )
+            {
+                this->mnfa_backoff = false ;
+            }
+            else
+            {
+                int retry_timeout = MTC_SECS_30 ;
+
+                /* in the case of a send failure, to avoid log flooding,
+                 * start the timer again in 30 seconds */
+                mtcTimer_start ( mtcTimer_mnfa, mtcTimer_handler, retry_timeout );
+                ilog("%s heartbeat backoff recovery command send failed, retrying in %d secs",
+                         hostname.c_str(), retry_timeout);
+            }
+        }
+    }
+}