From 48978d804d6f22130d0bd8bd17f361441024bc6c Mon Sep 17 00:00:00 2001
From: Eric MacDonald <eric.macdonald@windriver.com>
Date: Wed, 28 Apr 2021 09:39:19 -0400
Subject: [PATCH] Improved maintenance handling of spontaneous active
 controller reboot

Performing a forced reboot of the active controller sometimes
results in a second reboot of that controller. The cause of the
second reboot was due to its reported uptime in the first mtcAlive
message, following the reboot, as greater than 10 minutes.

Maintenance has a long standing graceful recovery threshold of
10 minutes. Meaning that if a host looses heartbeat and enters
Graceful Recovery, if the uptime value extracted from the first
mtcAlive message following the recovery of that host exceeds 10
minutes, then maintenance interprets that the host did not reboot.
If a host goes absent for longer than this threshold then for
reasons not limited to security, maintenance declares the host
as 'failed' and force re-enables it through a reboot.

With the introduction of containers and addition of new features
over the last few releases, boot times on some servers are
approaching the 10 minute threshold and in this case exceeded
the threshold.

The primary fix in this update is to increase this long standing
threshold to 15 minutes to account for evolution of the product.

During the debug of this issue a few other related undesirable
behaviors related to Graceful Recovery were observed with the
following additional changes implemented.

 - Remove hbsAgent process restart in ha service management
   failover failure recovery handling. This change is in the
   ha git with a loose dependency placed on this update.
   Reason: https://review.opendev.org/c/starlingx/ha/+/788299

 - Prevent the hbsAgent from sending heartbeat clear events
   to maintenance in response to a heartbeat stop command.
   Reason: Maintenance receiving these clear events while in
           Graceful Recovery causes it to pop out of graceful
           recovery only to re-enter as a retry and therefore
           needlessly consumes one (of a max of 5) retry count.

 - Prevent successful Graceful Recovery until all heartbeat
   monitored networks recover.
   Reason: If heartbeat of one network, say cluster recovers but
           another (management) does not then its possible the
           max Graceful Recovery Retries could be reached quite
           quickly, while one network recovered but the other
           may not have, causing maintenance to fail the host and
           force a full enable with reboot.

 - Extend the wait for the hbsClient ready event in the graceful
   recovery handler timout from 1 minute to worker config timeout.
   Reason: To give the worker config time to complete before force
           starting the recovery handler's heartbeat soak.

 - Add Graceful Recovery Wait state recovery over process restart.
   Reason: Avoid double reboot of Gracefully Recovering host over
           SM service bounce.

 - Add requirement for a valid out-of-band mtce flags value before
   declaring configuration error in the subfunction enable handler.
   Reason: rebooting the active controller can sometimes result in
           a falsely reported configation error due to the
           subfunction enable handler interpreting a zero value as
           a configuration error.

 - Add uptime to all Graceful Recovery 'Connectivity Recovered' logs.
   Reason: To assist log analysis and issue debug

Test Plan:

PASS: Verify handling active controller reboot
             cases: AIO DC, AIO DX, Standard, and Storage
PASS: Verify Graceful Recovery Wait behavior
             cases: with and without timeout, with and without bmc
             cases: uptime > 15 mins and 10 < uptime < 15 mins
PASS: Verify Graceful Recovery continuation over mtcAgent restart
             cases: peer controller, compute, MNFA 4 computes
PASS: Verify AIO DX and DC active controller reboot to standby
             takeover that up for less than 15 minutes.

Regression:

PASS: Verify MNFA feature ; 4 computes in 8 node Storage system
PASS: Verify cluster network only heartbeat loss handling
             cases: worker and standby controller in all systems.
PASS: Verify Dead Office Recovery (DOR)
             cases: AIO DC, AIO DX, Standard, Storage
PASS: Verify system installations
             cases: AIO SX/DC/DX and 8 node Storage system
PASS: Verify heartbeat and graceful recovery of both 'standby
             controller' and worker nodes in AIO Plus.

PASS: Verify logging and no coredumps over all of testing
PASS: Verify no missing or stuck alarms over all of testing

Change-Id: I3d16d8627b7e838faf931a3c2039a6babf2a79ef
Closes-Bug: 1922584
Signed-off-by: Eric MacDonald <eric.macdonald@windriver.com>
---
 mtce-common/src/common/nodeBase.h     |  1 +
 mtce/src/common/nodeClass.cpp         | 36 +++++++++++++++-
 mtce/src/common/nodeClass.h           |  1 +
 mtce/src/heartbeat/hbsAgent.cpp       |  2 +-
 mtce/src/maintenance/mtcCtrlMsg.cpp   | 62 +++++++++++++++++++++++++--
 mtce/src/maintenance/mtcNodeHdlrs.cpp | 61 +++++++++++++++++++++-----
 mtce/src/maintenance/mtcSubfHdlrs.cpp | 13 +++---
 7 files changed, 155 insertions(+), 21 deletions(-)

diff --git a/mtce-common/src/common/nodeBase.h b/mtce-common/src/common/nodeBase.h
index 2d02bd97..b98c34d4 100755
--- a/mtce-common/src/common/nodeBase.h
+++ b/mtce-common/src/common/nodeBase.h
@@ -263,6 +263,7 @@ typedef enum
 #define MTC_TASK_ENABLE_WORK_FAIL  "Enable Action Failed"
 #define MTC_TASK_ENABLE_WORK_TO    "Enable Action Timeout"
 #define MTC_TASK_ENABLE_FAIL_HB    "Enable Heartbeat Failure, re-enabling"
+#define MTC_TASK_RECOVERY_FAIL_HB  "Graceful Recovery Heartbeat Failure, re-enabling"
 #define MTC_TASK_RECOVERY_FAIL     "Graceful Recovery Failed, re-enabling"
 #define MTC_TASK_RECOVERY_WAIT     "Graceful Recovery Wait"
 #define MTC_TASK_RECOVERED         "Gracefully Recovered"
diff --git a/mtce/src/common/nodeClass.cpp b/mtce/src/common/nodeClass.cpp
index 638d12fd..e2320430 100755
--- a/mtce/src/common/nodeClass.cpp
+++ b/mtce/src/common/nodeClass.cpp
@@ -4131,6 +4131,18 @@ int  nodeLinkClass::get_uptime_refresh_ctr ( string & hostname )
     return (0);
 }
 
+
+int nodeLinkClass::get_mtce_flags ( string & hostname )
+{
+    nodeLinkClass::node* node_ptr ;
+    node_ptr = nodeLinkClass::getNode ( hostname );
+    if ( node_ptr != NULL )
+    {
+        return ( node_ptr->mtce_flags );
+    }
+    return (0);
+}
+
 void nodeLinkClass::set_mtce_flags ( string hostname, int flags, int iface )
 {
     nodeLinkClass::node* node_ptr = nodeLinkClass::getNode ( hostname );
@@ -4883,7 +4895,10 @@ void nodeLinkClass::hbs_minor_clear ( struct nodeLinkClass::node * node_ptr, ifa
 
             /* Otherwise this is a single host that has recovered
              * possibly as part of a mnfa group or simply a lone wolf */
-            else
+            else if (( node_ptr->hbs_minor[MGMNT_IFACE] == false ) &&
+                     (( clstr_network_provisioned == false ) ||
+                      (( clstr_network_provisioned == true ) &&
+                       ( node_ptr->hbs_minor[CLSTR_IFACE] == false ))))
             {
                 if ( node_ptr->mnfa_graceful_recovery == true )
                 {
@@ -4891,6 +4906,8 @@ void nodeLinkClass::hbs_minor_clear ( struct nodeLinkClass::node * node_ptr, ifa
                     mnfa_awol_list.remove(node_ptr->hostname);
                 }
 
+                /* Don't recover until heartbeat is working over all
+                 * monitored interfaces */
                 mnfa_recover_host ( node_ptr );
 
                 if ( mnfa_active == true )
@@ -5044,11 +5061,28 @@ void nodeLinkClass::manage_heartbeat_failure ( string hostname, iface_enum iface
             }
             return ;
         }
+        else if ( node_ptr->recoveryStage == MTC_RECOVERY__HEARTBEAT_SOAK )
+        {
+            elog ("%s %s *** Heartbeat Loss *** (during recovery soak)\n",
+                      hostname.c_str(),
+                      get_iface_name_str(iface));
+            force_full_enable ( node_ptr );
+            return ;
+        }
 
         mnfa_add_host ( node_ptr , iface );
 
         if ( mnfa_active == false )
         {
+            /* if node is already in graceful recovery just ignore the event */
+            if ( node_ptr->graceful_recovery_counter != 0 )
+            {
+                dlog ("%s %s loss event ; already in graceful recovery try %d",
+                          hostname.c_str(),
+                          get_iface_name_str(iface),
+                          node_ptr->graceful_recovery_counter );
+                return ;
+            }
             elog ("%s %s *** Heartbeat Loss ***\n", hostname.c_str(), get_iface_name_str(iface));
             if ( iface == CLSTR_IFACE )
             {
diff --git a/mtce/src/common/nodeClass.h b/mtce/src/common/nodeClass.h
index 0ea25e1a..5df2ce56 100755
--- a/mtce/src/common/nodeClass.h
+++ b/mtce/src/common/nodeClass.h
@@ -1775,6 +1775,7 @@ public:
         #define MTC_FLAG__I_AM_LOCKED      (0x00000008)
     */
     void set_mtce_flags ( string hostname, int flags, int iface );
+    int  get_mtce_flags ( string & hostname );
 
     /** Updates the node's health code
       * Codes are found in nodeBase.h
diff --git a/mtce/src/heartbeat/hbsAgent.cpp b/mtce/src/heartbeat/hbsAgent.cpp
index c891fdb4..ecd6941a 100644
--- a/mtce/src/heartbeat/hbsAgent.cpp
+++ b/mtce/src/heartbeat/hbsAgent.cpp
@@ -2127,7 +2127,7 @@ void daemon_service_run ( void )
                         {
                             if ( hostname != hbsInv.my_hostname )
                             {
-                                hbsInv.mon_host ( hostname, false, true );
+                                hbsInv.mon_host ( hostname, false, false );
                                 hbs_cluster_del ( hostname );
                                 ilog ("%s heartbeat service disabled by stop command",
                                           hostname.c_str());
diff --git a/mtce/src/maintenance/mtcCtrlMsg.cpp b/mtce/src/maintenance/mtcCtrlMsg.cpp
index 45232d43..a77131f3 100755
--- a/mtce/src/maintenance/mtcCtrlMsg.cpp
+++ b/mtce/src/maintenance/mtcCtrlMsg.cpp
@@ -1262,13 +1262,69 @@ int service_events ( nodeLinkClass * obj_ptr, mtc_socket_type * sock_ptr )
             {
                 elog ("%s Failed to send inventory to heartbeat service\n", hostname.c_str());
             }
-            /* Send the start event to the heartbeat service for all enabled hosts */
-            if (( obj_ptr->get_adminState  ( hostname ) == MTC_ADMIN_STATE__UNLOCKED ) &&
+            /* Consider sending the 'start' request to the heartbeat service
+             * for all enabled hosts except for this active controller. */
+            if (( obj_ptr->my_hostname != hostname ) &&
+                ( obj_ptr->get_adminState  ( hostname ) == MTC_ADMIN_STATE__UNLOCKED ) &&
                 ( obj_ptr->get_operState   ( hostname ) == MTC_OPER_STATE__ENABLED ) &&
                 ((obj_ptr->get_availStatus ( hostname ) == MTC_AVAIL_STATUS__AVAILABLE ) ||
                  (obj_ptr->get_availStatus ( hostname ) == MTC_AVAIL_STATUS__DEGRADED )))
             {
-                send_hbs_command ( hostname, MTC_CMD_START_HOST, controller );
+                /* However, bypass sending heartbeat 'start' for nodes that
+                 * are not ready to heartbeat; enabling, configuring, testing.
+                 * Such cases are if a host is:
+                 *
+                 * 1. running the add_handler or
+                 * 2. running the enable_handler or
+                 * 3. running the enable_subf_handler or
+                 * 4. not configured or
+                 * 5. not tested (goenabled not complete)
+                 *
+                 */
+                mtc_nodeAdminAction_enum current_action =
+                    obj_ptr->get_adminAction (hostname);
+                if (( current_action != MTC_ADMIN_ACTION__ADD ) &&
+                    ( current_action != MTC_ADMIN_ACTION__ENABLE ) &&
+                    ( current_action != MTC_ADMIN_ACTION__ENABLE_SUBF ))
+                {
+                    int mtce_flags = obj_ptr->get_mtce_flags(hostname);
+                    if (( mtce_flags & MTC_FLAG__I_AM_CONFIGURED ) &&
+                        ( mtce_flags & MTC_FLAG__I_AM_HEALTHY  ) &&
+                        ( mtce_flags & MTC_FLAG__MAIN_GOENABLED ))
+                    {
+                        if (( obj_ptr->system_type != SYSTEM_TYPE__NORMAL ) &&
+                            ( obj_ptr->get_nodetype ( hostname ) & CONTROLLER_TYPE ))
+                        {
+                            /* If its an AIO then its worker subfunction
+                             * needs to have been be configured and tested. */
+                            if (( mtce_flags & MTC_FLAG__SUBF_CONFIGURED ) &&
+                                ( mtce_flags & MTC_FLAG__SUBF_GOENABLED ))
+                            {
+                                ilog("%s heartbeat start (AIO controller)",
+                                         hostname.c_str());
+                                send_hbs_command ( hostname, MTC_CMD_START_HOST, controller );
+                            }
+                            else
+                            {
+                                wlog ("%s not heartbeat ready (subf) (oob:%x)",
+                                          hostname.c_str(),
+                                          mtce_flags);
+                            }
+                        }
+                        else
+                        {
+                            ilog("%s heartbeat start (from ready event)",
+                                     hostname.c_str());
+                            send_hbs_command ( hostname, MTC_CMD_START_HOST, controller );
+                        }
+                    }
+                    else
+                    {
+                        wlog ("%s not heartbeat ready (main) (oob:%x)",
+                                  hostname.c_str(),
+                                  mtce_flags);
+                    }
+                }
             }
         }
         ilog ("%s %s inventory push ... done",
diff --git a/mtce/src/maintenance/mtcNodeHdlrs.cpp b/mtce/src/maintenance/mtcNodeHdlrs.cpp
index e1924af8..49ac2684 100755
--- a/mtce/src/maintenance/mtcNodeHdlrs.cpp
+++ b/mtce/src/maintenance/mtcNodeHdlrs.cpp
@@ -1637,9 +1637,10 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr )
             node_ptr->http_retries_cur = 0 ;
             node_ptr->unknown_health_reported = false ;
 
-            plog ("%s %sGraceful Recovery (uptime was %d)\n",
+            plog ("%s %sGraceful Recovery (%d) (uptime was %d)\n",
                       node_ptr->hostname.c_str(),
                       node_ptr->mnfa_graceful_recovery ? "MNFA " : "",
+                      node_ptr->graceful_recovery_counter,
                       node_ptr->uptime );
 
             /* Cancel any outstanding timers */
@@ -1773,10 +1774,11 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr )
 
                 else if ( node_ptr->mnfa_graceful_recovery == true )
                 {
-                    if ( node_ptr->uptime > MTC_MINS_10 )
+                    if ( node_ptr->uptime > MTC_MINS_15 )
                     {
                         /* did not reboot case */
-                        wlog ("%s Connectivity Recovered ; host did not reset\n", node_ptr->hostname.c_str());
+                        wlog ("%s Connectivity Recovered ; host did not reset (uptime:%d)\n",
+                                  node_ptr->hostname.c_str(), node_ptr->uptime);
                         wlog ("%s ... continuing with MNFA graceful recovery\n", node_ptr->hostname.c_str());
                         wlog ("%s ... with no affect to host services\n", node_ptr->hostname.c_str());
 
@@ -1789,7 +1791,8 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr )
                     else
                     {
                         /* did reboot case */
-                        wlog ("%s Connectivity Recovered ; host has reset\n", node_ptr->hostname.c_str());
+                        wlog ("%s Connectivity Recovered ; host has reset (uptime:%d)\n",
+                                  node_ptr->hostname.c_str(),  node_ptr->uptime);
                         ilog ("%s ... continuing with MNFA graceful recovery\n", node_ptr->hostname.c_str());
                         ilog ("%s ... without additional reboot %s\n",
                                   node_ptr->hostname.c_str(), node_ptr->bm_ip.empty() ? "or reset" : "" );
@@ -1807,12 +1810,13 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr )
                         break ;
                     }
                 }
-                else if (( node_ptr->uptime_save ) && ( node_ptr->uptime >= node_ptr->uptime_save ))
+                else if ( node_ptr->uptime > MTC_MINS_15 )
                 {
                     /* did not reboot case */
-                    wlog ("%s Connectivity Recovered ; host did not reset%s\n",
+                    wlog ("%s Connectivity Recovered ; host did not reset%s (uptime:%d)",
                               node_ptr->hostname.c_str(),
-                              node_ptr->was_dor_recovery_mode ? " (DOR)" : "" );
+                              node_ptr->was_dor_recovery_mode ? " (DOR)" : "",
+                              node_ptr->uptime);
 
                     wlog ("%s ... continuing with graceful recovery\n", node_ptr->hostname.c_str());
                     wlog ("%s ... with no affect to host services\n", node_ptr->hostname.c_str());
@@ -1906,7 +1910,7 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr )
         {
             int timeout = 0 ;
 
-            /* Set the FSM task state to booting */
+            /* Set the FSM task state to 'Graceful Recovery Wait' */
             node_ptr->uptime = 0 ;
             mtcInvApi_update_task ( node_ptr, MTC_TASK_RECOVERY_WAIT );
 
@@ -2443,10 +2447,10 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr )
             }
             else /* success path */
             {
-                /* allow the fsm to wait for up to 1 minute for the
-                 * hbsClient's ready event before starting heartbeat
+                /* allow the fsm to wait for up to 'worker config timeout'
+                 * for the hbsClient's ready event before starting heartbeat
                  * test. */
-                mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_MINS_1 );
+                mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_WORKER_CONFIG_TIMEOUT );
                 recoveryStageChange ( node_ptr, MTC_RECOVERY__HEARTBEAT_START );
             }
             break ;
@@ -2503,6 +2507,7 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr )
         {
             if ( node_ptr->mtcTimer.ring == true )
             {
+                ilog ("%s heartbeating", node_ptr->hostname.c_str());
                 /* if heartbeat is not working then we will
                  * never get here and enable the host */
                 recoveryStageChange ( node_ptr, MTC_RECOVERY__STATE_CHANGE );
@@ -6261,6 +6266,40 @@ int nodeLinkClass::add_handler ( struct nodeLinkClass::node * node_ptr )
                               node_ptr->hostname.c_str(), node_ptr->uptime );
                     break ;
                 }
+                /* Handle catching and recovering/restoring hosts that might
+                 * have been in the Graceful Recovery Wait state.
+                 *
+                 * Prevents an extra reboot for hosts that might be in
+                 * Graceful Recovery over a maintenance process restart. */
+                else if (( NOT_THIS_HOST ) &&
+                         ( !node_ptr->task.compare(MTC_TASK_RECOVERY_WAIT)))
+                {
+                    ilog ("%s is in %s ; restoring state",
+                              node_ptr->hostname.c_str(),
+                              MTC_TASK_RECOVERY_WAIT);
+
+                    /* Complete necessary add operations before switching
+                     * to Recovery */
+                    LOAD_NODETYPE_TIMERS ;
+                    workQueue_purge ( node_ptr );
+                    if (( hostUtil_is_valid_bm_type  ( node_ptr->bm_type )) &&
+                        ( hostUtil_is_valid_ip_addr  ( node_ptr->bm_ip )) &&
+                        ( hostUtil_is_valid_username ( node_ptr->bm_un )))
+                    {
+                        set_bm_prov ( node_ptr, true ) ;
+                    }
+                    mtcTimer_reset ( node_ptr->mtcTimer );
+                    adminActionChange ( node_ptr, MTC_ADMIN_ACTION__NONE );
+                    node_ptr->addStage = MTC_ADD__START;
+
+                    /* Switch into recovery_handler's Graceful Recovery Wait
+                     * state with the Graceful Recovery Wait timeout */
+                    adminActionChange ( node_ptr, MTC_ADMIN_ACTION__RECOVER );
+                    mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler,
+                                     node_ptr->mtcalive_timeout );
+                    recoveryStageChange ( node_ptr, MTC_RECOVERY__MTCALIVE_WAIT );
+                    break ;
+                }
                 else
                 {
                     if ( is_controller(node_ptr) )
diff --git a/mtce/src/maintenance/mtcSubfHdlrs.cpp b/mtce/src/maintenance/mtcSubfHdlrs.cpp
index 6f9646da..5c994f4a 100644
--- a/mtce/src/maintenance/mtcSubfHdlrs.cpp
+++ b/mtce/src/maintenance/mtcSubfHdlrs.cpp
@@ -110,14 +110,16 @@ int nodeLinkClass::enable_subf_handler ( struct nodeLinkClass::node * node_ptr )
             if ( node_ptr->mtce_flags & MTC_FLAG__SUBF_CONFIGURED )
             {
                 mtcTimer_reset (node_ptr->mtcTimer);
-                plog ("%s Subf Configured OK\n", name.c_str());
+                plog ("%s Subf Configured OK (oob:%x)\n",
+                          name.c_str(), node_ptr->mtce_flags);
                 enableStageChange ( node_ptr, MTC_ENABLE__GOENABLED_TIMER );
                 alarm_config_clear ( node_ptr );
                 break ;
             }
 
-            if ((( !node_ptr->mtce_flags & MTC_FLAG__I_AM_CONFIGURED )) ||
-                ((  node_ptr->mtce_flags & MTC_FLAG__I_AM_NOT_HEALTHY )))
+            if (( node_ptr->mtce_flags ) &&
+                (( !node_ptr->mtce_flags & MTC_FLAG__I_AM_CONFIGURED ) ||
+                 (  node_ptr->mtce_flags & MTC_FLAG__I_AM_NOT_HEALTHY )))
             {
                 mtcTimer_reset (node_ptr->mtcTimer);
 
@@ -140,9 +142,10 @@ int nodeLinkClass::enable_subf_handler ( struct nodeLinkClass::node * node_ptr )
             /* timeout handling */
             else if ( node_ptr->mtcTimer.ring == true )
             {
-                elog ("%s configuration timeout (%d secs)\n",
+                elog ("%s configuration timeout (%d secs) (oob:%x)\n",
                           name.c_str(),
-                          MTC_WORKER_CONFIG_TIMEOUT );
+                          MTC_WORKER_CONFIG_TIMEOUT,
+                          node_ptr->mtce_flags);
 
                 alarm_config_failure ( node_ptr );