From 48978d804d6f22130d0bd8bd17f361441024bc6c Mon Sep 17 00:00:00 2001 From: Eric MacDonald Date: Wed, 28 Apr 2021 09:39:19 -0400 Subject: [PATCH] Improved maintenance handling of spontaneous active controller reboot Performing a forced reboot of the active controller sometimes results in a second reboot of that controller. The cause of the second reboot was due to its reported uptime in the first mtcAlive message, following the reboot, as greater than 10 minutes. Maintenance has a long standing graceful recovery threshold of 10 minutes. Meaning that if a host looses heartbeat and enters Graceful Recovery, if the uptime value extracted from the first mtcAlive message following the recovery of that host exceeds 10 minutes, then maintenance interprets that the host did not reboot. If a host goes absent for longer than this threshold then for reasons not limited to security, maintenance declares the host as 'failed' and force re-enables it through a reboot. With the introduction of containers and addition of new features over the last few releases, boot times on some servers are approaching the 10 minute threshold and in this case exceeded the threshold. The primary fix in this update is to increase this long standing threshold to 15 minutes to account for evolution of the product. During the debug of this issue a few other related undesirable behaviors related to Graceful Recovery were observed with the following additional changes implemented. - Remove hbsAgent process restart in ha service management failover failure recovery handling. This change is in the ha git with a loose dependency placed on this update. Reason: https://review.opendev.org/c/starlingx/ha/+/788299 - Prevent the hbsAgent from sending heartbeat clear events to maintenance in response to a heartbeat stop command. Reason: Maintenance receiving these clear events while in Graceful Recovery causes it to pop out of graceful recovery only to re-enter as a retry and therefore needlessly consumes one (of a max of 5) retry count. - Prevent successful Graceful Recovery until all heartbeat monitored networks recover. Reason: If heartbeat of one network, say cluster recovers but another (management) does not then its possible the max Graceful Recovery Retries could be reached quite quickly, while one network recovered but the other may not have, causing maintenance to fail the host and force a full enable with reboot. - Extend the wait for the hbsClient ready event in the graceful recovery handler timout from 1 minute to worker config timeout. Reason: To give the worker config time to complete before force starting the recovery handler's heartbeat soak. - Add Graceful Recovery Wait state recovery over process restart. Reason: Avoid double reboot of Gracefully Recovering host over SM service bounce. - Add requirement for a valid out-of-band mtce flags value before declaring configuration error in the subfunction enable handler. Reason: rebooting the active controller can sometimes result in a falsely reported configation error due to the subfunction enable handler interpreting a zero value as a configuration error. - Add uptime to all Graceful Recovery 'Connectivity Recovered' logs. Reason: To assist log analysis and issue debug Test Plan: PASS: Verify handling active controller reboot cases: AIO DC, AIO DX, Standard, and Storage PASS: Verify Graceful Recovery Wait behavior cases: with and without timeout, with and without bmc cases: uptime > 15 mins and 10 < uptime < 15 mins PASS: Verify Graceful Recovery continuation over mtcAgent restart cases: peer controller, compute, MNFA 4 computes PASS: Verify AIO DX and DC active controller reboot to standby takeover that up for less than 15 minutes. Regression: PASS: Verify MNFA feature ; 4 computes in 8 node Storage system PASS: Verify cluster network only heartbeat loss handling cases: worker and standby controller in all systems. PASS: Verify Dead Office Recovery (DOR) cases: AIO DC, AIO DX, Standard, Storage PASS: Verify system installations cases: AIO SX/DC/DX and 8 node Storage system PASS: Verify heartbeat and graceful recovery of both 'standby controller' and worker nodes in AIO Plus. PASS: Verify logging and no coredumps over all of testing PASS: Verify no missing or stuck alarms over all of testing Change-Id: I3d16d8627b7e838faf931a3c2039a6babf2a79ef Closes-Bug: 1922584 Signed-off-by: Eric MacDonald --- mtce-common/src/common/nodeBase.h | 1 + mtce/src/common/nodeClass.cpp | 36 +++++++++++++++- mtce/src/common/nodeClass.h | 1 + mtce/src/heartbeat/hbsAgent.cpp | 2 +- mtce/src/maintenance/mtcCtrlMsg.cpp | 62 +++++++++++++++++++++++++-- mtce/src/maintenance/mtcNodeHdlrs.cpp | 61 +++++++++++++++++++++----- mtce/src/maintenance/mtcSubfHdlrs.cpp | 13 +++--- 7 files changed, 155 insertions(+), 21 deletions(-) diff --git a/mtce-common/src/common/nodeBase.h b/mtce-common/src/common/nodeBase.h index 2d02bd97..b98c34d4 100755 --- a/mtce-common/src/common/nodeBase.h +++ b/mtce-common/src/common/nodeBase.h @@ -263,6 +263,7 @@ typedef enum #define MTC_TASK_ENABLE_WORK_FAIL "Enable Action Failed" #define MTC_TASK_ENABLE_WORK_TO "Enable Action Timeout" #define MTC_TASK_ENABLE_FAIL_HB "Enable Heartbeat Failure, re-enabling" +#define MTC_TASK_RECOVERY_FAIL_HB "Graceful Recovery Heartbeat Failure, re-enabling" #define MTC_TASK_RECOVERY_FAIL "Graceful Recovery Failed, re-enabling" #define MTC_TASK_RECOVERY_WAIT "Graceful Recovery Wait" #define MTC_TASK_RECOVERED "Gracefully Recovered" diff --git a/mtce/src/common/nodeClass.cpp b/mtce/src/common/nodeClass.cpp index 638d12fd..e2320430 100755 --- a/mtce/src/common/nodeClass.cpp +++ b/mtce/src/common/nodeClass.cpp @@ -4131,6 +4131,18 @@ int nodeLinkClass::get_uptime_refresh_ctr ( string & hostname ) return (0); } + +int nodeLinkClass::get_mtce_flags ( string & hostname ) +{ + nodeLinkClass::node* node_ptr ; + node_ptr = nodeLinkClass::getNode ( hostname ); + if ( node_ptr != NULL ) + { + return ( node_ptr->mtce_flags ); + } + return (0); +} + void nodeLinkClass::set_mtce_flags ( string hostname, int flags, int iface ) { nodeLinkClass::node* node_ptr = nodeLinkClass::getNode ( hostname ); @@ -4883,7 +4895,10 @@ void nodeLinkClass::hbs_minor_clear ( struct nodeLinkClass::node * node_ptr, ifa /* Otherwise this is a single host that has recovered * possibly as part of a mnfa group or simply a lone wolf */ - else + else if (( node_ptr->hbs_minor[MGMNT_IFACE] == false ) && + (( clstr_network_provisioned == false ) || + (( clstr_network_provisioned == true ) && + ( node_ptr->hbs_minor[CLSTR_IFACE] == false )))) { if ( node_ptr->mnfa_graceful_recovery == true ) { @@ -4891,6 +4906,8 @@ void nodeLinkClass::hbs_minor_clear ( struct nodeLinkClass::node * node_ptr, ifa mnfa_awol_list.remove(node_ptr->hostname); } + /* Don't recover until heartbeat is working over all + * monitored interfaces */ mnfa_recover_host ( node_ptr ); if ( mnfa_active == true ) @@ -5044,11 +5061,28 @@ void nodeLinkClass::manage_heartbeat_failure ( string hostname, iface_enum iface } return ; } + else if ( node_ptr->recoveryStage == MTC_RECOVERY__HEARTBEAT_SOAK ) + { + elog ("%s %s *** Heartbeat Loss *** (during recovery soak)\n", + hostname.c_str(), + get_iface_name_str(iface)); + force_full_enable ( node_ptr ); + return ; + } mnfa_add_host ( node_ptr , iface ); if ( mnfa_active == false ) { + /* if node is already in graceful recovery just ignore the event */ + if ( node_ptr->graceful_recovery_counter != 0 ) + { + dlog ("%s %s loss event ; already in graceful recovery try %d", + hostname.c_str(), + get_iface_name_str(iface), + node_ptr->graceful_recovery_counter ); + return ; + } elog ("%s %s *** Heartbeat Loss ***\n", hostname.c_str(), get_iface_name_str(iface)); if ( iface == CLSTR_IFACE ) { diff --git a/mtce/src/common/nodeClass.h b/mtce/src/common/nodeClass.h index 0ea25e1a..5df2ce56 100755 --- a/mtce/src/common/nodeClass.h +++ b/mtce/src/common/nodeClass.h @@ -1775,6 +1775,7 @@ public: #define MTC_FLAG__I_AM_LOCKED (0x00000008) */ void set_mtce_flags ( string hostname, int flags, int iface ); + int get_mtce_flags ( string & hostname ); /** Updates the node's health code * Codes are found in nodeBase.h diff --git a/mtce/src/heartbeat/hbsAgent.cpp b/mtce/src/heartbeat/hbsAgent.cpp index c891fdb4..ecd6941a 100644 --- a/mtce/src/heartbeat/hbsAgent.cpp +++ b/mtce/src/heartbeat/hbsAgent.cpp @@ -2127,7 +2127,7 @@ void daemon_service_run ( void ) { if ( hostname != hbsInv.my_hostname ) { - hbsInv.mon_host ( hostname, false, true ); + hbsInv.mon_host ( hostname, false, false ); hbs_cluster_del ( hostname ); ilog ("%s heartbeat service disabled by stop command", hostname.c_str()); diff --git a/mtce/src/maintenance/mtcCtrlMsg.cpp b/mtce/src/maintenance/mtcCtrlMsg.cpp index 45232d43..a77131f3 100755 --- a/mtce/src/maintenance/mtcCtrlMsg.cpp +++ b/mtce/src/maintenance/mtcCtrlMsg.cpp @@ -1262,13 +1262,69 @@ int service_events ( nodeLinkClass * obj_ptr, mtc_socket_type * sock_ptr ) { elog ("%s Failed to send inventory to heartbeat service\n", hostname.c_str()); } - /* Send the start event to the heartbeat service for all enabled hosts */ - if (( obj_ptr->get_adminState ( hostname ) == MTC_ADMIN_STATE__UNLOCKED ) && + /* Consider sending the 'start' request to the heartbeat service + * for all enabled hosts except for this active controller. */ + if (( obj_ptr->my_hostname != hostname ) && + ( obj_ptr->get_adminState ( hostname ) == MTC_ADMIN_STATE__UNLOCKED ) && ( obj_ptr->get_operState ( hostname ) == MTC_OPER_STATE__ENABLED ) && ((obj_ptr->get_availStatus ( hostname ) == MTC_AVAIL_STATUS__AVAILABLE ) || (obj_ptr->get_availStatus ( hostname ) == MTC_AVAIL_STATUS__DEGRADED ))) { - send_hbs_command ( hostname, MTC_CMD_START_HOST, controller ); + /* However, bypass sending heartbeat 'start' for nodes that + * are not ready to heartbeat; enabling, configuring, testing. + * Such cases are if a host is: + * + * 1. running the add_handler or + * 2. running the enable_handler or + * 3. running the enable_subf_handler or + * 4. not configured or + * 5. not tested (goenabled not complete) + * + */ + mtc_nodeAdminAction_enum current_action = + obj_ptr->get_adminAction (hostname); + if (( current_action != MTC_ADMIN_ACTION__ADD ) && + ( current_action != MTC_ADMIN_ACTION__ENABLE ) && + ( current_action != MTC_ADMIN_ACTION__ENABLE_SUBF )) + { + int mtce_flags = obj_ptr->get_mtce_flags(hostname); + if (( mtce_flags & MTC_FLAG__I_AM_CONFIGURED ) && + ( mtce_flags & MTC_FLAG__I_AM_HEALTHY ) && + ( mtce_flags & MTC_FLAG__MAIN_GOENABLED )) + { + if (( obj_ptr->system_type != SYSTEM_TYPE__NORMAL ) && + ( obj_ptr->get_nodetype ( hostname ) & CONTROLLER_TYPE )) + { + /* If its an AIO then its worker subfunction + * needs to have been be configured and tested. */ + if (( mtce_flags & MTC_FLAG__SUBF_CONFIGURED ) && + ( mtce_flags & MTC_FLAG__SUBF_GOENABLED )) + { + ilog("%s heartbeat start (AIO controller)", + hostname.c_str()); + send_hbs_command ( hostname, MTC_CMD_START_HOST, controller ); + } + else + { + wlog ("%s not heartbeat ready (subf) (oob:%x)", + hostname.c_str(), + mtce_flags); + } + } + else + { + ilog("%s heartbeat start (from ready event)", + hostname.c_str()); + send_hbs_command ( hostname, MTC_CMD_START_HOST, controller ); + } + } + else + { + wlog ("%s not heartbeat ready (main) (oob:%x)", + hostname.c_str(), + mtce_flags); + } + } } } ilog ("%s %s inventory push ... done", diff --git a/mtce/src/maintenance/mtcNodeHdlrs.cpp b/mtce/src/maintenance/mtcNodeHdlrs.cpp index e1924af8..49ac2684 100755 --- a/mtce/src/maintenance/mtcNodeHdlrs.cpp +++ b/mtce/src/maintenance/mtcNodeHdlrs.cpp @@ -1637,9 +1637,10 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr ) node_ptr->http_retries_cur = 0 ; node_ptr->unknown_health_reported = false ; - plog ("%s %sGraceful Recovery (uptime was %d)\n", + plog ("%s %sGraceful Recovery (%d) (uptime was %d)\n", node_ptr->hostname.c_str(), node_ptr->mnfa_graceful_recovery ? "MNFA " : "", + node_ptr->graceful_recovery_counter, node_ptr->uptime ); /* Cancel any outstanding timers */ @@ -1773,10 +1774,11 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr ) else if ( node_ptr->mnfa_graceful_recovery == true ) { - if ( node_ptr->uptime > MTC_MINS_10 ) + if ( node_ptr->uptime > MTC_MINS_15 ) { /* did not reboot case */ - wlog ("%s Connectivity Recovered ; host did not reset\n", node_ptr->hostname.c_str()); + wlog ("%s Connectivity Recovered ; host did not reset (uptime:%d)\n", + node_ptr->hostname.c_str(), node_ptr->uptime); wlog ("%s ... continuing with MNFA graceful recovery\n", node_ptr->hostname.c_str()); wlog ("%s ... with no affect to host services\n", node_ptr->hostname.c_str()); @@ -1789,7 +1791,8 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr ) else { /* did reboot case */ - wlog ("%s Connectivity Recovered ; host has reset\n", node_ptr->hostname.c_str()); + wlog ("%s Connectivity Recovered ; host has reset (uptime:%d)\n", + node_ptr->hostname.c_str(), node_ptr->uptime); ilog ("%s ... continuing with MNFA graceful recovery\n", node_ptr->hostname.c_str()); ilog ("%s ... without additional reboot %s\n", node_ptr->hostname.c_str(), node_ptr->bm_ip.empty() ? "or reset" : "" ); @@ -1807,12 +1810,13 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr ) break ; } } - else if (( node_ptr->uptime_save ) && ( node_ptr->uptime >= node_ptr->uptime_save )) + else if ( node_ptr->uptime > MTC_MINS_15 ) { /* did not reboot case */ - wlog ("%s Connectivity Recovered ; host did not reset%s\n", + wlog ("%s Connectivity Recovered ; host did not reset%s (uptime:%d)", node_ptr->hostname.c_str(), - node_ptr->was_dor_recovery_mode ? " (DOR)" : "" ); + node_ptr->was_dor_recovery_mode ? " (DOR)" : "", + node_ptr->uptime); wlog ("%s ... continuing with graceful recovery\n", node_ptr->hostname.c_str()); wlog ("%s ... with no affect to host services\n", node_ptr->hostname.c_str()); @@ -1906,7 +1910,7 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr ) { int timeout = 0 ; - /* Set the FSM task state to booting */ + /* Set the FSM task state to 'Graceful Recovery Wait' */ node_ptr->uptime = 0 ; mtcInvApi_update_task ( node_ptr, MTC_TASK_RECOVERY_WAIT ); @@ -2443,10 +2447,10 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr ) } else /* success path */ { - /* allow the fsm to wait for up to 1 minute for the - * hbsClient's ready event before starting heartbeat + /* allow the fsm to wait for up to 'worker config timeout' + * for the hbsClient's ready event before starting heartbeat * test. */ - mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_MINS_1 ); + mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_WORKER_CONFIG_TIMEOUT ); recoveryStageChange ( node_ptr, MTC_RECOVERY__HEARTBEAT_START ); } break ; @@ -2503,6 +2507,7 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr ) { if ( node_ptr->mtcTimer.ring == true ) { + ilog ("%s heartbeating", node_ptr->hostname.c_str()); /* if heartbeat is not working then we will * never get here and enable the host */ recoveryStageChange ( node_ptr, MTC_RECOVERY__STATE_CHANGE ); @@ -6261,6 +6266,40 @@ int nodeLinkClass::add_handler ( struct nodeLinkClass::node * node_ptr ) node_ptr->hostname.c_str(), node_ptr->uptime ); break ; } + /* Handle catching and recovering/restoring hosts that might + * have been in the Graceful Recovery Wait state. + * + * Prevents an extra reboot for hosts that might be in + * Graceful Recovery over a maintenance process restart. */ + else if (( NOT_THIS_HOST ) && + ( !node_ptr->task.compare(MTC_TASK_RECOVERY_WAIT))) + { + ilog ("%s is in %s ; restoring state", + node_ptr->hostname.c_str(), + MTC_TASK_RECOVERY_WAIT); + + /* Complete necessary add operations before switching + * to Recovery */ + LOAD_NODETYPE_TIMERS ; + workQueue_purge ( node_ptr ); + if (( hostUtil_is_valid_bm_type ( node_ptr->bm_type )) && + ( hostUtil_is_valid_ip_addr ( node_ptr->bm_ip )) && + ( hostUtil_is_valid_username ( node_ptr->bm_un ))) + { + set_bm_prov ( node_ptr, true ) ; + } + mtcTimer_reset ( node_ptr->mtcTimer ); + adminActionChange ( node_ptr, MTC_ADMIN_ACTION__NONE ); + node_ptr->addStage = MTC_ADD__START; + + /* Switch into recovery_handler's Graceful Recovery Wait + * state with the Graceful Recovery Wait timeout */ + adminActionChange ( node_ptr, MTC_ADMIN_ACTION__RECOVER ); + mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, + node_ptr->mtcalive_timeout ); + recoveryStageChange ( node_ptr, MTC_RECOVERY__MTCALIVE_WAIT ); + break ; + } else { if ( is_controller(node_ptr) ) diff --git a/mtce/src/maintenance/mtcSubfHdlrs.cpp b/mtce/src/maintenance/mtcSubfHdlrs.cpp index 6f9646da..5c994f4a 100644 --- a/mtce/src/maintenance/mtcSubfHdlrs.cpp +++ b/mtce/src/maintenance/mtcSubfHdlrs.cpp @@ -110,14 +110,16 @@ int nodeLinkClass::enable_subf_handler ( struct nodeLinkClass::node * node_ptr ) if ( node_ptr->mtce_flags & MTC_FLAG__SUBF_CONFIGURED ) { mtcTimer_reset (node_ptr->mtcTimer); - plog ("%s Subf Configured OK\n", name.c_str()); + plog ("%s Subf Configured OK (oob:%x)\n", + name.c_str(), node_ptr->mtce_flags); enableStageChange ( node_ptr, MTC_ENABLE__GOENABLED_TIMER ); alarm_config_clear ( node_ptr ); break ; } - if ((( !node_ptr->mtce_flags & MTC_FLAG__I_AM_CONFIGURED )) || - (( node_ptr->mtce_flags & MTC_FLAG__I_AM_NOT_HEALTHY ))) + if (( node_ptr->mtce_flags ) && + (( !node_ptr->mtce_flags & MTC_FLAG__I_AM_CONFIGURED ) || + ( node_ptr->mtce_flags & MTC_FLAG__I_AM_NOT_HEALTHY ))) { mtcTimer_reset (node_ptr->mtcTimer); @@ -140,9 +142,10 @@ int nodeLinkClass::enable_subf_handler ( struct nodeLinkClass::node * node_ptr ) /* timeout handling */ else if ( node_ptr->mtcTimer.ring == true ) { - elog ("%s configuration timeout (%d secs)\n", + elog ("%s configuration timeout (%d secs) (oob:%x)\n", name.c_str(), - MTC_WORKER_CONFIG_TIMEOUT ); + MTC_WORKER_CONFIG_TIMEOUT, + node_ptr->mtce_flags); alarm_config_failure ( node_ptr );