From d9982a3b7e32549d0a671cbd42e7c8fe18b783b4 Mon Sep 17 00:00:00 2001 From: Eric MacDonald Date: Fri, 2 Feb 2024 16:44:01 +0000 Subject: [PATCH] Mtce: Create non-volatile backup of node locked flag file The existing /var/run/.node_locked flag file is volatile. Meaning it is lost over a host reboot which has DOR implications. Service Management (SM) sometimes selects and activates services on a locked controller following a DOR (Dead Office Recovery). This update is part one of a two-part update that solves both of the above problems. Part two is a change to SM in the ha git. This update can be merged without part two. This update maintains the existing volatile node locked file because it is looked at by other system services. So to minimize the change and therefore patchback impact, a new non-volatile 'backup' of the existing node locked flag file is created. This update incorporates modifications to the mtcAgent and mtcClient, introducing a new backup file and ensuring their synchronized management to guarantee their simultaneous presence or absence. Note: A design choice was made to not use a symlink of one to the other rather than add support to manage symlinks in the code. This approach was chosen for its simplicity and reliability in directly managing both files. At some point in the future volatile file could be deprecated contingent upon identifying and updating all services that directly reference it. This update also removes some dead code that was adjacent to my update. Test Plan: This test plan covers the maintenance management of both files to ensure they always align and the expected behavior exists. PASS: Verify AIO DX Install. PASS: Verify Storage System Install. PASS: Verify Swact back and forth. PASS: Verify mtcClient and mtcAgent logging. PASS: Verify node lock/unlock soak. Non-volatile (Nv) node locked management test cases: PASS: Verify Nv node locked file is present when a node is locked. Confirmed on all node types. PASS: Verify any system node install comes up locked with both node locked flag files present. PASS: Verify mtcClient logs when a node is locked and unlocked. PASS: Verify Nv node locked file present/absent state mirrors the already existing /var/run/.node_locked flag file. PASS: Verify node locked file is present on controller-0 during ansible run following initial install and removed as part of the self-unlock. PASS: Verify the Nv node locked file is removed over the unlock along with the administrative state change prior to the unlock reboot. PASS: Verify both node locked files are always present or absent together. PASS: Verify node locked file management while the management interface is down. File is still managed over cluster network. PASS: Verify node locked file management while the cluster interface is down. File is still managed over management network. PASS: Verify behavior if the new unlocked message is received by a mtcClient process that does not support it ; unknown command log. PASS: Verify a node locked state is auto corrected while not in a locked/unlocked action change state. ... Manually remove either file on locked node and verify they are both recreated within 5 seconds. ... Manually create either node locked file on unlocked worker or storage node and verify the created files are removed within 5 seconds. Note: doing this to the new backup file on the active controller will cause SM to shutdown as expected. PASS: Verify Nv node locked file is auto created on a node that spontaneously rebooted while it was unlocked. During the reboot the node was administratively locked. The node should come online with both node locked files present. Partial-Bug: 2051578 Change-Id: I0c279b92491e526682d43d78c66f8736934221de Signed-off-by: Eric MacDonald --- mtce-common/src/common/nodeBase.cpp | 9 +-- mtce-common/src/common/nodeBase.h | 14 ++-- mtce/src/common/nodeClass.cpp | 98 +++++++++++++++++---------- mtce/src/common/nodeClass.h | 21 ++---- mtce/src/maintenance/mtcCompMsg.cpp | 37 +++++++--- mtce/src/maintenance/mtcCtrlMsg.cpp | 33 +++++++-- mtce/src/maintenance/mtcNodeComp.cpp | 25 +++++-- mtce/src/maintenance/mtcNodeHdlrs.cpp | 24 +++++-- 8 files changed, 177 insertions(+), 84 deletions(-) diff --git a/mtce-common/src/common/nodeBase.cpp b/mtce-common/src/common/nodeBase.cpp index 540a4d79..750515b3 100755 --- a/mtce-common/src/common/nodeBase.cpp +++ b/mtce-common/src/common/nodeBase.cpp @@ -1,8 +1,8 @@ /* - * Copyright (c) 2013, 2016, 2023 Wind River Systems, Inc. -* -* SPDX-License-Identifier: Apache-2.0 -* + * Copyright (c) 2013, 2016, 2023-2024 Wind River Systems, Inc. + * + * SPDX-License-Identifier: Apache-2.0 + * */ /** @@ -148,6 +148,7 @@ const char * get_mtcNodeCommand_str ( int cmd ) case MTC_MSG_MTCALIVE: return ("mtcAlive msg"); case MTC_REQ_MTCALIVE: return ("mtcAlive req"); case MTC_MSG_LOCKED: return ("locked msg"); + case MTC_MSG_UNLOCKED: return ("unlocked msg"); case MTC_CMD_LAZY_REBOOT: return ("lazy reboot"); case MTC_MSG_INFO: return ("info msg"); case MTC_CMD_SYNC: return ("sync"); diff --git a/mtce-common/src/common/nodeBase.h b/mtce-common/src/common/nodeBase.h index b6493a36..8af83c3b 100755 --- a/mtce-common/src/common/nodeBase.h +++ b/mtce-common/src/common/nodeBase.h @@ -1,10 +1,10 @@ #ifndef __INCLUDE_NODEBASE_HH__ #define __INCLUDE_NODEBASE_HH__ /* - * Copyright (c) 2013-2020, 2023 Wind River Systems, Inc. -* -* SPDX-License-Identifier: Apache-2.0 -* + * Copyright (c) 2013-2020, 2023-2024 Wind River Systems, Inc. + * + * SPDX-License-Identifier: Apache-2.0 + * */ /** @@ -105,6 +105,7 @@ void daemon_exit ( void ); #define CONFIG_PASS_FILE ((const char *)"/var/run/.config_pass") #define CONFIG_FAIL_FILE ((const char *)"/var/run/.config_fail") #define NODE_LOCKED_FILE ((const char *)"/var/run/.node_locked") +#define NODE_LOCKED_FILE_BACKUP ((const char *)"/etc/mtc/tmp/.node_locked") #define NODE_RESET_FILE ((const char *)"/var/run/.node_reset") #define SMGMT_DEGRADED_FILE ((const char *)"/var/run/.sm_degraded") #define SMGMT_UNHEALTHY_FILE ((const char *)"/var/run/.sm_node_unhealthy") @@ -762,7 +763,10 @@ typedef struct #define MTC_CMD_HOST_SVCS_RESULT 21 /* to host */ #define MTC_MSG_INFO 22 /* to host */ #define MTC_CMD_SYNC 23 /* to host */ -#define MTC_CMD_LAST 24 +#define MTC_MSG_UNLOCKED 24 /* to host */ +#define MTC_CMD_LAST 25 + +#define ADMIN_LOCKED_STR ((const char *)"This node is currently in the administratively locked state") #define RESET_PROG_MAX_REBOOTS_B4_RESET (5) #define RESET_PROG_MAX_REBOOTS_B4_RETRY (RESET_PROG_MAX_REBOOTS_B4_RESET+2) diff --git a/mtce/src/common/nodeClass.cpp b/mtce/src/common/nodeClass.cpp index 7a4ce958..b7ef0ea4 100755 --- a/mtce/src/common/nodeClass.cpp +++ b/mtce/src/common/nodeClass.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013-2020, 2023 Wind River Systems, Inc. + * Copyright (c) 2013-2020, 2023-2024 Wind River Systems, Inc. * * SPDX-License-Identifier: Apache-2.0 * @@ -538,9 +538,6 @@ nodeLinkClass::node* nodeLinkClass::addNode( string hostname ) /* Default timeout values */ ptr->mtcalive_timeout = HOST_MTCALIVE_TIMEOUT ; - /* no ned to send a reboot response back to any client */ - ptr->activeClient = CLIENT_NONE ; - ptr->task = "none" ; ptr->action = "none" ; ptr->clear_task = false ; @@ -565,7 +562,7 @@ nodeLinkClass::node* nodeLinkClass::addNode( string hostname ) ptr->reboot_cmd_ack_mgmnt = false ; ptr->reboot_cmd_ack_clstr = false ; - + ptr->unlock_cmd_ack = false ; ptr->offline_log_throttle = 0 ; ptr->offline_log_reported = true ; ptr->online_log_reported = false ; @@ -1402,6 +1399,12 @@ int nodeLinkClass::admin_state_change ( string hostname, { clog ("%s %s (from %s)\n", hostname.c_str(), newAdminState.c_str(), adminState_enum_to_str (node_ptr->adminState).c_str()); node_ptr->adminState = adminState_str_to_enum ( newAdminState.data() ); + if ( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED ) + { + /* Tell the node it is unlocked now */ + node_ptr->unlock_cmd_ack = false ; + send_mtc_cmd ( node_ptr->hostname , MTC_MSG_UNLOCKED, MGMNT_INTERFACE ); + } rc = PASS ; } else @@ -3861,14 +3864,41 @@ void nodeLinkClass::set_cmd_resp ( string & hostname, mtc_message_type & msg, in } else if ( node_ptr->cmdRsp != msg.cmd ) { - /* record ack's for reboot requests */ - if ( msg.cmd == MTC_CMD_REBOOT ) + if ( msg.cmd == MTC_MSG_UNLOCKED ) { + ilog ("%s %s ACK (%s)", + node_ptr->hostname.c_str(), + get_mtcNodeCommand_str(msg.cmd), + get_iface_name_str(iface)); + node_ptr->unlock_cmd_ack = true ; + } + else if ( msg.cmd == MTC_MSG_LOCKED ) + { + mlog ("%s %s ACK (%s)", + node_ptr->hostname.c_str(), + get_mtcNodeCommand_str(msg.cmd), + get_iface_name_str(iface)); + } + /* record ack's for reboot requests */ + else if ( msg.cmd == MTC_CMD_REBOOT ) + { + ilog ("%s %s ACK (%s)", + node_ptr->hostname.c_str(), + get_mtcNodeCommand_str(msg.cmd), + get_iface_name_str(iface)); + if ( iface == MGMNT_INTERFACE ) node_ptr->reboot_cmd_ack_mgmnt = 1 ; else if ( iface == CLSTR_INTERFACE ) node_ptr->reboot_cmd_ack_clstr = 1 ; } + else + { + ilog ("%s %s ACK (%s)", + node_ptr->hostname.c_str(), + get_mtcNodeCommand_str(msg.cmd), + get_iface_name_str(iface)); + } node_ptr->cmdRsp = msg.cmd ; if ( msg.num > 0 ) node_ptr->cmdRsp_status = msg.parm[0] ; @@ -3889,35 +3919,6 @@ unsigned int nodeLinkClass::get_cmd_resp ( string & hostname ) return (-1); } -mtc_client_enum nodeLinkClass::get_activeClient ( string hostname ) -{ - nodeLinkClass::node* node_ptr = nodeLinkClass::getNode ( hostname ); - if ( node_ptr != NULL ) - { - return ( node_ptr->activeClient ) ; - } - else - { - slog ("Host lookup failed for '%s'\n", hostname.c_str()); - } - return (CLIENT_NONE); -} - -int nodeLinkClass::set_activeClient ( string hostname, mtc_client_enum client ) -{ - nodeLinkClass::node* node_ptr = nodeLinkClass::getNode ( hostname ); - if ( node_ptr != NULL ) - { - node_ptr->activeClient = client ; - return (PASS); - } - else - { - slog ("Host lookup failed for '%s'\n", hostname.c_str()); - } - return (FAIL_HOSTNAME_LOOKUP); -} - /***************************************************************************** * * Name : set_mtcAlive @@ -4282,6 +4283,31 @@ void nodeLinkClass::set_mtce_flags ( string hostname, int flags, int iface ) } + /* Manage the remote locked/unlocked state of the host */ + if ( flags & MTC_FLAG__I_AM_LOCKED ) + { + /* Don't auto correct while we are going through the unlock sequence */ + if (( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED ) && + ((node_ptr->adminAction != MTC_ADMIN_ACTION__ENABLE ) && + (node_ptr->adminAction != MTC_ADMIN_ACTION__UNLOCK ))) + { + wlog ("%s mtcAlive reporting locked while unlocked ; correcting", node_ptr->hostname.c_str()); + send_mtc_cmd ( node_ptr->hostname , MTC_MSG_UNLOCKED, MGMNT_INTERFACE ); + send_mtc_cmd ( node_ptr->hostname , MTC_MSG_UNLOCKED, CLSTR_INTERFACE ); + } + } + else + { + /* Don't auto correct while we are going through the unlock sequence */ + if (( node_ptr->adminState == MTC_ADMIN_STATE__LOCKED ) && + ( node_ptr->adminAction != MTC_ADMIN_ACTION__LOCK )) + { + wlog ("%s mtcAlive reporting unlocked while locked ; correcting", node_ptr->hostname.c_str()); + send_mtc_cmd ( node_ptr->hostname , MTC_MSG_LOCKED, MGMNT_INTERFACE ); + send_mtc_cmd ( node_ptr->hostname , MTC_MSG_LOCKED, CLSTR_INTERFACE ); + } + } + /* Deal with sub-function if AIO controller host */ if (( AIO_SYSTEM ) && ( is_controller(node_ptr) == true )) { diff --git a/mtce/src/common/nodeClass.h b/mtce/src/common/nodeClass.h index 1cd03813..16ad523a 100755 --- a/mtce/src/common/nodeClass.h +++ b/mtce/src/common/nodeClass.h @@ -1,10 +1,10 @@ #ifndef __INCLUDE_NODECLASS_H__ #define __INCLUDE_NODECLASS_H__ /* - * Copyright (c) 2013-2016, 2023 Wind River Systems, Inc. -* -* SPDX-License-Identifier: Apache-2.0 -* + * Copyright (c) 2013-2016, 2023-2024 Wind River Systems, Inc. + * + * SPDX-License-Identifier: Apache-2.0 + * */ /** @@ -452,15 +452,14 @@ private: /** Command Response Data - typically an error details string */ string cmdRsp_status_string ; + /* Mtce command acknowledgements */ + bool unlock_cmd_ack ; /* set true when a unlocked command ack is rx'ed */ bool reboot_cmd_ack_mgmnt ; bool reboot_cmd_ack_clstr ; /** Tracks back to back Fast Fault Recovery counts */ int graceful_recovery_counter; - /** Reboot acknowledge */ - mtc_client_enum activeClient ; - /** @} private_Maintenance_variables */ /** @@ -1661,14 +1660,6 @@ public: /** Remove a host from Node list */ int rem_host ( string & hostname ); - /* Returns the active client. */ - mtc_client_enum get_activeClient ( string hostname ); - - /* Sets the active client for this particular host. The first use of this - * is or reset/reboot acknowledge to the VIm over an evacuate reset request - * from within the reboot handler. */ - int set_activeClient ( string hostname, mtc_client_enum client ); - /** Get the number of worker hosts that are operationally 'enabled' */ int enabled_compute_nodes ( void ); diff --git a/mtce/src/maintenance/mtcCompMsg.cpp b/mtce/src/maintenance/mtcCompMsg.cpp index 76858273..ba6049ba 100755 --- a/mtce/src/maintenance/mtcCompMsg.cpp +++ b/mtce/src/maintenance/mtcCompMsg.cpp @@ -1,8 +1,8 @@ /* - * Copyright (c) 2013-2018 Wind River Systems, Inc. -* -* SPDX-License-Identifier: Apache-2.0 -* + * Copyright (c) 2013-2018, 2024 Wind River Systems, Inc. + * + * SPDX-License-Identifier: Apache-2.0 + * */ /** @@ -228,17 +228,36 @@ int mtc_service_command ( mtc_socket_type * sock_ptr, int interface ) } else if ( msg.cmd == MTC_MSG_LOCKED ) { + log_ack = false ; + /* Only recreate the file if its not already present */ if ( daemon_is_file_present ( NODE_LOCKED_FILE ) == false ) { - log_ack = true ; ilog ("%s locked (%s)", get_hostname().c_str(), interface_name.c_str() ); - daemon_log ( NODE_LOCKED_FILE, - "This node is currently in the administratively locked state" ); + daemon_log ( NODE_LOCKED_FILE, ADMIN_LOCKED_STR); } - else + + /* Preserve the node locked state in a non-volatile backup + * file that persists over reboot. + * Maintaining the legacy NODE_LOCKED_FILE as other sw looks at it. */ + if ( daemon_is_file_present ( NODE_LOCKED_FILE_BACKUP ) == false ) { - log_ack = false ; + daemon_log ( NODE_LOCKED_FILE_BACKUP, ADMIN_LOCKED_STR ); + } + } + else if ( msg.cmd == MTC_MSG_UNLOCKED ) + { + ilog ("%s unlocked (%s)", get_hostname().c_str(), interface_name.c_str() ); + + /* Only remove the file if it is present */ + if ( daemon_is_file_present ( NODE_LOCKED_FILE ) == true ) + { + daemon_remove_file ( NODE_LOCKED_FILE ); + } + if ( daemon_is_file_present ( NODE_LOCKED_FILE_BACKUP ) == true ) + { + daemon_remove_file ( NODE_LOCKED_FILE_BACKUP ); + ilog ("cleared node locked backup flag (%s)", interface_name.c_str() ); } } else if ( msg.cmd == MTC_MSG_SUBF_GOENABLED_FAILED ) diff --git a/mtce/src/maintenance/mtcCtrlMsg.cpp b/mtce/src/maintenance/mtcCtrlMsg.cpp index b2c9d716..69fef347 100755 --- a/mtce/src/maintenance/mtcCtrlMsg.cpp +++ b/mtce/src/maintenance/mtcCtrlMsg.cpp @@ -1,8 +1,8 @@ /* - * Copyright (c) 2013-2018, 2023 Wind River Systems, Inc. -* -* SPDX-License-Identifier: Apache-2.0 -* + * Copyright (c) 2013-2018, 2023-2024 Wind River Systems, Inc. + * + * SPDX-License-Identifier: Apache-2.0 + * */ /** @@ -665,7 +665,10 @@ int send_mtc_cmd ( string & hostname, int cmd , int interface, string json_dict case MTC_CMD_WIPEDISK: case MTC_CMD_LAZY_REBOOT: { - ilog ("%s sending '%s' request (%s network)\n", hostname.c_str(), get_mtcNodeCommand_str(cmd), get_iface_name_str(interface)); + ilog ("%s sending '%s' request (%s)", + hostname.c_str(), + get_mtcNodeCommand_str(cmd), + get_iface_name_str(interface)); snprintf ( &mtc_cmd.hdr[0], MSG_HEADER_SIZE, "%s", get_cmd_req_msg_header() ); mtc_cmd.cmd = cmd ; mtc_cmd.num = 0 ; @@ -682,7 +685,10 @@ int send_mtc_cmd ( string & hostname, int cmd , int interface, string json_dict case MTC_MSG_SUBF_GOENABLED_FAILED: { force = true ; - ilog ("%s sending '%s' request (%s network)\n", hostname.c_str(), get_mtcNodeCommand_str(cmd), get_iface_name_str(interface)); + ilog ("%s sending '%s' request (%s)", + hostname.c_str(), + get_mtcNodeCommand_str(cmd), + get_iface_name_str(interface)); snprintf ( &mtc_cmd.hdr[0], MSG_HEADER_SIZE, "%s", get_cmd_req_msg_header() ); mtc_cmd.cmd = cmd ; mtc_cmd.num = 0 ; @@ -705,7 +711,20 @@ int send_mtc_cmd ( string & hostname, int cmd , int interface, string json_dict } case MTC_MSG_LOCKED: { - mlog ("%s sending 'Locked' notification (%s network)\n", hostname.c_str(), get_iface_name_str(interface)); + mlog ("%s sending 'Locked' notification (%s)", + hostname.c_str(), + get_iface_name_str(interface)); + snprintf ( &mtc_cmd.hdr[0], MSG_HEADER_SIZE, "%s", get_cmd_req_msg_header() ); + mtc_cmd.cmd = cmd ; + mtc_cmd.num = 0 ; + rc = PASS ; + break ; + } + case MTC_MSG_UNLOCKED: + { + ilog ("%s sending 'UnLocked' notification (%s)", + hostname.c_str(), + get_iface_name_str(interface)); snprintf ( &mtc_cmd.hdr[0], MSG_HEADER_SIZE, "%s", get_cmd_req_msg_header() ); mtc_cmd.cmd = cmd ; mtc_cmd.num = 0 ; diff --git a/mtce/src/maintenance/mtcNodeComp.cpp b/mtce/src/maintenance/mtcNodeComp.cpp index 421543dc..2cc822c7 100644 --- a/mtce/src/maintenance/mtcNodeComp.cpp +++ b/mtce/src/maintenance/mtcNodeComp.cpp @@ -1,8 +1,8 @@ /* - * Copyright (c) 2013-2016 Wind River Systems, Inc. -* -* SPDX-License-Identifier: Apache-2.0 -* + * Copyright (c) 2013-2016, 2024 Wind River Systems, Inc. + * + * SPDX-License-Identifier: Apache-2.0 + * */ /** @@ -1153,6 +1153,23 @@ void daemon_service_run ( void ) } } + /* If the mtcClient starts up and finds that its persistent node + * locked backup file is present then make sure the volatile one + * is also present. */ + if ( daemon_is_file_present ( NODE_LOCKED_FILE_BACKUP )) + { + if ( daemon_is_file_present ( NODE_LOCKED_FILE ) == false ) + { + ilog ("restoring %s from %s backup", NODE_LOCKED_FILE, + NODE_LOCKED_FILE_BACKUP); + daemon_log ( NODE_LOCKED_FILE, ADMIN_LOCKED_STR ); + } + } + /* otherwise if the backup file is not there remove volatile file */ + else if ( daemon_is_file_present ( NODE_LOCKED_FILE )) + { + daemon_remove_file ( NODE_LOCKED_FILE ); + } /* Start mtcAlive message timer */ /* Send first mtcAlive ASAP */ diff --git a/mtce/src/maintenance/mtcNodeHdlrs.cpp b/mtce/src/maintenance/mtcNodeHdlrs.cpp index 9ddc6c16..ec90f667 100755 --- a/mtce/src/maintenance/mtcNodeHdlrs.cpp +++ b/mtce/src/maintenance/mtcNodeHdlrs.cpp @@ -1,8 +1,8 @@ /* - * Copyright (c) 2013-2020, 2023 Wind River Systems, Inc. -* -* SPDX-License-Identifier: Apache-2.0 -* + * Copyright (c) 2013-2020, 2023-2024 Wind River Systems, Inc. + * + * SPDX-License-Identifier: Apache-2.0 + * */ /**************************************************************************** @@ -530,6 +530,8 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr ) } } } + + daemon_remove_file (NODE_LOCKED_FILE_BACKUP); mtcInvApi_update_states_now ( node_ptr, "unlocked", "disabled" , "offline", "disabled", "offline" ); mtcInvApi_update_task_now ( node_ptr, aio ? MTC_TASK_AIO_SX_UNLOCK_MSG : MTC_TASK_SELF_UNLOCK_MSG ); @@ -1174,6 +1176,11 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr ) node_ptr->offline_log_reported = false ; node_ptr->online_log_reported = true ; + /* This is a redundant / backup message to the call in + * admin_state_change telling the node it is unlocked. */ + node_ptr->unlock_cmd_ack = false ; + send_mtc_cmd ( node_ptr->hostname , MTC_MSG_UNLOCKED, MGMNT_INTERFACE ); + /* Request Out-Of--Service test execution */ send_mtc_cmd ( node_ptr->hostname, MTC_REQ_MAIN_GOENABLED, MGMNT_INTERFACE ); @@ -1461,6 +1468,15 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr ) /* Check the work queue complete and done status's */ mtcInvApi_force_task ( node_ptr, "" ); + if ( node_ptr->unlock_cmd_ack ) + { + ilog ("%s acknowledged unlock", node_ptr->hostname.c_str()); + } + else + { + wlog ("%s has not acknowledged unlock", node_ptr->hostname.c_str()); + } + if ( node_ptr->degrade_mask ) { /* Allow host to enable in the degraded state */