diff --git a/mtce-common/src/common/logMacros.h b/mtce-common/src/common/logMacros.h index 5574ecd3..b6c758e6 100644 --- a/mtce-common/src/common/logMacros.h +++ b/mtce-common/src/common/logMacros.h @@ -1,7 +1,7 @@ #ifndef __INCLUDE_NODELOG_HH__ #define __INCLUDE_NODELOG_HH__ /* - * Copyright (c) 2013-2017 Wind River Systems, Inc. + * Copyright (c) 2013-2017,2023 Wind River Systems, Inc. * * SPDX-License-Identifier: Apache-2.0 * @@ -115,6 +115,7 @@ typedef struct int lmon_query_port ; int start_delay ; /**< startup delay, added for pmon */ int api_retries ; /**< api retries before failure */ + int bmc_reset_delay ; /**< secs delay before bmc reset */ int hostwd_failure_threshold ; /**< allowed # of missed pmon/hostwd messages */ bool hostwd_reboot_on_err ; /**< should hostwd reboot on fault detected */ bool hostwd_kdump_on_stall ; /**< sysrq crash dump on quorum msg'ing stall */ diff --git a/mtce-common/src/common/nodeBase.cpp b/mtce-common/src/common/nodeBase.cpp index f8800019..540a4d79 100755 --- a/mtce-common/src/common/nodeBase.cpp +++ b/mtce-common/src/common/nodeBase.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013, 2016 Wind River Systems, Inc. + * Copyright (c) 2013, 2016, 2023 Wind River Systems, Inc. * * SPDX-License-Identifier: Apache-2.0 * @@ -9,7 +9,7 @@ * @file * Wind River CGTS Platform "Node Base" Utility */ - + #include #include #include @@ -382,6 +382,7 @@ void mtc_stages_init ( void ) recoveryStages_str[MTC_RECOVERY__RETRY_WAIT ] = "Req-Retry-Wait"; recoveryStages_str[MTC_RECOVERY__REQ_MTCALIVE ] = "Req-MtcAlive"; recoveryStages_str[MTC_RECOVERY__REQ_MTCALIVE_WAIT ] = "Req-MtcAlive-Wait"; + recoveryStages_str[MTC_RECOVERY__RESET_SEND_WAIT ] = "Reset-Send-Wait"; recoveryStages_str[MTC_RECOVERY__RESET_RECV_WAIT ] = "Reset-Recv-Wait"; recoveryStages_str[MTC_RECOVERY__MTCALIVE_TIMER ] = "MtcAlive-Timer"; recoveryStages_str[MTC_RECOVERY__MTCALIVE_WAIT ] = "MtcAlive-Wait"; diff --git a/mtce-common/src/common/nodeBase.h b/mtce-common/src/common/nodeBase.h index b98c34d4..b6028fa8 100755 --- a/mtce-common/src/common/nodeBase.h +++ b/mtce-common/src/common/nodeBase.h @@ -1,7 +1,7 @@ #ifndef __INCLUDE_NODEBASE_HH__ #define __INCLUDE_NODEBASE_HH__ /* - * Copyright (c) 2013-2020 Wind River Systems, Inc. + * Copyright (c) 2013-2020, 2023 Wind River Systems, Inc. * * SPDX-License-Identifier: Apache-2.0 * @@ -556,7 +556,7 @@ typedef struct unsigned short rev ; /* minor revision number */ unsigned int res ; /* a reserved field */ unsigned int cmd ; - unsigned int num ; + unsigned int num ; unsigned int parm[5] ; char buf[BUF_SIZE] ; } ALIGN_PACK(mtc_message_type); @@ -613,7 +613,7 @@ typedef struct /* Generic Monitor Service ready event */ #define MTC_EVENT_MONITOR_READY (0xf0f0f0f0) -/** Process Monitor Event codes */ +/** Process Monitor Event codes */ #define MTC_EVENT_PMON_CLEAR (0x02020202) /**< Clear Action */ #define MTC_EVENT_PMON_CRIT (0x04040404) /**< Crit Failed Action */ #define MTC_EVENT_PMON_MAJOR (0x05050505) /**< Major Degrade Action */ @@ -633,7 +633,7 @@ typedef struct #define MTC_EVENT_AVS_CRITICAL (0x12340002) #define MTC_EVENT_AVS_OFFLINE (0x12340003) -/** Hardware Monitor (hwmond) Action Request Codes +/** Hardware Monitor (hwmond) Action Request Codes * Action based event messages that hwmond sends to maintenance */ #define MTC_EVENT_HWMON_CONFIG (0x11110000) /* Sensor Config Log */ #define MTC_EVENT_HWMON_CLEAR (0x11110001) /* Clear Event */ @@ -956,8 +956,8 @@ typedef enum MTC_RECOVERY__RETRY_WAIT, MTC_RECOVERY__REQ_MTCALIVE, MTC_RECOVERY__REQ_MTCALIVE_WAIT, + MTC_RECOVERY__RESET_SEND_WAIT, MTC_RECOVERY__RESET_RECV_WAIT, - MTC_RECOVERY__RESET_WAIT, MTC_RECOVERY__MTCALIVE_TIMER, MTC_RECOVERY__MTCALIVE_WAIT, MTC_RECOVERY__GOENABLED_TIMER, @@ -1138,7 +1138,7 @@ typedef enum /** Return the string representing the specified 'powercycle' stage */ string get_powercycleStages_str ( mtc_powercycleStages_enum stage ); -typedef enum +typedef enum { MTC_SUBSTAGE__START = 0, MTC_SUBSTAGE__SEND = 1, diff --git a/mtce/src/common/nodeClass.cpp b/mtce/src/common/nodeClass.cpp index ae0a50a0..f8bee057 100755 --- a/mtce/src/common/nodeClass.cpp +++ b/mtce/src/common/nodeClass.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013-2020 Wind River Systems, Inc. + * Copyright (c) 2013-2020, 2023 Wind River Systems, Inc. * * SPDX-License-Identifier: Apache-2.0 * @@ -306,6 +306,7 @@ nodeLinkClass::nodeLinkClass() dor_start_time = 0 ; dor_mode_active_log_throttle = 0 ; + bmc_reset_delay = MTC_MINS_5 ; swact_timeout = MTC_MINS_2 ; uptime_period = MTC_UPTIME_REFRESH_TIMER ; online_period = MTC_OFFLINE_TIMER ; @@ -553,6 +554,13 @@ nodeLinkClass::node* nodeLinkClass::addNode( string hostname ) ptr->offline_search_count = 0 ; ptr->mtcAlive_mgmnt = false ; ptr->mtcAlive_clstr = false ; + + /* These counts are incremented in the set_mtcAlive member + * function and cleared in the reset progression handler. */ + ptr->mtcAlive_mgmnt_count = 0 ; + ptr->mtcAlive_clstr_count = 0 ; + ptr->bmc_reset_pending_log_throttle = 0 ; + ptr->reboot_cmd_ack_mgmnt = false ; ptr->reboot_cmd_ack_clstr = false ; @@ -2523,6 +2531,12 @@ int nodeLinkClass::mod_host ( node_inv_type & inv ) return ( rc ); } +/* calculate and return offline timeout in secs */ +int nodeLinkClass::offline_timeout_secs ( void ) +{ + return (((offline_period*offline_threshold)/1000)*3); +} + void nodeLinkClass::start_offline_handler ( struct nodeLinkClass::node * node_ptr ) { bool already_active = false ; @@ -3744,7 +3758,7 @@ int nodeLinkClass::num_hosts ( void ) return ( nodeLinkClass::hosts ) ; } -void nodeLinkClass::set_cmd_resp ( string & hostname, mtc_message_type & msg ) +void nodeLinkClass::set_cmd_resp ( string & hostname, mtc_message_type & msg, int iface ) { nodeLinkClass::node* node_ptr ; node_ptr = nodeLinkClass::getNode ( hostname ); @@ -3808,16 +3822,21 @@ void nodeLinkClass::set_cmd_resp ( string & hostname, mtc_message_type & msg ) "no error string" : node_ptr->host_services_req.status_string.c_str()); } } - else + else if ( node_ptr->cmdRsp != msg.cmd ) { - if ( node_ptr->cmdRsp != msg.cmd ) + /* record ack's for reboot requests */ + if ( msg.cmd == MTC_CMD_REBOOT ) { - node_ptr->cmdRsp = msg.cmd ; - if ( msg.num > 0 ) - node_ptr->cmdRsp_status = msg.parm[0] ; - else - node_ptr->cmdRsp_status = -1 ; + if ( iface == MGMNT_INTERFACE ) + node_ptr->reboot_cmd_ack_mgmnt = 1 ; + else if ( iface == CLSTR_INTERFACE ) + node_ptr->reboot_cmd_ack_clstr = 1 ; } + node_ptr->cmdRsp = msg.cmd ; + if ( msg.num > 0 ) + node_ptr->cmdRsp_status = msg.parm[0] ; + else + node_ptr->cmdRsp_status = -1 ; } } } @@ -3907,6 +3926,7 @@ void nodeLinkClass::set_mtcAlive ( struct nodeLinkClass::node * node_ptr, int in alog ("%s %s mtcAlive received", node_ptr->hostname.c_str(), get_iface_name_str(interface)); + node_ptr->mtcAlive_clstr_count++ ; node_ptr->mtcAlive_clstr = true ; } } @@ -3917,6 +3937,7 @@ void nodeLinkClass::set_mtcAlive ( struct nodeLinkClass::node * node_ptr, int in alog ("%s %s mtcAlive received", node_ptr->hostname.c_str(), get_iface_name_str(interface)); + node_ptr->mtcAlive_mgmnt_count++ ; node_ptr->mtcAlive_mgmnt = true ; } } diff --git a/mtce/src/common/nodeClass.h b/mtce/src/common/nodeClass.h index 0bb96dc1..367bda87 100755 --- a/mtce/src/common/nodeClass.h +++ b/mtce/src/common/nodeClass.h @@ -1,7 +1,7 @@ #ifndef __INCLUDE_NODECLASS_H__ #define __INCLUDE_NODECLASS_H__ /* - * Copyright (c) 2013-2016 Wind River Systems, Inc. + * Copyright (c) 2013-2016, 2023 Wind River Systems, Inc. * * SPDX-License-Identifier: Apache-2.0 * @@ -315,9 +315,15 @@ private: int mtcAlive_hits ; int mtcAlive_purge ; + int mtcAlive_mgmnt_count ; /* count the mgmnt network mtcAlive messages */ + int mtcAlive_clstr_count ; /* count the clstr network mtcAlive messages */ bool mtcAlive_mgmnt ; /* set true when mtcAlive is rx'd from mgmnt network */ bool mtcAlive_clstr ; /* set true when mtcAlive is rx'd from clstr network */ + /* used to log time leading up to reset */ + int bmc_reset_pending_log_throttle ; + time_debug_type reset_delay_start_time ; + /* Both of these booleans are set true upon receipt of a mtcAlive message. */ bool mtcAlive_online ; /* this is consumed by online and offline handler */ bool mtcAlive_offline ; /* this is consumed by reset progression handler */ @@ -854,6 +860,7 @@ private: int calc_reset_prog_timeout ( struct nodeLinkClass::node * node_ptr, int retries ); /* These interfaces will start and stop the offline FSM if not already active */ + int offline_timeout_secs ( void ); void start_offline_handler ( struct nodeLinkClass::node * node_ptr ); void stop_offline_handler ( struct nodeLinkClass::node * node_ptr ); @@ -1760,7 +1767,7 @@ public: struct mtc_timer mtcTimer_dor ; unsigned int get_cmd_resp ( string & hostname ); - void set_cmd_resp ( string & hostname, mtc_message_type & msg ); + void set_cmd_resp ( string & hostname, mtc_message_type & msg, int iface ); void set_uptime ( string & hostname, unsigned int uptime, bool force ); unsigned int get_uptime ( string & hostname ); @@ -1848,6 +1855,11 @@ public: * upon which feature will activate */ int mnfa_threshold ; + /* seconds to wait before issuing a bmc reset of a failed node + * that does not ACK reboot requests. The delay gives + * time for crashdumps to complete. */ + int bmc_reset_delay ; + /* collectd event handler */ int collectd_notify_handler ( string & hostname, string & resource, diff --git a/mtce/src/maintenance/mtcCmdHdlr.cpp b/mtce/src/maintenance/mtcCmdHdlr.cpp index c841c28c..9b12c97c 100644 --- a/mtce/src/maintenance/mtcCmdHdlr.cpp +++ b/mtce/src/maintenance/mtcCmdHdlr.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013-2017 Wind River Systems, Inc. + * Copyright (c) 2013-2017, 2023 Wind River Systems, Inc. * * SPDX-License-Identifier: Apache-2.0 * @@ -342,7 +342,6 @@ int nodeLinkClass::cmd_handler ( struct nodeLinkClass::node * node_ptr ) * *************************************************************************/ case MTC_CMD_STAGE__RESET_PROGRESSION_START: { - node_ptr->cmd_retries = 0 ; if ( node_ptr->cmd.task == true ) { /* Management Reboot Failed */ @@ -384,7 +383,7 @@ int nodeLinkClass::cmd_handler ( struct nodeLinkClass::node * node_ptr ) MTC_CMD_REBOOT, CLSTR_INTERFACE )) != PASS ) { - wlog ("%s 'reboot' request failed (%s) (rc:%d)\n", + wlog ("%s reboot request failed (%s) (rc:%d)\n", node_ptr->hostname.c_str(), get_iface_name_str(CLSTR_INTERFACE), rc); } @@ -400,16 +399,20 @@ int nodeLinkClass::cmd_handler ( struct nodeLinkClass::node * node_ptr ) mtcTimer_reset ( node_ptr->mtcCmd_timer ); mtcTimer_start ( node_ptr->mtcCmd_timer, mtcTimer_handler, MTC_CMD_RSP_TIMEOUT ); - ilog ("%s waiting for REBOOT ACK\n", node_ptr->hostname.c_str() ); + ilog ("%s waiting for reboot ACK\n", node_ptr->hostname.c_str() ); } else { + /* This means that the mtcAgent can't send commands. + * Very unlikely case. Fail the operation. + */ if ( node_ptr->cmd.task == true ) { /* Reboot Failed */ mtcInvApi_update_task ( node_ptr, MTC_TASK_REBOOT_FAIL ); } - node_ptr->mtcCmd_work_fifo_ptr->stage = MTC_CMD_STAGE__RESET ; + node_ptr->mtcCmd_work_fifo_ptr->status = FAIL_SOCKET_SENDTO ; + node_ptr->mtcCmd_work_fifo_ptr->stage = MTC_CMD_STAGE__DONE ; } break ; } @@ -420,21 +423,60 @@ int nodeLinkClass::cmd_handler ( struct nodeLinkClass::node * node_ptr ) { if ( node_ptr->mtcCmd_timer.ring == true ) { - if ( node_ptr->cmd.task == true ) + if (( node_ptr->cmd.task == true ) && ( node_ptr->cmd_retries == 0 )) { + /* no need to repost task on retries */ mtcInvApi_update_task ( node_ptr, MTC_TASK_REBOOT_FAIL ); } - wlog ("%s REBOOT ACK Timeout\n", node_ptr->hostname.c_str()); - node_ptr->mtcCmd_timer.ring = false ; - node_ptr->mtcCmd_work_fifo_ptr->stage = MTC_CMD_STAGE__RESET ; + /* progress to RESET if we have tried + * RESET_PROG_MAX_REBOOTS_B4_RESET times already */ + if ( ++node_ptr->cmd_retries >= RESET_PROG_MAX_REBOOTS_B4_RESET ) + { + wlog ("%s reboot ACK timeout ; max reboot retries reached", + node_ptr->hostname.c_str()); + if ( node_ptr->bmc_provisioned ) + { + int reset_delay = bmc_reset_delay - (RESET_PROG_MAX_REBOOTS_B4_RESET * MTC_CMD_RSP_TIMEOUT); + node_ptr->bmc_reset_pending_log_throttle = 0 ; + gettime ( node_ptr->reset_delay_start_time ); + + /* Clear the counts so we can tell if we have been getting mtcAlive + * messages from the remote host during the reset delay window */ + node_ptr->mtcAlive_mgmnt_count = 0 ; + node_ptr->mtcAlive_clstr_count = 0 ; + + wlog ("%s ... bmc reset in %d secs", node_ptr->hostname.c_str(), reset_delay); + mtcTimer_start ( node_ptr->mtcCmd_timer, mtcTimer_handler, reset_delay ); + node_ptr->mtcCmd_work_fifo_ptr->stage = MTC_CMD_STAGE__RESET ; + } + else + { + ilog ("%s bmc not provisioned ; search for offline", node_ptr->hostname.c_str()); + mtcTimer_start ( node_ptr->mtcCmd_timer, mtcTimer_handler, offline_timeout_secs()); + node_ptr->mtcCmd_work_fifo_ptr->stage = MTC_CMD_STAGE__OFFLINE_CHECK ; + } + } + else + { + int retry_delay = MTC_CMD_RSP_TIMEOUT ; + wlog ("%s reboot ACK timeout ; reboot retry (%d of %d) in %d secs", + node_ptr->hostname.c_str(), + node_ptr->cmd_retries, + RESET_PROG_MAX_REBOOTS_B4_RESET-1, + retry_delay); + mtcTimer_start ( node_ptr->mtcCmd_timer, mtcTimer_handler, retry_delay ); + } } } else { /* declare successful reboot */ - plog ("%s REBOOT Request Succeeded\n", node_ptr->hostname.c_str()); + plog ("%s reboot request succeeded (%s %s)", + node_ptr->hostname.c_str(), + node_ptr->reboot_cmd_ack_mgmnt ? get_iface_name_str(MGMNT_INTERFACE) : "", + node_ptr->reboot_cmd_ack_clstr ? get_iface_name_str(CLSTR_INTERFACE) : ""); if ( node_ptr->cmd.task == true ) { @@ -447,21 +489,31 @@ int nodeLinkClass::cmd_handler ( struct nodeLinkClass::node * node_ptr ) mtcTimer_reset ( node_ptr->mtcCmd_timer ); /* progress to RESET if we have tried 5 times already */ - if ( node_ptr->cmd_retries >= RESET_PROG_MAX_REBOOTS_B4_RESET ) + if ( ++node_ptr->cmd_retries >= RESET_PROG_MAX_REBOOTS_B4_RESET ) { - elog ("%s still not offline ; trying reset\n", node_ptr->hostname.c_str()); + int reset_delay = bmc_reset_delay - (RESET_PROG_MAX_REBOOTS_B4_RESET * MTC_CMD_RSP_TIMEOUT) ; + node_ptr->bmc_reset_pending_log_throttle = 0 ; + gettime ( node_ptr->reset_delay_start_time ); + + /* Clear the counts so we can tell if we have been getting mtcAlive + * messages from the remote host during the reset delay window */ + node_ptr->mtcAlive_mgmnt_count = 0 ; + node_ptr->mtcAlive_clstr_count = 0 ; + + wlog ("%s max reboot retries reached ; still not offline ; reset in %3d secs", + node_ptr->hostname.c_str(), reset_delay); + mtcTimer_start ( node_ptr->mtcCmd_timer, mtcTimer_handler, reset_delay ); node_ptr->mtcCmd_work_fifo_ptr->stage = MTC_CMD_STAGE__RESET ; } else { - int delay = (((offline_period*offline_threshold)/1000)*3); ilog ("%s searching for offline ; next reboot attempt in %d seconds\n", - node_ptr->hostname.c_str(), delay); + node_ptr->hostname.c_str(), offline_timeout_secs()); /* After the host is reset we need to wait for it to stop sending mtcAlive messages * Delay the time fo the offline handler to run to completion at least once before * timing out and retrying the reset again */ - mtcTimer_start ( node_ptr->mtcCmd_timer, mtcTimer_handler, delay ); + mtcTimer_start ( node_ptr->mtcCmd_timer, mtcTimer_handler, offline_timeout_secs()); /* Wait for the host to go offline */ node_ptr->mtcCmd_work_fifo_ptr->stage = MTC_CMD_STAGE__OFFLINE_CHECK ; @@ -471,94 +523,155 @@ int nodeLinkClass::cmd_handler ( struct nodeLinkClass::node * node_ptr ) } case MTC_CMD_STAGE__RESET: { - if (( node_ptr->bmc_provisioned == true ) && ( node_ptr->bmc_accessible == true )) + if ( node_ptr->bmc_provisioned == true ) { - plog ("%s Performing RESET over Board Management Interface\n", node_ptr->hostname.c_str()); - if ( node_ptr->cmd.task == true ) + if ( node_ptr->mtcCmd_timer.ring == true ) { - mtcInvApi_update_task ( node_ptr, MTC_TASK_RESET_REQUEST); - } + if ( node_ptr->bmc_accessible == true ) + { + plog ("%s issuing reset over bmc", node_ptr->hostname.c_str()); + if ( node_ptr->cmd.task == true ) + { + mtcInvApi_update_task ( node_ptr, MTC_TASK_RESET_REQUEST); + } - /* bmc power control reset by bmc */ - rc = bmc_command_send ( node_ptr, BMC_THREAD_CMD__POWER_RESET ); - - if ( rc == PASS ) - { - dlog ("%s Board Management Interface RESET Requested\n", node_ptr->hostname.c_str()); - - mtcTimer_start ( node_ptr->mtcCmd_timer, mtcTimer_handler, MTC_BMC_REQUEST_DELAY ); - node_ptr->mtcCmd_work_fifo_ptr->stage = MTC_CMD_STAGE__RESET_ACK; - break ; + /* bmc power control reset by bmc */ + rc = bmc_command_send ( node_ptr, BMC_THREAD_CMD__POWER_RESET ); + if ( rc == PASS ) + { + ilog ("%s bmc reset requested", node_ptr->hostname.c_str()); + mtcTimer_start ( node_ptr->mtcCmd_timer, mtcTimer_handler, MTC_BMC_REQUEST_DELAY ); + node_ptr->mtcCmd_work_fifo_ptr->stage = MTC_CMD_STAGE__RESET_ACK; + break ; + } + else + { + node_ptr->mtcCmd_work_fifo_ptr->status = rc ; + wlog ("%s bmc reset command request failed (%d)", node_ptr->hostname.c_str(), rc ); + } + } + else + { + wlog ("%s bmc not accessible ; unable to reset", node_ptr->hostname.c_str()); + } } else { - node_ptr->mtcCmd_work_fifo_ptr->status = rc ; - wlog ("%s 'reset' command request failed (%d)\n", node_ptr->hostname.c_str(), rc ); + /* To handle potentially large bmc_reset_delay values that could + * be longer than a boot time this check cancels the reset once the + * node goes online. Maybe the reset did get through or the node + * rebooted quite fast. + * + * However, don't allow momentary heartbeat loss recovery handling + * or the failure of just one (mgmnt or clstr) networks to mistakenly + * cancel the reset. Prevent the cancel if + * - the node uptime is high and + * - not receiving mtcAlive both mgmnt and clstr networks. + * + * Note: online does not mean both networks are receiving mtcAlive, + * Currently just mgmnt needs to see mtcAlive for the node to + * go online. + * TODO: Fix this in the future so both are required. + * It came from the days when the cluster-host was named the + * infrastructure network where at that time it was optional. + * Cluster-host is no longer optional. */ + if (( node_ptr->availStatus == MTC_AVAIL_STATUS__ONLINE ) && + ( node_ptr->uptime < MTC_MINS_5 ) && + ( node_ptr->mtcAlive_mgmnt_count ) && + ( node_ptr->mtcAlive_clstr_count )) + { + mtcTimer_reset ( node_ptr->mtcCmd_timer ); + ilog ("%s cancelling reset ; host is online ; delay:%d uptime:%d mtcAlive:%d:%d ", + node_ptr->hostname.c_str(), + bmc_reset_delay, + node_ptr->uptime, + node_ptr->mtcAlive_mgmnt_count, + node_ptr->mtcAlive_clstr_count); + node_ptr->mtcCmd_work_fifo_ptr->status = PASS ; + node_ptr->mtcCmd_work_fifo_ptr->stage = MTC_CMD_STAGE__DONE ; + } + else + { + time_debug_type now_time ; + time_delta_type diff_time ; + int reset_delay = bmc_reset_delay - (RESET_PROG_MAX_REBOOTS_B4_RESET * MTC_CMD_RSP_TIMEOUT) ; + gettime ( now_time ); + timedelta ( node_ptr->reset_delay_start_time, now_time, diff_time ); + if ( reset_delay > diff_time.secs ) + { + #define BMC_RESET_PENDING_LOG_THROTTLE (1000) + wlog_throttled ( node_ptr->bmc_reset_pending_log_throttle, + BMC_RESET_PENDING_LOG_THROTTLE, + "%s reset in %3ld secs ; delay:%d uptime:%d mtcAlive:%d:%d", + node_ptr->hostname.c_str(), + reset_delay-diff_time.secs, + bmc_reset_delay, + node_ptr->uptime, + node_ptr->mtcAlive_mgmnt_count, + node_ptr->mtcAlive_clstr_count); + } + } + break ; /* waiting path */ } } - else + else if ( node_ptr->bmc_provisioned == false ) { - if ( node_ptr->bmc_provisioned == false ) - { - wlog ("%s Board Management Interface not provisioned\n", node_ptr->hostname.c_str()); - } - else if ( node_ptr->bmc_accessible == false ) - { - wlog ("%s Board Management Interface not accessible\n", node_ptr->hostname.c_str()); - } - } - int delay = (((offline_period*offline_threshold)/1000)*3); - mtcTimer_start ( node_ptr->mtcCmd_timer, mtcTimer_handler, delay ); + wlog ("%s bmc not provisioned", node_ptr->hostname.c_str()); + } + + /* if we get here then either + * - the bmc is not proivisioned, + * - the bmc is not accessible after the bmc_reset_delay + * - the reset send command failed + * So we need to just jump to the offline check which will + * retry the reboot/reset if the host still does not go + * offline aftrer calculated delay + */ + mtcTimer_reset ( node_ptr->mtcCmd_timer ); + mtcTimer_start ( node_ptr->mtcCmd_timer, mtcTimer_handler, offline_timeout_secs()); node_ptr->mtcCmd_work_fifo_ptr->stage = MTC_CMD_STAGE__OFFLINE_CHECK ; break ; } case MTC_CMD_STAGE__RESET_ACK: { - if ( node_ptr->mtcCmd_timer.ring == true ) - { - int delay = (((offline_period*offline_threshold)/1000)*3); - - /* bmc power control reset by bmc */ - rc = bmc_command_recv ( node_ptr ); - if ( rc == RETRY ) + if ( node_ptr->mtcCmd_timer.ring == true ) + { + /* bmc power control reset by bmc */ + rc = bmc_command_recv ( node_ptr ); + if ( rc == RETRY ) + { + mtcTimer_start ( node_ptr->mtcCmd_timer, mtcTimer_handler, MTC_BMC_REQUEST_DELAY ); + break ; + } + else if ( rc ) + { + elog ("%s bmc reset request failed [rc:%d]\n", node_ptr->hostname.c_str(), rc); + if ( node_ptr->cmd.task == true ) { - mtcTimer_start ( node_ptr->mtcCmd_timer, mtcTimer_handler, MTC_BMC_REQUEST_DELAY ); - break ; + mtcInvApi_update_task ( node_ptr, MTC_TASK_RESET_FAIL); + } + node_ptr->mtcCmd_work_fifo_ptr->status = rc ; + } + else + { + plog ("%s bmc reset request succeeded\n", node_ptr->hostname.c_str()); + + if (( node_ptr->adminAction != MTC_ADMIN_ACTION__RESET ) && + ( node_ptr->adminAction != MTC_ADMIN_ACTION__REBOOT )) + { + mtcAlarm_log ( node_ptr->hostname, MTC_LOG_ID__COMMAND_AUTO_RESET ); } - if ( rc ) - { - elog ("%s Board Management Interface RESET Unsuccessful\n", node_ptr->hostname.c_str()); - if ( node_ptr->cmd.task == true ) - { - mtcInvApi_update_task ( node_ptr, MTC_TASK_RESET_FAIL); - } - mtcTimer_start ( node_ptr->mtcCmd_timer, mtcTimer_handler, delay ); - node_ptr->mtcCmd_work_fifo_ptr->status = rc ; - node_ptr->mtcCmd_work_fifo_ptr->stage = MTC_CMD_STAGE__OFFLINE_CHECK ; - } - else - { - plog ("%s Board Management Interface RESET Command Succeeded\n", node_ptr->hostname.c_str()); - - if (( node_ptr->adminAction != MTC_ADMIN_ACTION__RESET ) && - ( node_ptr->adminAction != MTC_ADMIN_ACTION__REBOOT )) - { - mtcAlarm_log ( node_ptr->hostname, MTC_LOG_ID__COMMAND_AUTO_RESET ); - } - - set_uptime ( node_ptr, 0 , false ); - - if ( node_ptr->cmd.task == true ) - { - mtcInvApi_update_task ( node_ptr, MTC_TASK_RESETTING ); - } - mtcTimer_start ( node_ptr->mtcCmd_timer, mtcTimer_handler, delay ); - node_ptr->mtcCmd_work_fifo_ptr->stage = MTC_CMD_STAGE__OFFLINE_CHECK ; - ilog ("%s waiting for host to go offline (%d secs) before retrying reset\n", - node_ptr->hostname.c_str(), - delay); + set_uptime ( node_ptr, 0 , false ); + if ( node_ptr->cmd.task == true ) + { + mtcInvApi_update_task ( node_ptr, MTC_TASK_RESETTING ); + } } + ilog ("%s waiting for host to go offline ; %d secs before retrying reboot/reset", + node_ptr->hostname.c_str(), offline_timeout_secs()); + node_ptr->mtcCmd_work_fifo_ptr->stage = MTC_CMD_STAGE__OFFLINE_CHECK ; + mtcTimer_start ( node_ptr->mtcCmd_timer, mtcTimer_handler, offline_timeout_secs()); } break ; } @@ -570,7 +683,7 @@ int nodeLinkClass::cmd_handler ( struct nodeLinkClass::node * node_ptr ) clear_service_readies ( node_ptr ); - qlog ("%s Reset Progression Complete ; host is offline (after %d retries)\n", + qlog ("%s reset progression complete ; host is offline (after %d retries)\n", node_ptr->hostname.c_str(), node_ptr->cmd_retries ); node_ptr->mtcCmd_work_fifo_ptr->status = PASS ; @@ -581,7 +694,7 @@ int nodeLinkClass::cmd_handler ( struct nodeLinkClass::node * node_ptr ) { if ( ++node_ptr->cmd_retries < RESET_PROG_MAX_REBOOTS_B4_RETRY ) { - ilog ("%s REBOOT (retry %d of %d)\n", + ilog ("%s reboot (retry %d of %d)\n", node_ptr->hostname.c_str(), node_ptr->cmd_retries, RESET_PROG_MAX_REBOOTS_B4_RETRY ); @@ -602,13 +715,13 @@ int nodeLinkClass::cmd_handler ( struct nodeLinkClass::node * node_ptr ) /* Complete command if we reach max retries */ if ( ++node_ptr->mtcCmd_work_fifo_ptr->parm2 > node_ptr->mtcCmd_work_fifo_ptr->parm1 ) { - plog ("%s Reset Progression Done\n", node_ptr->hostname.c_str()); + plog ("%s reset progression done\n", node_ptr->hostname.c_str()); node_ptr->mtcCmd_work_fifo_ptr->status = FAIL_RETRY ; node_ptr->mtcCmd_work_fifo_ptr->stage = MTC_CMD_STAGE__DONE ; } else { - wlog ("%s Reset Progression Retry\n", node_ptr->hostname.c_str()); + wlog ("%s reset progression retry\n", node_ptr->hostname.c_str()); node_ptr->mtcCmd_work_fifo_ptr->stage = MTC_CMD_STAGE__RESET_PROGRESSION_START ; } diff --git a/mtce/src/maintenance/mtcCtrlMsg.cpp b/mtce/src/maintenance/mtcCtrlMsg.cpp index 5a7be7e9..b2c9d716 100755 --- a/mtce/src/maintenance/mtcCtrlMsg.cpp +++ b/mtce/src/maintenance/mtcCtrlMsg.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013-2018 Wind River Systems, Inc. + * Copyright (c) 2013-2018, 2023 Wind River Systems, Inc. * * SPDX-License-Identifier: Apache-2.0 * @@ -243,7 +243,7 @@ int mtc_service_inbox ( nodeLinkClass * obj_ptr, /* Check for response messages */ else if ( strstr ( &msg.hdr[0], get_cmd_rsp_msg_header() ) ) { - obj_ptr->set_cmd_resp ( hostname , msg ) ; + obj_ptr->set_cmd_resp ( hostname , msg, iface ) ; if ( msg.num > 0 ) { /* log if not locked message, not start host services result diff --git a/mtce/src/maintenance/mtcNodeCtrl.cpp b/mtce/src/maintenance/mtcNodeCtrl.cpp index a52b67e3..b7b6e4f9 100644 --- a/mtce/src/maintenance/mtcNodeCtrl.cpp +++ b/mtce/src/maintenance/mtcNodeCtrl.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013, 2016 Wind River Systems, Inc. + * Copyright (c) 2013, 2016, 2023 Wind River Systems, Inc. * * SPDX-License-Identifier: Apache-2.0 * @@ -371,6 +371,11 @@ static int mtc_config_handler ( void * user, config_ptr->mask |= CONFIG_AGENT_API_RETRIES ; mtcInv.api_retries = config_ptr->api_retries ; } + else if (MATCH("agent", "bmc_reset_delay")) + { + config_ptr->bmc_reset_delay = atoi(value); + mtcInv.bmc_reset_delay = config_ptr->bmc_reset_delay ; + } else if (MATCH("timeouts", "failsafe_shutdown_delay")) { config_ptr->failsafe_shutdown_delay = atoi(value); @@ -682,6 +687,7 @@ int daemon_configure ( void ) ilog ("TokenRefresh: %3d secs\n" , mtcInv.token_refresh_rate); ilog ("API Retries : %3d secs\n" , mtcInv.api_retries); + ilog ("Reset Delay : %3d secs\n" , mtcInv.bmc_reset_delay); /* Verify loaded config against an expected mask * as an ini file fault detection method */ diff --git a/mtce/src/maintenance/mtcNodeHdlrs.cpp b/mtce/src/maintenance/mtcNodeHdlrs.cpp index c0315fe0..bf3569f9 100755 --- a/mtce/src/maintenance/mtcNodeHdlrs.cpp +++ b/mtce/src/maintenance/mtcNodeHdlrs.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013-2020 Wind River Systems, Inc. + * Copyright (c) 2013-2020, 2023 Wind River Systems, Inc. * * SPDX-License-Identifier: Apache-2.0 * @@ -70,6 +70,9 @@ using namespace std; * * Purpose : Calculate the overall reset progression timeout * + * Note : Needs to take into account the bmc_reset_delay + * for nodes that have the bmc provisioned. + * * ***********************************************************/ int nodeLinkClass::calc_reset_prog_timeout ( struct nodeLinkClass::node * node_ptr, int retries ) @@ -84,6 +87,9 @@ int nodeLinkClass::calc_reset_prog_timeout ( struct nodeLinkClass::node * node_p /* add a small buffer */ to += (MTC_ENABLED_TIMER*4) ; + /* factor in the bmc reset delay */ + to += nodeLinkClass::bmc_reset_delay ; + /* factor in the number of retries */ to *= (retries+1) ; @@ -970,7 +976,8 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr ) mtcCmd_init ( node_ptr->cmd ); node_ptr->cmd.stage = MTC_CMD_STAGE__START ; node_ptr->cmd.cmd = MTC_OPER__RESET_PROGRESSION ; - node_ptr->cmd.parm1 = 0 ; /* retries */ + node_ptr->cmd_retries = 0 ; /* init fsm retries count */ + node_ptr->cmd.parm1 = 0 ; /* set progression retries */ node_ptr->cmd.task = true ; /* send task updates */ node_ptr->mtcCmd_work_fifo.push_front(node_ptr->cmd); @@ -1922,34 +1929,35 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr ) * NOT in Dead Office Recovery (DOR) mode. */ if ( node_ptr->dor_recovery_mode == false ) { + ilog ("%s issuing one time graceful recovery reboot over management network\n", + node_ptr->hostname.c_str()); + node_ptr->reboot_cmd_ack_mgmnt = false ; + node_ptr->reboot_cmd_ack_clstr = false ; + send_mtc_cmd ( node_ptr->hostname, MTC_CMD_REBOOT, MGMNT_INTERFACE ) ; + /* If the cluster-host network is provisioned then try * and issue a reset over it to expedite the recovery * for the case where the management heartbeat has * failed but the cluster-host has not. * Keeping it simple by just issuing the command and not looping on it */ - if (( node_ptr->clstr_ip.length () > 5 ) && - ( node_ptr->heartbeat_failed[MGMNT_IFACE] == true ) && - ( node_ptr->heartbeat_failed[CLSTR_IFACE] == false )) + if ( node_ptr->clstr_ip.length () > 5 ) { - ilog ("%s issuing one time graceful recovery reboot over cluster-host network\n", node_ptr->hostname.c_str()); + ilog ("%s issuing one time graceful recovery reboot over cluster-host network\n", + node_ptr->hostname.c_str()); send_mtc_cmd ( node_ptr->hostname, MTC_CMD_REBOOT, CLSTR_INTERFACE ) ; } - if ((node_ptr->bmc_provisioned) && (node_ptr->bmc_accessible)) + if ( node_ptr->bmc_provisioned ) { - ilog ("%s issuing one time board management graceful recovery reset\n", node_ptr->hostname.c_str()); - - rc = bmc_command_send ( node_ptr, BMC_THREAD_CMD__POWER_RESET ); - if ( rc ) - { - wlog ("%s board management reset failed\n", node_ptr->hostname.c_str()); - } - else - { - mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_POWER_ACTION_RETRY_DELAY ); - recoveryStageChange ( node_ptr, MTC_RECOVERY__RESET_RECV_WAIT ); - break ; - } + ilog ("%s posting one time board management graceful recovery reset", + node_ptr->hostname.c_str()); + ilog ("%s ... node may be rebooting or running kdump", + node_ptr->hostname.c_str()); + ilog ("%s ... give kdump time to complete ; reset in %d secs", + node_ptr->hostname.c_str(), bmc_reset_delay ); + mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, bmc_reset_delay ); + recoveryStageChange ( node_ptr, MTC_RECOVERY__RESET_SEND_WAIT ); + break ; } else { @@ -1982,6 +1990,69 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr ) recoveryStageChange ( node_ptr, MTC_RECOVERY__MTCALIVE_WAIT ); break ; } + case MTC_RECOVERY__RESET_SEND_WAIT: + { + bool reset_aborted = false ; + + /* Abort the reset if we got an acknowledgment from + * either the mgmnt or clstr reboot requests. + */ + if ( node_ptr->reboot_cmd_ack_mgmnt ) + { + reset_aborted = true ; + ilog ("%s backup bmc reset aborted due to management network reboot request ACK", + node_ptr->hostname.c_str()); + } + else if ( node_ptr->reboot_cmd_ack_clstr ) + { + reset_aborted = true ; + ilog ("%s backup bmc reset aborted due to cluster-host network reboot request ACK", + node_ptr->hostname.c_str()); + + } + else if ( mtcTimer_expired ( node_ptr->mtcTimer )) + { + if ( node_ptr->bmc_accessible ) + { + rc = bmc_command_send ( node_ptr, BMC_THREAD_CMD__POWER_RESET ); + if ( rc ) + { + wlog ("%s board management reset failed\n", node_ptr->hostname.c_str()); + wlog ("%s ... aborting one time reset", node_ptr->hostname.c_str()); + reset_aborted = true ; + } + else + { + mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_POWER_ACTION_RETRY_DELAY ); + recoveryStageChange ( node_ptr, MTC_RECOVERY__RESET_RECV_WAIT ); + break ; + } + } + else + { + reset_aborted = true ; + wlog ("%s bmc is not accessible ; aborting one time reset", + node_ptr->hostname.c_str()); + } + } + if ( reset_aborted ) + { + int timeout = node_ptr->mtcalive_timeout ; + /* start the timer that waits for MTCALIVE */ + mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, timeout ); + + plog ("%s %s (%d secs)%s(uptime was %d) \n", + node_ptr->hostname.c_str(), + MTC_TASK_RECOVERY_WAIT, + timeout, + node_ptr->dor_recovery_mode ? " (DOR) " : " " , + node_ptr->uptime_save ); + + clear_service_readies ( node_ptr ); + recoveryStageChange ( node_ptr, MTC_RECOVERY__MTCALIVE_WAIT ); + } + break ; + } case MTC_RECOVERY__RESET_RECV_WAIT: { @@ -2926,6 +2997,7 @@ int nodeLinkClass::disable_handler ( struct nodeLinkClass::node * node_ptr ) ilog ("%s Issuing Force-Lock Reset\n", node_ptr->hostname.c_str()); mtcCmd_init ( node_ptr->cmd ); + node_ptr->cmd_retries = 0 ; node_ptr->cmd.stage = MTC_CMD_STAGE__START ; node_ptr->cmd.cmd = MTC_OPER__RESET_PROGRESSION ; node_ptr->cmd.parm1 = 2 ; /* 2 retries */ @@ -3258,11 +3330,13 @@ int nodeLinkClass::offline_handler ( struct nodeLinkClass::node * node_ptr ) if (( node_ptr->mtcAlive_mgmnt || node_ptr->mtcAlive_clstr ) && node_ptr->offline_search_count ) { node_ptr->mtcAlive_online = true ; - ilog ("%s still seeing mtcAlive (%d) (%c:%c) ; reset offline_search_count=%d of %d\n", + ilog ("%s still seeing mtcAlive (%d) (Mgmt:%c:%d Clstr:%c:%d) ; restart offline_search_count=%d of %d\n", node_ptr->hostname.c_str(), node_ptr->mtcAlive_count, node_ptr->mtcAlive_mgmnt ? 'Y' : 'n', + node_ptr->mtcAlive_mgmnt_count, node_ptr->mtcAlive_clstr ? 'Y' : 'n', + node_ptr->mtcAlive_clstr_count, node_ptr->offline_search_count, offline_threshold ); node_ptr->offline_search_count = 0 ; /* reset the count */ @@ -3343,11 +3417,13 @@ int nodeLinkClass::offline_handler ( struct nodeLinkClass::node * node_ptr ) **/ node_ptr->mtcAlive_online = true ; - ilog ("%s still seeing mtcAlive (%d) (%c:%c) ; reset offline_search_count=%d of %d\n", + ilog ("%s still seeing mtcAlive (%d) (Mgmt:%c:%d Clstr:%c:%d) ; restart offline_search_count=%d of %d\n", node_ptr->hostname.c_str(), node_ptr->mtcAlive_count, node_ptr->mtcAlive_mgmnt ? 'Y' : 'n', + node_ptr->mtcAlive_mgmnt_count, node_ptr->mtcAlive_clstr ? 'Y' : 'n', + node_ptr->mtcAlive_clstr_count, node_ptr->offline_search_count, offline_threshold ); node_ptr->offline_search_count = 0 ; /* reset the search count */ @@ -4723,6 +4799,7 @@ int nodeLinkClass::reboot_handler ( struct nodeLinkClass::node * node_ptr ) case MTC_RESETPROG__REBOOT: { #define REBOOT_RETRIES (0) + node_ptr->cmd_retries = 0 ; node_ptr->mtcCmd_work_fifo.clear(); mtcCmd_init ( node_ptr->cmd ); node_ptr->cmd.stage = MTC_CMD_STAGE__START ; diff --git a/mtce/src/maintenance/mtcVimApi.cpp b/mtce/src/maintenance/mtcVimApi.cpp index 761d8e3b..a5169d14 100644 --- a/mtce/src/maintenance/mtcVimApi.cpp +++ b/mtce/src/maintenance/mtcVimApi.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013, 2016 Wind River Systems, Inc. + * Copyright (c) 2013, 2016, 2023 Wind River Systems, Inc. * * SPDX-License-Identifier: Apache-2.0 * @@ -248,7 +248,7 @@ int nodeLinkClass::mtcVimApi_state_change ( struct nodeLinkClass::node * node_pt if (( request == VIM_HOST_FAILED ) || ( request == VIM_DPORT_FAILED )) { - elog ("%s %s\n", node_ptr->hostname.c_str(), node_ptr->httpReq.payload.c_str()); + wlog ("%s %s\n", node_ptr->hostname.c_str(), node_ptr->httpReq.payload.c_str()); } else { diff --git a/mtce/src/scripts/mtc.conf b/mtce/src/scripts/mtc.conf index df4db4fa..d029fc4c 100644 --- a/mtce/src/scripts/mtc.conf +++ b/mtce/src/scripts/mtc.conf @@ -1,4 +1,4 @@ -; CGTS Maintenance Service config file +; StarlingX Maintenance Service config file [agent] ; Agent Configuration scheduling_priority = 1 ; Range of 1 .. 99 is acceptable ; @@ -69,7 +69,12 @@ ar_goenable_interval = 30 ar_hostservices_interval = 30 ar_heartbeat_interval = 600 -api_retries = 10 ; number of API retries b4 failure +api_retries = 10 ; number of API retries b4 failure + +bmc_reset_delay = 300 ; seconds to wait before issuing a bmc + ; reset of a failed node that does not + ; ACK reboot requests. The delay gives + ; time for crashdumps to complete. [client] ; Client Configuration