From 9bf231a2866c0ff737064755d0106198d4df7d7d Mon Sep 17 00:00:00 2001 From: Eric MacDonald Date: Fri, 3 Jan 2020 09:34:37 -0500 Subject: [PATCH] Fix BMC access loss handling Recent refactoring of the BMC handler FSM introduced a code change that prevents the BMC Access alarm from being raised after initial BMC accessibility was established and is then lost. This update ensures BMC access alarm management is working properly. This update also implements ping failure debounce so that a single ping failure does not trigger full reconnection handling. Instead that now requires 3 ping failures in a row. This has the effect of adding a minute to ping failure action handling before the usual 2 minute BMC access failure alarm is raised. ping failure logging is reduced/improved. Test Plan: for both hwmond and mtcAgent PASS: Verify BMC access alarm due to bad provisioning (un, pw, ip, type) PASS: Verify BMC ping failure debounce handling, recovery and logging PASS: Verify BMC ping persistent failure handling PASS: Verify BMC ping periodic miss handling PASS: Verify BMC ping and access failure recovery timing PASS: Verify BMC ping failure and recovery handling over BMC link pull/plug PASS: Verify BMC sensor monitoring stops/resumes over ping failure/recovery Regression: PASS: Verify IPv6 System Install using provisioned BMCs (wp8-12) PASS: Verify BMC power-off request handling with BMC ping failing & recovering PASS: Verify BMC power-on request handling with BMC ping failing & recovering PASS: Verify BMC reset request handling with BMC ping failing & recovering PASS: Verify BMC sensor group read failure handling & recovery PASS: Verify sensor monitoring after ping failure handling & recovery Change-Id: I74870816930ef6cdb11f987424ffed300ff8affe Closes-Bug: 1858110 Signed-off-by: Eric MacDonald --- mtce-common/src/common/msgClass.cpp | 12 +++- mtce-common/src/common/msgClass.h | 2 +- mtce-common/src/common/pingUtil.cpp | 97 +++++++++++++++------------ mtce-common/src/common/pingUtil.h | 7 +- mtce/centos/build_srpm.data | 2 +- mtce/src/common/nodeClass.cpp | 4 +- mtce/src/hwmon/hwmonClass.cpp | 3 + mtce/src/hwmon/hwmonFsm.cpp | 18 +---- mtce/src/hwmon/hwmonHdlr.cpp | 4 +- mtce/src/maintenance/mtcNodeHdlrs.cpp | 25 +------ 10 files changed, 82 insertions(+), 92 deletions(-) diff --git a/mtce-common/src/common/msgClass.cpp b/mtce-common/src/common/msgClass.cpp index af284c25..2caf7185 100644 --- a/mtce-common/src/common/msgClass.cpp +++ b/mtce-common/src/common/msgClass.cpp @@ -1041,19 +1041,25 @@ int msgClassRx::initSocket(bool allow_any, bool is_multicast) * @param IP protocol * @param name of interface to bind to, or null otherwise */ -msgClassTx::msgClassTx(const char* address, int port, int proto, const char* interface) +msgClassTx::msgClassTx(const char* address, int port, int proto, const char* interface, bool quiet) { this->dst_addr = new msgClassAddr(address, port, proto); this->src_addr = new msgClassAddr(address, port, proto); if(interface) { - ilog ("Creating %s socket on port %d with address: %s\n", interface, port, address); + if ( !quiet) + { + ilog ("Creating %s socket on port %d with address: %s\n", interface, port, address); + } this->interface = new char[strlen(interface)+1]; snprintf(this->interface, strlen(interface)+1, "%s", interface); } else { - ilog ("Creating socket on port %d with address: %s\n", port, address); + if ( !quiet) + { + ilog ("Creating socket on port %d with address: %s\n", port, address); + } this->interface = NULL; } this->return_status = initSocket(); diff --git a/mtce-common/src/common/msgClass.h b/mtce-common/src/common/msgClass.h index cbeee29f..abcc88a7 100644 --- a/mtce-common/src/common/msgClass.h +++ b/mtce-common/src/common/msgClass.h @@ -207,7 +207,7 @@ private: class msgClassTx : public msgClassSock { public: - msgClassTx(const char* address, int port, int proto, const char* interface=NULL); + msgClassTx(const char* address, int port, int proto, const char* interface=NULL, bool quiet=false); msgClassTx(const msgClassAddr& addr, const char* interface=NULL); private: int initSocket(); diff --git a/mtce-common/src/common/pingUtil.cpp b/mtce-common/src/common/pingUtil.cpp index 01edd64d..308ebb02 100644 --- a/mtce-common/src/common/pingUtil.cpp +++ b/mtce-common/src/common/pingUtil.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2017 Wind River Systems, Inc. + * Copyright (c) 2015-2017,2020 Wind River Systems, Inc. * * SPDX-License-Identifier: Apache-2.0 * @@ -89,6 +89,7 @@ int pingUtil_init ( string hostname, ping_info_type & ping_info, const char * ip ping_info.sequence = getpid() ; ping_info.recv_retries = 0 ; ping_info.send_retries = 0 ; + ping_info.fail_debounce = 0 ; ping_info.requested= false ; ping_info.received = false ; ping_info.recv_flush_highwater=2; @@ -97,7 +98,7 @@ int pingUtil_init ( string hostname, ping_info_type & ping_info, const char * ip ping_info.monitoring = false ; ping_info.stage = PINGUTIL_MONITOR_STAGE__OPEN ; - ping_info.sock = new msgClassTx(ip_address, 0, IPPROTO_RAW, NULL ); + ping_info.sock = new msgClassTx(ip_address, 0, IPPROTO_RAW, NULL, true); /* Validate the socket setup */ if ( ping_info.sock == NULL ) @@ -194,7 +195,6 @@ int pingUtil_send ( ping_info_type & ping_info ) } ping_info.sequence++ ; - ping_info.recv_retries = 0; if ( ping_info.ipv6_mode == false ) { @@ -238,8 +238,6 @@ int pingUtil_send ( ping_info_type & ping_info ) bytes = ping_info.sock->write( (const char*)&ping6_tx, sizeof(ping6_tx_message_type)); } - ping_info.recv_retries = 0; - if ( bytes <= 0 ) { wlog ("%s ping %s send failed (rc:%d) (%d:%m)\n", ping_info.hostname.c_str(), ping_info.ip.c_str(), bytes, errno ); @@ -508,10 +506,19 @@ int pingUtil_recv ( ping_info_type & ping_info, if ( ping_info.monitoring == false ) { /* ... only want the log when we are first connecting */ - ilog ("%s ping to %s ok ; (try %d)", + if ( ping_info.recv_retries ) + { + ilog ("%s ping %s ok ; (try %d)", + ping_info.hostname.c_str(), + ping_info.ip.c_str(), + ping_info.recv_retries+1); + } + else + { + ilog ("%s ping %s ok", ping_info.hostname.c_str(), - ping_info.ip.c_str(), - ping_info.recv_retries+1); + ping_info.ip.c_str()); + } } else { @@ -575,6 +582,7 @@ void pingUtil_fini ( ping_info_type & ping_info ) ping_info.recv_retries = 0; ping_info.send_retries = 0; + ping_info.fail_debounce = 0; ping_info.sequence = 0; ping_info.identity = 0; @@ -585,10 +593,7 @@ void pingUtil_fini ( ping_info_type & ping_info ) void pingUtil_restart ( ping_info_type & ping_info ) { - ilog ("%s ping monitor restart", ping_info.hostname.c_str()); - ping_info.ok = false ; - ping_info.send_retries = 0 ; - ping_info.monitoring = false ; + dlog ("%s ping monitor restart", ping_info.hostname.c_str()); pingUtil_fini (ping_info); pingUtil_init (ping_info.hostname, ping_info, ping_info.ip.data()); @@ -668,7 +673,7 @@ int pingUtil_acc_monitor ( ping_info_type & ping_info ) if ( ++ping_info.send_retries > PING_MAX_SEND_RETRIES ) { - elog ("%s ping to %s failed\n", + dlog ("%s ping %s failed ; max send retries\n", ping_info.hostname.c_str(), ping_info.ip.c_str()); @@ -676,7 +681,9 @@ int pingUtil_acc_monitor ( ping_info_type & ping_info ) } else if ( pingUtil_send ( ping_info ) ) { - elog ("%s failed to send bmc ping\n", ping_info.hostname.c_str()); + wlog ("%s ping %s send failed\n", + ping_info.hostname.c_str(), + ping_info.ip.c_str()); ping_info.stage = PINGUTIL_MONITOR_STAGE__FAIL ; } else @@ -694,6 +701,10 @@ int pingUtil_acc_monitor ( ping_info_type & ping_info ) mtcTimer_reset ( ping_info.timer ); } mtcTimer_start_msec ( ping_info.timer, ping_info.timer_handler, PING_WAIT_TIMER_MSEC ); + + /* send was ok so clear its retry counter */ + ping_info.send_retries = 0 ; + ping_info.stage = PINGUTIL_MONITOR_STAGE__RECV ; } } @@ -711,25 +722,17 @@ int pingUtil_acc_monitor ( ping_info_type & ping_info ) { if ( ++ping_info.recv_retries > (PING_MAX_RECV_RETRIES) ) { - /* only print this log once on the resend attempt */ - if ( ping_info.send_retries >= PING_MAX_SEND_RETRIES ) - { - mlog ("%s ping recv from %s missed ; identity:%04x sequence:%04x (try %d of %d)\n", - ping_info.hostname.c_str(), - ping_info.ip.c_str(), - ping_info.identity, - ping_info.sequence, - ping_info.recv_retries-1, - PING_MAX_RECV_RETRIES); - } - ping_info.stage = PINGUTIL_MONITOR_STAGE__SEND ; - break ; + ping_info.recv_retries = 0; + dlog ("%s ping %s failed ; max recv retries\n", + ping_info.hostname.c_str(), + ping_info.ip.c_str()); + ping_info.stage = PINGUTIL_MONITOR_STAGE__FAIL ; } else { - blog1 ("%s retrying ping\n", ping_info.hostname.c_str()); + blog1 ("%s retrying ping\n", ping_info.hostname.c_str()); + mtcTimer_start_msec ( ping_info.timer, ping_info.timer_handler, PING_RETRY_DELAY_MSECS ); } - mtcTimer_start_msec ( ping_info.timer, ping_info.timer_handler, PING_RETRY_DELAY_MSECS ); } else { @@ -746,13 +749,7 @@ int pingUtil_acc_monitor ( ping_info_type & ping_info ) ping_info.send_retries = 0 ; ping_info.recv_retries = 0 ; - -#ifdef WANT_FIT_TESTING - if ( daemon_want_fit ( FIT_CODE__FAST_PING_AUDIT_HOST, ping_info.hostname ) == true ) - interval = 3 ; - if ( daemon_want_fit ( FIT_CODE__FAST_PING_AUDIT_ALL ) == true ) - interval = 3 ; -#endif + ping_info.fail_debounce = 0 ; mtcTimer_start ( ping_info.timer, ping_info.timer_handler, interval ); ping_info.stage = PINGUTIL_MONITOR_STAGE__WAIT ; } @@ -766,14 +763,30 @@ int pingUtil_acc_monitor ( ping_info_type & ping_info ) } case PINGUTIL_MONITOR_STAGE__FAIL: { - ping_info.ok = false ; - ping_info.send_retries = 0 ; - ping_info.monitoring = false ; - pingUtil_fini (ping_info); - pingUtil_init (ping_info.hostname, ping_info, ping_info.ip.data()); + // Assume socket reinit unless thresholding */ + bool reinit = true ; + if ( ping_info.ok == true ) + { + if ( ++ping_info.fail_debounce < PING_FAIL_DEBOUNCE_THLD ) + { + reinit = false ; + wlog("%s ping %s miss (%d of %d)", + ping_info.hostname.c_str(), + ping_info.ip.c_str(), + ping_info.fail_debounce, + PING_FAIL_DEBOUNCE_THLD ); + } + } + if ( reinit == true ) + { + elog("%s ping %s fail", ping_info.hostname.c_str(), + ping_info.ip.c_str()); + pingUtil_fini (ping_info); + pingUtil_init (ping_info.hostname, ping_info, ping_info.ip.data()); + } mtcTimer_reset ( ping_info.timer ); - mtcTimer_start ( ping_info.timer, ping_info.timer_handler, PING_MONITOR_INTERVAL ); + mtcTimer_start ( ping_info.timer, ping_info.timer_handler, PING_FAIL_RETRY_DELAY ); ping_info.stage = PINGUTIL_MONITOR_STAGE__WAIT; break ; } diff --git a/mtce-common/src/common/pingUtil.h b/mtce-common/src/common/pingUtil.h index 045ffb7e..7a9e4f22 100644 --- a/mtce-common/src/common/pingUtil.h +++ b/mtce-common/src/common/pingUtil.h @@ -2,7 +2,7 @@ #define __INCLUDE_PINGUTIL_H__ /* - * Copyright (c) 2015-2017 Wind River Systems, Inc. + * Copyright (c) 2015-2017,2020 Wind River Systems, Inc. * * SPDX-License-Identifier: Apache-2.0 * @@ -47,8 +47,8 @@ using namespace std; #define PING_WAIT_TIMER_MSEC (200) #define PING_RETRY_DELAY_MSECS (200) #define PING_MONITOR_INTERVAL (60) -#define PING_MISS_RETRY_DELAY (5) - +#define PING_FAIL_RETRY_DELAY (30) +#define PING_FAIL_DEBOUNCE_THLD (3) #define PING_MESSAGE_LEN (80) typedef enum @@ -73,6 +73,7 @@ typedef struct unsigned short sequence ; int send_retries ; int recv_retries ; + int fail_debounce ; bool ipv6_mode ; bool received ; bool requested ; diff --git a/mtce/centos/build_srpm.data b/mtce/centos/build_srpm.data index 7bbb93a8..5bc6d613 100644 --- a/mtce/centos/build_srpm.data +++ b/mtce/centos/build_srpm.data @@ -1,3 +1,3 @@ SRC_DIR="src" -TIS_PATCH_VER=156 +TIS_PATCH_VER=157 BUILD_IS_SLOW=5 diff --git a/mtce/src/common/nodeClass.cpp b/mtce/src/common/nodeClass.cpp index d3775089..ae43fe64 100755 --- a/mtce/src/common/nodeClass.cpp +++ b/mtce/src/common/nodeClass.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013-2016 Wind River Systems, Inc. + * Copyright (c) 2013-2020 Wind River Systems, Inc. * * SPDX-License-Identifier: Apache-2.0 * @@ -4322,7 +4322,9 @@ void nodeLinkClass::bmc_access_data_init ( struct nodeLinkClass::node * node_ptr node_ptr->bm_ping_info.stage = PINGUTIL_MONITOR_STAGE__OPEN ; mtcTimer_reset ( node_ptr->bm_ping_info.timer ); node_ptr->bm_ping_info.timer_handler = &mtcTimer_handler ; + node_ptr->bm_ping_info.ip = node_ptr->bm_ip ; + node_ptr->bm_ping_info.hostname = node_ptr->hostname ; node_ptr->bmc_protocol_learning = false ; diff --git a/mtce/src/hwmon/hwmonClass.cpp b/mtce/src/hwmon/hwmonClass.cpp index ab131c4f..7ccef7b5 100644 --- a/mtce/src/hwmon/hwmonClass.cpp +++ b/mtce/src/hwmon/hwmonClass.cpp @@ -85,6 +85,9 @@ void hwmonHostClass::bmc_data_init ( struct hwmonHostClass::hwmon_host * host_pt host_ptr->accessible = false; host_ptr->degraded = false ; + host_ptr->ping_info.ip = host_ptr->bm_ip ; + host_ptr->ping_info.hostname = host_ptr->hostname ; + hwmon_del_groups ( host_ptr ); hwmon_del_sensors ( host_ptr ); diff --git a/mtce/src/hwmon/hwmonFsm.cpp b/mtce/src/hwmon/hwmonFsm.cpp index 0971a944..dcecacaa 100644 --- a/mtce/src/hwmon/hwmonFsm.cpp +++ b/mtce/src/hwmon/hwmonFsm.cpp @@ -96,25 +96,9 @@ void hwmonHostClass::hwmon_fsm ( void ) pingUtil_acc_monitor ( host_ptr->ping_info ); - if ( host_ptr->ping_info.ok == false ) - { - /* Auto correct key ping information ; should never occur but if it does ... */ - if (( host_ptr->ping_info.hostname.empty()) || - ( hostUtil_is_valid_ip_addr(host_ptr->ping_info.ip ) == false )) - { - slog ("%s host ping info missing ; (%d:%d)\n", - host_ptr->hostname.c_str(), - host_ptr->ping_info.hostname.empty(), - host_ptr->ping_info.ip.empty()); - - host_ptr->ping_info.hostname = host_ptr->hostname ; - host_ptr->ping_info.ip = host_ptr->bm_ip ; - } - } - /* Check to see if sensor monitoring for this host is * disabled or the bm password has not yet been learned */ - else if (( host_ptr->monitor == false ) || ( host_ptr->bm_pw.empty())) + if (( host_ptr->monitor == false ) || ( host_ptr->bm_pw.empty())) { /* ... make sure the thread sits in the * idle state while disabled or there diff --git a/mtce/src/hwmon/hwmonHdlr.cpp b/mtce/src/hwmon/hwmonHdlr.cpp index 4eae4eb8..f06506f2 100644 --- a/mtce/src/hwmon/hwmonHdlr.cpp +++ b/mtce/src/hwmon/hwmonHdlr.cpp @@ -1165,12 +1165,12 @@ int hwmonHostClass::bmc_sensor_monitor ( struct hwmonHostClass::hwmon_host * hos { if ( host_ptr->bmc_thread_info.status == FAIL_SYSTEM_CALL ) { - elog ("%s '%s' system call failed (retry %d of %d) (%s)", + elog ("%s '%s' system call failed (retry %d of %d)", host_ptr->bmc_thread_info.log_prefix, bmcUtil_getCmd_str( host_ptr->bmc_thread_info.command).c_str(), host_ptr->bmc_thread_ctrl.retries, - MAX_THREAD_RETRIES, host_ptr->thread_extra_info.bm_pw.c_str()); + MAX_THREAD_RETRIES); } else if (( host_ptr->bmc_thread_ctrl.retries == (MAX_THREAD_RETRIES-1)) || ( host_ptr->bmc_thread_ctrl.retries == 1 )) diff --git a/mtce/src/maintenance/mtcNodeHdlrs.cpp b/mtce/src/maintenance/mtcNodeHdlrs.cpp index 07777c85..16951d11 100755 --- a/mtce/src/maintenance/mtcNodeHdlrs.cpp +++ b/mtce/src/maintenance/mtcNodeHdlrs.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013-2018 Wind River Systems, Inc. + * Copyright (c) 2013-2020 Wind River Systems, Inc. * * SPDX-License-Identifier: Apache-2.0 * @@ -3901,7 +3901,7 @@ int nodeLinkClass::reset_handler ( struct nodeLinkClass::node * node_ptr ) if ( node_ptr->bmc_accessible == false ) { - wlog ("%s Power Off request rejected ; BMC not accessible ; retry in %d seconds \n", + wlog ("%s Reset request rejected ; BMC not accessible ; retry in %d seconds \n", node_ptr->hostname.c_str(), MTC_POWER_ACTION_RETRY_DELAY); @@ -6185,29 +6185,10 @@ int nodeLinkClass::bmc_handler ( struct nodeLinkClass::node * node_ptr ) pingUtil_acc_monitor ( node_ptr->bm_ping_info ); } - /***************************************************************** - * Manage bmc creds refresh - ****************************************************************/ - if ( node_ptr->bm_ping_info.ok == false ) - { - /* Auto correct key ping information ; - * should never occur but if it does ... */ - if (( node_ptr->bm_ping_info.hostname.empty()) || - ( node_ptr->bm_ping_info.ip.empty())) - { - node_ptr->bm_ping_info.hostname = node_ptr->hostname ; - node_ptr->bm_ping_info.ip = node_ptr->bm_ip ; - } - if ( ! node_ptr->bm_pw.empty() ) - { - node_ptr->bm_pw.clear(); - } - } - /***************************************************************** * Manage getting the bm password but only when ping is ok ****************************************************************/ - else if ( node_ptr->bm_pw.empty() ) + if ( node_ptr->bm_pw.empty() ) { barbicanSecret_type * secret = secretUtil_manage_secret( node_ptr->secretEvent, node_ptr->hostname,