From 649e94c8da682b94fd130fa5389ecaf496c96678 Mon Sep 17 00:00:00 2001 From: Eric MacDonald Date: Wed, 3 Apr 2024 18:13:08 +0000 Subject: [PATCH] Add pxeboot mtcAlive messaging alarm handling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This update adds alarm handling to the recently introduced pxeboot network mtcAlive messaging, see depends on review below. A new 200.003 maintenance alarm is introduced with the second depends on update below. This new alarm is MINOR but also Management Affecting because the pxeboot network is required for node installation. This update enhances the new pxeboot_mtcAlive_monitor FSM for the purpose of detecting pxeboot mtcAlive message loss, alarming and then clearing the alarm once pxceboot mtcAlive messaging resumes. The new alarm assertion and clear is debounced: - alarm is asserted if message loss persists to the accumulation of 12 missed messages or after 2 minutes of complete message loss. - alarm is cleared after decrementing the message missed counter to zero or 1 minute of loss-less messaging. Upgrades are supported with the addition of a features list to the mtcClient ready event. All new mtcClients that support pxeboot network messaging now publish pxeboot mtcAlive support through this new features list. This is rendered in the logs like this: mtcClient ready ; with pxeboot mtcAlive support The mtcAgent does not expect/monitor pxeboot mtcAlive messages from hosts that don't publish the feature support. Test Plan: PASS: Verify mtcAlive period is 5 seconds. PASS: Verify pxeboot mtcAlive monitor period is 10 seconds. PASS: Verify mtcAgent sends mtcClient a mtcAlive request on every mtcAlive monitor miss. PASS: Verify pxeboot mtcAlive alarm is not raised while a node is locked. Alarm attributes: PASS: Verify severity is minor. PASS: Verify alarm is cleared while node is locked. PASS: Verify alarm can be suppressed while unlocked. PASS: Verify asserted alarm is management affecting. PASS: Verify alarm-show output format including cause and repair action text. Process Restart Handling: PASS: Verify alarm is maintained over a mtcAgent process restart. PASS: Verify pxeboot monitoring resumes with or without asserted alarm immediately following a mtcAgent process restart. PASS: Verify mtcClient learns and starts pxeboot mtcAlive messaging immediately following mtcClient process restart for locked or unlocked nodes. Alarm Debounce Handling: PASS: Verify alarm assertion only after 2 minutes of mtcAlive loss. PASS: Verify alarm clear after 1 minutes of mtcAlive recovery. PASS: Verify assertion and recovery debounce logging. PASS: Verify alarm management miss and loss controls handle all boundary conditions exercised by a 12 hr soak with randomized period between message loss and recovery. Host Action Handling: PASS: Verify mtcAlive alarm is not raised over a Host Unlock Enable. PASS: Verify mtcAlive alarm is not raised over a Host Graceful Recovery. PASS: Verify mtcAlive alarm is not raised over a Host Power Off/On. PASS: Verify mtcAlive alarm is not raised over a Host Reboot/Reset. PASS: Verify mtcAlive alarm is not raised over a Host Reinstall. PASS: Verify pxeboot mtcAlive is factored into Host Offline Handling. PASS: Verify pxeboot alarm handling for node that does not send pxeboot mtcAlive after unlock. Stuck Alarm Avoidance Handling: PASS: Verify typical alarm assertion and clear handling. PASS: Verify alarm is maintained or cleared over node reboot if the messaging issue persists or resolves over the reboot recovery. PASS: Verify mtcAlive alarm is maintained over a Swact and cleared if the messaging is ok on the newly active controller. PASS: Verify mtcAlive alarm assertion recovery case over uncontrolled Swact due to active controller reboot. PASS: Verify alarm is cleared over a spontaneous reboot if pxeboot messaging recovers over that reboot. Upgrades Case: PASS: Verify pxeboot mtcAlive monitoring only occurs on mtcClients that actually support pxeboot network mtcAlive monitoring. PASS: Verify mtcClient new features list, parsing which enables pxeboot  mtcAlive monitoring for that node. PASS: Verify pxeboot mtcAlive messaging monitoring is not enabled towards nodes whose mtcClient does publish pxeboot mtcAlive messaging feature support. PROG: Verify AIO DX upgrade from 22.12 to current master branch. Focus on pxeboot messaging over the upgrade process. Depends-On: https://review.opendev.org/c/starlingx/metal/+/912654 Depends-On: https://review.opendev.org/c/starlingx/fault/+/914660 Story: 2010940 Task: 49542 Change-Id: I1b51ad9ebcf010f5dee9a86c0295be3da6e2f9b1 Signed-off-by: Eric MacDonald --- mtce-common/src/common/alarmUtil.h | 3 +- mtce-common/src/common/fitCodes.h | 1 + mtce-common/src/common/nodeBase.h | 5 + mtce/src/alarm/alarm.h | 57 +------ mtce/src/common/nodeClass.cpp | 188 +++++++++++++++++++--- mtce/src/common/nodeClass.h | 33 +++- mtce/src/maintenance/mtcAlarm.cpp | 31 +++- mtce/src/maintenance/mtcAlarm.h | 3 +- mtce/src/maintenance/mtcCompMsg.cpp | 53 ++++++- mtce/src/maintenance/mtcCtrlMsg.cpp | 17 +- mtce/src/maintenance/mtcNodeComp.cpp | 80 ++++++++-- mtce/src/maintenance/mtcNodeComp.h | 5 + mtce/src/maintenance/mtcNodeFsm.cpp | 14 +- mtce/src/maintenance/mtcNodeHdlrs.cpp | 219 +++++++++++++++++++------- 14 files changed, 549 insertions(+), 160 deletions(-) diff --git a/mtce-common/src/common/alarmUtil.h b/mtce-common/src/common/alarmUtil.h index c2f7faed..49430740 100644 --- a/mtce-common/src/common/alarmUtil.h +++ b/mtce-common/src/common/alarmUtil.h @@ -2,7 +2,7 @@ #define __ALARMUTIL_H__ /* - * Copyright (c) 2013, 2016 Wind River Systems, Inc. + * Copyright (c) 2013, 2016, 2024 Wind River Systems, Inc. * * SPDX-License-Identifier: Apache-2.0 * @@ -31,6 +31,7 @@ #define SWERR_ALARM_ID ((const char *)"200.000") /* Do No Use */ #define LOCK_ALARM_ID ((const char *)"200.001") +#define MTCALIVE_ALARM_ID ((const char *)"200.003") #define ENABLE_ALARM_ID ((const char *)"200.004") #define MGMNT_HB_ALARM_ID ((const char *)"200.005") #define PMOND_ALARM_ID ((const char *)"200.006") diff --git a/mtce-common/src/common/fitCodes.h b/mtce-common/src/common/fitCodes.h index 91009e42..d5a0480c 100644 --- a/mtce-common/src/common/fitCodes.h +++ b/mtce-common/src/common/fitCodes.h @@ -120,6 +120,7 @@ #define FIT_CODE__FORCE_LOCK_HOST (32) #define FIT_CODE__UNLOCK_HOST (33) #define FIT_CODE__FAIL_SWACT (34) +#define FIT_CODE__FAIL_PXEBOOT_MTCALIVE (35) #define FIT_CODE__FM_SET_ALARM (40) #define FIT_CODE__FM_GET_ALARM (41) diff --git a/mtce-common/src/common/nodeBase.h b/mtce-common/src/common/nodeBase.h index 5a14eeb9..3fc4e867 100755 --- a/mtce-common/src/common/nodeBase.h +++ b/mtce-common/src/common/nodeBase.h @@ -223,6 +223,10 @@ typedef enum #define MTC_JSON_SEVERITY "severity" #define MTC_JSON_SENSOR "sensor" #define MTC_JSON_PROCESS "process" +#define MTC_JSON_FEATURES "features" + +/* Used by the mtcCLient with the MTC_JSON_FEATURES label above */ +#define MTC_PXEBOOT_MTCALIVE "pxeboot_mtcAlive" /* Mtce Info Keys */ #define MTCE_INFO_KEY__BMC_PROTOCOL "bmc_protocol" @@ -614,6 +618,7 @@ typedef struct #define MTC_SERVICE_PMOND (0xB00BF00D) #define MTC_SERVICE_HWMOND (0xF00BF00D) #define MTC_SERVICE_HEARTBEAT (0xBABEF00D) +#define MTC_SERVICE_MTCCLIENT (0xABCDF00D) /** process to process loopback command */ #define MTC_EVENT_LOOPBACK (0x01010101) diff --git a/mtce/src/alarm/alarm.h b/mtce/src/alarm/alarm.h index a9895ac4..f20e640a 100644 --- a/mtce/src/alarm/alarm.h +++ b/mtce/src/alarm/alarm.h @@ -2,7 +2,7 @@ #define __INCLUDE_ALARM_H__ /* - * Copyright (c) 2016-2017 Wind River Systems, Inc. + * Copyright (c) 2016-2017, 2024 Wind River Systems, Inc. * * SPDX-License-Identifier: Apache-2.0 * @@ -14,38 +14,17 @@ */ #include "nodeBase.h" -#include "nodeUtil.h" /* for ... common utilities */ - +#include "nodeUtil.h" /* for ... common utilities */ +#include "alarmUtil.h" /* for ... common alarm definitions */ #include "msgClass.h" /* for ... msgClassSock type definition */ /* external APIs */ #include "fmAPI.h" -#define ENTITY_PREFIX ((const char *)"host=") - #define MAX_ALARM_REQ_PER_MSG (4) #define MAX_ALARM_REQ_MSG_SIZE (500) #define MAX_ALARM_REQ_SIZE (MAX_ALARM_REQ_PER_MSG*MAX_ALARM_REQ_MSG_SIZE) -#define SWERR_ALARM_ID ((const char *)"200.000") /* Do No Use */ -#define LOCK_ALARM_ID ((const char *)"200.001") -#define ENABLE_ALARM_ID ((const char *)"200.004") -#define MGMNT_HB_ALARM_ID ((const char *)"200.005") -#define PMOND_ALARM_ID ((const char *)"200.006") -#define SENSOR_ALARM_ID ((const char *)"200.007") /* Sensor read alarm ; i.e. the sensor read value bad */ -#define CLSTR_HB_ALARM_ID ((const char *)"200.009") -#define BM_ALARM_ID ((const char *)"200.010") -#define CONFIG_ALARM_ID ((const char *)"200.011") -#define CH_COMP_ALARM_ID ((const char *)"200.013") /* Combo Host Compute Failure - on last Controller */ -#define SENSORCFG_ALARM_ID ((const char *)"200.014") /* Sensor configuration alarm ; i.e. could not add */ -#define SENSORGROUP_ALARM_ID ((const char *)"200.015") /* Sensor Group Read Error */ -#define LUKS_ALARM_ID ((const char *)"200.016") /* LUKS volume failure alarm */ - -#define EVENT_LOG_ID ((const char *)"200.020") -#define COMMAND_LOG_ID ((const char *)"200.021") -#define STATECHANGE_LOG_ID ((const char *)"200.022") -#define SERVICESTATUS_LOG_ID ((const char *)"200.023") /* log used to report service failure events against */ - /** Heartbeat Alarm Abstract Reference IDs */ typedef enum @@ -85,17 +64,6 @@ int alarm_log ( string hostname, const char * id_ptr, string entity ); #else -typedef struct -{ - SFmAlarmDataT alarm ; - string name ; - string instc_prefix ; /* Instance prefix i.e. "=sensor." or "=process." */ - string critl_reason ; - string minor_reason ; - string major_reason ; - string clear_reason ; -} alarmUtil_type ; - typedef struct { string alarmid ; @@ -133,10 +101,6 @@ echo "${STR}" | socat - ${PROTOCOL}:${ADDRESS}:${port} void alarmData_init ( void ); alarmUtil_type * alarmData_getAlarm_ptr ( string alarm_id_str ); -/* in alarmUtil.cpp */ -// EFmAlarmSeverityT mtcAlarm_state ( string hostname, alarm_id_enum id ); - - /* in alarmHdlr.cpp */ int alarmHdlr_request_handler ( char * msg_ptr ); @@ -144,21 +108,6 @@ void alarmMgr_queue_clear ( void ); void alarmMgr_queue_alarm (queue_entry_type entry); void alarmMgr_service_queue(void); -/* Clear all alarms against this host */ -void alarmUtil_clear_all ( string hostname ); - -/** - * Query the specified alarm severity level. - * Severity levels are specified in fmAPI.h - **/ -EFmAlarmSeverityT alarmUtil_query ( string hostname, - string identity, - string instance ); - -int alarmUtil_query_identity ( string identity, - SFmAlarmDataT * alarm_list_ptr, - unsigned int alarms_max ); - int alarmUtil_clear ( string hostname, string alarm_id, string entity ); int alarmUtil_critical ( string hostname, string alarm_id, string entity, FMTimeT & timestamp ); int alarmUtil_major ( string hostname, string alarm_id, string entity, FMTimeT & timestamp ); diff --git a/mtce/src/common/nodeClass.cpp b/mtce/src/common/nodeClass.cpp index 4d5e7d47..f9e2fe36 100755 --- a/mtce/src/common/nodeClass.cpp +++ b/mtce/src/common/nodeClass.cpp @@ -19,7 +19,7 @@ #include /* for ENODEV, EFAULT and ENXIO */ #include /* for close and usleep */ #include - +#include /* for ... json_tokener_parse and other json utils */ using namespace std; #ifdef __AREA__ @@ -564,6 +564,11 @@ nodeLinkClass::node* nodeLinkClass::addNode( string hostname ) ptr->mtcAlive_clstr = false ; ptr->mtcAlive_pxeboot = false ; + /* Assume the node's mtcClient does not support pxeboot mtcAlive + * messaging until that mtcClient reports that it does. + * This bool blocks the pxeboot_mtcAlive_monitor fsm. */ + ptr->pxeboot_mtcAlive_supported = false ; + /* These counts are incremented in the set_mtcAlive member * function and cleared in the reset progression handler. */ ptr->mtcAlive_mgmnt_count = 0 ; @@ -574,8 +579,9 @@ nodeLinkClass::node* nodeLinkClass::addNode( string hostname ) for (int i = 0 ; i < MTCALIVE_INTERFACES_MAX ; i++) { ptr->mtcAlive_sequence[i] = + ptr->mtcAlive_loss_count[i] = ptr->mtcAlive_sequence_save[i] = - ptr->mtcAlive_sequence_miss[i] = + ptr->mtcAlive_miss_count[i] = ptr->mtcAlive_log_throttle [i] = 0 ; } ptr->pxeboot_mtcAlive_not_seen_log_throttle = 0 ; @@ -1886,6 +1892,66 @@ int nodeLinkClass::alarm_compute_clear ( struct nodeLinkClass::node * node_ptr, return (PASS); } +/***************************************************************************** + * + * Name : alarm_mtcAlive_failure, alarm_mtcAlive_clear + * + * Purpose : Generate a log and assert minor or clear alarm for mtcAlive + * messaging failures. + * + * Assumptions: Degrade not supported for minor alarms. + * + * Limitations: Only pxeboot messaging is currently supported. + * Need to introduce an network mtcAlive alarm array to track which + * ones are active if/when it comes time to support alarming of + * mtcAlive failures on other networks. + * + *****************************************************************************/ +int nodeLinkClass::alarm_mtcAlive_failure ( struct nodeLinkClass::node * node_ptr, int network ) +{ + int rc = PASS ; + if ( network != PXEBOOT_INTERFACE ) + { + slog ("%s mtcAlive alarm not supported for %s network", + node_ptr->hostname.c_str(), + get_iface_name_str(network)); + rc = FAIL_INVALID_OPERATION ; + } + else if ( node_ptr->alarms[MTC_ALARM_ID__MTCALIVE] != FM_ALARM_SEVERITY_MINOR ) + { + wlog ("%s minor %s mtcAlive messaging failure", + node_ptr->hostname.c_str(), + get_iface_name_str(network)); + + mtcAlarm_minor ( node_ptr->hostname, MTC_ALARM_ID__MTCALIVE ); + node_ptr->alarms[MTC_ALARM_ID__MTCALIVE] = FM_ALARM_SEVERITY_MINOR ; + } + return (rc); +} + +int nodeLinkClass::alarm_mtcAlive_clear ( struct nodeLinkClass::node * node_ptr, int network ) +{ + int rc = PASS ; + if ( network != PXEBOOT_INTERFACE ) + { + slog ("%s mtcAlive alarm not supported for %s network", + node_ptr->hostname.c_str(), + get_iface_name_str(network)); + rc = FAIL_INVALID_OPERATION ; + } + else if ( node_ptr->alarms[MTC_ALARM_ID__MTCALIVE] != FM_ALARM_SEVERITY_CLEAR ) + { + ilog ("%s minor %s mtcAlive messaging failure", + node_ptr->hostname.c_str(), + get_iface_name_str(network)); + + mtcAlarm_clear ( node_ptr->hostname, MTC_ALARM_ID__MTCALIVE ); + node_ptr->alarms[MTC_ALARM_ID__MTCALIVE] = FM_ALARM_SEVERITY_CLEAR ; + node_ptr->mtcAlive_loss_count[network] = 0 ; + } + return (rc); +} + /** Host Operational State Change public member function */ int nodeLinkClass::oper_subf_state_change ( string hostname, string newOperState ) { @@ -3086,6 +3152,7 @@ void nodeLinkClass::clear_service_readies ( struct nodeLinkClass::node * node_pt ilog ("%s clearing service ready events\n", node_ptr->hostname.c_str()); node_ptr->hbsClient_ready = false ; node_ptr->pmond_ready = false ; + node_ptr->mtcClient_ready = false ; } } } @@ -3952,6 +4019,25 @@ int nodeLinkClass::set_clstr_hostaddr ( string & hostname, string & ip ) return ( rc ); } +int nodeLinkClass::set_pxeboot_mtcAlive_support ( string hostname, bool state ) +{ + int rc = FAIL_HOSTNAME_LOOKUP ; + nodeLinkClass::node* node_ptr = nodeLinkClass::getNode ( hostname ); + if ( node_ptr != NULL ) + { + if ( state != node_ptr->pxeboot_mtcAlive_supported ) + mtcAliveStageChange (node_ptr, MTC_MTCALIVE__START); + + dlog ("%s pxeboot mtcAlive %s supported", + node_ptr->hostname.c_str(), + state ? "is" : "is not"); + + node_ptr->pxeboot_mtcAlive_supported = state ; + rc = PASS ; + } + return ( rc ); +} + string nodeLinkClass::get_hostname ( string hostaddr ) { if (( hostaddr == LOOPBACK_IPV6 ) || @@ -4197,7 +4283,7 @@ void nodeLinkClass::set_mtcAlive ( struct nodeLinkClass::node * node_ptr, unsign } if ( state_change ) { - ilog ("%s mtcAlive received from %s network with uptime:%d ; seq:%d", + ilog ("%s mtcAlive received from %s network with uptime:%d ; seq:%d ; state change", node_ptr->hostname.c_str(), get_iface_name_str(iface), node_ptr->uptime, @@ -4208,22 +4294,32 @@ void nodeLinkClass::set_mtcAlive ( struct nodeLinkClass::node * node_ptr, unsign { if ( sequence < node_ptr->mtcAlive_sequence[iface]+1 ) { - wlog ("%s mtcAlive received from %s network with uptime:%d ; out-of-sequence ; expect:%d detect:%d ; correcting", - node_ptr->hostname.c_str(), - get_iface_name_str(iface), - node_ptr->uptime, - node_ptr->mtcAlive_sequence[iface]+1, - sequence); + // Don't warn log for mtcClient restart cases. + // ... indicated by a very low sequence number. + if ( sequence > 2 ) + { + wlog ("%s mtcAlive received from %s network with uptime:%d ; out-of-sequence ; expect:%d detect:%d ; correcting", + node_ptr->hostname.c_str(), + get_iface_name_str(iface), + node_ptr->uptime, + node_ptr->mtcAlive_sequence[iface]+1, + sequence); + } } else { - wlog ("%s mtcAlive received from %s network with uptime:%d ; missed %d mtcalive msgs ; expect:%d detect:%d ; correcting", - node_ptr->hostname.c_str(), - get_iface_name_str(iface), - node_ptr->uptime, - sequence-(node_ptr->mtcAlive_sequence[iface]+1), - node_ptr->mtcAlive_sequence[iface]+1, - sequence); + // Don't warn log for mtcAgent restart cases. + // ... indicated by expecting 1 and detecting a large number. + if ( node_ptr->mtcAlive_sequence[iface] > 0 ) + { + wlog ("%s mtcAlive received from %s network with uptime:%d ; missed %d mtcAlive msgs ; expect:%d detect:%d ; correcting", + node_ptr->hostname.c_str(), + get_iface_name_str(iface), + node_ptr->uptime, + sequence-(node_ptr->mtcAlive_sequence[iface]+1), + node_ptr->mtcAlive_sequence[iface]+1, + sequence); + } } } else @@ -5659,7 +5755,8 @@ void nodeLinkClass::manage_heartbeat_minor ( string hostname, iface_enum iface, /** Interface to declare that a key service on the * specified host is up, running and ready */ int nodeLinkClass::declare_service_ready ( string & hostname, - unsigned int service ) + unsigned int service, + string feature_list ) { nodeLinkClass::node * node_ptr = nodeLinkClass::getNode ( hostname ); if ( node_ptr == NULL ) @@ -5667,6 +5764,58 @@ int nodeLinkClass::declare_service_ready ( string & hostname, wlog ("%s Unknown Host\n", hostname.c_str()); return FAIL_UNKNOWN_HOSTNAME ; } + else if ( service == MTC_SERVICE_MTCCLIENT ) + { + + if ( ! feature_list.empty() ) + { + /* features is expected to be a list - ["feature 0", "feature 1", ..."] */ + struct json_object *json_obj = json_tokener_parse(feature_list.data()); + if ( json_obj != NULL ) + { + /* how many featiures are present ? */ + int features = json_object_array_length(json_obj); + + dlog ("%s %s offers %d features", hostname.c_str(), MTC_SERVICE_MTCCLIENT_NAME, features); + + for ( int f = 0 ; f < features ; f++ ) + { + /* get the first element at index 0 */ + struct json_object *feature_obj_ptr = json_object_array_get_idx(json_obj, f); + + /* convert each element to a string */ + string feature = json_object_get_string(feature_obj_ptr); + dlog ("mtcClient feature %d: %s", f, feature.c_str()); + if ( feature == MTC_PXEBOOT_MTCALIVE ) + { + dlog ("%s %s supports pxeboot mtcAlive", hostname.c_str(), MTC_SERVICE_MTCCLIENT_NAME); + set_pxeboot_mtcAlive_support (hostname, true); + } + } + /* free the json object */ + json_object_put(json_obj); + } + else + { + dlog ("%s json object is NULL", hostname.c_str()); + } + } + else + { + dlog ("%s feature list is empty", hostname.c_str()); + } + + if ( node_ptr->mtcClient_ready == false ) + { + ilog ("%s %s ready %s", hostname.c_str(), + MTC_SERVICE_MTCCLIENT_NAME, + node_ptr->pxeboot_mtcAlive_supported ? "; with pxeboot mtcAlive support" : ""); + node_ptr->mtcClient_ready = true ; + } + if ( node_ptr->pxeboot_mtcAlive_supported ) + send_mtc_cmd ( node_ptr->hostname, MTC_REQ_MTCALIVE, PXEBOOT_INTERFACE ); + return (PASS); + } else if ( service == MTC_SERVICE_PMOND ) { node_ptr->pmond_ready = true ; @@ -9580,7 +9729,7 @@ void nodeLinkClass::mem_log_mtcalive_pxeboot ( struct nodeLinkClass::node * node this->pxeboot_network_provisioned ? 'Y' : 'N', node_ptr->mtcAlive_pxeboot ? 'Y' : 'N', node_ptr->mtcAlive_timer.ring ? 'Y' : 'N', - node_ptr->mtcAlive_sequence_miss [PXEBOOT_INTERFACE], + node_ptr->mtcAlive_miss_count [PXEBOOT_INTERFACE], node_ptr->mtcAlive_sequence [PXEBOOT_INTERFACE], node_ptr->mtcAlive_sequence_save [PXEBOOT_INTERFACE]); @@ -9590,9 +9739,10 @@ void nodeLinkClass::mem_log_mtcalive_pxeboot ( struct nodeLinkClass::node * node void nodeLinkClass::mem_log_alarm1 ( struct nodeLinkClass::node * node_ptr ) { char str[MAX_MEM_LOG_DATA] ; - snprintf (&str[0], MAX_MEM_LOG_DATA, "%s\tAlarm List:%s%s%s%s%s\n", + snprintf (&str[0], MAX_MEM_LOG_DATA, "%s\tAlarm List:%s%s%s%s%s%s\n", node_ptr->hostname.c_str(), node_ptr->alarms[MTC_ALARM_ID__LOCK ] ? " Locked" : " .", + node_ptr->alarms[MTC_ALARM_ID__MTCALIVE] ? " mtcAlive" : " .", node_ptr->alarms[MTC_ALARM_ID__CONFIG ] ? " Config" : " .", node_ptr->alarms[MTC_ALARM_ID__ENABLE ] ? " Enable" : " .", node_ptr->alarms[MTC_ALARM_ID__CH_COMP ] ? " Compute" : " .", diff --git a/mtce/src/common/nodeClass.h b/mtce/src/common/nodeClass.h index 9536f00f..3f5a7825 100755 --- a/mtce/src/common/nodeClass.h +++ b/mtce/src/common/nodeClass.h @@ -335,9 +335,22 @@ private: /* tracks the sequence number of the last mtcAlive message */ unsigned int mtcAlive_sequence [MTCALIVE_INTERFACES_MAX] ; unsigned int mtcAlive_sequence_save[MTCALIVE_INTERFACES_MAX] ; - unsigned int mtcAlive_sequence_miss[MTCALIVE_INTERFACES_MAX] ; + unsigned int mtcAlive_miss_count [MTCALIVE_INTERFACES_MAX] ; unsigned int mtcAlive_log_throttle [MTCALIVE_INTERFACES_MAX] ; + /* mtcAlive miss, loss, alarm and log throttle definitions */ + #define PXEBOOT_MTCALIVE_MONITOR_RATE_SECS (MTC_ALIVE_TIMER*2) // monitor every 10 seconds + #define PXEBOOT_MTCALIVE_LOSS_THRESHOLD (6) // 6 misses or 1 minute if back-2-back + #define PXEBOOT_MTCALIVE_LOSS_ALARM_THRESHOLD (2) // 2 losses before recovery is 2 minutes + #define PXEBOOT_MTCALIVE_NOT_SEEN_LOG_THROTTLE (60) // not seen log every 10 minutes + #define PXEBOOT_MTCALIVE_LOSS_LOG_THROTTLE (60) // loss log every 10 minutes + + /* used to debounce mtcAlive loss alarm */ + unsigned int mtcAlive_loss_count [MTCALIVE_INTERFACES_MAX] ; + + /* indicates boolean support for pxeboot mtcAlive messaging */ + bool pxeboot_mtcAlive_supported ; + /* pxeboot mtcAlive monitor log throttles */ int pxeboot_mtcAlive_not_seen_log_throttle ; int pxeboot_mtcAlive_loss_log_throttle ; @@ -661,6 +674,9 @@ private: /** Host degraded due to loss of Process Monitor running flag */ bool pmon_degraded ; + /* Maintenance Client Ready */ + bool mtcClient_ready ; + /** Process Monitor Ready flag and degrade list */ bool pmond_ready ; @@ -1139,6 +1155,9 @@ private: int alarm_compute_clear ( struct nodeLinkClass::node * node_ptr, bool force ); int alarm_compute_failure ( struct nodeLinkClass::node * node_ptr , EFmAlarmSeverityT sev ); + int alarm_mtcAlive_clear ( struct nodeLinkClass::node * node_ptr, int network ); + int alarm_mtcAlive_failure( struct nodeLinkClass::node * node_ptr, int network ); + void clear_subf_failed_bools ( struct nodeLinkClass::node * node_ptr ); void clear_main_failed_bools ( struct nodeLinkClass::node * node_ptr ); void clear_hostservices_ctls ( struct nodeLinkClass::node * node_ptr ); @@ -1496,6 +1515,9 @@ public: /* set the pxeboot network address for any hostname */ int set_pxeboot_hostaddr ( string hostname, string ip ); + /* set the state of this node's pxeboot mtcAlive support */ + int set_pxeboot_mtcAlive_support ( string hostname, bool state ); + /** get hostname for any hostname */ string get_hostname ( string hostaddr ); @@ -1732,6 +1754,13 @@ public: * interface is on the 'lo' (localhost) interface. */ bool pxeboot_network_provisioned ; + /** A boolean that is used to indicate whether this node supports + * pxeboot mtcAlive messaging. + * This is needed to support upgrades to nodes that don't support + * this feature prior to their upgrade. + * Assuming controllers are upgraded first.*/ + bool pxeboot_mtcAlive_supported ; + /** A debug bool hat allows cluster-host heartbeat failures to only * cause host degrade rather than failure */ bool clstr_degrade_only ; @@ -1925,7 +1954,7 @@ public: /** Interface to declare that a key service on the * specified host is up, running and ready */ - int declare_service_ready ( string & hostname, unsigned int service ); + int declare_service_ready ( string & hostname, unsigned int service, string features="" ); /** Process Monitor 'Clear' Event handler. * diff --git a/mtce/src/maintenance/mtcAlarm.cpp b/mtce/src/maintenance/mtcAlarm.cpp index be705d75..ce536cf1 100644 --- a/mtce/src/maintenance/mtcAlarm.cpp +++ b/mtce/src/maintenance/mtcAlarm.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2017 Wind River Systems, Inc. + * Copyright (c) 2015-2017, 2024 Wind River Systems, Inc. * * SPDX-License-Identifier: Apache-2.0 * @@ -64,6 +64,32 @@ void mtcAlarm_init ( void ) snprintf( ptr->alarm.proposed_repair_action, FM_MAX_BUFFER_LENGTH, "Administratively unlock Host to bring it back in-service."); + /** pxeboot mtcAlive Alarm **************************************************/ + + ptr = &alarm_list[MTC_ALARM_ID__MTCALIVE]; + memset (&ptr->alarm, 0, (sizeof(SFmAlarmDataT))); + snprintf(&ptr->alarm.alarm_id[0], FM_MAX_BUFFER_LENGTH, "%s", LOCK_ALARM_ID); + + ptr->name = "mtcAlive" ; + ptr->instc_prefix = "" ; + + ptr->critl_reason = + ptr->major_reason = + ptr->minor_reason = "pxeboot network communication failure"; + ptr->clear_reason = "pxeboot network communication recovered"; + + ptr->alarm.alarm_type = FM_ALARM_COMM ; + ptr->alarm.probable_cause = FM_ALARM_CAUSE_UNKNOWN; + ptr->alarm.inhibit_alarms = FM_FALSE; + ptr->alarm.service_affecting = FM_FALSE; + ptr->alarm.suppression = FM_TRUE ; + + ptr->alarm.severity = FM_ALARM_SEVERITY_CLEAR ; /* Dynamic */ + ptr->alarm.alarm_state = FM_ALARM_STATE_CLEAR ; /* Dynamic */ + + snprintf( ptr->alarm.proposed_repair_action, FM_MAX_BUFFER_LENGTH, + "Administratively Lock and Unlock host to recover. If problem persists, contact next level of support."); + /** Enable Alarm ************************************************************/ ptr = &alarm_list[MTC_ALARM_ID__ENABLE]; @@ -339,6 +365,7 @@ string _getIdentity ( mtc_alarm_id_enum id ) switch ( id ) { case MTC_ALARM_ID__LOCK: return (LOCK_ALARM_ID); + case MTC_ALARM_ID__MTCALIVE: return (MTCALIVE_ALARM_ID); case MTC_ALARM_ID__CONFIG: return (CONFIG_ALARM_ID); case MTC_ALARM_ID__ENABLE: return (ENABLE_ALARM_ID); case MTC_ALARM_ID__BM: return (BM_ALARM_ID); @@ -348,7 +375,7 @@ string _getIdentity ( mtc_alarm_id_enum id ) case MTC_LOG_ID__COMMAND: return (COMMAND_LOG_ID); case MTC_LOG_ID__STATECHANGE: return (STATECHANGE_LOG_ID); case MTC_LOG_ID__CONFIG: return (CONFIG_LOG_ID); - default: return ("200.000"); + default: return (SWERR_ALARM_ID); } } diff --git a/mtce/src/maintenance/mtcAlarm.h b/mtce/src/maintenance/mtcAlarm.h index e0ddf87e..abf63bd0 100644 --- a/mtce/src/maintenance/mtcAlarm.h +++ b/mtce/src/maintenance/mtcAlarm.h @@ -2,7 +2,7 @@ #define __MTCALARM_H__ /* - * Copyright (c) 2015-2017 Wind River Systems, Inc. + * Copyright (c) 2015-2017, 2024 Wind River Systems, Inc. * * SPDX-License-Identifier: Apache-2.0 * @@ -26,6 +26,7 @@ using namespace std; typedef enum { MTC_ALARM_ID__LOCK, + MTC_ALARM_ID__MTCALIVE, MTC_ALARM_ID__CONFIG, MTC_ALARM_ID__ENABLE, MTC_ALARM_ID__BM, diff --git a/mtce/src/maintenance/mtcCompMsg.cpp b/mtce/src/maintenance/mtcCompMsg.cpp index 3ec8b93b..c6f987dd 100755 --- a/mtce/src/maintenance/mtcCompMsg.cpp +++ b/mtce/src/maintenance/mtcCompMsg.cpp @@ -247,13 +247,20 @@ int mtc_service_command ( mtc_socket_type * sock_ptr, int interface ) rc = PASS ; if ( msg.cmd == MTC_REQ_MTCALIVE ) { - ilog ("mtcAlive request received from %s network", iface_name_ptr); + alog1 ("mtcAlive request received from %s network", iface_name_ptr); if ( interface == PXEBOOT_INTERFACE ) { alog2 ("pxeboot mtcAlive buffer: %s", &msg.buf[0]); load_pxebootInfo_msg(msg); + +#ifdef WANT_FIT_TESTING + if ( ! daemon_want_fit ( FIT_CODE__FAIL_PXEBOOT_MTCALIVE ) ) +#endif + { + send_mtcAlive_msg ( sock_ptr, ctrl_ptr->who_i_am, interface ); + } } - return ( send_mtcAlive_msg ( sock_ptr, get_who_i_am(), interface )); + return (rc); } else if ( msg.cmd == MTC_MSG_INFO ) { @@ -749,6 +756,7 @@ int mtc_service_command ( mtc_socket_type * sock_ptr, int interface ) int mtce_send_event ( mtc_socket_type * sock_ptr, unsigned int cmd , const char * mtce_name_ptr ) { mtc_message_type event ; + ctrl_type *ctrl_ptr = get_ctrl_ptr(); int rc = PASS ; int bytes = 0 ; @@ -772,13 +780,21 @@ int mtce_send_event ( mtc_socket_type * sock_ptr, unsigned int cmd , const char event_info.append(MTC_JSON_SERVICE); event_info.append("\":\""); event_info.append(MTC_SERVICE_MTCCLIENT_NAME ); - event_info.append("\"}"); + + event_info.append("\",\"active_controller_pxeboot_address\":\""); + event_info.append(ctrl_ptr->pxeboot_addr_active_controller); + + event_info.append("\",\""); + event_info.append(MTC_JSON_FEATURES); + event_info.append("\":[\""); + event_info.append(MTC_PXEBOOT_MTCALIVE); + event_info.append("\"]}"); size_t len = event_info.length()+1 ; snprintf ( &event.hdr[0], MSG_HEADER_SIZE, "%s", get_mtce_event_header()); snprintf ( &event.buf[0], len, "%s", event_info.data()); bytes = ((sizeof(mtc_message_type))-(BUF_SIZE-len)); - ilog ("%s %s ready", get_hostname().c_str(), MTC_SERVICE_MTCCLIENT_NAME); + dlog ("%s %s ready", get_hostname().c_str(), MTC_SERVICE_MTCCLIENT_NAME); } else if (( cmd == MTC_EVENT_AVS_CLEAR ) || ( cmd == MTC_EVENT_AVS_MAJOR ) || @@ -849,10 +865,37 @@ int mtce_send_event ( mtc_socket_type * sock_ptr, unsigned int cmd , const char } else { - elog ("cannot send to null or failed socket (%s)", + elog ("cannot send to null or failed management network socket (%s)", get_interface_name_str (MGMNT_INTERFACE) ); rc = FAIL_SOCKET_SENDTO ; } + + // Only the events sent on the pxeboot network are: + // - ready event + if (( cmd == MTC_EVENT_MONITOR_READY ) && + ( sock_ptr->pxeboot_tx_socket > 0 ) && + ( !ctrl_ptr->pxeboot_addr_active_controller.empty())) + { + int flags = 0 ; // no tx flags + struct sockaddr_in hostAddr; + memset(&hostAddr, 0, sizeof(hostAddr)); + print_mtc_message ( ctrl_ptr->pxeboot_addr_active_controller.data(), MTC_CMD_TX, event, get_interface_name_str(PXEBOOT_INTERFACE), false); + hostAddr.sin_addr.s_addr = inet_addr(ctrl_ptr->pxeboot_addr_active_controller.data()); + hostAddr.sin_family = AF_INET; + hostAddr.sin_port = htons(sock_ptr->mtc_tx_pxeboot_port); + + ssize_t bytes_sent = sendto(sock_ptr->pxeboot_tx_socket, &event.hdr[0], bytes, flags, + (const struct sockaddr*)&hostAddr, sizeof(hostAddr)); + if (bytes_sent <= 0) + { + elog ("failed to send %s to %s:%d on %s network (rc:%ld) (%d:%m)", + get_mtcNodeCommand_str(event.cmd), + ctrl_ptr->pxeboot_addr_active_controller.c_str(), + hostAddr.sin_port, + get_interface_name_str(PXEBOOT_INTERFACE), + bytes_sent, errno); + } + } return rc ; } diff --git a/mtce/src/maintenance/mtcCtrlMsg.cpp b/mtce/src/maintenance/mtcCtrlMsg.cpp index 96f8ef8b..353fd130 100755 --- a/mtce/src/maintenance/mtcCtrlMsg.cpp +++ b/mtce/src/maintenance/mtcCtrlMsg.cpp @@ -443,6 +443,7 @@ int mtc_service_inbox ( nodeLinkClass * obj_ptr, string service = "" ; string sensor = "" ; string process = "" ; + hostname = "unknown" ; int rc1 = FAIL ; @@ -493,7 +494,17 @@ int mtc_service_inbox ( nodeLinkClass * obj_ptr, } else if ( service == MTC_SERVICE_MTCCLIENT_NAME ) { - ilog ("%s %s ready", hostname.c_str(), MTC_SERVICE_MTCCLIENT_NAME); + string features= "" ; + if ( jsonUtil_get_key_val(&msg.buf[0], MTC_JSON_FEATURES, features ) == PASS ) + { + dlog ("%s %s features: %s", hostname.c_str(), service.c_str(), features.c_str()); + } + else + { + ilog ("%s %s not offering feature list ; node may have upgrade pending", + hostname.c_str(), MTC_SERVICE_MTCCLIENT_NAME); + } + obj_ptr->declare_service_ready ( hostname, MTC_SERVICE_MTCCLIENT, features ); /* if this ready event is from the mtcClient of a * controller that has valid bmc access info then @@ -697,7 +708,9 @@ int send_mtc_cmd ( string & hostname, int cmd , int interface, string json_dict * controller's pxeboot ip addresses so it knows where to send. */ obj_ptr->pxebootInfo_loader(); data = "{\"pxebootInfo\":{" ; - data.append ("\"address\":\""); + data.append ("\""); + data.append (CONTROLLER); + data.append ("\":\""); data.append (obj_ptr->my_pxeboot_ip); data.append ("\",\""); data.append (CONTROLLER_0); diff --git a/mtce/src/maintenance/mtcNodeComp.cpp b/mtce/src/maintenance/mtcNodeComp.cpp index 3c25d760..f5bad581 100644 --- a/mtce/src/maintenance/mtcNodeComp.cpp +++ b/mtce/src/maintenance/mtcNodeComp.cpp @@ -337,6 +337,7 @@ int daemon_configure ( void ) rc = PASS ; } + daemon_load_fit(); return (rc); } @@ -1267,6 +1268,9 @@ int daemon_init ( string iface, string nodetype_str ) ctrl.clstr_iface_provisioned = false ; ctrl.pxeboot_iface_provisioned = false ; ctrl.peer_ctrlr_reset.sync = false ; + ctrl.pxeboot_addr_c0 = "" ; + ctrl.pxeboot_addr_c1 = "" ; + ctrl.pxeboot_addr_active_controller = "" ; /* convert node type to integer */ ctrl.nodetype = get_host_function_mask ( nodetype_str ) ; @@ -1295,9 +1299,12 @@ int daemon_init ( string iface, string nodetype_str ) } else { - // Ready to do pxeboot messaging + ilog ("Mgmnt iface : %s", ctrl.mgmnt_iface.c_str()); + + // Not on LO, assume pxeboot provisioning starting with it being + // equal to the management interface, until otherwise updated due + // to bonding or vlan modes. ctrl.pxeboot_iface = ctrl.mgmnt_iface ; - ilog ("Pxeboot iface %s", ctrl.pxeboot_iface.c_str()); ctrl.pxeboot_iface_provisioned = true ; } } @@ -1437,7 +1444,9 @@ void daemon_service_run ( void ) ctrl.peer_ctrlr_reset.audit_period ); } + /* Send the mtcClient ready event and clear the periodic event counter */ mtce_send_event ( sock_ptr, MTC_EVENT_MONITOR_READY, NULL ); + ctrl.ready_event_counter = 0 ; /* lets go select so that the sock does not go crazy */ dlog ("%s running main loop with %d msecs socket timeout\n", @@ -1646,7 +1655,7 @@ void daemon_service_run ( void ) } } } - if ( ctrl.timer.ring == true ) + if ( mtcTimer_expired ( ctrl.timer ) ) { bool socket_reinit = true ; @@ -1745,7 +1754,13 @@ void daemon_service_run ( void ) string who_i_am = _self_identify ( ctrl.nodetype_str ); } alog1 ("sending mtcAlive on all provisioned mtcAlive networks"); - send_mtcAlive_msg ( sock_ptr, ctrl.who_i_am, PXEBOOT_INTERFACE ); + +#ifdef WANT_FIT_TESTING + if ( ! daemon_want_fit ( FIT_CODE__FAIL_PXEBOOT_MTCALIVE ) ) +#endif + { + send_mtcAlive_msg ( sock_ptr, ctrl.who_i_am, PXEBOOT_INTERFACE ); + } send_mtcAlive_msg ( sock_ptr, ctrl.who_i_am, MGMNT_INTERFACE ); if (( ctrl.clstr_iface_provisioned == true ) && ( mtc_sock.mtc_client_clstr_rx_socket != NULL ) && @@ -1801,6 +1816,22 @@ void daemon_service_run ( void ) _close_amon_sock (); } } + + // Purpose: mtcClient ready event audit + // + // Send the ready event every minute just in case the first + // process startup event was missed by the mtcAgent or + // the mtcAgent was restarted. + //. + // Needed to ensure that pxeboot mtcAlive messaging monitoring + // gets started over a mtcagent process restart. + if ( ++ctrl.ready_event_counter >= (MTC_MINS_1/MTC_ALIVE_TIMER) ) + { + + dlog ("sending mtcClient ready event"); + mtce_send_event ( sock_ptr, MTC_EVENT_MONITOR_READY, NULL ); + ctrl.ready_event_counter = 0 ; + } } /* service controller specific audits */ @@ -2388,8 +2419,9 @@ void load_mtcInfo_msg ( mtc_message_type & msg ) * Address can be empty of an unprovisioned controller. * * { "pxebootInfo":{ - * "controller-0":"169.254.202.2", - * "controller-1":"169.254.202.3" + * "controller" : "169.254.202.2" + * "controller-0" : "169.254.202.2", + * "controller-1" : "169.254.202.3" * } * } * @@ -2398,18 +2430,43 @@ void load_mtcInfo_msg ( mtc_message_type & msg ) ***************************************************************************/ void load_pxebootInfo_msg ( mtc_message_type & msg ) { - struct json_object *_obj = json_tokener_parse( &msg.buf[0] ); - if ( _obj ) + struct json_object *json_obj = json_tokener_parse( &msg.buf[0] ); + if ( json_obj ) { const char dict_label [] = "pxebootInfo" ; struct json_object *info_obj = (struct json_object *)(NULL); - json_bool json_rc = json_object_object_get_ex( _obj, + json_bool json_rc = json_object_object_get_ex( json_obj, &dict_label[0], &info_obj ); if ( ( json_rc == true ) && ( info_obj ) ) { + jlog ("%s: %s ", &dict_label[0], json_object_get_string(info_obj)); struct json_object *ctrl_obj = (struct json_object *)(NULL); + json_rc = json_object_object_get_ex( info_obj, CONTROLLER, &ctrl_obj ); + if (( json_rc == true ) && ( ctrl_obj )) + { + string active_controller = json_object_get_string(ctrl_obj); + if ( ctrl.pxeboot_addr_active_controller != active_controller ) + { + string prefix = "controller pxeboot address" ; + if ( ctrl.pxeboot_addr_active_controller.empty() ) + { + ilog ("%s: %s", + prefix.c_str(), + active_controller.c_str()); + } + else + { + ilog ("%s: %s ; was %s", + prefix.c_str(), + active_controller.c_str(), + ctrl.pxeboot_addr_active_controller.c_str()); + } + ctrl.pxeboot_addr_active_controller = active_controller ; + } + } + // now get the individual controller addresses string pxeboot_addr_cx[CONTROLLERS] = {CONTROLLER_0, CONTROLLER_1}; for (int c = 0 ; c < CONTROLLERS ; c++) { @@ -2423,8 +2480,7 @@ void load_pxebootInfo_msg ( mtc_message_type & msg ) // get the current pxeboot address for the in loop controller cur_pxeboot_addr = (controller == CONTROLLER_0) ? ctrl.pxeboot_addr_c0 : ctrl.pxeboot_addr_c1; - json_bool json_rc = - json_object_object_get_ex( info_obj, controller.data(), &ctrl_obj ); + json_rc = json_object_object_get_ex( info_obj, controller.data(), &ctrl_obj ); if (( json_rc == true ) && (ctrl_obj)) { jlog ("controller-x obj data: %s", json_object_get_string(ctrl_obj)); @@ -2477,7 +2533,7 @@ void load_pxebootInfo_msg ( mtc_message_type & msg ) elog("Failed to parse '%s' from mtcAlive request message: %s", &dict_label[0], &msg.buf[0]); } - json_object_put(_obj); + json_object_put(json_obj); } else { diff --git a/mtce/src/maintenance/mtcNodeComp.h b/mtce/src/maintenance/mtcNodeComp.h index a61a9317..01b8901d 100644 --- a/mtce/src/maintenance/mtcNodeComp.h +++ b/mtce/src/maintenance/mtcNodeComp.h @@ -102,6 +102,7 @@ typedef struct string pxeboot_addr ; string pxeboot_addr_c0 ; string pxeboot_addr_c1 ; + string pxeboot_addr_active_controller ; // Assume address is learned to start even though it's likely not. // This enabled the first not learned log followed by a learned @@ -147,6 +148,10 @@ typedef struct string mtcAgent_ip ; peer_ctrlr_reset_type peer_ctrlr_reset; + + /* throttles sending the periodic mtcClient ready event. */ + int ready_event_counter ; + } ctrl_type ; ctrl_type * get_ctrl_ptr ( void ); diff --git a/mtce/src/maintenance/mtcNodeFsm.cpp b/mtce/src/maintenance/mtcNodeFsm.cpp index df9a8d54..74fc1a0d 100755 --- a/mtce/src/maintenance/mtcNodeFsm.cpp +++ b/mtce/src/maintenance/mtcNodeFsm.cpp @@ -90,14 +90,15 @@ int nodeLinkClass::fsm ( struct nodeLinkClass::node * node_ptr ) * with mtcAlive debouncing */ nodeLinkClass::online_handler ( node_ptr ); - - /* - * Always run the mtcAlive handler. - * + /* pxeboot_mtcAlive_monitor * - monitor host's mtcAlive messaging * - manage host's mtcAlive missing alarm - */ - nodeLinkClass::pxeboot_mtcAlive_monitor ( node_ptr ); + * + * Don't monitor pxeboot mtcAlive messaging while the pxeboot network is + * not provisioned or that node has not yet reported that it supports + * pxeboot mtcAlive messaging */ + if ( this->pxeboot_network_provisioned && node_ptr->pxeboot_mtcAlive_supported ) + nodeLinkClass::pxeboot_mtcAlive_monitor ( node_ptr ); if ( node_ptr->adminAction == MTC_ADMIN_ACTION__DELETE ) { @@ -106,7 +107,6 @@ int nodeLinkClass::fsm ( struct nodeLinkClass::node * node_ptr ) return (PASS); } - /* Run the config FSM if the configAction bool is set. * We keep this as a separate action unto itself so that * mtce can continue to service all other actions for the diff --git a/mtce/src/maintenance/mtcNodeHdlrs.cpp b/mtce/src/maintenance/mtcNodeHdlrs.cpp index 465c3750..7084dbab 100755 --- a/mtce/src/maintenance/mtcNodeHdlrs.cpp +++ b/mtce/src/maintenance/mtcNodeHdlrs.cpp @@ -3433,15 +3433,20 @@ int nodeLinkClass::offline_handler ( struct nodeLinkClass::node * node_ptr ) plog ("%s offline (external)\n", node_ptr->hostname.c_str()); node_ptr->offlineStage = MTC_OFFLINE__IDLE ; } - else if ( !node_ptr->mtcAlive_mgmnt && !node_ptr->mtcAlive_clstr ) + else if ( !node_ptr->mtcAlive_mgmnt && !node_ptr->mtcAlive_clstr && !node_ptr->mtcAlive_pxeboot ) { if ( ++node_ptr->offline_search_count > offline_threshold ) { node_ptr->mtcAlive_online = false ; + node_ptr->mtcClient_ready = false ; - // Clear all the mtcAlive_sequence numbers + + // Clear all the mtcAlive counts and sequence numbers + node_ptr->mtcAlive_mgmnt_count = 0 ; + node_ptr->mtcAlive_clstr_count = 0 ; + node_ptr->mtcAlive_pxeboot_count = 0 ; for (int i = 0 ; i < MTCALIVE_INTERFACES_MAX ; i++) - node_ptr->mtcAlive_sequence[i] = 0; + node_ptr->mtcAlive_sequence[i] = 0; plog ("%s going offline ; (threshold (%d msec * %d)\n", node_ptr->hostname.c_str(), @@ -3485,13 +3490,15 @@ int nodeLinkClass::offline_handler ( struct nodeLinkClass::node * node_ptr ) **/ node_ptr->mtcAlive_online = true ; - ilog ("%s still seeing mtcAlive (%d) (Mgmt:%c:%d Clstr:%c:%d) ; restart offline_search_count=%d of %d\n", + ilog ("%s still seeing mtcAlive (%d) (Mgmt:%c:%d Clstr:%c:%d Pxeboot:%c:%d) ; restart offline_search_count=%d of %d\n", node_ptr->hostname.c_str(), node_ptr->mtcAlive_count, node_ptr->mtcAlive_mgmnt ? 'Y' : 'n', node_ptr->mtcAlive_mgmnt_count, node_ptr->mtcAlive_clstr ? 'Y' : 'n', node_ptr->mtcAlive_clstr_count, + node_ptr->mtcAlive_pxeboot ? 'Y' : 'n', + node_ptr->mtcAlive_pxeboot_count, node_ptr->offline_search_count, offline_threshold ); node_ptr->offline_search_count = 0 ; /* reset the search count */ @@ -6261,6 +6268,8 @@ int nodeLinkClass::add_handler ( struct nodeLinkClass::node * node_ptr ) mtcAlarm_state ( node_ptr->hostname, MTC_ALARM_ID__ENABLE); EFmAlarmSeverityT config_alarm_severity = mtcAlarm_state ( node_ptr->hostname, MTC_ALARM_ID__CONFIG); + EFmAlarmSeverityT mtcAlive_alarm_severity = + mtcAlarm_state ( node_ptr->hostname, MTC_ALARM_ID__MTCALIVE); /* Clear generic enable alarm over process restart. * Will get reasserted if the cause condition still exists */ @@ -6284,6 +6293,21 @@ int nodeLinkClass::add_handler ( struct nodeLinkClass::node * node_ptr ) alarmUtil_getSev_str(config_alarm_severity).c_str()); } + /* The mtcAlive alarm is maintained if it exists. + * The pxeboot_mtcAlive_monitor will clear the alarm + * if it exists and the pxeboot mtcAlive messaging works. */ + if ( mtcAlive_alarm_severity != FM_ALARM_SEVERITY_CLEAR ) + { + node_ptr->alarms[MTC_ALARM_ID__MTCALIVE] = mtcAlive_alarm_severity ; + ilog ("%s found mtcAlive alarm ; loaded %s", + node_ptr->hostname.c_str(), + alarmUtil_getSev_str(mtcAlive_alarm_severity).c_str()); + + // Load up the miss and loss counts used for recovery + node_ptr->mtcAlive_loss_count[PXEBOOT_INTERFACE] = PXEBOOT_MTCALIVE_LOSS_ALARM_THRESHOLD ; + node_ptr->mtcAlive_miss_count[PXEBOOT_INTERFACE] = PXEBOOT_MTCALIVE_LOSS_THRESHOLD ; + } + if ( is_controller(node_ptr) ) { this->controllers++ ; @@ -7571,29 +7595,46 @@ int nodeLinkClass::oos_test_handler ( struct nodeLinkClass::node * node_ptr ) // Returns : PASS // /////////////////////////////////////////////////////////////////////////////// -#define PXEBOOT_MTCALIVE_MONITOR_RATE_SECS (10) -#define PXEBOOT_MTCALIVE_LOSS_THRESHOLD (6) -#define PXEBOOT_MTCALIVE_NOT_SEEN_LOG_THROTTLE (6) -#define PXEBOOT_MTCALIVE_LOSS_LOG_THROTTLE (6) int nodeLinkClass::pxeboot_mtcAlive_monitor ( struct nodeLinkClass::node * node_ptr ) { - // ERIK: TODO: Comment out once verified flog ("%s pxeboot mtcAlive fsm stage: %s", node_ptr->hostname.c_str(), get_mtcAliveStages_str(node_ptr->mtcAliveStage).c_str()); - if ( !this->pxeboot_network_provisioned ) return PASS ; + + // Don't monitor pxeboot mtcAlive messaging while the node is + // locked or in the following administrative action states. + if (( node_ptr->adminState == MTC_ADMIN_STATE__LOCKED ) || + ( node_ptr->adminAction == MTC_ADMIN_ACTION__UNLOCK ) || + ( node_ptr->adminAction == MTC_ADMIN_ACTION__ENABLE ) || + ( node_ptr->adminAction == MTC_ADMIN_ACTION__RECOVER ) || + ( node_ptr->adminAction == MTC_ADMIN_ACTION__POWERCYCLE )) + { + // Clear the alarm if the node is locked + if (( node_ptr->adminState == MTC_ADMIN_STATE__LOCKED ) && + ( node_ptr->alarms[MTC_ALARM_ID__MTCALIVE] != FM_ALARM_SEVERITY_CLEAR )) + alarm_mtcAlive_clear (node_ptr, PXEBOOT_INTERFACE); + // Switch to START if not already there + if ( node_ptr->mtcAliveStage != MTC_MTCALIVE__START ) + mtcAliveStageChange (node_ptr, MTC_MTCALIVE__START); + return PASS ; + } switch (node_ptr->mtcAliveStage) { + // Starts from scratch. Clears timer and counts but not alarm. case MTC_MTCALIVE__START: { alog2 ("%s mtcAlive start", node_ptr->hostname.c_str()); - mtcTimer_reset ( node_ptr->mtcAlive_timer ); + if ( ! mtcTimer_expired (node_ptr->mtcAlive_timer) ) + mtcTimer_reset (node_ptr->mtcAlive_timer); node_ptr->mtcAlive_sequence[PXEBOOT_INTERFACE] = 0 ; + node_ptr->mtcAlive_sequence_save[PXEBOOT_INTERFACE] = 0 ; mtcAliveStageChange (node_ptr, MTC_MTCALIVE__SEND); - break ; + return PASS ; } + // Reloads the controller's pxeboot info and sends it with a mtcAlive request + // telling the remote node to send send mtcAlive to the active controller. case MTC_MTCALIVE__SEND: { /* pxeboot info refresh audit */ @@ -7601,25 +7642,31 @@ int nodeLinkClass::pxeboot_mtcAlive_monitor ( struct nodeLinkClass::node * node_ pxebootInfo_loader (); alog2 ("%s mtcAlive send", node_ptr->hostname.c_str()); send_mtc_cmd ( node_ptr->hostname, MTC_REQ_MTCALIVE, PXEBOOT_INTERFACE ); - node_ptr->mtcAlive_sequence_save[PXEBOOT_INTERFACE] = 0 ; - node_ptr->mtcAlive_sequence_miss[PXEBOOT_INTERFACE] = 0 ; mtcAliveStageChange (node_ptr, MTC_MTCALIVE__MONITOR); - break ; + return PASS ; } + // Start the Wait timer 2x longer than the expected mtcAlive cadence case MTC_MTCALIVE__MONITOR: { alog2 ("%s mtcAlive monitor", node_ptr->hostname.c_str()); mtcTimer_start ( node_ptr->mtcAlive_timer, mtcTimer_handler, PXEBOOT_MTCALIVE_MONITOR_RATE_SECS ); mtcAliveStageChange (node_ptr, MTC_MTCALIVE__WAIT); - break ; + return PASS ; } + // Wait for the timer to expire case MTC_MTCALIVE__WAIT: { if ( mtcTimer_expired ( node_ptr->mtcAlive_timer ) ) mtcAliveStageChange (node_ptr, MTC_MTCALIVE__CHECK); - break ; + return PASS ; } + // Check the mtcAlive sequence numbers and handle each possible case + // success - mtcAlive sequence number is greater than the last one - may clear alarm + // out-of-sequence - mtcAlive sequence number is less than the last one - may assert alarm + // miss - mtcAlive sequence number is equal to the last one - count misses + // loss - mtcAlive messaging miss count exceeded threshold - assert alarm + // not seen - waiting for first mtcAlive following reboot - request mtcAlive case MTC_MTCALIVE__CHECK: { if ( node_ptr->mtcAlive_sequence[PXEBOOT_INTERFACE] > node_ptr->mtcAlive_sequence_save[PXEBOOT_INTERFACE] ) @@ -7632,70 +7679,132 @@ int nodeLinkClass::pxeboot_mtcAlive_monitor ( struct nodeLinkClass::node * node_ node_ptr->mtcAlive_sequence_save[PXEBOOT_INTERFACE]); // Now that we received a message we can dec the missed count - if ( node_ptr->mtcAlive_sequence_miss[PXEBOOT_INTERFACE] ) - node_ptr->mtcAlive_sequence_miss[PXEBOOT_INTERFACE]-- ; - node_ptr->pxeboot_mtcAlive_not_seen_log_throttle = 0 ; - node_ptr->pxeboot_mtcAlive_loss_log_throttle = 0 ; + // and clear the alarm if it exists + if ( node_ptr->mtcAlive_miss_count[PXEBOOT_INTERFACE] ) + { + // Set miss count to max if we are have reached at least one loss but no alarm yet + if (( node_ptr->alarms[MTC_ALARM_ID__MTCALIVE] == FM_ALARM_SEVERITY_CLEAR ) && + ( node_ptr->mtcAlive_loss_count[PXEBOOT_INTERFACE] )) + { + node_ptr->mtcAlive_miss_count[PXEBOOT_INTERFACE] = PXEBOOT_MTCALIVE_LOSS_THRESHOLD ; + node_ptr->mtcAlive_loss_count[PXEBOOT_INTERFACE] = 0 ; + } + + ilog ("%s pxeboot mtcAlive miss count %d ; decrement %s; recovery", + node_ptr->hostname.c_str(), + node_ptr->mtcAlive_miss_count[PXEBOOT_INTERFACE], + node_ptr->alarms[MTC_ALARM_ID__MTCALIVE] ? "; alarm clear when 0 " : ""); + node_ptr->mtcAlive_miss_count[PXEBOOT_INTERFACE]-- ; + } + else + { + // Clear alarm and start with a clean loss slate. miss's is already zero + node_ptr->mtcAlive_loss_count[PXEBOOT_INTERFACE] = 0 ; + alarm_mtcAlive_clear ( node_ptr, PXEBOOT_INTERFACE ); + } + + // Clear the log throttles now that we have received a message + if ( node_ptr->pxeboot_mtcAlive_not_seen_log_throttle || node_ptr->pxeboot_mtcAlive_loss_log_throttle ) + { + node_ptr->pxeboot_mtcAlive_not_seen_log_throttle = 0 ; + node_ptr->pxeboot_mtcAlive_loss_log_throttle = 0 ; + } + mtcAliveStageChange (node_ptr, MTC_MTCALIVE__MONITOR); } else if ( node_ptr->mtcAlive_sequence[PXEBOOT_INTERFACE] < node_ptr->mtcAlive_sequence_save[PXEBOOT_INTERFACE] ) { - // unexpected case - wlog ("%s mtcAlive out-of-sequence ; this:%d last:%d", + // mtcClient restart case + if ( ++node_ptr->mtcAlive_miss_count[PXEBOOT_INTERFACE] < PXEBOOT_MTCALIVE_LOSS_THRESHOLD ) + { + // The mtcClient on this host may have been restarted + mtcAliveStageChange (node_ptr, MTC_MTCALIVE__SEND); + } + else + mtcAliveStageChange (node_ptr, MTC_MTCALIVE__FAIL); + + wlog ("%s pxeboot mtcAlive miss count %d ; loss count %d ; out-of-sequence ; this:%d last:%d", node_ptr->hostname.c_str(), + node_ptr->mtcAlive_miss_count[PXEBOOT_INTERFACE], + node_ptr->mtcAlive_loss_count[PXEBOOT_INTERFACE], node_ptr->mtcAlive_sequence[PXEBOOT_INTERFACE], node_ptr->mtcAlive_sequence_save[PXEBOOT_INTERFACE]); - node_ptr->mtcAlive_sequence_miss[PXEBOOT_INTERFACE]++ ; - mtcAliveStageChange (node_ptr, MTC_MTCALIVE__START); } - else if ( ++node_ptr->mtcAlive_sequence_miss[PXEBOOT_INTERFACE] < PXEBOOT_MTCALIVE_LOSS_THRESHOLD ) + else if ( ++node_ptr->mtcAlive_miss_count[PXEBOOT_INTERFACE] < PXEBOOT_MTCALIVE_LOSS_THRESHOLD ) { // Missing pxeboot mtcAlive - alog ("%s pxeboot mtcAlive miss count %d ; sending request", + wlog ("%s pxeboot mtcAlive miss count %d ; loss count %d ; sending request", node_ptr->hostname.c_str(), - node_ptr->mtcAlive_sequence_miss[PXEBOOT_INTERFACE]); - send_mtc_cmd ( node_ptr->hostname, MTC_REQ_MTCALIVE, PXEBOOT_INTERFACE ); - mtcAliveStageChange (node_ptr, MTC_MTCALIVE__MONITOR); - } - else if ( node_ptr->mtcAlive_pxeboot == true ) - { - wlog_throttled (node_ptr->pxeboot_mtcAlive_loss_log_throttle, - PXEBOOT_MTCALIVE_LOSS_LOG_THROTTLE, - "%s pxeboot mtcAlive loss ; missed: %d ; last: count:%d seq: %d ; sending request", - node_ptr->hostname.c_str(), - node_ptr->mtcAlive_sequence_miss[PXEBOOT_INTERFACE], - node_ptr->mtcAlive_pxeboot_count, - node_ptr->mtcAlive_sequence_save[PXEBOOT_INTERFACE]); + node_ptr->mtcAlive_miss_count[PXEBOOT_INTERFACE], + node_ptr->mtcAlive_loss_count[PXEBOOT_INTERFACE]); + // The mtcClient on this host may have been restarted mtcAliveStageChange (node_ptr, MTC_MTCALIVE__SEND); } else { - ilog_throttled (node_ptr->pxeboot_mtcAlive_not_seen_log_throttle, - PXEBOOT_MTCALIVE_NOT_SEEN_LOG_THROTTLE, - "%s pxeboot mtcAlive not seen yet ; sending request", - node_ptr->hostname.c_str()); - mtcAliveStageChange (node_ptr, MTC_MTCALIVE__SEND); + if ( node_ptr->mtcAlive_pxeboot == true ) + { + // If we get there its a loss + wlog_throttled (node_ptr->pxeboot_mtcAlive_loss_log_throttle, + PXEBOOT_MTCALIVE_LOSS_LOG_THROTTLE, + "%s pxeboot mtcAlive lost ; missed: %d ; last: count:%d seq: %d ; sending request", + node_ptr->hostname.c_str(), + node_ptr->mtcAlive_miss_count[PXEBOOT_INTERFACE], + node_ptr->mtcAlive_pxeboot_count, + node_ptr->mtcAlive_sequence_save[PXEBOOT_INTERFACE]); + } + else + { + // Otherwise still searching beyond threshold for the first mtcAlive after reboot or graceful recovery + ilog_throttled (node_ptr->pxeboot_mtcAlive_not_seen_log_throttle, + PXEBOOT_MTCALIVE_NOT_SEEN_LOG_THROTTLE, + "%s pxeboot mtcAlive not seen yet ; sending request", + node_ptr->hostname.c_str()); + } + mtcAliveStageChange (node_ptr, MTC_MTCALIVE__FAIL); } + + // Prevent the miss count from being larger than the loss, and therfore the alarm clear recovery, threshold. + if (node_ptr->mtcAlive_miss_count[PXEBOOT_INTERFACE] > PXEBOOT_MTCALIVE_LOSS_THRESHOLD) + node_ptr->mtcAlive_miss_count[PXEBOOT_INTERFACE] = PXEBOOT_MTCALIVE_LOSS_THRESHOLD; + node_ptr->mtcAlive_sequence_save[PXEBOOT_INTERFACE] = node_ptr->mtcAlive_sequence[PXEBOOT_INTERFACE] ; - - // TODO (emacdona): Need to handle loss case that manages raising the alarm - // Transition to MTC_MTCALIVE__FAIL - break ; } case MTC_MTCALIVE__FAIL: - { - wlog ("%s mtcAlive fail", node_ptr->hostname.c_str()); - mtcAliveStageChange (node_ptr, MTC_MTCALIVE__START); - break ; - } default: { - slog ("%s mtcAlive fsm default", node_ptr->hostname.c_str()); + alog2 ("%s mtcAlive fail", node_ptr->hostname.c_str()); + if ( node_ptr->alarms[MTC_ALARM_ID__MTCALIVE] == FM_ALARM_SEVERITY_CLEAR ) + { + if ( ++node_ptr->mtcAlive_loss_count[PXEBOOT_INTERFACE] < PXEBOOT_MTCALIVE_LOSS_ALARM_THRESHOLD ) + { + wlog ("%s pxeboot mtcAlive lost ; %d more loss before alarm assertion", + node_ptr->hostname.c_str(), + PXEBOOT_MTCALIVE_LOSS_ALARM_THRESHOLD - node_ptr->mtcAlive_loss_count[PXEBOOT_INTERFACE] ); + + // Start the misses counter over again after each loss debounce + node_ptr->mtcAlive_miss_count[PXEBOOT_INTERFACE] = 0 ; + } + else + { + ilog ("%s pxeboot mtcAlive alarm assert (%d)", + node_ptr->hostname.c_str(), + node_ptr->mtcAlive_loss_count[PXEBOOT_INTERFACE]); + alarm_mtcAlive_failure ( node_ptr, PXEBOOT_INTERFACE ); + } + } mtcAliveStageChange (node_ptr, MTC_MTCALIVE__START); break ; } } + if ( node_ptr->mtcAlive_miss_count[PXEBOOT_INTERFACE] || node_ptr->mtcAlive_loss_count[PXEBOOT_INTERFACE] ) + { + alog2 ("%s pxeboot mtcAlive: Miss: %d of %d , Loss: %d of %d", + node_ptr->hostname.c_str(), + node_ptr->mtcAlive_miss_count[PXEBOOT_INTERFACE], PXEBOOT_MTCALIVE_LOSS_THRESHOLD, + node_ptr->mtcAlive_loss_count[PXEBOOT_INTERFACE], PXEBOOT_MTCALIVE_LOSS_ALARM_THRESHOLD); + } return (PASS); }