diff --git a/mtce-common/src/common/alarmUtil.h b/mtce-common/src/common/alarmUtil.h index c2f7faed..d32cbde8 100644 --- a/mtce-common/src/common/alarmUtil.h +++ b/mtce-common/src/common/alarmUtil.h @@ -31,6 +31,7 @@ #define SWERR_ALARM_ID ((const char *)"200.000") /* Do No Use */ #define LOCK_ALARM_ID ((const char *)"200.001") +#define MTCALIVE_ALARM_ID ((const char *)"200.003") #define ENABLE_ALARM_ID ((const char *)"200.004") #define MGMNT_HB_ALARM_ID ((const char *)"200.005") #define PMOND_ALARM_ID ((const char *)"200.006") diff --git a/mtce-common/src/common/fitCodes.h b/mtce-common/src/common/fitCodes.h index 91009e42..d5a0480c 100644 --- a/mtce-common/src/common/fitCodes.h +++ b/mtce-common/src/common/fitCodes.h @@ -120,6 +120,7 @@ #define FIT_CODE__FORCE_LOCK_HOST (32) #define FIT_CODE__UNLOCK_HOST (33) #define FIT_CODE__FAIL_SWACT (34) +#define FIT_CODE__FAIL_PXEBOOT_MTCALIVE (35) #define FIT_CODE__FM_SET_ALARM (40) #define FIT_CODE__FM_GET_ALARM (41) diff --git a/mtce-common/src/common/nodeBase.h b/mtce-common/src/common/nodeBase.h index 5a14eeb9..3fc4e867 100755 --- a/mtce-common/src/common/nodeBase.h +++ b/mtce-common/src/common/nodeBase.h @@ -223,6 +223,10 @@ typedef enum #define MTC_JSON_SEVERITY "severity" #define MTC_JSON_SENSOR "sensor" #define MTC_JSON_PROCESS "process" +#define MTC_JSON_FEATURES "features" + +/* Used by the mtcCLient with the MTC_JSON_FEATURES label above */ +#define MTC_PXEBOOT_MTCALIVE "pxeboot_mtcAlive" /* Mtce Info Keys */ #define MTCE_INFO_KEY__BMC_PROTOCOL "bmc_protocol" @@ -614,6 +618,7 @@ typedef struct #define MTC_SERVICE_PMOND (0xB00BF00D) #define MTC_SERVICE_HWMOND (0xF00BF00D) #define MTC_SERVICE_HEARTBEAT (0xBABEF00D) +#define MTC_SERVICE_MTCCLIENT (0xABCDF00D) /** process to process loopback command */ #define MTC_EVENT_LOOPBACK (0x01010101) diff --git a/mtce/src/alarm/alarm.h b/mtce/src/alarm/alarm.h index a9895ac4..f20e640a 100644 --- a/mtce/src/alarm/alarm.h +++ b/mtce/src/alarm/alarm.h @@ -2,7 +2,7 @@ #define __INCLUDE_ALARM_H__ /* - * Copyright (c) 2016-2017 Wind River Systems, Inc. + * Copyright (c) 2016-2017, 2024 Wind River Systems, Inc. * * SPDX-License-Identifier: Apache-2.0 * @@ -14,38 +14,17 @@ */ #include "nodeBase.h" -#include "nodeUtil.h" /* for ... common utilities */ - +#include "nodeUtil.h" /* for ... common utilities */ +#include "alarmUtil.h" /* for ... common alarm definitions */ #include "msgClass.h" /* for ... msgClassSock type definition */ /* external APIs */ #include "fmAPI.h" -#define ENTITY_PREFIX ((const char *)"host=") - #define MAX_ALARM_REQ_PER_MSG (4) #define MAX_ALARM_REQ_MSG_SIZE (500) #define MAX_ALARM_REQ_SIZE (MAX_ALARM_REQ_PER_MSG*MAX_ALARM_REQ_MSG_SIZE) -#define SWERR_ALARM_ID ((const char *)"200.000") /* Do No Use */ -#define LOCK_ALARM_ID ((const char *)"200.001") -#define ENABLE_ALARM_ID ((const char *)"200.004") -#define MGMNT_HB_ALARM_ID ((const char *)"200.005") -#define PMOND_ALARM_ID ((const char *)"200.006") -#define SENSOR_ALARM_ID ((const char *)"200.007") /* Sensor read alarm ; i.e. the sensor read value bad */ -#define CLSTR_HB_ALARM_ID ((const char *)"200.009") -#define BM_ALARM_ID ((const char *)"200.010") -#define CONFIG_ALARM_ID ((const char *)"200.011") -#define CH_COMP_ALARM_ID ((const char *)"200.013") /* Combo Host Compute Failure - on last Controller */ -#define SENSORCFG_ALARM_ID ((const char *)"200.014") /* Sensor configuration alarm ; i.e. could not add */ -#define SENSORGROUP_ALARM_ID ((const char *)"200.015") /* Sensor Group Read Error */ -#define LUKS_ALARM_ID ((const char *)"200.016") /* LUKS volume failure alarm */ - -#define EVENT_LOG_ID ((const char *)"200.020") -#define COMMAND_LOG_ID ((const char *)"200.021") -#define STATECHANGE_LOG_ID ((const char *)"200.022") -#define SERVICESTATUS_LOG_ID ((const char *)"200.023") /* log used to report service failure events against */ - /** Heartbeat Alarm Abstract Reference IDs */ typedef enum @@ -85,17 +64,6 @@ int alarm_log ( string hostname, const char * id_ptr, string entity ); #else -typedef struct -{ - SFmAlarmDataT alarm ; - string name ; - string instc_prefix ; /* Instance prefix i.e. "=sensor." or "=process." */ - string critl_reason ; - string minor_reason ; - string major_reason ; - string clear_reason ; -} alarmUtil_type ; - typedef struct { string alarmid ; @@ -133,10 +101,6 @@ echo "${STR}" | socat - ${PROTOCOL}:${ADDRESS}:${port} void alarmData_init ( void ); alarmUtil_type * alarmData_getAlarm_ptr ( string alarm_id_str ); -/* in alarmUtil.cpp */ -// EFmAlarmSeverityT mtcAlarm_state ( string hostname, alarm_id_enum id ); - - /* in alarmHdlr.cpp */ int alarmHdlr_request_handler ( char * msg_ptr ); @@ -144,21 +108,6 @@ void alarmMgr_queue_clear ( void ); void alarmMgr_queue_alarm (queue_entry_type entry); void alarmMgr_service_queue(void); -/* Clear all alarms against this host */ -void alarmUtil_clear_all ( string hostname ); - -/** - * Query the specified alarm severity level. - * Severity levels are specified in fmAPI.h - **/ -EFmAlarmSeverityT alarmUtil_query ( string hostname, - string identity, - string instance ); - -int alarmUtil_query_identity ( string identity, - SFmAlarmDataT * alarm_list_ptr, - unsigned int alarms_max ); - int alarmUtil_clear ( string hostname, string alarm_id, string entity ); int alarmUtil_critical ( string hostname, string alarm_id, string entity, FMTimeT & timestamp ); int alarmUtil_major ( string hostname, string alarm_id, string entity, FMTimeT & timestamp ); diff --git a/mtce/src/common/nodeClass.cpp b/mtce/src/common/nodeClass.cpp index 4d5e7d47..f9e2fe36 100755 --- a/mtce/src/common/nodeClass.cpp +++ b/mtce/src/common/nodeClass.cpp @@ -19,7 +19,7 @@ #include /* for ENODEV, EFAULT and ENXIO */ #include /* for close and usleep */ #include - +#include /* for ... json_tokener_parse and other json utils */ using namespace std; #ifdef __AREA__ @@ -564,6 +564,11 @@ nodeLinkClass::node* nodeLinkClass::addNode( string hostname ) ptr->mtcAlive_clstr = false ; ptr->mtcAlive_pxeboot = false ; + /* Assume the node's mtcClient does not support pxeboot mtcAlive + * messaging until that mtcClient reports that it does. + * This bool blocks the pxeboot_mtcAlive_monitor fsm. */ + ptr->pxeboot_mtcAlive_supported = false ; + /* These counts are incremented in the set_mtcAlive member * function and cleared in the reset progression handler. */ ptr->mtcAlive_mgmnt_count = 0 ; @@ -574,8 +579,9 @@ nodeLinkClass::node* nodeLinkClass::addNode( string hostname ) for (int i = 0 ; i < MTCALIVE_INTERFACES_MAX ; i++) { ptr->mtcAlive_sequence[i] = + ptr->mtcAlive_loss_count[i] = ptr->mtcAlive_sequence_save[i] = - ptr->mtcAlive_sequence_miss[i] = + ptr->mtcAlive_miss_count[i] = ptr->mtcAlive_log_throttle [i] = 0 ; } ptr->pxeboot_mtcAlive_not_seen_log_throttle = 0 ; @@ -1886,6 +1892,66 @@ int nodeLinkClass::alarm_compute_clear ( struct nodeLinkClass::node * node_ptr, return (PASS); } +/***************************************************************************** + * + * Name : alarm_mtcAlive_failure, alarm_mtcAlive_clear + * + * Purpose : Generate a log and assert minor or clear alarm for mtcAlive + * messaging failures. + * + * Assumptions: Degrade not supported for minor alarms. + * + * Limitations: Only pxeboot messaging is currently supported. + * Need to introduce an network mtcAlive alarm array to track which + * ones are active if/when it comes time to support alarming of + * mtcAlive failures on other networks. + * + *****************************************************************************/ +int nodeLinkClass::alarm_mtcAlive_failure ( struct nodeLinkClass::node * node_ptr, int network ) +{ + int rc = PASS ; + if ( network != PXEBOOT_INTERFACE ) + { + slog ("%s mtcAlive alarm not supported for %s network", + node_ptr->hostname.c_str(), + get_iface_name_str(network)); + rc = FAIL_INVALID_OPERATION ; + } + else if ( node_ptr->alarms[MTC_ALARM_ID__MTCALIVE] != FM_ALARM_SEVERITY_MINOR ) + { + wlog ("%s minor %s mtcAlive messaging failure", + node_ptr->hostname.c_str(), + get_iface_name_str(network)); + + mtcAlarm_minor ( node_ptr->hostname, MTC_ALARM_ID__MTCALIVE ); + node_ptr->alarms[MTC_ALARM_ID__MTCALIVE] = FM_ALARM_SEVERITY_MINOR ; + } + return (rc); +} + +int nodeLinkClass::alarm_mtcAlive_clear ( struct nodeLinkClass::node * node_ptr, int network ) +{ + int rc = PASS ; + if ( network != PXEBOOT_INTERFACE ) + { + slog ("%s mtcAlive alarm not supported for %s network", + node_ptr->hostname.c_str(), + get_iface_name_str(network)); + rc = FAIL_INVALID_OPERATION ; + } + else if ( node_ptr->alarms[MTC_ALARM_ID__MTCALIVE] != FM_ALARM_SEVERITY_CLEAR ) + { + ilog ("%s minor %s mtcAlive messaging failure", + node_ptr->hostname.c_str(), + get_iface_name_str(network)); + + mtcAlarm_clear ( node_ptr->hostname, MTC_ALARM_ID__MTCALIVE ); + node_ptr->alarms[MTC_ALARM_ID__MTCALIVE] = FM_ALARM_SEVERITY_CLEAR ; + node_ptr->mtcAlive_loss_count[network] = 0 ; + } + return (rc); +} + /** Host Operational State Change public member function */ int nodeLinkClass::oper_subf_state_change ( string hostname, string newOperState ) { @@ -3086,6 +3152,7 @@ void nodeLinkClass::clear_service_readies ( struct nodeLinkClass::node * node_pt ilog ("%s clearing service ready events\n", node_ptr->hostname.c_str()); node_ptr->hbsClient_ready = false ; node_ptr->pmond_ready = false ; + node_ptr->mtcClient_ready = false ; } } } @@ -3952,6 +4019,25 @@ int nodeLinkClass::set_clstr_hostaddr ( string & hostname, string & ip ) return ( rc ); } +int nodeLinkClass::set_pxeboot_mtcAlive_support ( string hostname, bool state ) +{ + int rc = FAIL_HOSTNAME_LOOKUP ; + nodeLinkClass::node* node_ptr = nodeLinkClass::getNode ( hostname ); + if ( node_ptr != NULL ) + { + if ( state != node_ptr->pxeboot_mtcAlive_supported ) + mtcAliveStageChange (node_ptr, MTC_MTCALIVE__START); + + dlog ("%s pxeboot mtcAlive %s supported", + node_ptr->hostname.c_str(), + state ? "is" : "is not"); + + node_ptr->pxeboot_mtcAlive_supported = state ; + rc = PASS ; + } + return ( rc ); +} + string nodeLinkClass::get_hostname ( string hostaddr ) { if (( hostaddr == LOOPBACK_IPV6 ) || @@ -4197,7 +4283,7 @@ void nodeLinkClass::set_mtcAlive ( struct nodeLinkClass::node * node_ptr, unsign } if ( state_change ) { - ilog ("%s mtcAlive received from %s network with uptime:%d ; seq:%d", + ilog ("%s mtcAlive received from %s network with uptime:%d ; seq:%d ; state change", node_ptr->hostname.c_str(), get_iface_name_str(iface), node_ptr->uptime, @@ -4208,22 +4294,32 @@ void nodeLinkClass::set_mtcAlive ( struct nodeLinkClass::node * node_ptr, unsign { if ( sequence < node_ptr->mtcAlive_sequence[iface]+1 ) { - wlog ("%s mtcAlive received from %s network with uptime:%d ; out-of-sequence ; expect:%d detect:%d ; correcting", - node_ptr->hostname.c_str(), - get_iface_name_str(iface), - node_ptr->uptime, - node_ptr->mtcAlive_sequence[iface]+1, - sequence); + // Don't warn log for mtcClient restart cases. + // ... indicated by a very low sequence number. + if ( sequence > 2 ) + { + wlog ("%s mtcAlive received from %s network with uptime:%d ; out-of-sequence ; expect:%d detect:%d ; correcting", + node_ptr->hostname.c_str(), + get_iface_name_str(iface), + node_ptr->uptime, + node_ptr->mtcAlive_sequence[iface]+1, + sequence); + } } else { - wlog ("%s mtcAlive received from %s network with uptime:%d ; missed %d mtcalive msgs ; expect:%d detect:%d ; correcting", - node_ptr->hostname.c_str(), - get_iface_name_str(iface), - node_ptr->uptime, - sequence-(node_ptr->mtcAlive_sequence[iface]+1), - node_ptr->mtcAlive_sequence[iface]+1, - sequence); + // Don't warn log for mtcAgent restart cases. + // ... indicated by expecting 1 and detecting a large number. + if ( node_ptr->mtcAlive_sequence[iface] > 0 ) + { + wlog ("%s mtcAlive received from %s network with uptime:%d ; missed %d mtcAlive msgs ; expect:%d detect:%d ; correcting", + node_ptr->hostname.c_str(), + get_iface_name_str(iface), + node_ptr->uptime, + sequence-(node_ptr->mtcAlive_sequence[iface]+1), + node_ptr->mtcAlive_sequence[iface]+1, + sequence); + } } } else @@ -5659,7 +5755,8 @@ void nodeLinkClass::manage_heartbeat_minor ( string hostname, iface_enum iface, /** Interface to declare that a key service on the * specified host is up, running and ready */ int nodeLinkClass::declare_service_ready ( string & hostname, - unsigned int service ) + unsigned int service, + string feature_list ) { nodeLinkClass::node * node_ptr = nodeLinkClass::getNode ( hostname ); if ( node_ptr == NULL ) @@ -5667,6 +5764,58 @@ int nodeLinkClass::declare_service_ready ( string & hostname, wlog ("%s Unknown Host\n", hostname.c_str()); return FAIL_UNKNOWN_HOSTNAME ; } + else if ( service == MTC_SERVICE_MTCCLIENT ) + { + + if ( ! feature_list.empty() ) + { + /* features is expected to be a list - ["feature 0", "feature 1", ..."] */ + struct json_object *json_obj = json_tokener_parse(feature_list.data()); + if ( json_obj != NULL ) + { + /* how many featiures are present ? */ + int features = json_object_array_length(json_obj); + + dlog ("%s %s offers %d features", hostname.c_str(), MTC_SERVICE_MTCCLIENT_NAME, features); + + for ( int f = 0 ; f < features ; f++ ) + { + /* get the first element at index 0 */ + struct json_object *feature_obj_ptr = json_object_array_get_idx(json_obj, f); + + /* convert each element to a string */ + string feature = json_object_get_string(feature_obj_ptr); + dlog ("mtcClient feature %d: %s", f, feature.c_str()); + if ( feature == MTC_PXEBOOT_MTCALIVE ) + { + dlog ("%s %s supports pxeboot mtcAlive", hostname.c_str(), MTC_SERVICE_MTCCLIENT_NAME); + set_pxeboot_mtcAlive_support (hostname, true); + } + } + /* free the json object */ + json_object_put(json_obj); + } + else + { + dlog ("%s json object is NULL", hostname.c_str()); + } + } + else + { + dlog ("%s feature list is empty", hostname.c_str()); + } + + if ( node_ptr->mtcClient_ready == false ) + { + ilog ("%s %s ready %s", hostname.c_str(), + MTC_SERVICE_MTCCLIENT_NAME, + node_ptr->pxeboot_mtcAlive_supported ? "; with pxeboot mtcAlive support" : ""); + node_ptr->mtcClient_ready = true ; + } + if ( node_ptr->pxeboot_mtcAlive_supported ) + send_mtc_cmd ( node_ptr->hostname, MTC_REQ_MTCALIVE, PXEBOOT_INTERFACE ); + return (PASS); + } else if ( service == MTC_SERVICE_PMOND ) { node_ptr->pmond_ready = true ; @@ -9580,7 +9729,7 @@ void nodeLinkClass::mem_log_mtcalive_pxeboot ( struct nodeLinkClass::node * node this->pxeboot_network_provisioned ? 'Y' : 'N', node_ptr->mtcAlive_pxeboot ? 'Y' : 'N', node_ptr->mtcAlive_timer.ring ? 'Y' : 'N', - node_ptr->mtcAlive_sequence_miss [PXEBOOT_INTERFACE], + node_ptr->mtcAlive_miss_count [PXEBOOT_INTERFACE], node_ptr->mtcAlive_sequence [PXEBOOT_INTERFACE], node_ptr->mtcAlive_sequence_save [PXEBOOT_INTERFACE]); @@ -9590,9 +9739,10 @@ void nodeLinkClass::mem_log_mtcalive_pxeboot ( struct nodeLinkClass::node * node void nodeLinkClass::mem_log_alarm1 ( struct nodeLinkClass::node * node_ptr ) { char str[MAX_MEM_LOG_DATA] ; - snprintf (&str[0], MAX_MEM_LOG_DATA, "%s\tAlarm List:%s%s%s%s%s\n", + snprintf (&str[0], MAX_MEM_LOG_DATA, "%s\tAlarm List:%s%s%s%s%s%s\n", node_ptr->hostname.c_str(), node_ptr->alarms[MTC_ALARM_ID__LOCK ] ? " Locked" : " .", + node_ptr->alarms[MTC_ALARM_ID__MTCALIVE] ? " mtcAlive" : " .", node_ptr->alarms[MTC_ALARM_ID__CONFIG ] ? " Config" : " .", node_ptr->alarms[MTC_ALARM_ID__ENABLE ] ? " Enable" : " .", node_ptr->alarms[MTC_ALARM_ID__CH_COMP ] ? " Compute" : " .", diff --git a/mtce/src/common/nodeClass.h b/mtce/src/common/nodeClass.h index 9536f00f..3f5a7825 100755 --- a/mtce/src/common/nodeClass.h +++ b/mtce/src/common/nodeClass.h @@ -335,9 +335,22 @@ private: /* tracks the sequence number of the last mtcAlive message */ unsigned int mtcAlive_sequence [MTCALIVE_INTERFACES_MAX] ; unsigned int mtcAlive_sequence_save[MTCALIVE_INTERFACES_MAX] ; - unsigned int mtcAlive_sequence_miss[MTCALIVE_INTERFACES_MAX] ; + unsigned int mtcAlive_miss_count [MTCALIVE_INTERFACES_MAX] ; unsigned int mtcAlive_log_throttle [MTCALIVE_INTERFACES_MAX] ; + /* mtcAlive miss, loss, alarm and log throttle definitions */ + #define PXEBOOT_MTCALIVE_MONITOR_RATE_SECS (MTC_ALIVE_TIMER*2) // monitor every 10 seconds + #define PXEBOOT_MTCALIVE_LOSS_THRESHOLD (6) // 6 misses or 1 minute if back-2-back + #define PXEBOOT_MTCALIVE_LOSS_ALARM_THRESHOLD (2) // 2 losses before recovery is 2 minutes + #define PXEBOOT_MTCALIVE_NOT_SEEN_LOG_THROTTLE (60) // not seen log every 10 minutes + #define PXEBOOT_MTCALIVE_LOSS_LOG_THROTTLE (60) // loss log every 10 minutes + + /* used to debounce mtcAlive loss alarm */ + unsigned int mtcAlive_loss_count [MTCALIVE_INTERFACES_MAX] ; + + /* indicates boolean support for pxeboot mtcAlive messaging */ + bool pxeboot_mtcAlive_supported ; + /* pxeboot mtcAlive monitor log throttles */ int pxeboot_mtcAlive_not_seen_log_throttle ; int pxeboot_mtcAlive_loss_log_throttle ; @@ -661,6 +674,9 @@ private: /** Host degraded due to loss of Process Monitor running flag */ bool pmon_degraded ; + /* Maintenance Client Ready */ + bool mtcClient_ready ; + /** Process Monitor Ready flag and degrade list */ bool pmond_ready ; @@ -1139,6 +1155,9 @@ private: int alarm_compute_clear ( struct nodeLinkClass::node * node_ptr, bool force ); int alarm_compute_failure ( struct nodeLinkClass::node * node_ptr , EFmAlarmSeverityT sev ); + int alarm_mtcAlive_clear ( struct nodeLinkClass::node * node_ptr, int network ); + int alarm_mtcAlive_failure( struct nodeLinkClass::node * node_ptr, int network ); + void clear_subf_failed_bools ( struct nodeLinkClass::node * node_ptr ); void clear_main_failed_bools ( struct nodeLinkClass::node * node_ptr ); void clear_hostservices_ctls ( struct nodeLinkClass::node * node_ptr ); @@ -1496,6 +1515,9 @@ public: /* set the pxeboot network address for any hostname */ int set_pxeboot_hostaddr ( string hostname, string ip ); + /* set the state of this node's pxeboot mtcAlive support */ + int set_pxeboot_mtcAlive_support ( string hostname, bool state ); + /** get hostname for any hostname */ string get_hostname ( string hostaddr ); @@ -1732,6 +1754,13 @@ public: * interface is on the 'lo' (localhost) interface. */ bool pxeboot_network_provisioned ; + /** A boolean that is used to indicate whether this node supports + * pxeboot mtcAlive messaging. + * This is needed to support upgrades to nodes that don't support + * this feature prior to their upgrade. + * Assuming controllers are upgraded first.*/ + bool pxeboot_mtcAlive_supported ; + /** A debug bool hat allows cluster-host heartbeat failures to only * cause host degrade rather than failure */ bool clstr_degrade_only ; @@ -1925,7 +1954,7 @@ public: /** Interface to declare that a key service on the * specified host is up, running and ready */ - int declare_service_ready ( string & hostname, unsigned int service ); + int declare_service_ready ( string & hostname, unsigned int service, string features="" ); /** Process Monitor 'Clear' Event handler. * diff --git a/mtce/src/maintenance/mtcAlarm.cpp b/mtce/src/maintenance/mtcAlarm.cpp index be705d75..449a7e7c 100644 --- a/mtce/src/maintenance/mtcAlarm.cpp +++ b/mtce/src/maintenance/mtcAlarm.cpp @@ -64,6 +64,32 @@ void mtcAlarm_init ( void ) snprintf( ptr->alarm.proposed_repair_action, FM_MAX_BUFFER_LENGTH, "Administratively unlock Host to bring it back in-service."); + /** pxeboot mtcAlive Alarm **************************************************/ + + ptr = &alarm_list[MTC_ALARM_ID__MTCALIVE]; + memset (&ptr->alarm, 0, (sizeof(SFmAlarmDataT))); + snprintf(&ptr->alarm.alarm_id[0], FM_MAX_BUFFER_LENGTH, "%s", LOCK_ALARM_ID); + + ptr->name = "mtcAlive" ; + ptr->instc_prefix = "" ; + + ptr->critl_reason = + ptr->major_reason = + ptr->minor_reason = "pxeboot network communication failure"; + ptr->clear_reason = "pxeboot network communication recovered"; + + ptr->alarm.alarm_type = FM_ALARM_COMM ; + ptr->alarm.probable_cause = FM_ALARM_CAUSE_UNKNOWN; + ptr->alarm.inhibit_alarms = FM_FALSE; + ptr->alarm.service_affecting = FM_FALSE; + ptr->alarm.suppression = FM_TRUE ; + + ptr->alarm.severity = FM_ALARM_SEVERITY_CLEAR ; /* Dynamic */ + ptr->alarm.alarm_state = FM_ALARM_STATE_CLEAR ; /* Dynamic */ + + snprintf( ptr->alarm.proposed_repair_action, FM_MAX_BUFFER_LENGTH, + "Administratively Lock and Unlock host to recover. If problem persists, contact next level of support."); + /** Enable Alarm ************************************************************/ ptr = &alarm_list[MTC_ALARM_ID__ENABLE]; @@ -339,6 +365,7 @@ string _getIdentity ( mtc_alarm_id_enum id ) switch ( id ) { case MTC_ALARM_ID__LOCK: return (LOCK_ALARM_ID); + case MTC_ALARM_ID__MTCALIVE: return (MTCALIVE_ALARM_ID); case MTC_ALARM_ID__CONFIG: return (CONFIG_ALARM_ID); case MTC_ALARM_ID__ENABLE: return (ENABLE_ALARM_ID); case MTC_ALARM_ID__BM: return (BM_ALARM_ID); @@ -348,7 +375,7 @@ string _getIdentity ( mtc_alarm_id_enum id ) case MTC_LOG_ID__COMMAND: return (COMMAND_LOG_ID); case MTC_LOG_ID__STATECHANGE: return (STATECHANGE_LOG_ID); case MTC_LOG_ID__CONFIG: return (CONFIG_LOG_ID); - default: return ("200.000"); + default: return (SWERR_ALARM_ID); } } diff --git a/mtce/src/maintenance/mtcAlarm.h b/mtce/src/maintenance/mtcAlarm.h index e0ddf87e..075d4df6 100644 --- a/mtce/src/maintenance/mtcAlarm.h +++ b/mtce/src/maintenance/mtcAlarm.h @@ -26,6 +26,7 @@ using namespace std; typedef enum { MTC_ALARM_ID__LOCK, + MTC_ALARM_ID__MTCALIVE, MTC_ALARM_ID__CONFIG, MTC_ALARM_ID__ENABLE, MTC_ALARM_ID__BM, diff --git a/mtce/src/maintenance/mtcCompMsg.cpp b/mtce/src/maintenance/mtcCompMsg.cpp index 3ec8b93b..c6f987dd 100755 --- a/mtce/src/maintenance/mtcCompMsg.cpp +++ b/mtce/src/maintenance/mtcCompMsg.cpp @@ -247,13 +247,20 @@ int mtc_service_command ( mtc_socket_type * sock_ptr, int interface ) rc = PASS ; if ( msg.cmd == MTC_REQ_MTCALIVE ) { - ilog ("mtcAlive request received from %s network", iface_name_ptr); + alog1 ("mtcAlive request received from %s network", iface_name_ptr); if ( interface == PXEBOOT_INTERFACE ) { alog2 ("pxeboot mtcAlive buffer: %s", &msg.buf[0]); load_pxebootInfo_msg(msg); + +#ifdef WANT_FIT_TESTING + if ( ! daemon_want_fit ( FIT_CODE__FAIL_PXEBOOT_MTCALIVE ) ) +#endif + { + send_mtcAlive_msg ( sock_ptr, ctrl_ptr->who_i_am, interface ); + } } - return ( send_mtcAlive_msg ( sock_ptr, get_who_i_am(), interface )); + return (rc); } else if ( msg.cmd == MTC_MSG_INFO ) { @@ -749,6 +756,7 @@ int mtc_service_command ( mtc_socket_type * sock_ptr, int interface ) int mtce_send_event ( mtc_socket_type * sock_ptr, unsigned int cmd , const char * mtce_name_ptr ) { mtc_message_type event ; + ctrl_type *ctrl_ptr = get_ctrl_ptr(); int rc = PASS ; int bytes = 0 ; @@ -772,13 +780,21 @@ int mtce_send_event ( mtc_socket_type * sock_ptr, unsigned int cmd , const char event_info.append(MTC_JSON_SERVICE); event_info.append("\":\""); event_info.append(MTC_SERVICE_MTCCLIENT_NAME ); - event_info.append("\"}"); + + event_info.append("\",\"active_controller_pxeboot_address\":\""); + event_info.append(ctrl_ptr->pxeboot_addr_active_controller); + + event_info.append("\",\""); + event_info.append(MTC_JSON_FEATURES); + event_info.append("\":[\""); + event_info.append(MTC_PXEBOOT_MTCALIVE); + event_info.append("\"]}"); size_t len = event_info.length()+1 ; snprintf ( &event.hdr[0], MSG_HEADER_SIZE, "%s", get_mtce_event_header()); snprintf ( &event.buf[0], len, "%s", event_info.data()); bytes = ((sizeof(mtc_message_type))-(BUF_SIZE-len)); - ilog ("%s %s ready", get_hostname().c_str(), MTC_SERVICE_MTCCLIENT_NAME); + dlog ("%s %s ready", get_hostname().c_str(), MTC_SERVICE_MTCCLIENT_NAME); } else if (( cmd == MTC_EVENT_AVS_CLEAR ) || ( cmd == MTC_EVENT_AVS_MAJOR ) || @@ -849,10 +865,37 @@ int mtce_send_event ( mtc_socket_type * sock_ptr, unsigned int cmd , const char } else { - elog ("cannot send to null or failed socket (%s)", + elog ("cannot send to null or failed management network socket (%s)", get_interface_name_str (MGMNT_INTERFACE) ); rc = FAIL_SOCKET_SENDTO ; } + + // Only the events sent on the pxeboot network are: + // - ready event + if (( cmd == MTC_EVENT_MONITOR_READY ) && + ( sock_ptr->pxeboot_tx_socket > 0 ) && + ( !ctrl_ptr->pxeboot_addr_active_controller.empty())) + { + int flags = 0 ; // no tx flags + struct sockaddr_in hostAddr; + memset(&hostAddr, 0, sizeof(hostAddr)); + print_mtc_message ( ctrl_ptr->pxeboot_addr_active_controller.data(), MTC_CMD_TX, event, get_interface_name_str(PXEBOOT_INTERFACE), false); + hostAddr.sin_addr.s_addr = inet_addr(ctrl_ptr->pxeboot_addr_active_controller.data()); + hostAddr.sin_family = AF_INET; + hostAddr.sin_port = htons(sock_ptr->mtc_tx_pxeboot_port); + + ssize_t bytes_sent = sendto(sock_ptr->pxeboot_tx_socket, &event.hdr[0], bytes, flags, + (const struct sockaddr*)&hostAddr, sizeof(hostAddr)); + if (bytes_sent <= 0) + { + elog ("failed to send %s to %s:%d on %s network (rc:%ld) (%d:%m)", + get_mtcNodeCommand_str(event.cmd), + ctrl_ptr->pxeboot_addr_active_controller.c_str(), + hostAddr.sin_port, + get_interface_name_str(PXEBOOT_INTERFACE), + bytes_sent, errno); + } + } return rc ; } diff --git a/mtce/src/maintenance/mtcCtrlMsg.cpp b/mtce/src/maintenance/mtcCtrlMsg.cpp index 96f8ef8b..353fd130 100755 --- a/mtce/src/maintenance/mtcCtrlMsg.cpp +++ b/mtce/src/maintenance/mtcCtrlMsg.cpp @@ -443,6 +443,7 @@ int mtc_service_inbox ( nodeLinkClass * obj_ptr, string service = "" ; string sensor = "" ; string process = "" ; + hostname = "unknown" ; int rc1 = FAIL ; @@ -493,7 +494,17 @@ int mtc_service_inbox ( nodeLinkClass * obj_ptr, } else if ( service == MTC_SERVICE_MTCCLIENT_NAME ) { - ilog ("%s %s ready", hostname.c_str(), MTC_SERVICE_MTCCLIENT_NAME); + string features= "" ; + if ( jsonUtil_get_key_val(&msg.buf[0], MTC_JSON_FEATURES, features ) == PASS ) + { + dlog ("%s %s features: %s", hostname.c_str(), service.c_str(), features.c_str()); + } + else + { + ilog ("%s %s not offering feature list ; node may have upgrade pending", + hostname.c_str(), MTC_SERVICE_MTCCLIENT_NAME); + } + obj_ptr->declare_service_ready ( hostname, MTC_SERVICE_MTCCLIENT, features ); /* if this ready event is from the mtcClient of a * controller that has valid bmc access info then @@ -697,7 +708,9 @@ int send_mtc_cmd ( string & hostname, int cmd , int interface, string json_dict * controller's pxeboot ip addresses so it knows where to send. */ obj_ptr->pxebootInfo_loader(); data = "{\"pxebootInfo\":{" ; - data.append ("\"address\":\""); + data.append ("\""); + data.append (CONTROLLER); + data.append ("\":\""); data.append (obj_ptr->my_pxeboot_ip); data.append ("\",\""); data.append (CONTROLLER_0); diff --git a/mtce/src/maintenance/mtcNodeComp.cpp b/mtce/src/maintenance/mtcNodeComp.cpp index 3c25d760..f5bad581 100644 --- a/mtce/src/maintenance/mtcNodeComp.cpp +++ b/mtce/src/maintenance/mtcNodeComp.cpp @@ -337,6 +337,7 @@ int daemon_configure ( void ) rc = PASS ; } + daemon_load_fit(); return (rc); } @@ -1267,6 +1268,9 @@ int daemon_init ( string iface, string nodetype_str ) ctrl.clstr_iface_provisioned = false ; ctrl.pxeboot_iface_provisioned = false ; ctrl.peer_ctrlr_reset.sync = false ; + ctrl.pxeboot_addr_c0 = "" ; + ctrl.pxeboot_addr_c1 = "" ; + ctrl.pxeboot_addr_active_controller = "" ; /* convert node type to integer */ ctrl.nodetype = get_host_function_mask ( nodetype_str ) ; @@ -1295,9 +1299,12 @@ int daemon_init ( string iface, string nodetype_str ) } else { - // Ready to do pxeboot messaging + ilog ("Mgmnt iface : %s", ctrl.mgmnt_iface.c_str()); + + // Not on LO, assume pxeboot provisioning starting with it being + // equal to the management interface, until otherwise updated due + // to bonding or vlan modes. ctrl.pxeboot_iface = ctrl.mgmnt_iface ; - ilog ("Pxeboot iface %s", ctrl.pxeboot_iface.c_str()); ctrl.pxeboot_iface_provisioned = true ; } } @@ -1437,7 +1444,9 @@ void daemon_service_run ( void ) ctrl.peer_ctrlr_reset.audit_period ); } + /* Send the mtcClient ready event and clear the periodic event counter */ mtce_send_event ( sock_ptr, MTC_EVENT_MONITOR_READY, NULL ); + ctrl.ready_event_counter = 0 ; /* lets go select so that the sock does not go crazy */ dlog ("%s running main loop with %d msecs socket timeout\n", @@ -1646,7 +1655,7 @@ void daemon_service_run ( void ) } } } - if ( ctrl.timer.ring == true ) + if ( mtcTimer_expired ( ctrl.timer ) ) { bool socket_reinit = true ; @@ -1745,7 +1754,13 @@ void daemon_service_run ( void ) string who_i_am = _self_identify ( ctrl.nodetype_str ); } alog1 ("sending mtcAlive on all provisioned mtcAlive networks"); - send_mtcAlive_msg ( sock_ptr, ctrl.who_i_am, PXEBOOT_INTERFACE ); + +#ifdef WANT_FIT_TESTING + if ( ! daemon_want_fit ( FIT_CODE__FAIL_PXEBOOT_MTCALIVE ) ) +#endif + { + send_mtcAlive_msg ( sock_ptr, ctrl.who_i_am, PXEBOOT_INTERFACE ); + } send_mtcAlive_msg ( sock_ptr, ctrl.who_i_am, MGMNT_INTERFACE ); if (( ctrl.clstr_iface_provisioned == true ) && ( mtc_sock.mtc_client_clstr_rx_socket != NULL ) && @@ -1801,6 +1816,22 @@ void daemon_service_run ( void ) _close_amon_sock (); } } + + // Purpose: mtcClient ready event audit + // + // Send the ready event every minute just in case the first + // process startup event was missed by the mtcAgent or + // the mtcAgent was restarted. + //. + // Needed to ensure that pxeboot mtcAlive messaging monitoring + // gets started over a mtcagent process restart. + if ( ++ctrl.ready_event_counter >= (MTC_MINS_1/MTC_ALIVE_TIMER) ) + { + + dlog ("sending mtcClient ready event"); + mtce_send_event ( sock_ptr, MTC_EVENT_MONITOR_READY, NULL ); + ctrl.ready_event_counter = 0 ; + } } /* service controller specific audits */ @@ -2388,8 +2419,9 @@ void load_mtcInfo_msg ( mtc_message_type & msg ) * Address can be empty of an unprovisioned controller. * * { "pxebootInfo":{ - * "controller-0":"169.254.202.2", - * "controller-1":"169.254.202.3" + * "controller" : "169.254.202.2" + * "controller-0" : "169.254.202.2", + * "controller-1" : "169.254.202.3" * } * } * @@ -2398,18 +2430,43 @@ void load_mtcInfo_msg ( mtc_message_type & msg ) ***************************************************************************/ void load_pxebootInfo_msg ( mtc_message_type & msg ) { - struct json_object *_obj = json_tokener_parse( &msg.buf[0] ); - if ( _obj ) + struct json_object *json_obj = json_tokener_parse( &msg.buf[0] ); + if ( json_obj ) { const char dict_label [] = "pxebootInfo" ; struct json_object *info_obj = (struct json_object *)(NULL); - json_bool json_rc = json_object_object_get_ex( _obj, + json_bool json_rc = json_object_object_get_ex( json_obj, &dict_label[0], &info_obj ); if ( ( json_rc == true ) && ( info_obj ) ) { + jlog ("%s: %s ", &dict_label[0], json_object_get_string(info_obj)); struct json_object *ctrl_obj = (struct json_object *)(NULL); + json_rc = json_object_object_get_ex( info_obj, CONTROLLER, &ctrl_obj ); + if (( json_rc == true ) && ( ctrl_obj )) + { + string active_controller = json_object_get_string(ctrl_obj); + if ( ctrl.pxeboot_addr_active_controller != active_controller ) + { + string prefix = "controller pxeboot address" ; + if ( ctrl.pxeboot_addr_active_controller.empty() ) + { + ilog ("%s: %s", + prefix.c_str(), + active_controller.c_str()); + } + else + { + ilog ("%s: %s ; was %s", + prefix.c_str(), + active_controller.c_str(), + ctrl.pxeboot_addr_active_controller.c_str()); + } + ctrl.pxeboot_addr_active_controller = active_controller ; + } + } + // now get the individual controller addresses string pxeboot_addr_cx[CONTROLLERS] = {CONTROLLER_0, CONTROLLER_1}; for (int c = 0 ; c < CONTROLLERS ; c++) { @@ -2423,8 +2480,7 @@ void load_pxebootInfo_msg ( mtc_message_type & msg ) // get the current pxeboot address for the in loop controller cur_pxeboot_addr = (controller == CONTROLLER_0) ? ctrl.pxeboot_addr_c0 : ctrl.pxeboot_addr_c1; - json_bool json_rc = - json_object_object_get_ex( info_obj, controller.data(), &ctrl_obj ); + json_rc = json_object_object_get_ex( info_obj, controller.data(), &ctrl_obj ); if (( json_rc == true ) && (ctrl_obj)) { jlog ("controller-x obj data: %s", json_object_get_string(ctrl_obj)); @@ -2477,7 +2533,7 @@ void load_pxebootInfo_msg ( mtc_message_type & msg ) elog("Failed to parse '%s' from mtcAlive request message: %s", &dict_label[0], &msg.buf[0]); } - json_object_put(_obj); + json_object_put(json_obj); } else { diff --git a/mtce/src/maintenance/mtcNodeComp.h b/mtce/src/maintenance/mtcNodeComp.h index a61a9317..01b8901d 100644 --- a/mtce/src/maintenance/mtcNodeComp.h +++ b/mtce/src/maintenance/mtcNodeComp.h @@ -102,6 +102,7 @@ typedef struct string pxeboot_addr ; string pxeboot_addr_c0 ; string pxeboot_addr_c1 ; + string pxeboot_addr_active_controller ; // Assume address is learned to start even though it's likely not. // This enabled the first not learned log followed by a learned @@ -147,6 +148,10 @@ typedef struct string mtcAgent_ip ; peer_ctrlr_reset_type peer_ctrlr_reset; + + /* throttles sending the periodic mtcClient ready event. */ + int ready_event_counter ; + } ctrl_type ; ctrl_type * get_ctrl_ptr ( void ); diff --git a/mtce/src/maintenance/mtcNodeFsm.cpp b/mtce/src/maintenance/mtcNodeFsm.cpp index df9a8d54..74fc1a0d 100755 --- a/mtce/src/maintenance/mtcNodeFsm.cpp +++ b/mtce/src/maintenance/mtcNodeFsm.cpp @@ -90,14 +90,15 @@ int nodeLinkClass::fsm ( struct nodeLinkClass::node * node_ptr ) * with mtcAlive debouncing */ nodeLinkClass::online_handler ( node_ptr ); - - /* - * Always run the mtcAlive handler. - * + /* pxeboot_mtcAlive_monitor * - monitor host's mtcAlive messaging * - manage host's mtcAlive missing alarm - */ - nodeLinkClass::pxeboot_mtcAlive_monitor ( node_ptr ); + * + * Don't monitor pxeboot mtcAlive messaging while the pxeboot network is + * not provisioned or that node has not yet reported that it supports + * pxeboot mtcAlive messaging */ + if ( this->pxeboot_network_provisioned && node_ptr->pxeboot_mtcAlive_supported ) + nodeLinkClass::pxeboot_mtcAlive_monitor ( node_ptr ); if ( node_ptr->adminAction == MTC_ADMIN_ACTION__DELETE ) { @@ -106,7 +107,6 @@ int nodeLinkClass::fsm ( struct nodeLinkClass::node * node_ptr ) return (PASS); } - /* Run the config FSM if the configAction bool is set. * We keep this as a separate action unto itself so that * mtce can continue to service all other actions for the diff --git a/mtce/src/maintenance/mtcNodeHdlrs.cpp b/mtce/src/maintenance/mtcNodeHdlrs.cpp index 465c3750..7084dbab 100755 --- a/mtce/src/maintenance/mtcNodeHdlrs.cpp +++ b/mtce/src/maintenance/mtcNodeHdlrs.cpp @@ -3433,15 +3433,20 @@ int nodeLinkClass::offline_handler ( struct nodeLinkClass::node * node_ptr ) plog ("%s offline (external)\n", node_ptr->hostname.c_str()); node_ptr->offlineStage = MTC_OFFLINE__IDLE ; } - else if ( !node_ptr->mtcAlive_mgmnt && !node_ptr->mtcAlive_clstr ) + else if ( !node_ptr->mtcAlive_mgmnt && !node_ptr->mtcAlive_clstr && !node_ptr->mtcAlive_pxeboot ) { if ( ++node_ptr->offline_search_count > offline_threshold ) { node_ptr->mtcAlive_online = false ; + node_ptr->mtcClient_ready = false ; - // Clear all the mtcAlive_sequence numbers + + // Clear all the mtcAlive counts and sequence numbers + node_ptr->mtcAlive_mgmnt_count = 0 ; + node_ptr->mtcAlive_clstr_count = 0 ; + node_ptr->mtcAlive_pxeboot_count = 0 ; for (int i = 0 ; i < MTCALIVE_INTERFACES_MAX ; i++) - node_ptr->mtcAlive_sequence[i] = 0; + node_ptr->mtcAlive_sequence[i] = 0; plog ("%s going offline ; (threshold (%d msec * %d)\n", node_ptr->hostname.c_str(), @@ -3485,13 +3490,15 @@ int nodeLinkClass::offline_handler ( struct nodeLinkClass::node * node_ptr ) **/ node_ptr->mtcAlive_online = true ; - ilog ("%s still seeing mtcAlive (%d) (Mgmt:%c:%d Clstr:%c:%d) ; restart offline_search_count=%d of %d\n", + ilog ("%s still seeing mtcAlive (%d) (Mgmt:%c:%d Clstr:%c:%d Pxeboot:%c:%d) ; restart offline_search_count=%d of %d\n", node_ptr->hostname.c_str(), node_ptr->mtcAlive_count, node_ptr->mtcAlive_mgmnt ? 'Y' : 'n', node_ptr->mtcAlive_mgmnt_count, node_ptr->mtcAlive_clstr ? 'Y' : 'n', node_ptr->mtcAlive_clstr_count, + node_ptr->mtcAlive_pxeboot ? 'Y' : 'n', + node_ptr->mtcAlive_pxeboot_count, node_ptr->offline_search_count, offline_threshold ); node_ptr->offline_search_count = 0 ; /* reset the search count */ @@ -6261,6 +6268,8 @@ int nodeLinkClass::add_handler ( struct nodeLinkClass::node * node_ptr ) mtcAlarm_state ( node_ptr->hostname, MTC_ALARM_ID__ENABLE); EFmAlarmSeverityT config_alarm_severity = mtcAlarm_state ( node_ptr->hostname, MTC_ALARM_ID__CONFIG); + EFmAlarmSeverityT mtcAlive_alarm_severity = + mtcAlarm_state ( node_ptr->hostname, MTC_ALARM_ID__MTCALIVE); /* Clear generic enable alarm over process restart. * Will get reasserted if the cause condition still exists */ @@ -6284,6 +6293,21 @@ int nodeLinkClass::add_handler ( struct nodeLinkClass::node * node_ptr ) alarmUtil_getSev_str(config_alarm_severity).c_str()); } + /* The mtcAlive alarm is maintained if it exists. + * The pxeboot_mtcAlive_monitor will clear the alarm + * if it exists and the pxeboot mtcAlive messaging works. */ + if ( mtcAlive_alarm_severity != FM_ALARM_SEVERITY_CLEAR ) + { + node_ptr->alarms[MTC_ALARM_ID__MTCALIVE] = mtcAlive_alarm_severity ; + ilog ("%s found mtcAlive alarm ; loaded %s", + node_ptr->hostname.c_str(), + alarmUtil_getSev_str(mtcAlive_alarm_severity).c_str()); + + // Load up the miss and loss counts used for recovery + node_ptr->mtcAlive_loss_count[PXEBOOT_INTERFACE] = PXEBOOT_MTCALIVE_LOSS_ALARM_THRESHOLD ; + node_ptr->mtcAlive_miss_count[PXEBOOT_INTERFACE] = PXEBOOT_MTCALIVE_LOSS_THRESHOLD ; + } + if ( is_controller(node_ptr) ) { this->controllers++ ; @@ -7571,29 +7595,46 @@ int nodeLinkClass::oos_test_handler ( struct nodeLinkClass::node * node_ptr ) // Returns : PASS // /////////////////////////////////////////////////////////////////////////////// -#define PXEBOOT_MTCALIVE_MONITOR_RATE_SECS (10) -#define PXEBOOT_MTCALIVE_LOSS_THRESHOLD (6) -#define PXEBOOT_MTCALIVE_NOT_SEEN_LOG_THROTTLE (6) -#define PXEBOOT_MTCALIVE_LOSS_LOG_THROTTLE (6) int nodeLinkClass::pxeboot_mtcAlive_monitor ( struct nodeLinkClass::node * node_ptr ) { - // ERIK: TODO: Comment out once verified flog ("%s pxeboot mtcAlive fsm stage: %s", node_ptr->hostname.c_str(), get_mtcAliveStages_str(node_ptr->mtcAliveStage).c_str()); - if ( !this->pxeboot_network_provisioned ) return PASS ; + + // Don't monitor pxeboot mtcAlive messaging while the node is + // locked or in the following administrative action states. + if (( node_ptr->adminState == MTC_ADMIN_STATE__LOCKED ) || + ( node_ptr->adminAction == MTC_ADMIN_ACTION__UNLOCK ) || + ( node_ptr->adminAction == MTC_ADMIN_ACTION__ENABLE ) || + ( node_ptr->adminAction == MTC_ADMIN_ACTION__RECOVER ) || + ( node_ptr->adminAction == MTC_ADMIN_ACTION__POWERCYCLE )) + { + // Clear the alarm if the node is locked + if (( node_ptr->adminState == MTC_ADMIN_STATE__LOCKED ) && + ( node_ptr->alarms[MTC_ALARM_ID__MTCALIVE] != FM_ALARM_SEVERITY_CLEAR )) + alarm_mtcAlive_clear (node_ptr, PXEBOOT_INTERFACE); + // Switch to START if not already there + if ( node_ptr->mtcAliveStage != MTC_MTCALIVE__START ) + mtcAliveStageChange (node_ptr, MTC_MTCALIVE__START); + return PASS ; + } switch (node_ptr->mtcAliveStage) { + // Starts from scratch. Clears timer and counts but not alarm. case MTC_MTCALIVE__START: { alog2 ("%s mtcAlive start", node_ptr->hostname.c_str()); - mtcTimer_reset ( node_ptr->mtcAlive_timer ); + if ( ! mtcTimer_expired (node_ptr->mtcAlive_timer) ) + mtcTimer_reset (node_ptr->mtcAlive_timer); node_ptr->mtcAlive_sequence[PXEBOOT_INTERFACE] = 0 ; + node_ptr->mtcAlive_sequence_save[PXEBOOT_INTERFACE] = 0 ; mtcAliveStageChange (node_ptr, MTC_MTCALIVE__SEND); - break ; + return PASS ; } + // Reloads the controller's pxeboot info and sends it with a mtcAlive request + // telling the remote node to send send mtcAlive to the active controller. case MTC_MTCALIVE__SEND: { /* pxeboot info refresh audit */ @@ -7601,25 +7642,31 @@ int nodeLinkClass::pxeboot_mtcAlive_monitor ( struct nodeLinkClass::node * node_ pxebootInfo_loader (); alog2 ("%s mtcAlive send", node_ptr->hostname.c_str()); send_mtc_cmd ( node_ptr->hostname, MTC_REQ_MTCALIVE, PXEBOOT_INTERFACE ); - node_ptr->mtcAlive_sequence_save[PXEBOOT_INTERFACE] = 0 ; - node_ptr->mtcAlive_sequence_miss[PXEBOOT_INTERFACE] = 0 ; mtcAliveStageChange (node_ptr, MTC_MTCALIVE__MONITOR); - break ; + return PASS ; } + // Start the Wait timer 2x longer than the expected mtcAlive cadence case MTC_MTCALIVE__MONITOR: { alog2 ("%s mtcAlive monitor", node_ptr->hostname.c_str()); mtcTimer_start ( node_ptr->mtcAlive_timer, mtcTimer_handler, PXEBOOT_MTCALIVE_MONITOR_RATE_SECS ); mtcAliveStageChange (node_ptr, MTC_MTCALIVE__WAIT); - break ; + return PASS ; } + // Wait for the timer to expire case MTC_MTCALIVE__WAIT: { if ( mtcTimer_expired ( node_ptr->mtcAlive_timer ) ) mtcAliveStageChange (node_ptr, MTC_MTCALIVE__CHECK); - break ; + return PASS ; } + // Check the mtcAlive sequence numbers and handle each possible case + // success - mtcAlive sequence number is greater than the last one - may clear alarm + // out-of-sequence - mtcAlive sequence number is less than the last one - may assert alarm + // miss - mtcAlive sequence number is equal to the last one - count misses + // loss - mtcAlive messaging miss count exceeded threshold - assert alarm + // not seen - waiting for first mtcAlive following reboot - request mtcAlive case MTC_MTCALIVE__CHECK: { if ( node_ptr->mtcAlive_sequence[PXEBOOT_INTERFACE] > node_ptr->mtcAlive_sequence_save[PXEBOOT_INTERFACE] ) @@ -7632,70 +7679,132 @@ int nodeLinkClass::pxeboot_mtcAlive_monitor ( struct nodeLinkClass::node * node_ node_ptr->mtcAlive_sequence_save[PXEBOOT_INTERFACE]); // Now that we received a message we can dec the missed count - if ( node_ptr->mtcAlive_sequence_miss[PXEBOOT_INTERFACE] ) - node_ptr->mtcAlive_sequence_miss[PXEBOOT_INTERFACE]-- ; - node_ptr->pxeboot_mtcAlive_not_seen_log_throttle = 0 ; - node_ptr->pxeboot_mtcAlive_loss_log_throttle = 0 ; + // and clear the alarm if it exists + if ( node_ptr->mtcAlive_miss_count[PXEBOOT_INTERFACE] ) + { + // Set miss count to max if we are have reached at least one loss but no alarm yet + if (( node_ptr->alarms[MTC_ALARM_ID__MTCALIVE] == FM_ALARM_SEVERITY_CLEAR ) && + ( node_ptr->mtcAlive_loss_count[PXEBOOT_INTERFACE] )) + { + node_ptr->mtcAlive_miss_count[PXEBOOT_INTERFACE] = PXEBOOT_MTCALIVE_LOSS_THRESHOLD ; + node_ptr->mtcAlive_loss_count[PXEBOOT_INTERFACE] = 0 ; + } + + ilog ("%s pxeboot mtcAlive miss count %d ; decrement %s; recovery", + node_ptr->hostname.c_str(), + node_ptr->mtcAlive_miss_count[PXEBOOT_INTERFACE], + node_ptr->alarms[MTC_ALARM_ID__MTCALIVE] ? "; alarm clear when 0 " : ""); + node_ptr->mtcAlive_miss_count[PXEBOOT_INTERFACE]-- ; + } + else + { + // Clear alarm and start with a clean loss slate. miss's is already zero + node_ptr->mtcAlive_loss_count[PXEBOOT_INTERFACE] = 0 ; + alarm_mtcAlive_clear ( node_ptr, PXEBOOT_INTERFACE ); + } + + // Clear the log throttles now that we have received a message + if ( node_ptr->pxeboot_mtcAlive_not_seen_log_throttle || node_ptr->pxeboot_mtcAlive_loss_log_throttle ) + { + node_ptr->pxeboot_mtcAlive_not_seen_log_throttle = 0 ; + node_ptr->pxeboot_mtcAlive_loss_log_throttle = 0 ; + } + mtcAliveStageChange (node_ptr, MTC_MTCALIVE__MONITOR); } else if ( node_ptr->mtcAlive_sequence[PXEBOOT_INTERFACE] < node_ptr->mtcAlive_sequence_save[PXEBOOT_INTERFACE] ) { - // unexpected case - wlog ("%s mtcAlive out-of-sequence ; this:%d last:%d", + // mtcClient restart case + if ( ++node_ptr->mtcAlive_miss_count[PXEBOOT_INTERFACE] < PXEBOOT_MTCALIVE_LOSS_THRESHOLD ) + { + // The mtcClient on this host may have been restarted + mtcAliveStageChange (node_ptr, MTC_MTCALIVE__SEND); + } + else + mtcAliveStageChange (node_ptr, MTC_MTCALIVE__FAIL); + + wlog ("%s pxeboot mtcAlive miss count %d ; loss count %d ; out-of-sequence ; this:%d last:%d", node_ptr->hostname.c_str(), + node_ptr->mtcAlive_miss_count[PXEBOOT_INTERFACE], + node_ptr->mtcAlive_loss_count[PXEBOOT_INTERFACE], node_ptr->mtcAlive_sequence[PXEBOOT_INTERFACE], node_ptr->mtcAlive_sequence_save[PXEBOOT_INTERFACE]); - node_ptr->mtcAlive_sequence_miss[PXEBOOT_INTERFACE]++ ; - mtcAliveStageChange (node_ptr, MTC_MTCALIVE__START); } - else if ( ++node_ptr->mtcAlive_sequence_miss[PXEBOOT_INTERFACE] < PXEBOOT_MTCALIVE_LOSS_THRESHOLD ) + else if ( ++node_ptr->mtcAlive_miss_count[PXEBOOT_INTERFACE] < PXEBOOT_MTCALIVE_LOSS_THRESHOLD ) { // Missing pxeboot mtcAlive - alog ("%s pxeboot mtcAlive miss count %d ; sending request", + wlog ("%s pxeboot mtcAlive miss count %d ; loss count %d ; sending request", node_ptr->hostname.c_str(), - node_ptr->mtcAlive_sequence_miss[PXEBOOT_INTERFACE]); - send_mtc_cmd ( node_ptr->hostname, MTC_REQ_MTCALIVE, PXEBOOT_INTERFACE ); - mtcAliveStageChange (node_ptr, MTC_MTCALIVE__MONITOR); - } - else if ( node_ptr->mtcAlive_pxeboot == true ) - { - wlog_throttled (node_ptr->pxeboot_mtcAlive_loss_log_throttle, - PXEBOOT_MTCALIVE_LOSS_LOG_THROTTLE, - "%s pxeboot mtcAlive loss ; missed: %d ; last: count:%d seq: %d ; sending request", - node_ptr->hostname.c_str(), - node_ptr->mtcAlive_sequence_miss[PXEBOOT_INTERFACE], - node_ptr->mtcAlive_pxeboot_count, - node_ptr->mtcAlive_sequence_save[PXEBOOT_INTERFACE]); + node_ptr->mtcAlive_miss_count[PXEBOOT_INTERFACE], + node_ptr->mtcAlive_loss_count[PXEBOOT_INTERFACE]); + // The mtcClient on this host may have been restarted mtcAliveStageChange (node_ptr, MTC_MTCALIVE__SEND); } else { - ilog_throttled (node_ptr->pxeboot_mtcAlive_not_seen_log_throttle, - PXEBOOT_MTCALIVE_NOT_SEEN_LOG_THROTTLE, - "%s pxeboot mtcAlive not seen yet ; sending request", - node_ptr->hostname.c_str()); - mtcAliveStageChange (node_ptr, MTC_MTCALIVE__SEND); + if ( node_ptr->mtcAlive_pxeboot == true ) + { + // If we get there its a loss + wlog_throttled (node_ptr->pxeboot_mtcAlive_loss_log_throttle, + PXEBOOT_MTCALIVE_LOSS_LOG_THROTTLE, + "%s pxeboot mtcAlive lost ; missed: %d ; last: count:%d seq: %d ; sending request", + node_ptr->hostname.c_str(), + node_ptr->mtcAlive_miss_count[PXEBOOT_INTERFACE], + node_ptr->mtcAlive_pxeboot_count, + node_ptr->mtcAlive_sequence_save[PXEBOOT_INTERFACE]); + } + else + { + // Otherwise still searching beyond threshold for the first mtcAlive after reboot or graceful recovery + ilog_throttled (node_ptr->pxeboot_mtcAlive_not_seen_log_throttle, + PXEBOOT_MTCALIVE_NOT_SEEN_LOG_THROTTLE, + "%s pxeboot mtcAlive not seen yet ; sending request", + node_ptr->hostname.c_str()); + } + mtcAliveStageChange (node_ptr, MTC_MTCALIVE__FAIL); } + + // Prevent the miss count from being larger than the loss, and therfore the alarm clear recovery, threshold. + if (node_ptr->mtcAlive_miss_count[PXEBOOT_INTERFACE] > PXEBOOT_MTCALIVE_LOSS_THRESHOLD) + node_ptr->mtcAlive_miss_count[PXEBOOT_INTERFACE] = PXEBOOT_MTCALIVE_LOSS_THRESHOLD; + node_ptr->mtcAlive_sequence_save[PXEBOOT_INTERFACE] = node_ptr->mtcAlive_sequence[PXEBOOT_INTERFACE] ; - - // TODO (emacdona): Need to handle loss case that manages raising the alarm - // Transition to MTC_MTCALIVE__FAIL - break ; } case MTC_MTCALIVE__FAIL: - { - wlog ("%s mtcAlive fail", node_ptr->hostname.c_str()); - mtcAliveStageChange (node_ptr, MTC_MTCALIVE__START); - break ; - } default: { - slog ("%s mtcAlive fsm default", node_ptr->hostname.c_str()); + alog2 ("%s mtcAlive fail", node_ptr->hostname.c_str()); + if ( node_ptr->alarms[MTC_ALARM_ID__MTCALIVE] == FM_ALARM_SEVERITY_CLEAR ) + { + if ( ++node_ptr->mtcAlive_loss_count[PXEBOOT_INTERFACE] < PXEBOOT_MTCALIVE_LOSS_ALARM_THRESHOLD ) + { + wlog ("%s pxeboot mtcAlive lost ; %d more loss before alarm assertion", + node_ptr->hostname.c_str(), + PXEBOOT_MTCALIVE_LOSS_ALARM_THRESHOLD - node_ptr->mtcAlive_loss_count[PXEBOOT_INTERFACE] ); + + // Start the misses counter over again after each loss debounce + node_ptr->mtcAlive_miss_count[PXEBOOT_INTERFACE] = 0 ; + } + else + { + ilog ("%s pxeboot mtcAlive alarm assert (%d)", + node_ptr->hostname.c_str(), + node_ptr->mtcAlive_loss_count[PXEBOOT_INTERFACE]); + alarm_mtcAlive_failure ( node_ptr, PXEBOOT_INTERFACE ); + } + } mtcAliveStageChange (node_ptr, MTC_MTCALIVE__START); break ; } } + if ( node_ptr->mtcAlive_miss_count[PXEBOOT_INTERFACE] || node_ptr->mtcAlive_loss_count[PXEBOOT_INTERFACE] ) + { + alog2 ("%s pxeboot mtcAlive: Miss: %d of %d , Loss: %d of %d", + node_ptr->hostname.c_str(), + node_ptr->mtcAlive_miss_count[PXEBOOT_INTERFACE], PXEBOOT_MTCALIVE_LOSS_THRESHOLD, + node_ptr->mtcAlive_loss_count[PXEBOOT_INTERFACE], PXEBOOT_MTCALIVE_LOSS_ALARM_THRESHOLD); + } return (PASS); }