Add pxeboot mtcAlive messaging alarm handling

This update adds alarm handling to the recently introduced pxeboot
network mtcAlive messaging, see depends on review below.

A new 200.003 maintenance alarm is introduced with the second depends
on update below. This new alarm is MINOR but also Management Affecting
because the pxeboot network is required for node installation.

This update enhances the new pxeboot_mtcAlive_monitor FSM for the
purpose of detecting pxeboot mtcAlive message loss, alarming and
then clearing the alarm once pxceboot mtcAlive messaging resumes.

The new alarm assertion and clear is debounced:
 - alarm is asserted if message loss persists to the accumulation of
   12 missed messages or after 2 minutes of complete message loss.
 - alarm is cleared after decrementing the message missed counter to
   zero or 1 minute of loss-less messaging.

Upgrades are supported with the addition of a features list to the
mtcClient ready event. All new mtcClients that support pxeboot network
messaging now publish pxeboot mtcAlive support through this new
features list. This is rendered in the logs like this:

    <hostname> mtcClient ready ; with pxeboot mtcAlive support

The mtcAgent does not expect/monitor pxeboot mtcAlive messages from
hosts that don't publish the feature support.

Test Plan:

PASS: Verify mtcAlive period is 5 seconds.
PASS: Verify pxeboot mtcAlive monitor period is 10 seconds.
PASS: Verify mtcAgent sends mtcClient a mtcAlive request on every
      mtcAlive monitor miss.
PASS: Verify pxeboot mtcAlive alarm is not raised while a node is
      locked.

Alarm attributes:

PASS: Verify severity is minor.
PASS: Verify alarm is cleared while node is locked.
PASS: Verify alarm can be suppressed while unlocked.
PASS: Verify asserted alarm is management affecting.
PASS: Verify alarm-show output format including cause and repair
      action text.

Process Restart Handling:

PASS: Verify alarm is maintained over a mtcAgent process restart.
PASS: Verify pxeboot monitoring resumes with or without asserted alarm
      immediately following a mtcAgent process restart.
PASS: Verify mtcClient learns and starts pxeboot mtcAlive messaging
      immediately following mtcClient process restart for locked or
      unlocked nodes.

Alarm Debounce Handling:

PASS: Verify alarm assertion only after 2 minutes of mtcAlive loss.
PASS: Verify alarm clear after 1 minutes of mtcAlive recovery.
PASS: Verify assertion and recovery debounce logging.
PASS: Verify alarm management miss and loss controls handle all
      boundary conditions exercised by a 12 hr soak with randomized
      period between message loss and recovery.

Host Action Handling:

PASS: Verify mtcAlive alarm is not raised over a Host Unlock Enable.
PASS: Verify mtcAlive alarm is not raised over a Host Graceful Recovery.
PASS: Verify mtcAlive alarm is not raised over a Host Power Off/On.
PASS: Verify mtcAlive alarm is not raised over a Host Reboot/Reset.
PASS: Verify mtcAlive alarm is not raised over a Host Reinstall.
PASS: Verify pxeboot mtcAlive is factored into Host Offline Handling.
PASS: Verify pxeboot alarm handling for node that does not send
      pxeboot mtcAlive after unlock.

Stuck Alarm Avoidance Handling:

PASS: Verify typical alarm assertion and clear handling.
PASS: Verify alarm is maintained or cleared over node reboot if the
      messaging issue persists or resolves over the reboot recovery.
PASS: Verify mtcAlive alarm is maintained over a Swact and cleared
      if the messaging is ok on the newly active controller.
PASS: Verify mtcAlive alarm assertion recovery case over uncontrolled
      Swact due to active controller reboot.
PASS: Verify alarm is cleared over a spontaneous reboot if pxeboot
      messaging recovers over that reboot.

Upgrades Case:

PASS: Verify pxeboot mtcAlive monitoring only occurs on mtcClients
      that actually support pxeboot network mtcAlive monitoring.

PASS: Verify mtcClient new features list, parsing which enables
      pxeboot  mtcAlive monitoring for that node.

PASS: Verify pxeboot mtcAlive messaging monitoring is not enabled
      towards nodes whose mtcClient does publish pxeboot mtcAlive
      messaging feature support.
PROG: Verify AIO DX upgrade from 22.12 to current master branch.
      Focus on pxeboot messaging over the upgrade process.

Depends-On: https://review.opendev.org/c/starlingx/metal/+/912654
Depends-On: https://review.opendev.org/c/starlingx/fault/+/914660
Story: 2010940
Task: 49542
Change-Id: I1b51ad9ebcf010f5dee9a86c0295be3da6e2f9b1
Signed-off-by: Eric MacDonald <eric.macdonald@windriver.com>
This commit is contained in:
Eric MacDonald 2024-04-03 18:13:08 +00:00
parent 89766131af
commit 649e94c8da
14 changed files with 549 additions and 160 deletions

View File

@ -2,7 +2,7 @@
#define __ALARMUTIL_H__
/*
* Copyright (c) 2013, 2016 Wind River Systems, Inc.
* Copyright (c) 2013, 2016, 2024 Wind River Systems, Inc.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -31,6 +31,7 @@
#define SWERR_ALARM_ID ((const char *)"200.000") /* Do No Use */
#define LOCK_ALARM_ID ((const char *)"200.001")
#define MTCALIVE_ALARM_ID ((const char *)"200.003")
#define ENABLE_ALARM_ID ((const char *)"200.004")
#define MGMNT_HB_ALARM_ID ((const char *)"200.005")
#define PMOND_ALARM_ID ((const char *)"200.006")

View File

@ -120,6 +120,7 @@
#define FIT_CODE__FORCE_LOCK_HOST (32)
#define FIT_CODE__UNLOCK_HOST (33)
#define FIT_CODE__FAIL_SWACT (34)
#define FIT_CODE__FAIL_PXEBOOT_MTCALIVE (35)
#define FIT_CODE__FM_SET_ALARM (40)
#define FIT_CODE__FM_GET_ALARM (41)

View File

@ -223,6 +223,10 @@ typedef enum
#define MTC_JSON_SEVERITY "severity"
#define MTC_JSON_SENSOR "sensor"
#define MTC_JSON_PROCESS "process"
#define MTC_JSON_FEATURES "features"
/* Used by the mtcCLient with the MTC_JSON_FEATURES label above */
#define MTC_PXEBOOT_MTCALIVE "pxeboot_mtcAlive"
/* Mtce Info Keys */
#define MTCE_INFO_KEY__BMC_PROTOCOL "bmc_protocol"
@ -614,6 +618,7 @@ typedef struct
#define MTC_SERVICE_PMOND (0xB00BF00D)
#define MTC_SERVICE_HWMOND (0xF00BF00D)
#define MTC_SERVICE_HEARTBEAT (0xBABEF00D)
#define MTC_SERVICE_MTCCLIENT (0xABCDF00D)
/** process to process loopback command */
#define MTC_EVENT_LOOPBACK (0x01010101)

View File

@ -2,7 +2,7 @@
#define __INCLUDE_ALARM_H__
/*
* Copyright (c) 2016-2017 Wind River Systems, Inc.
* Copyright (c) 2016-2017, 2024 Wind River Systems, Inc.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -14,38 +14,17 @@
*/
#include "nodeBase.h"
#include "nodeUtil.h" /* for ... common utilities */
#include "nodeUtil.h" /* for ... common utilities */
#include "alarmUtil.h" /* for ... common alarm definitions */
#include "msgClass.h" /* for ... msgClassSock type definition */
/* external APIs */
#include "fmAPI.h"
#define ENTITY_PREFIX ((const char *)"host=")
#define MAX_ALARM_REQ_PER_MSG (4)
#define MAX_ALARM_REQ_MSG_SIZE (500)
#define MAX_ALARM_REQ_SIZE (MAX_ALARM_REQ_PER_MSG*MAX_ALARM_REQ_MSG_SIZE)
#define SWERR_ALARM_ID ((const char *)"200.000") /* Do No Use */
#define LOCK_ALARM_ID ((const char *)"200.001")
#define ENABLE_ALARM_ID ((const char *)"200.004")
#define MGMNT_HB_ALARM_ID ((const char *)"200.005")
#define PMOND_ALARM_ID ((const char *)"200.006")
#define SENSOR_ALARM_ID ((const char *)"200.007") /* Sensor read alarm ; i.e. the sensor read value bad */
#define CLSTR_HB_ALARM_ID ((const char *)"200.009")
#define BM_ALARM_ID ((const char *)"200.010")
#define CONFIG_ALARM_ID ((const char *)"200.011")
#define CH_COMP_ALARM_ID ((const char *)"200.013") /* Combo Host Compute Failure - on last Controller */
#define SENSORCFG_ALARM_ID ((const char *)"200.014") /* Sensor configuration alarm ; i.e. could not add */
#define SENSORGROUP_ALARM_ID ((const char *)"200.015") /* Sensor Group Read Error */
#define LUKS_ALARM_ID ((const char *)"200.016") /* LUKS volume failure alarm */
#define EVENT_LOG_ID ((const char *)"200.020")
#define COMMAND_LOG_ID ((const char *)"200.021")
#define STATECHANGE_LOG_ID ((const char *)"200.022")
#define SERVICESTATUS_LOG_ID ((const char *)"200.023") /* log used to report service failure events against */
/** Heartbeat Alarm Abstract Reference IDs */
typedef enum
@ -85,17 +64,6 @@ int alarm_log ( string hostname, const char * id_ptr, string entity );
#else
typedef struct
{
SFmAlarmDataT alarm ;
string name ;
string instc_prefix ; /* Instance prefix i.e. "=sensor." or "=process." */
string critl_reason ;
string minor_reason ;
string major_reason ;
string clear_reason ;
} alarmUtil_type ;
typedef struct
{
string alarmid ;
@ -133,10 +101,6 @@ echo "${STR}" | socat - ${PROTOCOL}:${ADDRESS}:${port}
void alarmData_init ( void );
alarmUtil_type * alarmData_getAlarm_ptr ( string alarm_id_str );
/* in alarmUtil.cpp */
// EFmAlarmSeverityT mtcAlarm_state ( string hostname, alarm_id_enum id );
/* in alarmHdlr.cpp */
int alarmHdlr_request_handler ( char * msg_ptr );
@ -144,21 +108,6 @@ void alarmMgr_queue_clear ( void );
void alarmMgr_queue_alarm (queue_entry_type entry);
void alarmMgr_service_queue(void);
/* Clear all alarms against this host */
void alarmUtil_clear_all ( string hostname );
/**
* Query the specified alarm severity level.
* Severity levels are specified in fmAPI.h
**/
EFmAlarmSeverityT alarmUtil_query ( string hostname,
string identity,
string instance );
int alarmUtil_query_identity ( string identity,
SFmAlarmDataT * alarm_list_ptr,
unsigned int alarms_max );
int alarmUtil_clear ( string hostname, string alarm_id, string entity );
int alarmUtil_critical ( string hostname, string alarm_id, string entity, FMTimeT & timestamp );
int alarmUtil_major ( string hostname, string alarm_id, string entity, FMTimeT & timestamp );

View File

@ -19,7 +19,7 @@
#include <errno.h> /* for ENODEV, EFAULT and ENXIO */
#include <unistd.h> /* for close and usleep */
#include <algorithm>
#include <json-c/json.h> /* for ... json_tokener_parse and other json utils */
using namespace std;
#ifdef __AREA__
@ -564,6 +564,11 @@ nodeLinkClass::node* nodeLinkClass::addNode( string hostname )
ptr->mtcAlive_clstr = false ;
ptr->mtcAlive_pxeboot = false ;
/* Assume the node's mtcClient does not support pxeboot mtcAlive
* messaging until that mtcClient reports that it does.
* This bool blocks the pxeboot_mtcAlive_monitor fsm. */
ptr->pxeboot_mtcAlive_supported = false ;
/* These counts are incremented in the set_mtcAlive member
* function and cleared in the reset progression handler. */
ptr->mtcAlive_mgmnt_count = 0 ;
@ -574,8 +579,9 @@ nodeLinkClass::node* nodeLinkClass::addNode( string hostname )
for (int i = 0 ; i < MTCALIVE_INTERFACES_MAX ; i++)
{
ptr->mtcAlive_sequence[i] =
ptr->mtcAlive_loss_count[i] =
ptr->mtcAlive_sequence_save[i] =
ptr->mtcAlive_sequence_miss[i] =
ptr->mtcAlive_miss_count[i] =
ptr->mtcAlive_log_throttle [i] = 0 ;
}
ptr->pxeboot_mtcAlive_not_seen_log_throttle = 0 ;
@ -1886,6 +1892,66 @@ int nodeLinkClass::alarm_compute_clear ( struct nodeLinkClass::node * node_ptr,
return (PASS);
}
/*****************************************************************************
*
* Name : alarm_mtcAlive_failure, alarm_mtcAlive_clear
*
* Purpose : Generate a log and assert minor or clear alarm for mtcAlive
* messaging failures.
*
* Assumptions: Degrade not supported for minor alarms.
*
* Limitations: Only pxeboot messaging is currently supported.
* Need to introduce an network mtcAlive alarm array to track which
* ones are active if/when it comes time to support alarming of
* mtcAlive failures on other networks.
*
*****************************************************************************/
int nodeLinkClass::alarm_mtcAlive_failure ( struct nodeLinkClass::node * node_ptr, int network )
{
int rc = PASS ;
if ( network != PXEBOOT_INTERFACE )
{
slog ("%s mtcAlive alarm not supported for %s network",
node_ptr->hostname.c_str(),
get_iface_name_str(network));
rc = FAIL_INVALID_OPERATION ;
}
else if ( node_ptr->alarms[MTC_ALARM_ID__MTCALIVE] != FM_ALARM_SEVERITY_MINOR )
{
wlog ("%s minor %s mtcAlive messaging failure",
node_ptr->hostname.c_str(),
get_iface_name_str(network));
mtcAlarm_minor ( node_ptr->hostname, MTC_ALARM_ID__MTCALIVE );
node_ptr->alarms[MTC_ALARM_ID__MTCALIVE] = FM_ALARM_SEVERITY_MINOR ;
}
return (rc);
}
int nodeLinkClass::alarm_mtcAlive_clear ( struct nodeLinkClass::node * node_ptr, int network )
{
int rc = PASS ;
if ( network != PXEBOOT_INTERFACE )
{
slog ("%s mtcAlive alarm not supported for %s network",
node_ptr->hostname.c_str(),
get_iface_name_str(network));
rc = FAIL_INVALID_OPERATION ;
}
else if ( node_ptr->alarms[MTC_ALARM_ID__MTCALIVE] != FM_ALARM_SEVERITY_CLEAR )
{
ilog ("%s minor %s mtcAlive messaging failure",
node_ptr->hostname.c_str(),
get_iface_name_str(network));
mtcAlarm_clear ( node_ptr->hostname, MTC_ALARM_ID__MTCALIVE );
node_ptr->alarms[MTC_ALARM_ID__MTCALIVE] = FM_ALARM_SEVERITY_CLEAR ;
node_ptr->mtcAlive_loss_count[network] = 0 ;
}
return (rc);
}
/** Host Operational State Change public member function */
int nodeLinkClass::oper_subf_state_change ( string hostname, string newOperState )
{
@ -3086,6 +3152,7 @@ void nodeLinkClass::clear_service_readies ( struct nodeLinkClass::node * node_pt
ilog ("%s clearing service ready events\n", node_ptr->hostname.c_str());
node_ptr->hbsClient_ready = false ;
node_ptr->pmond_ready = false ;
node_ptr->mtcClient_ready = false ;
}
}
}
@ -3952,6 +4019,25 @@ int nodeLinkClass::set_clstr_hostaddr ( string & hostname, string & ip )
return ( rc );
}
int nodeLinkClass::set_pxeboot_mtcAlive_support ( string hostname, bool state )
{
int rc = FAIL_HOSTNAME_LOOKUP ;
nodeLinkClass::node* node_ptr = nodeLinkClass::getNode ( hostname );
if ( node_ptr != NULL )
{
if ( state != node_ptr->pxeboot_mtcAlive_supported )
mtcAliveStageChange (node_ptr, MTC_MTCALIVE__START);
dlog ("%s pxeboot mtcAlive %s supported",
node_ptr->hostname.c_str(),
state ? "is" : "is not");
node_ptr->pxeboot_mtcAlive_supported = state ;
rc = PASS ;
}
return ( rc );
}
string nodeLinkClass::get_hostname ( string hostaddr )
{
if (( hostaddr == LOOPBACK_IPV6 ) ||
@ -4197,7 +4283,7 @@ void nodeLinkClass::set_mtcAlive ( struct nodeLinkClass::node * node_ptr, unsign
}
if ( state_change )
{
ilog ("%s mtcAlive received from %s network with uptime:%d ; seq:%d",
ilog ("%s mtcAlive received from %s network with uptime:%d ; seq:%d ; state change",
node_ptr->hostname.c_str(),
get_iface_name_str(iface),
node_ptr->uptime,
@ -4208,22 +4294,32 @@ void nodeLinkClass::set_mtcAlive ( struct nodeLinkClass::node * node_ptr, unsign
{
if ( sequence < node_ptr->mtcAlive_sequence[iface]+1 )
{
wlog ("%s mtcAlive received from %s network with uptime:%d ; out-of-sequence ; expect:%d detect:%d ; correcting",
node_ptr->hostname.c_str(),
get_iface_name_str(iface),
node_ptr->uptime,
node_ptr->mtcAlive_sequence[iface]+1,
sequence);
// Don't warn log for mtcClient restart cases.
// ... indicated by a very low sequence number.
if ( sequence > 2 )
{
wlog ("%s mtcAlive received from %s network with uptime:%d ; out-of-sequence ; expect:%d detect:%d ; correcting",
node_ptr->hostname.c_str(),
get_iface_name_str(iface),
node_ptr->uptime,
node_ptr->mtcAlive_sequence[iface]+1,
sequence);
}
}
else
{
wlog ("%s mtcAlive received from %s network with uptime:%d ; missed %d mtcalive msgs ; expect:%d detect:%d ; correcting",
node_ptr->hostname.c_str(),
get_iface_name_str(iface),
node_ptr->uptime,
sequence-(node_ptr->mtcAlive_sequence[iface]+1),
node_ptr->mtcAlive_sequence[iface]+1,
sequence);
// Don't warn log for mtcAgent restart cases.
// ... indicated by expecting 1 and detecting a large number.
if ( node_ptr->mtcAlive_sequence[iface] > 0 )
{
wlog ("%s mtcAlive received from %s network with uptime:%d ; missed %d mtcAlive msgs ; expect:%d detect:%d ; correcting",
node_ptr->hostname.c_str(),
get_iface_name_str(iface),
node_ptr->uptime,
sequence-(node_ptr->mtcAlive_sequence[iface]+1),
node_ptr->mtcAlive_sequence[iface]+1,
sequence);
}
}
}
else
@ -5659,7 +5755,8 @@ void nodeLinkClass::manage_heartbeat_minor ( string hostname, iface_enum iface,
/** Interface to declare that a key service on the
* specified host is up, running and ready */
int nodeLinkClass::declare_service_ready ( string & hostname,
unsigned int service )
unsigned int service,
string feature_list )
{
nodeLinkClass::node * node_ptr = nodeLinkClass::getNode ( hostname );
if ( node_ptr == NULL )
@ -5667,6 +5764,58 @@ int nodeLinkClass::declare_service_ready ( string & hostname,
wlog ("%s Unknown Host\n", hostname.c_str());
return FAIL_UNKNOWN_HOSTNAME ;
}
else if ( service == MTC_SERVICE_MTCCLIENT )
{
if ( ! feature_list.empty() )
{
/* features is expected to be a list - ["feature 0", "feature 1", ..."] */
struct json_object *json_obj = json_tokener_parse(feature_list.data());
if ( json_obj != NULL )
{
/* how many featiures are present ? */
int features = json_object_array_length(json_obj);
dlog ("%s %s offers %d features", hostname.c_str(), MTC_SERVICE_MTCCLIENT_NAME, features);
for ( int f = 0 ; f < features ; f++ )
{
/* get the first element at index 0 */
struct json_object *feature_obj_ptr = json_object_array_get_idx(json_obj, f);
/* convert each element to a string */
string feature = json_object_get_string(feature_obj_ptr);
dlog ("mtcClient feature %d: %s", f, feature.c_str());
if ( feature == MTC_PXEBOOT_MTCALIVE )
{
dlog ("%s %s supports pxeboot mtcAlive", hostname.c_str(), MTC_SERVICE_MTCCLIENT_NAME);
set_pxeboot_mtcAlive_support (hostname, true);
}
}
/* free the json object */
json_object_put(json_obj);
}
else
{
dlog ("%s json object is NULL", hostname.c_str());
}
}
else
{
dlog ("%s feature list is empty", hostname.c_str());
}
if ( node_ptr->mtcClient_ready == false )
{
ilog ("%s %s ready %s", hostname.c_str(),
MTC_SERVICE_MTCCLIENT_NAME,
node_ptr->pxeboot_mtcAlive_supported ? "; with pxeboot mtcAlive support" : "");
node_ptr->mtcClient_ready = true ;
}
if ( node_ptr->pxeboot_mtcAlive_supported )
send_mtc_cmd ( node_ptr->hostname, MTC_REQ_MTCALIVE, PXEBOOT_INTERFACE );
return (PASS);
}
else if ( service == MTC_SERVICE_PMOND )
{
node_ptr->pmond_ready = true ;
@ -9580,7 +9729,7 @@ void nodeLinkClass::mem_log_mtcalive_pxeboot ( struct nodeLinkClass::node * node
this->pxeboot_network_provisioned ? 'Y' : 'N',
node_ptr->mtcAlive_pxeboot ? 'Y' : 'N',
node_ptr->mtcAlive_timer.ring ? 'Y' : 'N',
node_ptr->mtcAlive_sequence_miss [PXEBOOT_INTERFACE],
node_ptr->mtcAlive_miss_count [PXEBOOT_INTERFACE],
node_ptr->mtcAlive_sequence [PXEBOOT_INTERFACE],
node_ptr->mtcAlive_sequence_save [PXEBOOT_INTERFACE]);
@ -9590,9 +9739,10 @@ void nodeLinkClass::mem_log_mtcalive_pxeboot ( struct nodeLinkClass::node * node
void nodeLinkClass::mem_log_alarm1 ( struct nodeLinkClass::node * node_ptr )
{
char str[MAX_MEM_LOG_DATA] ;
snprintf (&str[0], MAX_MEM_LOG_DATA, "%s\tAlarm List:%s%s%s%s%s\n",
snprintf (&str[0], MAX_MEM_LOG_DATA, "%s\tAlarm List:%s%s%s%s%s%s\n",
node_ptr->hostname.c_str(),
node_ptr->alarms[MTC_ALARM_ID__LOCK ] ? " Locked" : " .",
node_ptr->alarms[MTC_ALARM_ID__MTCALIVE] ? " mtcAlive" : " .",
node_ptr->alarms[MTC_ALARM_ID__CONFIG ] ? " Config" : " .",
node_ptr->alarms[MTC_ALARM_ID__ENABLE ] ? " Enable" : " .",
node_ptr->alarms[MTC_ALARM_ID__CH_COMP ] ? " Compute" : " .",

View File

@ -335,9 +335,22 @@ private:
/* tracks the sequence number of the last <iface> mtcAlive message */
unsigned int mtcAlive_sequence [MTCALIVE_INTERFACES_MAX] ;
unsigned int mtcAlive_sequence_save[MTCALIVE_INTERFACES_MAX] ;
unsigned int mtcAlive_sequence_miss[MTCALIVE_INTERFACES_MAX] ;
unsigned int mtcAlive_miss_count [MTCALIVE_INTERFACES_MAX] ;
unsigned int mtcAlive_log_throttle [MTCALIVE_INTERFACES_MAX] ;
/* mtcAlive miss, loss, alarm and log throttle definitions */
#define PXEBOOT_MTCALIVE_MONITOR_RATE_SECS (MTC_ALIVE_TIMER*2) // monitor every 10 seconds
#define PXEBOOT_MTCALIVE_LOSS_THRESHOLD (6) // 6 misses or 1 minute if back-2-back
#define PXEBOOT_MTCALIVE_LOSS_ALARM_THRESHOLD (2) // 2 losses before recovery is 2 minutes
#define PXEBOOT_MTCALIVE_NOT_SEEN_LOG_THROTTLE (60) // not seen log every 10 minutes
#define PXEBOOT_MTCALIVE_LOSS_LOG_THROTTLE (60) // loss log every 10 minutes
/* used to debounce mtcAlive loss alarm */
unsigned int mtcAlive_loss_count [MTCALIVE_INTERFACES_MAX] ;
/* indicates boolean support for pxeboot mtcAlive messaging */
bool pxeboot_mtcAlive_supported ;
/* pxeboot mtcAlive monitor log throttles */
int pxeboot_mtcAlive_not_seen_log_throttle ;
int pxeboot_mtcAlive_loss_log_throttle ;
@ -661,6 +674,9 @@ private:
/** Host degraded due to loss of Process Monitor running flag */
bool pmon_degraded ;
/* Maintenance Client Ready */
bool mtcClient_ready ;
/** Process Monitor Ready flag and degrade list */
bool pmond_ready ;
@ -1139,6 +1155,9 @@ private:
int alarm_compute_clear ( struct nodeLinkClass::node * node_ptr, bool force );
int alarm_compute_failure ( struct nodeLinkClass::node * node_ptr , EFmAlarmSeverityT sev );
int alarm_mtcAlive_clear ( struct nodeLinkClass::node * node_ptr, int network );
int alarm_mtcAlive_failure( struct nodeLinkClass::node * node_ptr, int network );
void clear_subf_failed_bools ( struct nodeLinkClass::node * node_ptr );
void clear_main_failed_bools ( struct nodeLinkClass::node * node_ptr );
void clear_hostservices_ctls ( struct nodeLinkClass::node * node_ptr );
@ -1496,6 +1515,9 @@ public:
/* set the pxeboot network address for any hostname */
int set_pxeboot_hostaddr ( string hostname, string ip );
/* set the state of this node's pxeboot mtcAlive support */
int set_pxeboot_mtcAlive_support ( string hostname, bool state );
/** get hostname for any hostname */
string get_hostname ( string hostaddr );
@ -1732,6 +1754,13 @@ public:
* interface is on the 'lo' (localhost) interface. */
bool pxeboot_network_provisioned ;
/** A boolean that is used to indicate whether this node supports
* pxeboot mtcAlive messaging.
* This is needed to support upgrades to nodes that don't support
* this feature prior to their upgrade.
* Assuming controllers are upgraded first.*/
bool pxeboot_mtcAlive_supported ;
/** A debug bool hat allows cluster-host heartbeat failures to only
* cause host degrade rather than failure */
bool clstr_degrade_only ;
@ -1925,7 +1954,7 @@ public:
/** Interface to declare that a key service on the
* specified host is up, running and ready */
int declare_service_ready ( string & hostname, unsigned int service );
int declare_service_ready ( string & hostname, unsigned int service, string features="" );
/** Process Monitor 'Clear' Event handler.
*

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 2015-2017 Wind River Systems, Inc.
* Copyright (c) 2015-2017, 2024 Wind River Systems, Inc.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -64,6 +64,32 @@ void mtcAlarm_init ( void )
snprintf( ptr->alarm.proposed_repair_action, FM_MAX_BUFFER_LENGTH,
"Administratively unlock Host to bring it back in-service.");
/** pxeboot mtcAlive Alarm **************************************************/
ptr = &alarm_list[MTC_ALARM_ID__MTCALIVE];
memset (&ptr->alarm, 0, (sizeof(SFmAlarmDataT)));
snprintf(&ptr->alarm.alarm_id[0], FM_MAX_BUFFER_LENGTH, "%s", LOCK_ALARM_ID);
ptr->name = "mtcAlive" ;
ptr->instc_prefix = "" ;
ptr->critl_reason =
ptr->major_reason =
ptr->minor_reason = "pxeboot network communication failure";
ptr->clear_reason = "pxeboot network communication recovered";
ptr->alarm.alarm_type = FM_ALARM_COMM ;
ptr->alarm.probable_cause = FM_ALARM_CAUSE_UNKNOWN;
ptr->alarm.inhibit_alarms = FM_FALSE;
ptr->alarm.service_affecting = FM_FALSE;
ptr->alarm.suppression = FM_TRUE ;
ptr->alarm.severity = FM_ALARM_SEVERITY_CLEAR ; /* Dynamic */
ptr->alarm.alarm_state = FM_ALARM_STATE_CLEAR ; /* Dynamic */
snprintf( ptr->alarm.proposed_repair_action, FM_MAX_BUFFER_LENGTH,
"Administratively Lock and Unlock host to recover. If problem persists, contact next level of support.");
/** Enable Alarm ************************************************************/
ptr = &alarm_list[MTC_ALARM_ID__ENABLE];
@ -339,6 +365,7 @@ string _getIdentity ( mtc_alarm_id_enum id )
switch ( id )
{
case MTC_ALARM_ID__LOCK: return (LOCK_ALARM_ID);
case MTC_ALARM_ID__MTCALIVE: return (MTCALIVE_ALARM_ID);
case MTC_ALARM_ID__CONFIG: return (CONFIG_ALARM_ID);
case MTC_ALARM_ID__ENABLE: return (ENABLE_ALARM_ID);
case MTC_ALARM_ID__BM: return (BM_ALARM_ID);
@ -348,7 +375,7 @@ string _getIdentity ( mtc_alarm_id_enum id )
case MTC_LOG_ID__COMMAND: return (COMMAND_LOG_ID);
case MTC_LOG_ID__STATECHANGE: return (STATECHANGE_LOG_ID);
case MTC_LOG_ID__CONFIG: return (CONFIG_LOG_ID);
default: return ("200.000");
default: return (SWERR_ALARM_ID);
}
}

View File

@ -2,7 +2,7 @@
#define __MTCALARM_H__
/*
* Copyright (c) 2015-2017 Wind River Systems, Inc.
* Copyright (c) 2015-2017, 2024 Wind River Systems, Inc.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -26,6 +26,7 @@ using namespace std;
typedef enum
{
MTC_ALARM_ID__LOCK,
MTC_ALARM_ID__MTCALIVE,
MTC_ALARM_ID__CONFIG,
MTC_ALARM_ID__ENABLE,
MTC_ALARM_ID__BM,

View File

@ -247,13 +247,20 @@ int mtc_service_command ( mtc_socket_type * sock_ptr, int interface )
rc = PASS ;
if ( msg.cmd == MTC_REQ_MTCALIVE )
{
ilog ("mtcAlive request received from %s network", iface_name_ptr);
alog1 ("mtcAlive request received from %s network", iface_name_ptr);
if ( interface == PXEBOOT_INTERFACE )
{
alog2 ("pxeboot mtcAlive buffer: %s", &msg.buf[0]);
load_pxebootInfo_msg(msg);
#ifdef WANT_FIT_TESTING
if ( ! daemon_want_fit ( FIT_CODE__FAIL_PXEBOOT_MTCALIVE ) )
#endif
{
send_mtcAlive_msg ( sock_ptr, ctrl_ptr->who_i_am, interface );
}
}
return ( send_mtcAlive_msg ( sock_ptr, get_who_i_am(), interface ));
return (rc);
}
else if ( msg.cmd == MTC_MSG_INFO )
{
@ -749,6 +756,7 @@ int mtc_service_command ( mtc_socket_type * sock_ptr, int interface )
int mtce_send_event ( mtc_socket_type * sock_ptr, unsigned int cmd , const char * mtce_name_ptr )
{
mtc_message_type event ;
ctrl_type *ctrl_ptr = get_ctrl_ptr();
int rc = PASS ;
int bytes = 0 ;
@ -772,13 +780,21 @@ int mtce_send_event ( mtc_socket_type * sock_ptr, unsigned int cmd , const char
event_info.append(MTC_JSON_SERVICE);
event_info.append("\":\"");
event_info.append(MTC_SERVICE_MTCCLIENT_NAME );
event_info.append("\"}");
event_info.append("\",\"active_controller_pxeboot_address\":\"");
event_info.append(ctrl_ptr->pxeboot_addr_active_controller);
event_info.append("\",\"");
event_info.append(MTC_JSON_FEATURES);
event_info.append("\":[\"");
event_info.append(MTC_PXEBOOT_MTCALIVE);
event_info.append("\"]}");
size_t len = event_info.length()+1 ;
snprintf ( &event.hdr[0], MSG_HEADER_SIZE, "%s", get_mtce_event_header());
snprintf ( &event.buf[0], len, "%s", event_info.data());
bytes = ((sizeof(mtc_message_type))-(BUF_SIZE-len));
ilog ("%s %s ready", get_hostname().c_str(), MTC_SERVICE_MTCCLIENT_NAME);
dlog ("%s %s ready", get_hostname().c_str(), MTC_SERVICE_MTCCLIENT_NAME);
}
else if (( cmd == MTC_EVENT_AVS_CLEAR ) ||
( cmd == MTC_EVENT_AVS_MAJOR ) ||
@ -849,10 +865,37 @@ int mtce_send_event ( mtc_socket_type * sock_ptr, unsigned int cmd , const char
}
else
{
elog ("cannot send to null or failed socket (%s)",
elog ("cannot send to null or failed management network socket (%s)",
get_interface_name_str (MGMNT_INTERFACE) );
rc = FAIL_SOCKET_SENDTO ;
}
// Only the events sent on the pxeboot network are:
// - ready event
if (( cmd == MTC_EVENT_MONITOR_READY ) &&
( sock_ptr->pxeboot_tx_socket > 0 ) &&
( !ctrl_ptr->pxeboot_addr_active_controller.empty()))
{
int flags = 0 ; // no tx flags
struct sockaddr_in hostAddr;
memset(&hostAddr, 0, sizeof(hostAddr));
print_mtc_message ( ctrl_ptr->pxeboot_addr_active_controller.data(), MTC_CMD_TX, event, get_interface_name_str(PXEBOOT_INTERFACE), false);
hostAddr.sin_addr.s_addr = inet_addr(ctrl_ptr->pxeboot_addr_active_controller.data());
hostAddr.sin_family = AF_INET;
hostAddr.sin_port = htons(sock_ptr->mtc_tx_pxeboot_port);
ssize_t bytes_sent = sendto(sock_ptr->pxeboot_tx_socket, &event.hdr[0], bytes, flags,
(const struct sockaddr*)&hostAddr, sizeof(hostAddr));
if (bytes_sent <= 0)
{
elog ("failed to send %s to %s:%d on %s network (rc:%ld) (%d:%m)",
get_mtcNodeCommand_str(event.cmd),
ctrl_ptr->pxeboot_addr_active_controller.c_str(),
hostAddr.sin_port,
get_interface_name_str(PXEBOOT_INTERFACE),
bytes_sent, errno);
}
}
return rc ;
}

View File

@ -443,6 +443,7 @@ int mtc_service_inbox ( nodeLinkClass * obj_ptr,
string service = "" ;
string sensor = "" ;
string process = "" ;
hostname = "unknown" ;
int rc1 = FAIL ;
@ -493,7 +494,17 @@ int mtc_service_inbox ( nodeLinkClass * obj_ptr,
}
else if ( service == MTC_SERVICE_MTCCLIENT_NAME )
{
ilog ("%s %s ready", hostname.c_str(), MTC_SERVICE_MTCCLIENT_NAME);
string features= "" ;
if ( jsonUtil_get_key_val(&msg.buf[0], MTC_JSON_FEATURES, features ) == PASS )
{
dlog ("%s %s features: %s", hostname.c_str(), service.c_str(), features.c_str());
}
else
{
ilog ("%s %s not offering feature list ; node may have upgrade pending",
hostname.c_str(), MTC_SERVICE_MTCCLIENT_NAME);
}
obj_ptr->declare_service_ready ( hostname, MTC_SERVICE_MTCCLIENT, features );
/* if this ready event is from the mtcClient of a
* controller that has valid bmc access info then
@ -697,7 +708,9 @@ int send_mtc_cmd ( string & hostname, int cmd , int interface, string json_dict
* controller's pxeboot ip addresses so it knows where to send. */
obj_ptr->pxebootInfo_loader();
data = "{\"pxebootInfo\":{" ;
data.append ("\"address\":\"");
data.append ("\"");
data.append (CONTROLLER);
data.append ("\":\"");
data.append (obj_ptr->my_pxeboot_ip);
data.append ("\",\"");
data.append (CONTROLLER_0);

View File

@ -337,6 +337,7 @@ int daemon_configure ( void )
rc = PASS ;
}
daemon_load_fit();
return (rc);
}
@ -1267,6 +1268,9 @@ int daemon_init ( string iface, string nodetype_str )
ctrl.clstr_iface_provisioned = false ;
ctrl.pxeboot_iface_provisioned = false ;
ctrl.peer_ctrlr_reset.sync = false ;
ctrl.pxeboot_addr_c0 = "" ;
ctrl.pxeboot_addr_c1 = "" ;
ctrl.pxeboot_addr_active_controller = "" ;
/* convert node type to integer */
ctrl.nodetype = get_host_function_mask ( nodetype_str ) ;
@ -1295,9 +1299,12 @@ int daemon_init ( string iface, string nodetype_str )
}
else
{
// Ready to do pxeboot messaging
ilog ("Mgmnt iface : %s", ctrl.mgmnt_iface.c_str());
// Not on LO, assume pxeboot provisioning starting with it being
// equal to the management interface, until otherwise updated due
// to bonding or vlan modes.
ctrl.pxeboot_iface = ctrl.mgmnt_iface ;
ilog ("Pxeboot iface %s", ctrl.pxeboot_iface.c_str());
ctrl.pxeboot_iface_provisioned = true ;
}
}
@ -1437,7 +1444,9 @@ void daemon_service_run ( void )
ctrl.peer_ctrlr_reset.audit_period );
}
/* Send the mtcClient ready event and clear the periodic event counter */
mtce_send_event ( sock_ptr, MTC_EVENT_MONITOR_READY, NULL );
ctrl.ready_event_counter = 0 ;
/* lets go select so that the sock does not go crazy */
dlog ("%s running main loop with %d msecs socket timeout\n",
@ -1646,7 +1655,7 @@ void daemon_service_run ( void )
}
}
}
if ( ctrl.timer.ring == true )
if ( mtcTimer_expired ( ctrl.timer ) )
{
bool socket_reinit = true ;
@ -1745,7 +1754,13 @@ void daemon_service_run ( void )
string who_i_am = _self_identify ( ctrl.nodetype_str );
}
alog1 ("sending mtcAlive on all provisioned mtcAlive networks");
send_mtcAlive_msg ( sock_ptr, ctrl.who_i_am, PXEBOOT_INTERFACE );
#ifdef WANT_FIT_TESTING
if ( ! daemon_want_fit ( FIT_CODE__FAIL_PXEBOOT_MTCALIVE ) )
#endif
{
send_mtcAlive_msg ( sock_ptr, ctrl.who_i_am, PXEBOOT_INTERFACE );
}
send_mtcAlive_msg ( sock_ptr, ctrl.who_i_am, MGMNT_INTERFACE );
if (( ctrl.clstr_iface_provisioned == true ) &&
( mtc_sock.mtc_client_clstr_rx_socket != NULL ) &&
@ -1801,6 +1816,22 @@ void daemon_service_run ( void )
_close_amon_sock ();
}
}
// Purpose: mtcClient ready event audit
//
// Send the ready event every minute just in case the first
// process startup event was missed by the mtcAgent or
// the mtcAgent was restarted.
//.
// Needed to ensure that pxeboot mtcAlive messaging monitoring
// gets started over a mtcagent process restart.
if ( ++ctrl.ready_event_counter >= (MTC_MINS_1/MTC_ALIVE_TIMER) )
{
dlog ("sending mtcClient ready event");
mtce_send_event ( sock_ptr, MTC_EVENT_MONITOR_READY, NULL );
ctrl.ready_event_counter = 0 ;
}
}
/* service controller specific audits */
@ -2388,8 +2419,9 @@ void load_mtcInfo_msg ( mtc_message_type & msg )
* Address can be empty of an unprovisioned controller.
*
* { "pxebootInfo":{
* "controller-0":"169.254.202.2",
* "controller-1":"169.254.202.3"
* "controller" : "169.254.202.2"
* "controller-0" : "169.254.202.2",
* "controller-1" : "169.254.202.3"
* }
* }
*
@ -2398,18 +2430,43 @@ void load_mtcInfo_msg ( mtc_message_type & msg )
***************************************************************************/
void load_pxebootInfo_msg ( mtc_message_type & msg )
{
struct json_object *_obj = json_tokener_parse( &msg.buf[0] );
if ( _obj )
struct json_object *json_obj = json_tokener_parse( &msg.buf[0] );
if ( json_obj )
{
const char dict_label [] = "pxebootInfo" ;
struct json_object *info_obj = (struct json_object *)(NULL);
json_bool json_rc = json_object_object_get_ex( _obj,
json_bool json_rc = json_object_object_get_ex( json_obj,
&dict_label[0],
&info_obj );
if ( ( json_rc == true ) && ( info_obj ) )
{
jlog ("%s: %s ", &dict_label[0], json_object_get_string(info_obj));
struct json_object *ctrl_obj = (struct json_object *)(NULL);
json_rc = json_object_object_get_ex( info_obj, CONTROLLER, &ctrl_obj );
if (( json_rc == true ) && ( ctrl_obj ))
{
string active_controller = json_object_get_string(ctrl_obj);
if ( ctrl.pxeboot_addr_active_controller != active_controller )
{
string prefix = "controller pxeboot address" ;
if ( ctrl.pxeboot_addr_active_controller.empty() )
{
ilog ("%s: %s",
prefix.c_str(),
active_controller.c_str());
}
else
{
ilog ("%s: %s ; was %s",
prefix.c_str(),
active_controller.c_str(),
ctrl.pxeboot_addr_active_controller.c_str());
}
ctrl.pxeboot_addr_active_controller = active_controller ;
}
}
// now get the individual controller addresses
string pxeboot_addr_cx[CONTROLLERS] = {CONTROLLER_0, CONTROLLER_1};
for (int c = 0 ; c < CONTROLLERS ; c++)
{
@ -2423,8 +2480,7 @@ void load_pxebootInfo_msg ( mtc_message_type & msg )
// get the current pxeboot address for the in loop controller
cur_pxeboot_addr = (controller == CONTROLLER_0) ? ctrl.pxeboot_addr_c0 : ctrl.pxeboot_addr_c1;
json_bool json_rc =
json_object_object_get_ex( info_obj, controller.data(), &ctrl_obj );
json_rc = json_object_object_get_ex( info_obj, controller.data(), &ctrl_obj );
if (( json_rc == true ) && (ctrl_obj))
{
jlog ("controller-x obj data: %s", json_object_get_string(ctrl_obj));
@ -2477,7 +2533,7 @@ void load_pxebootInfo_msg ( mtc_message_type & msg )
elog("Failed to parse '%s' from mtcAlive request message: %s",
&dict_label[0], &msg.buf[0]);
}
json_object_put(_obj);
json_object_put(json_obj);
}
else
{

View File

@ -102,6 +102,7 @@ typedef struct
string pxeboot_addr ;
string pxeboot_addr_c0 ;
string pxeboot_addr_c1 ;
string pxeboot_addr_active_controller ;
// Assume address is learned to start even though it's likely not.
// This enabled the first not learned log followed by a learned
@ -147,6 +148,10 @@ typedef struct
string mtcAgent_ip ;
peer_ctrlr_reset_type peer_ctrlr_reset;
/* throttles sending the periodic mtcClient ready event. */
int ready_event_counter ;
} ctrl_type ;
ctrl_type * get_ctrl_ptr ( void );

View File

@ -90,14 +90,15 @@ int nodeLinkClass::fsm ( struct nodeLinkClass::node * node_ptr )
* with mtcAlive debouncing
*/
nodeLinkClass::online_handler ( node_ptr );
/*
* Always run the mtcAlive handler.
*
/* pxeboot_mtcAlive_monitor
* - monitor host's mtcAlive messaging
* - manage host's mtcAlive missing alarm
*/
nodeLinkClass::pxeboot_mtcAlive_monitor ( node_ptr );
*
* Don't monitor pxeboot mtcAlive messaging while the pxeboot network is
* not provisioned or that node has not yet reported that it supports
* pxeboot mtcAlive messaging */
if ( this->pxeboot_network_provisioned && node_ptr->pxeboot_mtcAlive_supported )
nodeLinkClass::pxeboot_mtcAlive_monitor ( node_ptr );
if ( node_ptr->adminAction == MTC_ADMIN_ACTION__DELETE )
{
@ -106,7 +107,6 @@ int nodeLinkClass::fsm ( struct nodeLinkClass::node * node_ptr )
return (PASS);
}
/* Run the config FSM if the configAction bool is set.
* We keep this as a separate action unto itself so that
* mtce can continue to service all other actions for the

View File

@ -3433,15 +3433,20 @@ int nodeLinkClass::offline_handler ( struct nodeLinkClass::node * node_ptr )
plog ("%s offline (external)\n", node_ptr->hostname.c_str());
node_ptr->offlineStage = MTC_OFFLINE__IDLE ;
}
else if ( !node_ptr->mtcAlive_mgmnt && !node_ptr->mtcAlive_clstr )
else if ( !node_ptr->mtcAlive_mgmnt && !node_ptr->mtcAlive_clstr && !node_ptr->mtcAlive_pxeboot )
{
if ( ++node_ptr->offline_search_count > offline_threshold )
{
node_ptr->mtcAlive_online = false ;
node_ptr->mtcClient_ready = false ;
// Clear all the mtcAlive_sequence numbers
// Clear all the mtcAlive counts and sequence numbers
node_ptr->mtcAlive_mgmnt_count = 0 ;
node_ptr->mtcAlive_clstr_count = 0 ;
node_ptr->mtcAlive_pxeboot_count = 0 ;
for (int i = 0 ; i < MTCALIVE_INTERFACES_MAX ; i++)
node_ptr->mtcAlive_sequence[i] = 0;
node_ptr->mtcAlive_sequence[i] = 0;
plog ("%s going offline ; (threshold (%d msec * %d)\n",
node_ptr->hostname.c_str(),
@ -3485,13 +3490,15 @@ int nodeLinkClass::offline_handler ( struct nodeLinkClass::node * node_ptr )
**/
node_ptr->mtcAlive_online = true ;
ilog ("%s still seeing mtcAlive (%d) (Mgmt:%c:%d Clstr:%c:%d) ; restart offline_search_count=%d of %d\n",
ilog ("%s still seeing mtcAlive (%d) (Mgmt:%c:%d Clstr:%c:%d Pxeboot:%c:%d) ; restart offline_search_count=%d of %d\n",
node_ptr->hostname.c_str(),
node_ptr->mtcAlive_count,
node_ptr->mtcAlive_mgmnt ? 'Y' : 'n',
node_ptr->mtcAlive_mgmnt_count,
node_ptr->mtcAlive_clstr ? 'Y' : 'n',
node_ptr->mtcAlive_clstr_count,
node_ptr->mtcAlive_pxeboot ? 'Y' : 'n',
node_ptr->mtcAlive_pxeboot_count,
node_ptr->offline_search_count,
offline_threshold );
node_ptr->offline_search_count = 0 ; /* reset the search count */
@ -6261,6 +6268,8 @@ int nodeLinkClass::add_handler ( struct nodeLinkClass::node * node_ptr )
mtcAlarm_state ( node_ptr->hostname, MTC_ALARM_ID__ENABLE);
EFmAlarmSeverityT config_alarm_severity =
mtcAlarm_state ( node_ptr->hostname, MTC_ALARM_ID__CONFIG);
EFmAlarmSeverityT mtcAlive_alarm_severity =
mtcAlarm_state ( node_ptr->hostname, MTC_ALARM_ID__MTCALIVE);
/* Clear generic enable alarm over process restart.
* Will get reasserted if the cause condition still exists */
@ -6284,6 +6293,21 @@ int nodeLinkClass::add_handler ( struct nodeLinkClass::node * node_ptr )
alarmUtil_getSev_str(config_alarm_severity).c_str());
}
/* The mtcAlive alarm is maintained if it exists.
* The pxeboot_mtcAlive_monitor will clear the alarm
* if it exists and the pxeboot mtcAlive messaging works. */
if ( mtcAlive_alarm_severity != FM_ALARM_SEVERITY_CLEAR )
{
node_ptr->alarms[MTC_ALARM_ID__MTCALIVE] = mtcAlive_alarm_severity ;
ilog ("%s found mtcAlive alarm ; loaded %s",
node_ptr->hostname.c_str(),
alarmUtil_getSev_str(mtcAlive_alarm_severity).c_str());
// Load up the miss and loss counts used for recovery
node_ptr->mtcAlive_loss_count[PXEBOOT_INTERFACE] = PXEBOOT_MTCALIVE_LOSS_ALARM_THRESHOLD ;
node_ptr->mtcAlive_miss_count[PXEBOOT_INTERFACE] = PXEBOOT_MTCALIVE_LOSS_THRESHOLD ;
}
if ( is_controller(node_ptr) )
{
this->controllers++ ;
@ -7571,29 +7595,46 @@ int nodeLinkClass::oos_test_handler ( struct nodeLinkClass::node * node_ptr )
// Returns : PASS
//
///////////////////////////////////////////////////////////////////////////////
#define PXEBOOT_MTCALIVE_MONITOR_RATE_SECS (10)
#define PXEBOOT_MTCALIVE_LOSS_THRESHOLD (6)
#define PXEBOOT_MTCALIVE_NOT_SEEN_LOG_THROTTLE (6)
#define PXEBOOT_MTCALIVE_LOSS_LOG_THROTTLE (6)
int nodeLinkClass::pxeboot_mtcAlive_monitor ( struct nodeLinkClass::node * node_ptr )
{
// ERIK: TODO: Comment out once verified
flog ("%s pxeboot mtcAlive fsm stage: %s",
node_ptr->hostname.c_str(),
get_mtcAliveStages_str(node_ptr->mtcAliveStage).c_str());
if ( !this->pxeboot_network_provisioned ) return PASS ;
// Don't monitor pxeboot mtcAlive messaging while the node is
// locked or in the following administrative action states.
if (( node_ptr->adminState == MTC_ADMIN_STATE__LOCKED ) ||
( node_ptr->adminAction == MTC_ADMIN_ACTION__UNLOCK ) ||
( node_ptr->adminAction == MTC_ADMIN_ACTION__ENABLE ) ||
( node_ptr->adminAction == MTC_ADMIN_ACTION__RECOVER ) ||
( node_ptr->adminAction == MTC_ADMIN_ACTION__POWERCYCLE ))
{
// Clear the alarm if the node is locked
if (( node_ptr->adminState == MTC_ADMIN_STATE__LOCKED ) &&
( node_ptr->alarms[MTC_ALARM_ID__MTCALIVE] != FM_ALARM_SEVERITY_CLEAR ))
alarm_mtcAlive_clear (node_ptr, PXEBOOT_INTERFACE);
// Switch to START if not already there
if ( node_ptr->mtcAliveStage != MTC_MTCALIVE__START )
mtcAliveStageChange (node_ptr, MTC_MTCALIVE__START);
return PASS ;
}
switch (node_ptr->mtcAliveStage)
{
// Starts from scratch. Clears timer and counts but not alarm.
case MTC_MTCALIVE__START:
{
alog2 ("%s mtcAlive start", node_ptr->hostname.c_str());
mtcTimer_reset ( node_ptr->mtcAlive_timer );
if ( ! mtcTimer_expired (node_ptr->mtcAlive_timer) )
mtcTimer_reset (node_ptr->mtcAlive_timer);
node_ptr->mtcAlive_sequence[PXEBOOT_INTERFACE] = 0 ;
node_ptr->mtcAlive_sequence_save[PXEBOOT_INTERFACE] = 0 ;
mtcAliveStageChange (node_ptr, MTC_MTCALIVE__SEND);
break ;
return PASS ;
}
// Reloads the controller's pxeboot info and sends it with a mtcAlive request
// telling the remote node to send send mtcAlive to the active controller.
case MTC_MTCALIVE__SEND:
{
/* pxeboot info refresh audit */
@ -7601,25 +7642,31 @@ int nodeLinkClass::pxeboot_mtcAlive_monitor ( struct nodeLinkClass::node * node_
pxebootInfo_loader ();
alog2 ("%s mtcAlive send", node_ptr->hostname.c_str());
send_mtc_cmd ( node_ptr->hostname, MTC_REQ_MTCALIVE, PXEBOOT_INTERFACE );
node_ptr->mtcAlive_sequence_save[PXEBOOT_INTERFACE] = 0 ;
node_ptr->mtcAlive_sequence_miss[PXEBOOT_INTERFACE] = 0 ;
mtcAliveStageChange (node_ptr, MTC_MTCALIVE__MONITOR);
break ;
return PASS ;
}
// Start the Wait timer 2x longer than the expected mtcAlive cadence
case MTC_MTCALIVE__MONITOR:
{
alog2 ("%s mtcAlive monitor", node_ptr->hostname.c_str());
mtcTimer_start ( node_ptr->mtcAlive_timer, mtcTimer_handler,
PXEBOOT_MTCALIVE_MONITOR_RATE_SECS );
mtcAliveStageChange (node_ptr, MTC_MTCALIVE__WAIT);
break ;
return PASS ;
}
// Wait for the timer to expire
case MTC_MTCALIVE__WAIT:
{
if ( mtcTimer_expired ( node_ptr->mtcAlive_timer ) )
mtcAliveStageChange (node_ptr, MTC_MTCALIVE__CHECK);
break ;
return PASS ;
}
// Check the mtcAlive sequence numbers and handle each possible case
// success - mtcAlive sequence number is greater than the last one - may clear alarm
// out-of-sequence - mtcAlive sequence number is less than the last one - may assert alarm
// miss - mtcAlive sequence number is equal to the last one - count misses
// loss - mtcAlive messaging miss count exceeded threshold - assert alarm
// not seen - waiting for first mtcAlive following reboot - request mtcAlive
case MTC_MTCALIVE__CHECK:
{
if ( node_ptr->mtcAlive_sequence[PXEBOOT_INTERFACE] > node_ptr->mtcAlive_sequence_save[PXEBOOT_INTERFACE] )
@ -7632,70 +7679,132 @@ int nodeLinkClass::pxeboot_mtcAlive_monitor ( struct nodeLinkClass::node * node_
node_ptr->mtcAlive_sequence_save[PXEBOOT_INTERFACE]);
// Now that we received a message we can dec the missed count
if ( node_ptr->mtcAlive_sequence_miss[PXEBOOT_INTERFACE] )
node_ptr->mtcAlive_sequence_miss[PXEBOOT_INTERFACE]-- ;
node_ptr->pxeboot_mtcAlive_not_seen_log_throttle = 0 ;
node_ptr->pxeboot_mtcAlive_loss_log_throttle = 0 ;
// and clear the alarm if it exists
if ( node_ptr->mtcAlive_miss_count[PXEBOOT_INTERFACE] )
{
// Set miss count to max if we are have reached at least one loss but no alarm yet
if (( node_ptr->alarms[MTC_ALARM_ID__MTCALIVE] == FM_ALARM_SEVERITY_CLEAR ) &&
( node_ptr->mtcAlive_loss_count[PXEBOOT_INTERFACE] ))
{
node_ptr->mtcAlive_miss_count[PXEBOOT_INTERFACE] = PXEBOOT_MTCALIVE_LOSS_THRESHOLD ;
node_ptr->mtcAlive_loss_count[PXEBOOT_INTERFACE] = 0 ;
}
ilog ("%s pxeboot mtcAlive miss count %d ; decrement %s; recovery",
node_ptr->hostname.c_str(),
node_ptr->mtcAlive_miss_count[PXEBOOT_INTERFACE],
node_ptr->alarms[MTC_ALARM_ID__MTCALIVE] ? "; alarm clear when 0 " : "");
node_ptr->mtcAlive_miss_count[PXEBOOT_INTERFACE]-- ;
}
else
{
// Clear alarm and start with a clean loss slate. miss's is already zero
node_ptr->mtcAlive_loss_count[PXEBOOT_INTERFACE] = 0 ;
alarm_mtcAlive_clear ( node_ptr, PXEBOOT_INTERFACE );
}
// Clear the log throttles now that we have received a message
if ( node_ptr->pxeboot_mtcAlive_not_seen_log_throttle || node_ptr->pxeboot_mtcAlive_loss_log_throttle )
{
node_ptr->pxeboot_mtcAlive_not_seen_log_throttle = 0 ;
node_ptr->pxeboot_mtcAlive_loss_log_throttle = 0 ;
}
mtcAliveStageChange (node_ptr, MTC_MTCALIVE__MONITOR);
}
else if ( node_ptr->mtcAlive_sequence[PXEBOOT_INTERFACE] < node_ptr->mtcAlive_sequence_save[PXEBOOT_INTERFACE] )
{
// unexpected case
wlog ("%s mtcAlive out-of-sequence ; this:%d last:%d",
// mtcClient restart case
if ( ++node_ptr->mtcAlive_miss_count[PXEBOOT_INTERFACE] < PXEBOOT_MTCALIVE_LOSS_THRESHOLD )
{
// The mtcClient on this host may have been restarted
mtcAliveStageChange (node_ptr, MTC_MTCALIVE__SEND);
}
else
mtcAliveStageChange (node_ptr, MTC_MTCALIVE__FAIL);
wlog ("%s pxeboot mtcAlive miss count %d ; loss count %d ; out-of-sequence ; this:%d last:%d",
node_ptr->hostname.c_str(),
node_ptr->mtcAlive_miss_count[PXEBOOT_INTERFACE],
node_ptr->mtcAlive_loss_count[PXEBOOT_INTERFACE],
node_ptr->mtcAlive_sequence[PXEBOOT_INTERFACE],
node_ptr->mtcAlive_sequence_save[PXEBOOT_INTERFACE]);
node_ptr->mtcAlive_sequence_miss[PXEBOOT_INTERFACE]++ ;
mtcAliveStageChange (node_ptr, MTC_MTCALIVE__START);
}
else if ( ++node_ptr->mtcAlive_sequence_miss[PXEBOOT_INTERFACE] < PXEBOOT_MTCALIVE_LOSS_THRESHOLD )
else if ( ++node_ptr->mtcAlive_miss_count[PXEBOOT_INTERFACE] < PXEBOOT_MTCALIVE_LOSS_THRESHOLD )
{
// Missing pxeboot mtcAlive
alog ("%s pxeboot mtcAlive miss count %d ; sending request",
wlog ("%s pxeboot mtcAlive miss count %d ; loss count %d ; sending request",
node_ptr->hostname.c_str(),
node_ptr->mtcAlive_sequence_miss[PXEBOOT_INTERFACE]);
send_mtc_cmd ( node_ptr->hostname, MTC_REQ_MTCALIVE, PXEBOOT_INTERFACE );
mtcAliveStageChange (node_ptr, MTC_MTCALIVE__MONITOR);
}
else if ( node_ptr->mtcAlive_pxeboot == true )
{
wlog_throttled (node_ptr->pxeboot_mtcAlive_loss_log_throttle,
PXEBOOT_MTCALIVE_LOSS_LOG_THROTTLE,
"%s pxeboot mtcAlive loss ; missed: %d ; last: count:%d seq: %d ; sending request",
node_ptr->hostname.c_str(),
node_ptr->mtcAlive_sequence_miss[PXEBOOT_INTERFACE],
node_ptr->mtcAlive_pxeboot_count,
node_ptr->mtcAlive_sequence_save[PXEBOOT_INTERFACE]);
node_ptr->mtcAlive_miss_count[PXEBOOT_INTERFACE],
node_ptr->mtcAlive_loss_count[PXEBOOT_INTERFACE]);
// The mtcClient on this host may have been restarted
mtcAliveStageChange (node_ptr, MTC_MTCALIVE__SEND);
}
else
{
ilog_throttled (node_ptr->pxeboot_mtcAlive_not_seen_log_throttle,
PXEBOOT_MTCALIVE_NOT_SEEN_LOG_THROTTLE,
"%s pxeboot mtcAlive not seen yet ; sending request",
node_ptr->hostname.c_str());
mtcAliveStageChange (node_ptr, MTC_MTCALIVE__SEND);
if ( node_ptr->mtcAlive_pxeboot == true )
{
// If we get there its a loss
wlog_throttled (node_ptr->pxeboot_mtcAlive_loss_log_throttle,
PXEBOOT_MTCALIVE_LOSS_LOG_THROTTLE,
"%s pxeboot mtcAlive lost ; missed: %d ; last: count:%d seq: %d ; sending request",
node_ptr->hostname.c_str(),
node_ptr->mtcAlive_miss_count[PXEBOOT_INTERFACE],
node_ptr->mtcAlive_pxeboot_count,
node_ptr->mtcAlive_sequence_save[PXEBOOT_INTERFACE]);
}
else
{
// Otherwise still searching beyond threshold for the first mtcAlive after reboot or graceful recovery
ilog_throttled (node_ptr->pxeboot_mtcAlive_not_seen_log_throttle,
PXEBOOT_MTCALIVE_NOT_SEEN_LOG_THROTTLE,
"%s pxeboot mtcAlive not seen yet ; sending request",
node_ptr->hostname.c_str());
}
mtcAliveStageChange (node_ptr, MTC_MTCALIVE__FAIL);
}
// Prevent the miss count from being larger than the loss, and therfore the alarm clear recovery, threshold.
if (node_ptr->mtcAlive_miss_count[PXEBOOT_INTERFACE] > PXEBOOT_MTCALIVE_LOSS_THRESHOLD)
node_ptr->mtcAlive_miss_count[PXEBOOT_INTERFACE] = PXEBOOT_MTCALIVE_LOSS_THRESHOLD;
node_ptr->mtcAlive_sequence_save[PXEBOOT_INTERFACE] = node_ptr->mtcAlive_sequence[PXEBOOT_INTERFACE] ;
// TODO (emacdona): Need to handle loss case that manages raising the alarm
// Transition to MTC_MTCALIVE__FAIL
break ;
}
case MTC_MTCALIVE__FAIL:
{
wlog ("%s mtcAlive fail", node_ptr->hostname.c_str());
mtcAliveStageChange (node_ptr, MTC_MTCALIVE__START);
break ;
}
default:
{
slog ("%s mtcAlive fsm default", node_ptr->hostname.c_str());
alog2 ("%s mtcAlive fail", node_ptr->hostname.c_str());
if ( node_ptr->alarms[MTC_ALARM_ID__MTCALIVE] == FM_ALARM_SEVERITY_CLEAR )
{
if ( ++node_ptr->mtcAlive_loss_count[PXEBOOT_INTERFACE] < PXEBOOT_MTCALIVE_LOSS_ALARM_THRESHOLD )
{
wlog ("%s pxeboot mtcAlive lost ; %d more loss before alarm assertion",
node_ptr->hostname.c_str(),
PXEBOOT_MTCALIVE_LOSS_ALARM_THRESHOLD - node_ptr->mtcAlive_loss_count[PXEBOOT_INTERFACE] );
// Start the misses counter over again after each loss debounce
node_ptr->mtcAlive_miss_count[PXEBOOT_INTERFACE] = 0 ;
}
else
{
ilog ("%s pxeboot mtcAlive alarm assert (%d)",
node_ptr->hostname.c_str(),
node_ptr->mtcAlive_loss_count[PXEBOOT_INTERFACE]);
alarm_mtcAlive_failure ( node_ptr, PXEBOOT_INTERFACE );
}
}
mtcAliveStageChange (node_ptr, MTC_MTCALIVE__START);
break ;
}
}
if ( node_ptr->mtcAlive_miss_count[PXEBOOT_INTERFACE] || node_ptr->mtcAlive_loss_count[PXEBOOT_INTERFACE] )
{
alog2 ("%s pxeboot mtcAlive: Miss: %d of %d , Loss: %d of %d",
node_ptr->hostname.c_str(),
node_ptr->mtcAlive_miss_count[PXEBOOT_INTERFACE], PXEBOOT_MTCALIVE_LOSS_THRESHOLD,
node_ptr->mtcAlive_loss_count[PXEBOOT_INTERFACE], PXEBOOT_MTCALIVE_LOSS_ALARM_THRESHOLD);
}
return (PASS);
}