Add in-service test to clear stale config failure alarm

A configuration failure alarm can get stuck asserted if
that node experiences an uncontrolled reboot that recovers
without a configuration failure.

This update adds an in-service test that audits host health
while there is a configuration failure alarm raised and
clear that alarm if the failure condition goes away. This
could be a result of an in-service manifest that runs and
corrects the configuration or if the node reboots and comes
back up in a healthy (properly configured) state.

Fixed bug that was clearing config alarm severity state
when a heartbeat clear event is received.

This update also goes a step further and introduces an
alarms state audit that detects and corrects maintenance
alarm state mismatches.

Test Plan:

PASS: Verify the add handler loads config alarm state
PASS: Verify in-service test clears stale config alarm
PASS: Verify in-service test acts on new config failure
      ... degrade - active controller
      ... fail    - other hosts
PASS: Verify audit fixes mtce alarm state mismatches
PASS: Verify audit handles fm not running case
PASS: Verify audit handling behavior with valid alarm cases
PASS: Verify locked alarm management over process restart
PASS: Verify audit only logs active alarms list changes
PASS: Verify audit runs for both locked/unlocked nodes
PASS: Verify update as a patch

Regression:

PASS: Verify enable sequence config failure handling
PASS: ... active controller     - recoverable degrade
PASS: ... other nodes           - threshold fail
PASS: ... auto recovery disable - config failure
PASS: Verify mtcAgent process logging
PASS: Verify heartbeat handling and alarming
PASS: Verify Standard system install
PASS: Verify AIO system install

Change-Id: If9957229810435e9faeb08374f2b5fbcb5b0f826
Closes-Bug: 1918195
Signed-off-by: Eric MacDonald <eric.macdonald@windriver.com>
This commit is contained in:
Eric MacDonald 2021-03-26 13:05:51 -04:00
parent 5c83453fdf
commit 031818e55b
6 changed files with 265 additions and 87 deletions

View File

@ -660,7 +660,7 @@ nodeLinkClass::node* nodeLinkClass::addNode( string hostname )
{
ptr->alarms[id] = FM_ALARM_SEVERITY_CLEAR ;
}
ptr->alarms_loaded = false ;
ptr->active_alarms = "" ; /* no active alarms */
ptr->cfgEvent.base = NULL ;
ptr->sysinvEvent.base= NULL ;
@ -778,6 +778,7 @@ nodeLinkClass::node* nodeLinkClass::addNode( string hostname )
return ptr ;
}
struct nodeLinkClass::node* nodeLinkClass::getNode ( string hostname )
{
/* check for empty list condition */
@ -5088,6 +5089,15 @@ void nodeLinkClass::manage_heartbeat_failure ( string hostname, iface_enum iface
}
}
/****************************************************************************
*
* Name : manage_heartbeat_clear
*
* Description: Manage clearing heartbeat failure status
*
* Assuptions : Called by Both hbsAgent and mtcAgent
*
***************************************************************************/
void nodeLinkClass::manage_heartbeat_clear ( string hostname, iface_enum iface )
{
nodeLinkClass::node * node_ptr = nodeLinkClass::getNode ( hostname );
@ -5103,13 +5113,17 @@ void nodeLinkClass::manage_heartbeat_clear ( string hostname, iface_enum iface )
node_ptr->heartbeat_failed[i] = false ;
if ( i == MGMNT_IFACE )
{
node_ptr->alarms[HBS_ALARM_ID__HB_MGMNT] = FM_ALARM_SEVERITY_CLEAR ;
node_ptr->degrade_mask &= ~DEGRADE_MASK_HEARTBEAT_MGMNT ;
if ( heartbeat )
node_ptr->alarms[HBS_ALARM_ID__HB_MGMNT] = FM_ALARM_SEVERITY_CLEAR ;
if ( maintenance )
node_ptr->degrade_mask &= ~DEGRADE_MASK_HEARTBEAT_MGMNT ;
}
if ( i == CLSTR_IFACE )
{
node_ptr->alarms[HBS_ALARM_ID__HB_CLSTR] = FM_ALARM_SEVERITY_CLEAR ;
node_ptr->degrade_mask &= ~DEGRADE_MASK_HEARTBEAT_CLSTR ;
if ( heartbeat )
node_ptr->alarms[HBS_ALARM_ID__HB_CLSTR] = FM_ALARM_SEVERITY_CLEAR ;
if ( maintenance )
node_ptr->degrade_mask &= ~DEGRADE_MASK_HEARTBEAT_CLSTR ;
}
}
}
@ -5118,13 +5132,17 @@ void nodeLinkClass::manage_heartbeat_clear ( string hostname, iface_enum iface )
node_ptr->heartbeat_failed[iface] = false ;
if ( iface == MGMNT_IFACE )
{
node_ptr->alarms[HBS_ALARM_ID__HB_MGMNT] = FM_ALARM_SEVERITY_CLEAR ;
node_ptr->degrade_mask &= ~DEGRADE_MASK_HEARTBEAT_MGMNT ;
if ( heartbeat )
node_ptr->alarms[HBS_ALARM_ID__HB_MGMNT] = FM_ALARM_SEVERITY_CLEAR ;
if ( maintenance )
node_ptr->degrade_mask &= ~DEGRADE_MASK_HEARTBEAT_MGMNT ;
}
else if ( iface == CLSTR_IFACE )
{
node_ptr->alarms[HBS_ALARM_ID__HB_CLSTR] = FM_ALARM_SEVERITY_CLEAR ;
node_ptr->degrade_mask &= ~DEGRADE_MASK_HEARTBEAT_CLSTR ;
if ( heartbeat )
node_ptr->alarms[HBS_ALARM_ID__HB_CLSTR] = FM_ALARM_SEVERITY_CLEAR ;
if ( maintenance )
node_ptr->degrade_mask &= ~DEGRADE_MASK_HEARTBEAT_CLSTR ;
}
}
}
@ -9068,21 +9086,21 @@ void nodeLinkClass::mem_log_mtcalive ( struct nodeLinkClass::node * node_ptr )
{
char str[MAX_MEM_LOG_DATA] ;
snprintf (&str[0], MAX_MEM_LOG_DATA, "%s\tmtcAlive: online:%c offline:%c Cnt:%d Gate:%s Misses:%d\n",
node_ptr->hostname.c_str(),
snprintf (&str[0], MAX_MEM_LOG_DATA, "%s\tmtcAlive: online:%c offline:%c Cnt:%d Gate:%s Misses:%d\n",
node_ptr->hostname.c_str(),
node_ptr->mtcAlive_online ? 'Y' : 'N',
node_ptr->mtcAlive_offline ? 'Y' : 'N',
node_ptr->mtcAlive_count,
node_ptr->mtcAlive_gate ? "closed" : "open",
node_ptr->mtcAlive_misses);
node_ptr->mtcAlive_misses);
mem_log (str);
}
void nodeLinkClass::mem_log_alarm1 ( struct nodeLinkClass::node * node_ptr )
{
char str[MAX_MEM_LOG_DATA] ;
snprintf (&str[0], MAX_MEM_LOG_DATA, "%s\tAlarm List:%s%s%s%s%s%s\n",
node_ptr->hostname.c_str(),
snprintf (&str[0], MAX_MEM_LOG_DATA, "%s\tAlarm List:%s%s%s%s%s%s\n",
node_ptr->hostname.c_str(),
node_ptr->alarms[MTC_ALARM_ID__LOCK ] ? " Locked" : " .",
node_ptr->alarms[MTC_ALARM_ID__CONFIG ] ? " Config" : " .",
node_ptr->alarms[MTC_ALARM_ID__ENABLE ] ? " Enable" : " .",
@ -9092,6 +9110,18 @@ void nodeLinkClass::mem_log_alarm1 ( struct nodeLinkClass::node * node_ptr )
mem_log (str);
}
void nodeLinkClass::mem_log_alarm2 ( struct nodeLinkClass::node * node_ptr )
{
if ( ! node_ptr->active_alarms.empty() )
{
char str[MAX_MEM_LOG_DATA] ;
snprintf (&str[0], MAX_MEM_LOG_DATA, "%s\tActive Alarms:%s\n",
node_ptr->hostname.c_str(),
node_ptr->active_alarms.c_str());
mem_log (str);
}
}
void nodeLinkClass::mem_log_stage ( struct nodeLinkClass::node * node_ptr )
{
char str[MAX_MEM_LOG_DATA] ;
@ -9142,8 +9172,8 @@ void nodeLinkClass::mem_log_network ( struct nodeLinkClass::node * node_ptr )
{
char str[MAX_MEM_LOG_DATA] ;
snprintf (&str[0], MAX_MEM_LOG_DATA, "%s\t%s %s cluster_host_ip: %s Uptime: %u\n",
node_ptr->hostname.c_str(),
node_ptr->mac.c_str(),
node_ptr->hostname.c_str(),
node_ptr->mac.c_str(),
node_ptr->ip.c_str(),
node_ptr->clstr_ip.c_str(),
node_ptr->uptime );
@ -9155,11 +9185,11 @@ void nodeLinkClass::mem_log_heartbeat ( struct nodeLinkClass::node * node_ptr )
char str[MAX_MEM_LOG_DATA] ;
for ( int iface = 0 ; iface < MAX_IFACES ; iface++ )
{
snprintf (&str[0], MAX_MEM_LOG_DATA, "%s\t%s Minor:%s Degrade:%s Failed:%s Monitor:%s\n",
node_ptr->hostname.c_str(),
snprintf (&str[0], MAX_MEM_LOG_DATA, "%s\t%s Minor:%s Degrade:%s Failed:%s Monitor:%s\n",
node_ptr->hostname.c_str(),
get_iface_name_str (iface),
node_ptr->hbs_minor[iface] ? "true " : "false",
node_ptr->hbs_degrade[iface] ? "true " : "false",
node_ptr->hbs_minor[iface] ? "true " : "false",
node_ptr->hbs_degrade[iface] ? "true " : "false",
node_ptr->hbs_failure[iface] ? "true " : "false",
node_ptr->monitor[iface] ? "YES" : "no" );
mem_log (str);
@ -9188,8 +9218,8 @@ void nodeLinkClass::mem_log_hbs_cnts ( struct nodeLinkClass::node * node_ptr )
void nodeLinkClass::mem_log_test_info ( struct nodeLinkClass::node * node_ptr )
{
char str[MAX_MEM_LOG_DATA] ;
snprintf (&str[0], MAX_MEM_LOG_DATA, "%s\tOOS Stage:%s Runs:%d - INSV Stage:%s Runs:%d\n",
node_ptr->hostname.c_str(),
snprintf (&str[0], MAX_MEM_LOG_DATA, "%s\tOOS Stage:%s Runs:%d - INSV Stage:%s Runs:%d\n",
node_ptr->hostname.c_str(),
get_oosTestStages_str(node_ptr->oosTestStage).c_str(),
node_ptr->oos_test_count,
get_insvTestStages_str(node_ptr->insvTestStage).c_str(),
@ -9261,6 +9291,7 @@ void nodeLinkClass::memDumpNodeState ( string hostname )
// mem_log_reset_info ( node_ptr );
mem_log_power_info ( node_ptr );
mem_log_alarm1 ( node_ptr );
mem_log_alarm2 ( node_ptr );
mem_log_mtcalive ( node_ptr );
mem_log_stage ( node_ptr );
mem_log_bm ( node_ptr );

View File

@ -652,12 +652,12 @@ private:
/** @} private_monitoring_services_variables */
/* List of alarms and current severity */
#define MAX_ALARMS (10)
/* List of alarms current severity */
EFmAlarmSeverityT alarms[MAX_ALARMS];
/* tracks whether the alarms for this host have been loaded already or not */
bool alarms_loaded ;
/* string containing active alarms and their severity
* ... for logging purposes only */
string active_alarms ;
/** true if this host has recovered before the mnfa timeout period.
* This bool flags the graceful recovery handler that this node
@ -665,8 +665,6 @@ private:
* and uptime accordingly */
bool mnfa_graceful_recovery ;
int stress_iteration ;
/* BMC Protocol Learning Controls and State */
/* specifies what BMC protocol is selected for this host
@ -843,6 +841,9 @@ private:
/* server specific power state query handler */
bool (*is_poweron_handler) (string hostname, string query_response );
/* Audit that monitors and auto corrects alarm state mismatches */
void mtcAlarm_audit ( struct nodeLinkClass::node * node_ptr );
/* Calculate the overall reset progression timeout */
int calc_reset_prog_timeout ( struct nodeLinkClass::node * node_ptr, int retries );
@ -1304,6 +1305,7 @@ private:
void mem_log_state1 ( struct nodeLinkClass::node * node_ptr );
void mem_log_state2 ( struct nodeLinkClass::node * node_ptr );
void mem_log_alarm1 ( struct nodeLinkClass::node * node_ptr );
void mem_log_alarm2 ( struct nodeLinkClass::node * node_ptr );
void mem_log_mtcalive ( struct nodeLinkClass::node * node_ptr );
void mem_log_stage ( struct nodeLinkClass::node * node_ptr );
void mem_log_test_info ( struct nodeLinkClass::node * node_ptr );

View File

@ -26,6 +26,7 @@ using namespace std;
#include "daemon_common.h" /* */
#include "nodeBase.h" /* */
#include "nodeClass.h" /* */
#include "nodeTimers.h" /* */
#include "nodeUtil.h" /* */
#include "mtcAlarm.h" /* for ... this module header */
@ -379,8 +380,169 @@ void mtcAlarm_clear_all ( string hostname )
}
}
/****************************************************************************
*
* Name : mtcAlarm_audit
*
* Purpose : Monitor and Auto-Correct maintenance alarms
*
* Description: Query locked state alarm (raw)
* if successful
* - Query alarms
* - compare to running state
* - correct mismatches ; internal state takes precidence
* - log all alarm state changes
*
****************************************************************************/
void nodeLinkClass::mtcAlarm_audit ( struct nodeLinkClass::node * node_ptr )
{
/*
* Read locked state alarm directly to detect fm access failures.
* If successful further reads are done using a wrapper utility.
*/
SFmAlarmDataT alarm_query ;
AlarmFilter alarm_filter ;
EFmErrorT rc ;
memset(&alarm_query, 0, sizeof(alarm_query));
memset(&alarm_filter, 0, sizeof(alarm_filter));
snprintf ( &alarm_filter.alarm_id[0], FM_MAX_BUFFER_LENGTH, "%s",
LOCK_ALARM_ID);
snprintf ( &alarm_filter.entity_instance_id[0], FM_MAX_BUFFER_LENGTH, "%s%s",
ENTITY_PREFIX, node_ptr->hostname.data());
rc = fm_get_fault ( &alarm_filter, &alarm_query );
if (( rc != FM_ERR_OK ) && ( rc != FM_ERR_ENTITY_NOT_FOUND ))
{
wlog("%s alarm query failure ; code:%d",
node_ptr->hostname.c_str(),
rc );
return ;
}
/* With FM comms proven working lets check the other mtc alarms */
string active_alarms = "";
for ( int i = 0 ; i < MAX_ALARMS ; i++ )
{
mtc_alarm_id_enum id = (mtc_alarm_id_enum)i ;
if ( id == MTC_ALARM_ID__LOCK )
{
/* Unexpected severity case */
if ( node_ptr->adminState == MTC_ADMIN_STATE__LOCKED )
{
if ( alarm_query.severity != FM_ALARM_SEVERITY_WARNING )
{
node_ptr->alarms[id] = FM_ALARM_SEVERITY_WARNING ;
wlog("%s %s alarm mismatch ; %s -> %s",
node_ptr->hostname.c_str(),
_getIdentity(id).c_str(),
alarmUtil_getSev_str(alarm_query.severity).c_str(),
alarmUtil_getSev_str(node_ptr->alarms[id]).c_str());
mtcAlarm_warning ( node_ptr->hostname, MTC_ALARM_ID__LOCK );
}
if (!active_alarms.empty())
active_alarms.append(", ");
active_alarms.append(_getIdentity(id) + ":");
active_alarms.append(alarmUtil_getSev_str(node_ptr->alarms[id]));
}
/* Unexpected assertion case */
else if (( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED ) &&
( alarm_query.severity != FM_ALARM_SEVERITY_CLEAR ))
{
node_ptr->alarms[id] = FM_ALARM_SEVERITY_CLEAR ;
wlog("%s %s alarm mismatch ; %s -> %s",
node_ptr->hostname.c_str(),
_getIdentity(id).c_str(),
alarmUtil_getSev_str(alarm_query.severity).c_str(),
alarmUtil_getSev_str(node_ptr->alarms[id]).c_str());
mtcAlarm_clear ( node_ptr->hostname, id );
}
}
else if (( id == MTC_ALARM_ID__CONFIG ) ||
( id == MTC_ALARM_ID__ENABLE ) ||
( id == MTC_ALARM_ID__BM ) ||
( id == MTC_ALARM_ID__CH_CONT) ||
( id == MTC_ALARM_ID__CH_COMP))
{
EFmAlarmSeverityT severity = mtcAlarm_state ( node_ptr->hostname, id);
if ( severity != node_ptr->alarms[id] )
{
ilog ("%s %s alarm mismatch ; %s -> %s",
node_ptr->hostname.c_str(),
_getIdentity(id).c_str(),
alarmUtil_getSev_str(severity).c_str(),
alarmUtil_getSev_str(node_ptr->alarms[id]).c_str());
if ( node_ptr->alarms[id] == FM_ALARM_SEVERITY_CLEAR )
{
mtcAlarm_clear ( node_ptr->hostname, id );
}
else
{
mtcAlarm_raise ( node_ptr->hostname, id, node_ptr->alarms[id] );
}
}
if ( node_ptr->alarms[id] != FM_ALARM_SEVERITY_CLEAR )
{
if (!active_alarms.empty())
active_alarms.append(", ");
active_alarms.append(_getIdentity(id) + ":");
active_alarms.append(alarmUtil_getSev_str(node_ptr->alarms[id]));
}
}
/* else don't care about other alarm ids ; logs events etc */
}
/* manage logging of active alarms */
if ( !active_alarms.empty() )
{
if ( node_ptr->active_alarms != active_alarms )
{
ilog ("%s active alarms: %s",
node_ptr->hostname.c_str(),
active_alarms.c_str());
node_ptr->active_alarms = active_alarms ;
}
/* else
* do nothing because there are active alarms
* that have not changed since the last audit.
*/
}
else if ( ! node_ptr->active_alarms.empty() )
{
/* clear active alarm list since there 'were' active alarms
* but there are no longer active alarms */
node_ptr->active_alarms.clear();
ilog ("%s no active alarms", node_ptr->hostname.c_str());
}
/* else
* no active alarms ; don't log */
}
/************************* A L A R M I N G **************************/
/* Raise the specified maintenance alarm severity */
int mtcAlarm_raise ( string hostname, mtc_alarm_id_enum id, EFmAlarmSeverityT severity )
{
switch ( severity )
{
case FM_ALARM_SEVERITY_MINOR:
return (mtcAlarm_minor(hostname,id));
case FM_ALARM_SEVERITY_MAJOR:
return (mtcAlarm_major(hostname,id));
case FM_ALARM_SEVERITY_CRITICAL:
return (mtcAlarm_critical(hostname,id));
default:
return (FAIL_BAD_PARM);
}
}
/* Clear the specified hosts's maintenance alarm */
int mtcAlarm_clear ( string hostname, mtc_alarm_id_enum id )
{

View File

@ -95,6 +95,9 @@ string mtcAlarm_getId_str ( mtc_alarm_id_enum id );
/** Clear the specified maintenance alarm for specific host */
int mtcAlarm_clear ( string hostname, mtc_alarm_id_enum id );
/** Raise specified severity level alarm for the specified host */
int mtcAlarm_raise ( string hostname, mtc_alarm_id_enum id, EFmAlarmSeverityT severity );
/** Assert a specified mtce alarm against the specified host with a WARNING severity level */
int mtcAlarm_warning ( string hostname, mtc_alarm_id_enum id );

View File

@ -1187,15 +1187,6 @@ int _self_provision ( void )
if ( my_identity.name == record_info.name )
{
/* If the active controller was 'locked' and is being auto-corrected
* to 'unlocked' then ensure that there is no locked alarm set for it */
if ( record_info.admin != "locked" )
{
mtcAlarm_clear ( my_identity.name, MTC_ALARM_ID__LOCK );
/* this is not required because its already inited to clear */
// node_ptr->alarms[MTC_ALARM_ID__LOCK] = FM_ALARM_SEVERITY_CLEAR
}
if ( my_identity.mac != record_info.mac )
{
wlog ("%s mac address mismatch (%s - %s)\n",

View File

@ -6107,48 +6107,32 @@ int nodeLinkClass::add_handler ( struct nodeLinkClass::node * node_ptr )
mtcInvApi_update_state ( node_ptr, "availability", "available" );
}
/* handle other cases */
EFmAlarmSeverityT sev = mtcAlarm_state ( node_ptr->hostname,
MTC_ALARM_ID__ENABLE);
/* Query FM for existing Enable and Config alarm status */
EFmAlarmSeverityT enable_alarm_severity =
mtcAlarm_state ( node_ptr->hostname, MTC_ALARM_ID__ENABLE);
EFmAlarmSeverityT config_alarm_severity =
mtcAlarm_state ( node_ptr->hostname, MTC_ALARM_ID__CONFIG);
if ( node_ptr->adminState == MTC_ADMIN_STATE__LOCKED )
/* Clear generic enable alarm over process restart.
* Will get reasserted if the cause condition still exists */
if ( enable_alarm_severity != FM_ALARM_SEVERITY_CLEAR )
{
node_ptr->alarms[MTC_ALARM_ID__LOCK] = FM_ALARM_SEVERITY_WARNING ;
/* If the node is locked then the Enable alarm
* should not be present */
if ( sev != FM_ALARM_SEVERITY_CLEAR )
{
mtcAlarm_clear ( node_ptr->hostname, MTC_ALARM_ID__ENABLE );
sev = FM_ALARM_SEVERITY_CLEAR ;
}
ilog ("%s found enable alarm ; clearing %s",
node_ptr->hostname.c_str(),
alarmUtil_getSev_str(enable_alarm_severity).c_str());
mtcAlarm_clear ( node_ptr->hostname, MTC_ALARM_ID__ENABLE );
}
/* Manage enable alarm over process restart.
*
* - clear the alarm in the active controller case
* - maintain the alarm, set degrade state in MAJOR and CRIT cases
* - clear alarm for all other severities.
*/
if ( THIS_HOST )
/* The config alarm is maintained if it exists.
* The in-service test handler will clear the alarm
* if the config failure is gone */
if ( config_alarm_severity != FM_ALARM_SEVERITY_CLEAR )
{
if ( sev != FM_ALARM_SEVERITY_CLEAR )
{
mtcAlarm_clear ( node_ptr->hostname, MTC_ALARM_ID__ENABLE );
}
}
else
{
if (( sev == FM_ALARM_SEVERITY_CRITICAL ) ||
( sev == FM_ALARM_SEVERITY_MAJOR ))
{
node_ptr->alarms[MTC_ALARM_ID__ENABLE] = sev ;
node_ptr->degrade_mask |= DEGRADE_MASK_ENABLE ;
}
else if ( sev != FM_ALARM_SEVERITY_CLEAR )
{
mtcAlarm_clear ( node_ptr->hostname, MTC_ALARM_ID__ENABLE );
}
node_ptr->degrade_mask |= DEGRADE_MASK_CONFIG ;
node_ptr->alarms[MTC_ALARM_ID__CONFIG] = config_alarm_severity ;
ilog ("%s found config alarm ; loaded %s",
node_ptr->hostname.c_str(),
alarmUtil_getSev_str(config_alarm_severity).c_str());
}
if ( is_controller(node_ptr) )
@ -6188,7 +6172,6 @@ int nodeLinkClass::add_handler ( struct nodeLinkClass::node * node_ptr )
{
ilog ("%s %s\n",node_ptr->hostname.c_str(), MTC_TASK_SWACT_COMPLETE );
/* Work Around for issue: */
mtcInvApi_update_uptime ( node_ptr, node_ptr->uptime );
mtcInvApi_update_task ( node_ptr, MTC_TASK_SWACT_COMPLETE );
@ -6222,7 +6205,6 @@ int nodeLinkClass::add_handler ( struct nodeLinkClass::node * node_ptr )
mtcSmgrApi_request ( node_ptr, state , SWACT_FAIL_THRESHOLD );
}
}
if ( daemon_get_cfg_ptr()->debug_level & 1 )
nodeLinkClass::host_print (node_ptr);
@ -6357,6 +6339,7 @@ int nodeLinkClass::add_handler ( struct nodeLinkClass::node * node_ptr )
}
case MTC_ADD__WORKQUEUE_WAIT:
{
rc = workQueue_done ( node_ptr );
if ( rc == RETRY )
{
@ -6444,6 +6427,7 @@ int nodeLinkClass::add_handler ( struct nodeLinkClass::node * node_ptr )
}
node_ptr->addStage = MTC_ADD__START;
plog ("%s Host Add Completed (uptime:%d)\n", node_ptr->hostname.c_str(), node_ptr->uptime );
node_ptr->add_completed = true ;
break ;
@ -7202,6 +7186,9 @@ int nodeLinkClass::oos_test_handler ( struct nodeLinkClass::node * node_ptr )
}
}
/* audit alarms */
mtcAlarm_audit (node_ptr );
break ;
}
case MTC_OOS_TEST__WAIT:
@ -7600,7 +7587,7 @@ int nodeLinkClass::insv_test_handler ( struct nodeLinkClass::node * node_ptr )
}
}
/* Monitor the health of the host - no pass file */
/* Monitor the health of the host */
if (( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED ) &&
( node_ptr->operState == MTC_OPER_STATE__ENABLED ) &&
(( node_ptr->availStatus == MTC_AVAIL_STATUS__AVAILABLE ) ||
@ -7626,6 +7613,11 @@ int nodeLinkClass::insv_test_handler ( struct nodeLinkClass::node * node_ptr )
ilog ("%s sm degrade clear\n", node_ptr->hostname.c_str());
}
/*
* In-service Config Failure/Alarm handling
*/
/* Detect new config failure condition */
if ( node_ptr->mtce_flags & MTC_FLAG__I_AM_NOT_HEALTHY)
{
/* not healthy .... */
@ -7637,16 +7629,7 @@ int nodeLinkClass::insv_test_handler ( struct nodeLinkClass::node * node_ptr )
{
wlog_throttled ( node_ptr->health_threshold_counter, (MTC_UNHEALTHY_THRESHOLD*10), "%s is UNHEALTHY\n", node_ptr->hostname.c_str());
if ( node_ptr->health_threshold_counter >= MTC_UNHEALTHY_THRESHOLD )
{
node_ptr->degrade_mask |= DEGRADE_MASK_CONFIG ;
/* threshold is reached so raise the config alarm if it is not already raised */
if ( node_ptr->alarms[MTC_ALARM_ID__CONFIG] != FM_ALARM_SEVERITY_CRITICAL )
{
mtcAlarm_critical ( node_ptr->hostname, MTC_ALARM_ID__CONFIG );
node_ptr->alarms[MTC_ALARM_ID__CONFIG] = FM_ALARM_SEVERITY_CRITICAL ;
}
}
alarm_config_failure ( node_ptr );
}
}
else
@ -7666,6 +7649,12 @@ int nodeLinkClass::insv_test_handler ( struct nodeLinkClass::node * node_ptr )
}
}
}
/* or correct an alarmed config failure that has cleared */
else if ( node_ptr->degrade_mask & DEGRADE_MASK_CONFIG )
{
if ( node_ptr->mtce_flags & MTC_FLAG__I_AM_HEALTHY )
alarm_config_clear ( node_ptr );
}
else
{
node_ptr->health_threshold_counter = 0 ;