Add in-service test to clear stale config failure alarm
A configuration failure alarm can get stuck asserted if that node experiences an uncontrolled reboot that recovers without a configuration failure. This update adds an in-service test that audits host health while there is a configuration failure alarm raised and clear that alarm if the failure condition goes away. This could be a result of an in-service manifest that runs and corrects the configuration or if the node reboots and comes back up in a healthy (properly configured) state. Fixed bug that was clearing config alarm severity state when a heartbeat clear event is received. This update also goes a step further and introduces an alarms state audit that detects and corrects maintenance alarm state mismatches. Test Plan: PASS: Verify the add handler loads config alarm state PASS: Verify in-service test clears stale config alarm PASS: Verify in-service test acts on new config failure ... degrade - active controller ... fail - other hosts PASS: Verify audit fixes mtce alarm state mismatches PASS: Verify audit handles fm not running case PASS: Verify audit handling behavior with valid alarm cases PASS: Verify locked alarm management over process restart PASS: Verify audit only logs active alarms list changes PASS: Verify audit runs for both locked/unlocked nodes PASS: Verify update as a patch Regression: PASS: Verify enable sequence config failure handling PASS: ... active controller - recoverable degrade PASS: ... other nodes - threshold fail PASS: ... auto recovery disable - config failure PASS: Verify mtcAgent process logging PASS: Verify heartbeat handling and alarming PASS: Verify Standard system install PASS: Verify AIO system install Change-Id: If9957229810435e9faeb08374f2b5fbcb5b0f826 Closes-Bug: 1918195 Signed-off-by: Eric MacDonald <eric.macdonald@windriver.com>
This commit is contained in:
parent
5c83453fdf
commit
031818e55b
|
@ -660,7 +660,7 @@ nodeLinkClass::node* nodeLinkClass::addNode( string hostname )
|
|||
{
|
||||
ptr->alarms[id] = FM_ALARM_SEVERITY_CLEAR ;
|
||||
}
|
||||
ptr->alarms_loaded = false ;
|
||||
ptr->active_alarms = "" ; /* no active alarms */
|
||||
|
||||
ptr->cfgEvent.base = NULL ;
|
||||
ptr->sysinvEvent.base= NULL ;
|
||||
|
@ -778,6 +778,7 @@ nodeLinkClass::node* nodeLinkClass::addNode( string hostname )
|
|||
return ptr ;
|
||||
}
|
||||
|
||||
|
||||
struct nodeLinkClass::node* nodeLinkClass::getNode ( string hostname )
|
||||
{
|
||||
/* check for empty list condition */
|
||||
|
@ -5088,6 +5089,15 @@ void nodeLinkClass::manage_heartbeat_failure ( string hostname, iface_enum iface
|
|||
}
|
||||
}
|
||||
|
||||
/****************************************************************************
|
||||
*
|
||||
* Name : manage_heartbeat_clear
|
||||
*
|
||||
* Description: Manage clearing heartbeat failure status
|
||||
*
|
||||
* Assuptions : Called by Both hbsAgent and mtcAgent
|
||||
*
|
||||
***************************************************************************/
|
||||
void nodeLinkClass::manage_heartbeat_clear ( string hostname, iface_enum iface )
|
||||
{
|
||||
nodeLinkClass::node * node_ptr = nodeLinkClass::getNode ( hostname );
|
||||
|
@ -5103,12 +5113,16 @@ void nodeLinkClass::manage_heartbeat_clear ( string hostname, iface_enum iface )
|
|||
node_ptr->heartbeat_failed[i] = false ;
|
||||
if ( i == MGMNT_IFACE )
|
||||
{
|
||||
if ( heartbeat )
|
||||
node_ptr->alarms[HBS_ALARM_ID__HB_MGMNT] = FM_ALARM_SEVERITY_CLEAR ;
|
||||
if ( maintenance )
|
||||
node_ptr->degrade_mask &= ~DEGRADE_MASK_HEARTBEAT_MGMNT ;
|
||||
}
|
||||
if ( i == CLSTR_IFACE )
|
||||
{
|
||||
if ( heartbeat )
|
||||
node_ptr->alarms[HBS_ALARM_ID__HB_CLSTR] = FM_ALARM_SEVERITY_CLEAR ;
|
||||
if ( maintenance )
|
||||
node_ptr->degrade_mask &= ~DEGRADE_MASK_HEARTBEAT_CLSTR ;
|
||||
}
|
||||
}
|
||||
|
@ -5118,12 +5132,16 @@ void nodeLinkClass::manage_heartbeat_clear ( string hostname, iface_enum iface )
|
|||
node_ptr->heartbeat_failed[iface] = false ;
|
||||
if ( iface == MGMNT_IFACE )
|
||||
{
|
||||
if ( heartbeat )
|
||||
node_ptr->alarms[HBS_ALARM_ID__HB_MGMNT] = FM_ALARM_SEVERITY_CLEAR ;
|
||||
if ( maintenance )
|
||||
node_ptr->degrade_mask &= ~DEGRADE_MASK_HEARTBEAT_MGMNT ;
|
||||
}
|
||||
else if ( iface == CLSTR_IFACE )
|
||||
{
|
||||
if ( heartbeat )
|
||||
node_ptr->alarms[HBS_ALARM_ID__HB_CLSTR] = FM_ALARM_SEVERITY_CLEAR ;
|
||||
if ( maintenance )
|
||||
node_ptr->degrade_mask &= ~DEGRADE_MASK_HEARTBEAT_CLSTR ;
|
||||
}
|
||||
}
|
||||
|
@ -9092,6 +9110,18 @@ void nodeLinkClass::mem_log_alarm1 ( struct nodeLinkClass::node * node_ptr )
|
|||
mem_log (str);
|
||||
}
|
||||
|
||||
void nodeLinkClass::mem_log_alarm2 ( struct nodeLinkClass::node * node_ptr )
|
||||
{
|
||||
if ( ! node_ptr->active_alarms.empty() )
|
||||
{
|
||||
char str[MAX_MEM_LOG_DATA] ;
|
||||
snprintf (&str[0], MAX_MEM_LOG_DATA, "%s\tActive Alarms:%s\n",
|
||||
node_ptr->hostname.c_str(),
|
||||
node_ptr->active_alarms.c_str());
|
||||
mem_log (str);
|
||||
}
|
||||
}
|
||||
|
||||
void nodeLinkClass::mem_log_stage ( struct nodeLinkClass::node * node_ptr )
|
||||
{
|
||||
char str[MAX_MEM_LOG_DATA] ;
|
||||
|
@ -9261,6 +9291,7 @@ void nodeLinkClass::memDumpNodeState ( string hostname )
|
|||
// mem_log_reset_info ( node_ptr );
|
||||
mem_log_power_info ( node_ptr );
|
||||
mem_log_alarm1 ( node_ptr );
|
||||
mem_log_alarm2 ( node_ptr );
|
||||
mem_log_mtcalive ( node_ptr );
|
||||
mem_log_stage ( node_ptr );
|
||||
mem_log_bm ( node_ptr );
|
||||
|
|
|
@ -652,12 +652,12 @@ private:
|
|||
|
||||
/** @} private_monitoring_services_variables */
|
||||
|
||||
/* List of alarms and current severity */
|
||||
#define MAX_ALARMS (10)
|
||||
/* List of alarms current severity */
|
||||
EFmAlarmSeverityT alarms[MAX_ALARMS];
|
||||
|
||||
/* tracks whether the alarms for this host have been loaded already or not */
|
||||
bool alarms_loaded ;
|
||||
/* string containing active alarms and their severity
|
||||
* ... for logging purposes only */
|
||||
string active_alarms ;
|
||||
|
||||
/** true if this host has recovered before the mnfa timeout period.
|
||||
* This bool flags the graceful recovery handler that this node
|
||||
|
@ -665,8 +665,6 @@ private:
|
|||
* and uptime accordingly */
|
||||
bool mnfa_graceful_recovery ;
|
||||
|
||||
int stress_iteration ;
|
||||
|
||||
/* BMC Protocol Learning Controls and State */
|
||||
|
||||
/* specifies what BMC protocol is selected for this host
|
||||
|
@ -843,6 +841,9 @@ private:
|
|||
/* server specific power state query handler */
|
||||
bool (*is_poweron_handler) (string hostname, string query_response );
|
||||
|
||||
/* Audit that monitors and auto corrects alarm state mismatches */
|
||||
void mtcAlarm_audit ( struct nodeLinkClass::node * node_ptr );
|
||||
|
||||
/* Calculate the overall reset progression timeout */
|
||||
int calc_reset_prog_timeout ( struct nodeLinkClass::node * node_ptr, int retries );
|
||||
|
||||
|
@ -1304,6 +1305,7 @@ private:
|
|||
void mem_log_state1 ( struct nodeLinkClass::node * node_ptr );
|
||||
void mem_log_state2 ( struct nodeLinkClass::node * node_ptr );
|
||||
void mem_log_alarm1 ( struct nodeLinkClass::node * node_ptr );
|
||||
void mem_log_alarm2 ( struct nodeLinkClass::node * node_ptr );
|
||||
void mem_log_mtcalive ( struct nodeLinkClass::node * node_ptr );
|
||||
void mem_log_stage ( struct nodeLinkClass::node * node_ptr );
|
||||
void mem_log_test_info ( struct nodeLinkClass::node * node_ptr );
|
||||
|
|
|
@ -26,6 +26,7 @@ using namespace std;
|
|||
#include "daemon_common.h" /* */
|
||||
|
||||
#include "nodeBase.h" /* */
|
||||
#include "nodeClass.h" /* */
|
||||
#include "nodeTimers.h" /* */
|
||||
#include "nodeUtil.h" /* */
|
||||
#include "mtcAlarm.h" /* for ... this module header */
|
||||
|
@ -379,8 +380,169 @@ void mtcAlarm_clear_all ( string hostname )
|
|||
}
|
||||
}
|
||||
|
||||
/****************************************************************************
|
||||
*
|
||||
* Name : mtcAlarm_audit
|
||||
*
|
||||
* Purpose : Monitor and Auto-Correct maintenance alarms
|
||||
*
|
||||
* Description: Query locked state alarm (raw)
|
||||
* if successful
|
||||
* - Query alarms
|
||||
* - compare to running state
|
||||
* - correct mismatches ; internal state takes precidence
|
||||
* - log all alarm state changes
|
||||
*
|
||||
****************************************************************************/
|
||||
|
||||
void nodeLinkClass::mtcAlarm_audit ( struct nodeLinkClass::node * node_ptr )
|
||||
{
|
||||
/*
|
||||
* Read locked state alarm directly to detect fm access failures.
|
||||
* If successful further reads are done using a wrapper utility.
|
||||
*/
|
||||
SFmAlarmDataT alarm_query ;
|
||||
AlarmFilter alarm_filter ;
|
||||
EFmErrorT rc ;
|
||||
|
||||
memset(&alarm_query, 0, sizeof(alarm_query));
|
||||
memset(&alarm_filter, 0, sizeof(alarm_filter));
|
||||
snprintf ( &alarm_filter.alarm_id[0], FM_MAX_BUFFER_LENGTH, "%s",
|
||||
LOCK_ALARM_ID);
|
||||
snprintf ( &alarm_filter.entity_instance_id[0], FM_MAX_BUFFER_LENGTH, "%s%s",
|
||||
ENTITY_PREFIX, node_ptr->hostname.data());
|
||||
rc = fm_get_fault ( &alarm_filter, &alarm_query );
|
||||
if (( rc != FM_ERR_OK ) && ( rc != FM_ERR_ENTITY_NOT_FOUND ))
|
||||
{
|
||||
wlog("%s alarm query failure ; code:%d",
|
||||
node_ptr->hostname.c_str(),
|
||||
rc );
|
||||
return ;
|
||||
}
|
||||
|
||||
/* With FM comms proven working lets check the other mtc alarms */
|
||||
string active_alarms = "";
|
||||
for ( int i = 0 ; i < MAX_ALARMS ; i++ )
|
||||
{
|
||||
mtc_alarm_id_enum id = (mtc_alarm_id_enum)i ;
|
||||
if ( id == MTC_ALARM_ID__LOCK )
|
||||
{
|
||||
/* Unexpected severity case */
|
||||
if ( node_ptr->adminState == MTC_ADMIN_STATE__LOCKED )
|
||||
{
|
||||
if ( alarm_query.severity != FM_ALARM_SEVERITY_WARNING )
|
||||
{
|
||||
node_ptr->alarms[id] = FM_ALARM_SEVERITY_WARNING ;
|
||||
|
||||
wlog("%s %s alarm mismatch ; %s -> %s",
|
||||
node_ptr->hostname.c_str(),
|
||||
_getIdentity(id).c_str(),
|
||||
alarmUtil_getSev_str(alarm_query.severity).c_str(),
|
||||
alarmUtil_getSev_str(node_ptr->alarms[id]).c_str());
|
||||
|
||||
mtcAlarm_warning ( node_ptr->hostname, MTC_ALARM_ID__LOCK );
|
||||
|
||||
}
|
||||
if (!active_alarms.empty())
|
||||
active_alarms.append(", ");
|
||||
active_alarms.append(_getIdentity(id) + ":");
|
||||
active_alarms.append(alarmUtil_getSev_str(node_ptr->alarms[id]));
|
||||
}
|
||||
/* Unexpected assertion case */
|
||||
else if (( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED ) &&
|
||||
( alarm_query.severity != FM_ALARM_SEVERITY_CLEAR ))
|
||||
{
|
||||
node_ptr->alarms[id] = FM_ALARM_SEVERITY_CLEAR ;
|
||||
|
||||
wlog("%s %s alarm mismatch ; %s -> %s",
|
||||
node_ptr->hostname.c_str(),
|
||||
_getIdentity(id).c_str(),
|
||||
alarmUtil_getSev_str(alarm_query.severity).c_str(),
|
||||
alarmUtil_getSev_str(node_ptr->alarms[id]).c_str());
|
||||
|
||||
mtcAlarm_clear ( node_ptr->hostname, id );
|
||||
}
|
||||
}
|
||||
else if (( id == MTC_ALARM_ID__CONFIG ) ||
|
||||
( id == MTC_ALARM_ID__ENABLE ) ||
|
||||
( id == MTC_ALARM_ID__BM ) ||
|
||||
( id == MTC_ALARM_ID__CH_CONT) ||
|
||||
( id == MTC_ALARM_ID__CH_COMP))
|
||||
{
|
||||
EFmAlarmSeverityT severity = mtcAlarm_state ( node_ptr->hostname, id);
|
||||
if ( severity != node_ptr->alarms[id] )
|
||||
{
|
||||
ilog ("%s %s alarm mismatch ; %s -> %s",
|
||||
node_ptr->hostname.c_str(),
|
||||
_getIdentity(id).c_str(),
|
||||
alarmUtil_getSev_str(severity).c_str(),
|
||||
alarmUtil_getSev_str(node_ptr->alarms[id]).c_str());
|
||||
|
||||
if ( node_ptr->alarms[id] == FM_ALARM_SEVERITY_CLEAR )
|
||||
{
|
||||
mtcAlarm_clear ( node_ptr->hostname, id );
|
||||
}
|
||||
else
|
||||
{
|
||||
mtcAlarm_raise ( node_ptr->hostname, id, node_ptr->alarms[id] );
|
||||
}
|
||||
}
|
||||
if ( node_ptr->alarms[id] != FM_ALARM_SEVERITY_CLEAR )
|
||||
{
|
||||
if (!active_alarms.empty())
|
||||
active_alarms.append(", ");
|
||||
active_alarms.append(_getIdentity(id) + ":");
|
||||
active_alarms.append(alarmUtil_getSev_str(node_ptr->alarms[id]));
|
||||
}
|
||||
}
|
||||
/* else don't care about other alarm ids ; logs events etc */
|
||||
}
|
||||
|
||||
/* manage logging of active alarms */
|
||||
if ( !active_alarms.empty() )
|
||||
{
|
||||
if ( node_ptr->active_alarms != active_alarms )
|
||||
{
|
||||
ilog ("%s active alarms: %s",
|
||||
node_ptr->hostname.c_str(),
|
||||
active_alarms.c_str());
|
||||
|
||||
node_ptr->active_alarms = active_alarms ;
|
||||
}
|
||||
/* else
|
||||
* do nothing because there are active alarms
|
||||
* that have not changed since the last audit.
|
||||
*/
|
||||
}
|
||||
else if ( ! node_ptr->active_alarms.empty() )
|
||||
{
|
||||
/* clear active alarm list since there 'were' active alarms
|
||||
* but there are no longer active alarms */
|
||||
node_ptr->active_alarms.clear();
|
||||
ilog ("%s no active alarms", node_ptr->hostname.c_str());
|
||||
}
|
||||
/* else
|
||||
* no active alarms ; don't log */
|
||||
}
|
||||
|
||||
/************************* A L A R M I N G **************************/
|
||||
|
||||
/* Raise the specified maintenance alarm severity */
|
||||
int mtcAlarm_raise ( string hostname, mtc_alarm_id_enum id, EFmAlarmSeverityT severity )
|
||||
{
|
||||
switch ( severity )
|
||||
{
|
||||
case FM_ALARM_SEVERITY_MINOR:
|
||||
return (mtcAlarm_minor(hostname,id));
|
||||
case FM_ALARM_SEVERITY_MAJOR:
|
||||
return (mtcAlarm_major(hostname,id));
|
||||
case FM_ALARM_SEVERITY_CRITICAL:
|
||||
return (mtcAlarm_critical(hostname,id));
|
||||
default:
|
||||
return (FAIL_BAD_PARM);
|
||||
}
|
||||
}
|
||||
|
||||
/* Clear the specified hosts's maintenance alarm */
|
||||
int mtcAlarm_clear ( string hostname, mtc_alarm_id_enum id )
|
||||
{
|
||||
|
|
|
@ -95,6 +95,9 @@ string mtcAlarm_getId_str ( mtc_alarm_id_enum id );
|
|||
/** Clear the specified maintenance alarm for specific host */
|
||||
int mtcAlarm_clear ( string hostname, mtc_alarm_id_enum id );
|
||||
|
||||
/** Raise specified severity level alarm for the specified host */
|
||||
int mtcAlarm_raise ( string hostname, mtc_alarm_id_enum id, EFmAlarmSeverityT severity );
|
||||
|
||||
/** Assert a specified mtce alarm against the specified host with a WARNING severity level */
|
||||
int mtcAlarm_warning ( string hostname, mtc_alarm_id_enum id );
|
||||
|
||||
|
|
|
@ -1187,15 +1187,6 @@ int _self_provision ( void )
|
|||
|
||||
if ( my_identity.name == record_info.name )
|
||||
{
|
||||
/* If the active controller was 'locked' and is being auto-corrected
|
||||
* to 'unlocked' then ensure that there is no locked alarm set for it */
|
||||
if ( record_info.admin != "locked" )
|
||||
{
|
||||
mtcAlarm_clear ( my_identity.name, MTC_ALARM_ID__LOCK );
|
||||
/* this is not required because its already inited to clear */
|
||||
// node_ptr->alarms[MTC_ALARM_ID__LOCK] = FM_ALARM_SEVERITY_CLEAR
|
||||
}
|
||||
|
||||
if ( my_identity.mac != record_info.mac )
|
||||
{
|
||||
wlog ("%s mac address mismatch (%s - %s)\n",
|
||||
|
|
|
@ -6107,48 +6107,32 @@ int nodeLinkClass::add_handler ( struct nodeLinkClass::node * node_ptr )
|
|||
mtcInvApi_update_state ( node_ptr, "availability", "available" );
|
||||
}
|
||||
|
||||
/* handle other cases */
|
||||
EFmAlarmSeverityT sev = mtcAlarm_state ( node_ptr->hostname,
|
||||
MTC_ALARM_ID__ENABLE);
|
||||
/* Query FM for existing Enable and Config alarm status */
|
||||
EFmAlarmSeverityT enable_alarm_severity =
|
||||
mtcAlarm_state ( node_ptr->hostname, MTC_ALARM_ID__ENABLE);
|
||||
EFmAlarmSeverityT config_alarm_severity =
|
||||
mtcAlarm_state ( node_ptr->hostname, MTC_ALARM_ID__CONFIG);
|
||||
|
||||
if ( node_ptr->adminState == MTC_ADMIN_STATE__LOCKED )
|
||||
{
|
||||
node_ptr->alarms[MTC_ALARM_ID__LOCK] = FM_ALARM_SEVERITY_WARNING ;
|
||||
|
||||
/* If the node is locked then the Enable alarm
|
||||
* should not be present */
|
||||
if ( sev != FM_ALARM_SEVERITY_CLEAR )
|
||||
{
|
||||
mtcAlarm_clear ( node_ptr->hostname, MTC_ALARM_ID__ENABLE );
|
||||
sev = FM_ALARM_SEVERITY_CLEAR ;
|
||||
}
|
||||
}
|
||||
|
||||
/* Manage enable alarm over process restart.
|
||||
*
|
||||
* - clear the alarm in the active controller case
|
||||
* - maintain the alarm, set degrade state in MAJOR and CRIT cases
|
||||
* - clear alarm for all other severities.
|
||||
*/
|
||||
if ( THIS_HOST )
|
||||
{
|
||||
if ( sev != FM_ALARM_SEVERITY_CLEAR )
|
||||
/* Clear generic enable alarm over process restart.
|
||||
* Will get reasserted if the cause condition still exists */
|
||||
if ( enable_alarm_severity != FM_ALARM_SEVERITY_CLEAR )
|
||||
{
|
||||
ilog ("%s found enable alarm ; clearing %s",
|
||||
node_ptr->hostname.c_str(),
|
||||
alarmUtil_getSev_str(enable_alarm_severity).c_str());
|
||||
mtcAlarm_clear ( node_ptr->hostname, MTC_ALARM_ID__ENABLE );
|
||||
}
|
||||
}
|
||||
else
|
||||
|
||||
/* The config alarm is maintained if it exists.
|
||||
* The in-service test handler will clear the alarm
|
||||
* if the config failure is gone */
|
||||
if ( config_alarm_severity != FM_ALARM_SEVERITY_CLEAR )
|
||||
{
|
||||
if (( sev == FM_ALARM_SEVERITY_CRITICAL ) ||
|
||||
( sev == FM_ALARM_SEVERITY_MAJOR ))
|
||||
{
|
||||
node_ptr->alarms[MTC_ALARM_ID__ENABLE] = sev ;
|
||||
node_ptr->degrade_mask |= DEGRADE_MASK_ENABLE ;
|
||||
}
|
||||
else if ( sev != FM_ALARM_SEVERITY_CLEAR )
|
||||
{
|
||||
mtcAlarm_clear ( node_ptr->hostname, MTC_ALARM_ID__ENABLE );
|
||||
}
|
||||
node_ptr->degrade_mask |= DEGRADE_MASK_CONFIG ;
|
||||
node_ptr->alarms[MTC_ALARM_ID__CONFIG] = config_alarm_severity ;
|
||||
ilog ("%s found config alarm ; loaded %s",
|
||||
node_ptr->hostname.c_str(),
|
||||
alarmUtil_getSev_str(config_alarm_severity).c_str());
|
||||
}
|
||||
|
||||
if ( is_controller(node_ptr) )
|
||||
|
@ -6188,7 +6172,6 @@ int nodeLinkClass::add_handler ( struct nodeLinkClass::node * node_ptr )
|
|||
{
|
||||
ilog ("%s %s\n",node_ptr->hostname.c_str(), MTC_TASK_SWACT_COMPLETE );
|
||||
|
||||
/* Work Around for issue: */
|
||||
mtcInvApi_update_uptime ( node_ptr, node_ptr->uptime );
|
||||
|
||||
mtcInvApi_update_task ( node_ptr, MTC_TASK_SWACT_COMPLETE );
|
||||
|
@ -6222,7 +6205,6 @@ int nodeLinkClass::add_handler ( struct nodeLinkClass::node * node_ptr )
|
|||
mtcSmgrApi_request ( node_ptr, state , SWACT_FAIL_THRESHOLD );
|
||||
}
|
||||
}
|
||||
|
||||
if ( daemon_get_cfg_ptr()->debug_level & 1 )
|
||||
nodeLinkClass::host_print (node_ptr);
|
||||
|
||||
|
@ -6357,6 +6339,7 @@ int nodeLinkClass::add_handler ( struct nodeLinkClass::node * node_ptr )
|
|||
}
|
||||
case MTC_ADD__WORKQUEUE_WAIT:
|
||||
{
|
||||
|
||||
rc = workQueue_done ( node_ptr );
|
||||
if ( rc == RETRY )
|
||||
{
|
||||
|
@ -6444,6 +6427,7 @@ int nodeLinkClass::add_handler ( struct nodeLinkClass::node * node_ptr )
|
|||
}
|
||||
|
||||
node_ptr->addStage = MTC_ADD__START;
|
||||
|
||||
plog ("%s Host Add Completed (uptime:%d)\n", node_ptr->hostname.c_str(), node_ptr->uptime );
|
||||
node_ptr->add_completed = true ;
|
||||
break ;
|
||||
|
@ -7202,6 +7186,9 @@ int nodeLinkClass::oos_test_handler ( struct nodeLinkClass::node * node_ptr )
|
|||
}
|
||||
}
|
||||
|
||||
/* audit alarms */
|
||||
mtcAlarm_audit (node_ptr );
|
||||
|
||||
break ;
|
||||
}
|
||||
case MTC_OOS_TEST__WAIT:
|
||||
|
@ -7600,7 +7587,7 @@ int nodeLinkClass::insv_test_handler ( struct nodeLinkClass::node * node_ptr )
|
|||
}
|
||||
}
|
||||
|
||||
/* Monitor the health of the host - no pass file */
|
||||
/* Monitor the health of the host */
|
||||
if (( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED ) &&
|
||||
( node_ptr->operState == MTC_OPER_STATE__ENABLED ) &&
|
||||
(( node_ptr->availStatus == MTC_AVAIL_STATUS__AVAILABLE ) ||
|
||||
|
@ -7626,6 +7613,11 @@ int nodeLinkClass::insv_test_handler ( struct nodeLinkClass::node * node_ptr )
|
|||
ilog ("%s sm degrade clear\n", node_ptr->hostname.c_str());
|
||||
}
|
||||
|
||||
/*
|
||||
* In-service Config Failure/Alarm handling
|
||||
*/
|
||||
|
||||
/* Detect new config failure condition */
|
||||
if ( node_ptr->mtce_flags & MTC_FLAG__I_AM_NOT_HEALTHY)
|
||||
{
|
||||
/* not healthy .... */
|
||||
|
@ -7637,16 +7629,7 @@ int nodeLinkClass::insv_test_handler ( struct nodeLinkClass::node * node_ptr )
|
|||
{
|
||||
wlog_throttled ( node_ptr->health_threshold_counter, (MTC_UNHEALTHY_THRESHOLD*10), "%s is UNHEALTHY\n", node_ptr->hostname.c_str());
|
||||
if ( node_ptr->health_threshold_counter >= MTC_UNHEALTHY_THRESHOLD )
|
||||
{
|
||||
node_ptr->degrade_mask |= DEGRADE_MASK_CONFIG ;
|
||||
|
||||
/* threshold is reached so raise the config alarm if it is not already raised */
|
||||
if ( node_ptr->alarms[MTC_ALARM_ID__CONFIG] != FM_ALARM_SEVERITY_CRITICAL )
|
||||
{
|
||||
mtcAlarm_critical ( node_ptr->hostname, MTC_ALARM_ID__CONFIG );
|
||||
node_ptr->alarms[MTC_ALARM_ID__CONFIG] = FM_ALARM_SEVERITY_CRITICAL ;
|
||||
}
|
||||
}
|
||||
alarm_config_failure ( node_ptr );
|
||||
}
|
||||
}
|
||||
else
|
||||
|
@ -7666,6 +7649,12 @@ int nodeLinkClass::insv_test_handler ( struct nodeLinkClass::node * node_ptr )
|
|||
}
|
||||
}
|
||||
}
|
||||
/* or correct an alarmed config failure that has cleared */
|
||||
else if ( node_ptr->degrade_mask & DEGRADE_MASK_CONFIG )
|
||||
{
|
||||
if ( node_ptr->mtce_flags & MTC_FLAG__I_AM_HEALTHY )
|
||||
alarm_config_clear ( node_ptr );
|
||||
}
|
||||
else
|
||||
{
|
||||
node_ptr->health_threshold_counter = 0 ;
|
||||
|
|
Loading…
Reference in New Issue