Add in-service test to clear stale config failure alarm

A configuration failure alarm can get stuck asserted if that node experiences an uncontrolled reboot that recovers without a configuration failure. This update adds an in-service test that audits host health while there is a configuration failure alarm raised and clear that alarm if the failure condition goes away. This could be a result of an in-service manifest that runs and corrects the configuration or if the node reboots and comes back up in a healthy (properly configured) state. Fixed bug that was clearing config alarm severity state when a heartbeat clear event is received. This update also goes a step further and introduces an alarms state audit that detects and corrects maintenance alarm state mismatches. Test Plan: PASS: Verify the add handler loads config alarm state PASS: Verify in-service test clears stale config alarm PASS: Verify in-service test acts on new config failure ... degrade - active controller ... fail - other hosts PASS: Verify audit fixes mtce alarm state mismatches PASS: Verify audit handles fm not running case PASS: Verify audit handling behavior with valid alarm cases PASS: Verify locked alarm management over process restart PASS: Verify audit only logs active alarms list changes PASS: Verify audit runs for both locked/unlocked nodes PASS: Verify update as a patch Regression: PASS: Verify enable sequence config failure handling PASS: ... active controller - recoverable degrade PASS: ... other nodes - threshold fail PASS: ... auto recovery disable - config failure PASS: Verify mtcAgent process logging PASS: Verify heartbeat handling and alarming PASS: Verify Standard system install PASS: Verify AIO system install Change-Id: If9957229810435e9faeb08374f2b5fbcb5b0f826 Closes-Bug: 1918195 Signed-off-by: Eric MacDonald <eric.macdonald@windriver.com>
2021-03-26 13:05:51 -04:00 · 2021-03-26 13:05:51 -04:00 · 031818e55b
parent 5c83453fdf
commit 031818e55b
6 changed files with 265 additions and 87 deletions
--- a/mtce/src/common/nodeClass.cpp
+++ b/mtce/src/common/nodeClass.cpp
@ -660,7 +660,7 @@ nodeLinkClass::node* nodeLinkClass::addNode( string hostname )
    {
        ptr->alarms[id] = FM_ALARM_SEVERITY_CLEAR ;
    }
-    ptr->alarms_loaded   = false ;
+    ptr->active_alarms = "" ; /* no active alarms */

    ptr->cfgEvent.base   = NULL ;
    ptr->sysinvEvent.base= NULL ;
@ -778,6 +778,7 @@ nodeLinkClass::node* nodeLinkClass::addNode( string hostname )
    return ptr ;
 }

+
 struct nodeLinkClass::node* nodeLinkClass::getNode ( string hostname )
 {
   /* check for empty list condition */
@ -5088,6 +5089,15 @@ void nodeLinkClass::manage_heartbeat_failure ( string hostname, iface_enum iface
    }
 }

+/****************************************************************************
+ *
+ * Name       : manage_heartbeat_clear
+ *
+ * Description: Manage clearing heartbeat failure status
+ *
+ * Assuptions : Called by Both hbsAgent and mtcAgent
+ *
+ ***************************************************************************/
 void nodeLinkClass::manage_heartbeat_clear ( string hostname, iface_enum iface )
 {
    nodeLinkClass::node * node_ptr = nodeLinkClass::getNode ( hostname );
@ -5103,13 +5113,17 @@ void nodeLinkClass::manage_heartbeat_clear ( string hostname, iface_enum iface )
            node_ptr->heartbeat_failed[i] = false ;
            if ( i == MGMNT_IFACE )
            {
-                node_ptr->alarms[HBS_ALARM_ID__HB_MGMNT] = FM_ALARM_SEVERITY_CLEAR ;
-                node_ptr->degrade_mask &= ~DEGRADE_MASK_HEARTBEAT_MGMNT ;
+                if ( heartbeat )
+                    node_ptr->alarms[HBS_ALARM_ID__HB_MGMNT] = FM_ALARM_SEVERITY_CLEAR ;
+                if ( maintenance )
+                    node_ptr->degrade_mask &= ~DEGRADE_MASK_HEARTBEAT_MGMNT ;
            }
            if ( i == CLSTR_IFACE )
            {
-                node_ptr->alarms[HBS_ALARM_ID__HB_CLSTR] = FM_ALARM_SEVERITY_CLEAR ;
-                node_ptr->degrade_mask &= ~DEGRADE_MASK_HEARTBEAT_CLSTR ;
+                if ( heartbeat )
+                    node_ptr->alarms[HBS_ALARM_ID__HB_CLSTR] = FM_ALARM_SEVERITY_CLEAR ;
+                if ( maintenance )
+                    node_ptr->degrade_mask &= ~DEGRADE_MASK_HEARTBEAT_CLSTR ;
            }
        }
    }
@ -5118,13 +5132,17 @@ void nodeLinkClass::manage_heartbeat_clear ( string hostname, iface_enum iface )
        node_ptr->heartbeat_failed[iface] = false ;
        if ( iface == MGMNT_IFACE )
        {
-            node_ptr->alarms[HBS_ALARM_ID__HB_MGMNT] = FM_ALARM_SEVERITY_CLEAR ;
-            node_ptr->degrade_mask &= ~DEGRADE_MASK_HEARTBEAT_MGMNT ;
+            if ( heartbeat )
+                node_ptr->alarms[HBS_ALARM_ID__HB_MGMNT] = FM_ALARM_SEVERITY_CLEAR ;
+            if ( maintenance )
+                node_ptr->degrade_mask &= ~DEGRADE_MASK_HEARTBEAT_MGMNT ;
        }
        else if ( iface == CLSTR_IFACE )
        {
-            node_ptr->alarms[HBS_ALARM_ID__HB_CLSTR] = FM_ALARM_SEVERITY_CLEAR ;
-            node_ptr->degrade_mask &= ~DEGRADE_MASK_HEARTBEAT_CLSTR ;
+            if ( heartbeat )
+                node_ptr->alarms[HBS_ALARM_ID__HB_CLSTR] = FM_ALARM_SEVERITY_CLEAR ;
+            if ( maintenance )
+                node_ptr->degrade_mask &= ~DEGRADE_MASK_HEARTBEAT_CLSTR ;
        }
    }
 }
@ -9068,21 +9086,21 @@ void nodeLinkClass::mem_log_mtcalive ( struct nodeLinkClass::node * node_ptr )
 {
    char str[MAX_MEM_LOG_DATA] ;

-    snprintf (&str[0], MAX_MEM_LOG_DATA, "%s\tmtcAlive: online:%c offline:%c Cnt:%d Gate:%s Misses:%d\n", 
-                node_ptr->hostname.c_str(), 
+    snprintf (&str[0], MAX_MEM_LOG_DATA, "%s\tmtcAlive: online:%c offline:%c Cnt:%d Gate:%s Misses:%d\n",
+                node_ptr->hostname.c_str(),
                node_ptr->mtcAlive_online ? 'Y' : 'N',
                node_ptr->mtcAlive_offline ? 'Y' : 'N',
                node_ptr->mtcAlive_count,
                node_ptr->mtcAlive_gate ? "closed" : "open",
-                node_ptr->mtcAlive_misses); 
+                node_ptr->mtcAlive_misses);
    mem_log (str);
 }

 void nodeLinkClass::mem_log_alarm1 ( struct nodeLinkClass::node * node_ptr )
 {
    char str[MAX_MEM_LOG_DATA] ;
-    snprintf (&str[0], MAX_MEM_LOG_DATA, "%s\tAlarm List:%s%s%s%s%s%s\n", 
-               node_ptr->hostname.c_str(), 
+    snprintf (&str[0], MAX_MEM_LOG_DATA, "%s\tAlarm List:%s%s%s%s%s%s\n",
+               node_ptr->hostname.c_str(),
               node_ptr->alarms[MTC_ALARM_ID__LOCK    ] ? " Locked"   : " .",
               node_ptr->alarms[MTC_ALARM_ID__CONFIG  ] ? " Config"   : " .",
               node_ptr->alarms[MTC_ALARM_ID__ENABLE  ] ? " Enable"   : " .",
@ -9092,6 +9110,18 @@ void nodeLinkClass::mem_log_alarm1 ( struct nodeLinkClass::node * node_ptr )
    mem_log (str);
 }

+void nodeLinkClass::mem_log_alarm2 ( struct nodeLinkClass::node * node_ptr )
+{
+    if ( ! node_ptr->active_alarms.empty() )
+    {
+        char str[MAX_MEM_LOG_DATA] ;
+        snprintf (&str[0], MAX_MEM_LOG_DATA, "%s\tActive Alarms:%s\n",
+                   node_ptr->hostname.c_str(),
+                   node_ptr->active_alarms.c_str());
+        mem_log (str);
+    }
+}
+
 void nodeLinkClass::mem_log_stage ( struct nodeLinkClass::node * node_ptr )
 {
    char str[MAX_MEM_LOG_DATA] ;
@ -9142,8 +9172,8 @@ void nodeLinkClass::mem_log_network ( struct nodeLinkClass::node * node_ptr )
 {
    char str[MAX_MEM_LOG_DATA] ;
    snprintf (&str[0], MAX_MEM_LOG_DATA, "%s\t%s %s cluster_host_ip: %s Uptime: %u\n",
-                node_ptr->hostname.c_str(), 
-                node_ptr->mac.c_str(), 
+                node_ptr->hostname.c_str(),
+                node_ptr->mac.c_str(),
                node_ptr->ip.c_str(),
                node_ptr->clstr_ip.c_str(),
                node_ptr->uptime );
@ -9155,11 +9185,11 @@ void nodeLinkClass::mem_log_heartbeat ( struct nodeLinkClass::node * node_ptr )
    char str[MAX_MEM_LOG_DATA] ;
    for ( int iface = 0 ; iface < MAX_IFACES ; iface++ )
    {
-        snprintf (&str[0], MAX_MEM_LOG_DATA, "%s\t%s Minor:%s Degrade:%s Failed:%s  Monitor:%s\n", 
-                   node_ptr->hostname.c_str(), 
+        snprintf (&str[0], MAX_MEM_LOG_DATA, "%s\t%s Minor:%s Degrade:%s Failed:%s  Monitor:%s\n",
+                   node_ptr->hostname.c_str(),
                   get_iface_name_str (iface),
-                   node_ptr->hbs_minor[iface] ? "true " : "false", 
-                   node_ptr->hbs_degrade[iface] ? "true " : "false", 
+                   node_ptr->hbs_minor[iface] ? "true " : "false",
+                   node_ptr->hbs_degrade[iface] ? "true " : "false",
                   node_ptr->hbs_failure[iface] ? "true " : "false",
                   node_ptr->monitor[iface] ? "YES" : "no"  );
        mem_log (str);
@ -9188,8 +9218,8 @@ void nodeLinkClass::mem_log_hbs_cnts ( struct nodeLinkClass::node * node_ptr )
 void nodeLinkClass::mem_log_test_info ( struct nodeLinkClass::node * node_ptr )
 {
    char str[MAX_MEM_LOG_DATA] ;
-    snprintf (&str[0], MAX_MEM_LOG_DATA, "%s\tOOS Stage:%s Runs:%d - INSV Stage:%s Runs:%d\n", 
-                node_ptr->hostname.c_str(), 
+    snprintf (&str[0], MAX_MEM_LOG_DATA, "%s\tOOS Stage:%s Runs:%d - INSV Stage:%s Runs:%d\n",
+                node_ptr->hostname.c_str(),
                get_oosTestStages_str(node_ptr->oosTestStage).c_str(),
                node_ptr->oos_test_count,
                get_insvTestStages_str(node_ptr->insvTestStage).c_str(),
@ -9261,6 +9291,7 @@ void nodeLinkClass::memDumpNodeState ( string hostname )
            // mem_log_reset_info ( node_ptr );
            mem_log_power_info ( node_ptr );
            mem_log_alarm1     ( node_ptr );
+            mem_log_alarm2     ( node_ptr );
            mem_log_mtcalive   ( node_ptr );
            mem_log_stage      ( node_ptr );
            mem_log_bm         ( node_ptr );
--- a/mtce/src/common/nodeClass.h
+++ b/mtce/src/common/nodeClass.h
@ -652,12 +652,12 @@ private:

        /** @} private_monitoring_services_variables */

-        /* List of alarms and current severity */
-        #define MAX_ALARMS           (10)
+        /* List of alarms current severity */
        EFmAlarmSeverityT alarms[MAX_ALARMS];

-        /* tracks whether the alarms for this host have been loaded already or not */
-        bool alarms_loaded ;
+        /* string containing active alarms and their severity
+         * ... for logging purposes only */
+        string active_alarms ;

        /** true if this host has recovered before the mnfa timeout period.
         *  This bool flags the graceful recovery handler that this node
@ -665,8 +665,6 @@ private:
         *  and uptime accordingly */
        bool mnfa_graceful_recovery ;

-        int stress_iteration ;
-
        /* BMC Protocol Learning Controls and State */

        /* specifies what BMC protocol is selected for this host
@ -843,6 +841,9 @@ private:
    /* server specific power state query handler */
    bool (*is_poweron_handler) (string hostname, string query_response );

+    /* Audit that monitors and auto corrects alarm state mismatches */
+    void mtcAlarm_audit ( struct nodeLinkClass::node * node_ptr );
+
    /* Calculate the overall reset progression timeout */
    int calc_reset_prog_timeout ( struct nodeLinkClass::node * node_ptr, int retries );

@ -1304,6 +1305,7 @@ private:
    void mem_log_state1    ( struct nodeLinkClass::node * node_ptr );
    void mem_log_state2    ( struct nodeLinkClass::node * node_ptr );
    void mem_log_alarm1    ( struct nodeLinkClass::node * node_ptr );
+    void mem_log_alarm2    ( struct nodeLinkClass::node * node_ptr );
    void mem_log_mtcalive  ( struct nodeLinkClass::node * node_ptr );
    void mem_log_stage     ( struct nodeLinkClass::node * node_ptr );
    void mem_log_test_info ( struct nodeLinkClass::node * node_ptr );
--- a/mtce/src/maintenance/mtcAlarm.cpp
+++ b/mtce/src/maintenance/mtcAlarm.cpp
@ -26,6 +26,7 @@ using namespace std;
 #include "daemon_common.h" /*                                           */

 #include "nodeBase.h"      /*                                           */
+#include "nodeClass.h"     /*                                           */
 #include "nodeTimers.h"    /*                                           */
 #include "nodeUtil.h"      /*                                           */
 #include "mtcAlarm.h"      /* for ... this module header                */
@ -379,8 +380,169 @@ void mtcAlarm_clear_all ( string hostname )
    }
 }

+/****************************************************************************
+ *
+ * Name       : mtcAlarm_audit
+ *
+ * Purpose    : Monitor and Auto-Correct maintenance alarms
+ *
+ * Description: Query locked state alarm (raw)
+ *              if successful
+ *                 - Query alarms
+ *                 - compare to running state
+ *                 - correct mismatches ; internal state takes precidence
+ *                 - log all alarm state changes
+ *
+ ****************************************************************************/
+
+void nodeLinkClass::mtcAlarm_audit ( struct nodeLinkClass::node * node_ptr )
+{
+   /*
+    * Read locked state alarm directly to detect fm access failures.
+    * If successful further reads are done using a wrapper utility.
+    */
+    SFmAlarmDataT alarm_query  ;
+    AlarmFilter   alarm_filter ;
+    EFmErrorT     rc           ;
+
+    memset(&alarm_query, 0, sizeof(alarm_query));
+    memset(&alarm_filter, 0, sizeof(alarm_filter));
+    snprintf ( &alarm_filter.alarm_id[0], FM_MAX_BUFFER_LENGTH, "%s",
+               LOCK_ALARM_ID);
+    snprintf ( &alarm_filter.entity_instance_id[0], FM_MAX_BUFFER_LENGTH, "%s%s",
+                    ENTITY_PREFIX, node_ptr->hostname.data());
+    rc = fm_get_fault ( &alarm_filter, &alarm_query );
+    if (( rc != FM_ERR_OK ) && ( rc != FM_ERR_ENTITY_NOT_FOUND ))
+    {
+        wlog("%s alarm query failure ; code:%d",
+                 node_ptr->hostname.c_str(),
+                 rc );
+        return ;
+    }
+
+    /* With FM comms proven working lets check the other mtc alarms */
+    string active_alarms = "";
+    for ( int i = 0 ; i < MAX_ALARMS ; i++ )
+    {
+        mtc_alarm_id_enum id = (mtc_alarm_id_enum)i ;
+        if ( id == MTC_ALARM_ID__LOCK )
+        {
+            /* Unexpected severity case */
+            if ( node_ptr->adminState == MTC_ADMIN_STATE__LOCKED )
+            {
+                if ( alarm_query.severity != FM_ALARM_SEVERITY_WARNING )
+                {
+                    node_ptr->alarms[id] = FM_ALARM_SEVERITY_WARNING ;
+
+                    wlog("%s %s alarm mismatch ; %s -> %s",
+                             node_ptr->hostname.c_str(),
+                             _getIdentity(id).c_str(),
+                             alarmUtil_getSev_str(alarm_query.severity).c_str(),
+                             alarmUtil_getSev_str(node_ptr->alarms[id]).c_str());
+
+                    mtcAlarm_warning ( node_ptr->hostname, MTC_ALARM_ID__LOCK );
+
+                }
+                if (!active_alarms.empty())
+                    active_alarms.append(", ");
+                active_alarms.append(_getIdentity(id) + ":");
+                active_alarms.append(alarmUtil_getSev_str(node_ptr->alarms[id]));
+            }
+            /* Unexpected assertion case */
+            else if (( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED ) &&
+                     (  alarm_query.severity != FM_ALARM_SEVERITY_CLEAR ))
+            {
+                node_ptr->alarms[id] = FM_ALARM_SEVERITY_CLEAR ;
+
+                wlog("%s %s alarm mismatch ; %s -> %s",
+                         node_ptr->hostname.c_str(),
+                         _getIdentity(id).c_str(),
+                         alarmUtil_getSev_str(alarm_query.severity).c_str(),
+                         alarmUtil_getSev_str(node_ptr->alarms[id]).c_str());
+
+                mtcAlarm_clear ( node_ptr->hostname, id );
+            }
+        }
+        else if (( id == MTC_ALARM_ID__CONFIG ) ||
+                 ( id == MTC_ALARM_ID__ENABLE ) ||
+                 ( id == MTC_ALARM_ID__BM     ) ||
+                 ( id == MTC_ALARM_ID__CH_CONT) ||
+                 ( id == MTC_ALARM_ID__CH_COMP))
+        {
+            EFmAlarmSeverityT severity = mtcAlarm_state ( node_ptr->hostname, id);
+            if ( severity != node_ptr->alarms[id] )
+            {
+                ilog ("%s %s alarm mismatch ; %s -> %s",
+                          node_ptr->hostname.c_str(),
+                          _getIdentity(id).c_str(),
+                           alarmUtil_getSev_str(severity).c_str(),
+                           alarmUtil_getSev_str(node_ptr->alarms[id]).c_str());
+
+                if ( node_ptr->alarms[id] == FM_ALARM_SEVERITY_CLEAR )
+                {
+                    mtcAlarm_clear ( node_ptr->hostname, id );
+                }
+                else
+                {
+                    mtcAlarm_raise ( node_ptr->hostname, id, node_ptr->alarms[id] );
+                }
+            }
+            if ( node_ptr->alarms[id] != FM_ALARM_SEVERITY_CLEAR )
+            {
+                if (!active_alarms.empty())
+                    active_alarms.append(", ");
+                active_alarms.append(_getIdentity(id) + ":");
+                active_alarms.append(alarmUtil_getSev_str(node_ptr->alarms[id]));
+            }
+        }
+        /* else don't care about other alarm ids ; logs events etc */
+    }
+
+    /* manage logging of active alarms */
+    if ( !active_alarms.empty() )
+    {
+        if ( node_ptr->active_alarms != active_alarms )
+        {
+            ilog ("%s active alarms: %s",
+                      node_ptr->hostname.c_str(),
+                      active_alarms.c_str());
+
+            node_ptr->active_alarms = active_alarms ;
+        }
+        /* else
+         *    do nothing because there are active alarms
+         *    that have not changed since the last audit.
+         */
+    }
+    else if ( ! node_ptr->active_alarms.empty() )
+    {
+        /* clear active alarm list since there 'were' active alarms
+         * but there are no longer active alarms */
+        node_ptr->active_alarms.clear();
+        ilog ("%s no active alarms", node_ptr->hostname.c_str());
+    }
+    /* else
+     *    no active alarms ; don't log */
+}
+
 /*************************   A L A R M I N G   **************************/

+/* Raise the specified maintenance alarm severity */
+int mtcAlarm_raise ( string hostname, mtc_alarm_id_enum id, EFmAlarmSeverityT severity )
+{
+    switch ( severity )
+    {
+        case FM_ALARM_SEVERITY_MINOR:
+            return (mtcAlarm_minor(hostname,id));
+        case FM_ALARM_SEVERITY_MAJOR:
+            return (mtcAlarm_major(hostname,id));
+        case FM_ALARM_SEVERITY_CRITICAL:
+            return (mtcAlarm_critical(hostname,id));
+        default:
+            return (FAIL_BAD_PARM);
+    }
+}
+
 /* Clear the specified hosts's maintenance alarm */
 int mtcAlarm_clear ( string hostname, mtc_alarm_id_enum id )
 {
--- a/mtce/src/maintenance/mtcAlarm.h
+++ b/mtce/src/maintenance/mtcAlarm.h
@ -95,6 +95,9 @@ string mtcAlarm_getId_str ( mtc_alarm_id_enum id );
 /** Clear the specified maintenance alarm for specific host */
 int  mtcAlarm_clear    ( string hostname, mtc_alarm_id_enum id );

+/** Raise specified severity level alarm for the specified host */
+int mtcAlarm_raise ( string hostname, mtc_alarm_id_enum id, EFmAlarmSeverityT severity );
+
 /** Assert a specified mtce alarm against the specified host with a WARNING severity level */
 int  mtcAlarm_warning  ( string hostname, mtc_alarm_id_enum id );

--- a/mtce/src/maintenance/mtcNodeCtrl.cpp
+++ b/mtce/src/maintenance/mtcNodeCtrl.cpp
@ -1187,15 +1187,6 @@ int _self_provision ( void )

            if ( my_identity.name == record_info.name )
            {
-                /* If the active controller was 'locked' and is being auto-corrected
-                 * to 'unlocked' then ensure that there is no locked alarm set for it */
-                if ( record_info.admin != "locked" )
-                {
-                        mtcAlarm_clear ( my_identity.name, MTC_ALARM_ID__LOCK );
-                        /* this is not required because its already inited to clear */
-                        // node_ptr->alarms[MTC_ALARM_ID__LOCK] = FM_ALARM_SEVERITY_CLEAR
-                }
-
                if ( my_identity.mac != record_info.mac )
                {
                    wlog ("%s mac address mismatch (%s - %s)\n",
--- a/mtce/src/maintenance/mtcNodeHdlrs.cpp
+++ b/mtce/src/maintenance/mtcNodeHdlrs.cpp
@ -6107,48 +6107,32 @@ int nodeLinkClass::add_handler ( struct nodeLinkClass::node * node_ptr )
                mtcInvApi_update_state ( node_ptr, "availability", "available" );
            }

-            /* handle other cases */
-            EFmAlarmSeverityT sev = mtcAlarm_state ( node_ptr->hostname,
-                                                     MTC_ALARM_ID__ENABLE);
+            /* Query FM for existing Enable and Config alarm status */
+            EFmAlarmSeverityT enable_alarm_severity =
+                mtcAlarm_state ( node_ptr->hostname, MTC_ALARM_ID__ENABLE);
+            EFmAlarmSeverityT config_alarm_severity =
+                mtcAlarm_state ( node_ptr->hostname, MTC_ALARM_ID__CONFIG);

-            if ( node_ptr->adminState == MTC_ADMIN_STATE__LOCKED )
+            /* Clear generic enable alarm over process restart.
+             * Will get reasserted if the cause condition still exists */
+            if ( enable_alarm_severity != FM_ALARM_SEVERITY_CLEAR )
            {
-                node_ptr->alarms[MTC_ALARM_ID__LOCK] = FM_ALARM_SEVERITY_WARNING ;
-
-                /* If the node is locked then the Enable alarm
-                 * should not be present */
-                if ( sev != FM_ALARM_SEVERITY_CLEAR )
-                {
-                    mtcAlarm_clear ( node_ptr->hostname, MTC_ALARM_ID__ENABLE );
-                    sev = FM_ALARM_SEVERITY_CLEAR ;
-                }
+                ilog ("%s found enable alarm ; clearing %s",
+                          node_ptr->hostname.c_str(),
+                          alarmUtil_getSev_str(enable_alarm_severity).c_str());
+                mtcAlarm_clear ( node_ptr->hostname, MTC_ALARM_ID__ENABLE );
            }

-            /* Manage enable alarm over process restart.
-             *
-             * - clear the alarm in the active controller case
-             * - maintain the alarm, set degrade state in MAJOR and CRIT cases
-             * - clear alarm for all other severities.
-             */
-            if ( THIS_HOST )
+            /* The config alarm is maintained if it exists.
+             * The in-service test handler will clear the alarm
+             * if the config failure is gone */
+            if ( config_alarm_severity != FM_ALARM_SEVERITY_CLEAR )
            {
-                if ( sev != FM_ALARM_SEVERITY_CLEAR )
-                {
-                    mtcAlarm_clear ( node_ptr->hostname, MTC_ALARM_ID__ENABLE );
-                }
-            }
-            else
-            {
-                if (( sev == FM_ALARM_SEVERITY_CRITICAL ) ||
-                    ( sev == FM_ALARM_SEVERITY_MAJOR ))
-                {
-                    node_ptr->alarms[MTC_ALARM_ID__ENABLE] = sev ;
-                    node_ptr->degrade_mask |= DEGRADE_MASK_ENABLE ;
-                }
-                else if ( sev != FM_ALARM_SEVERITY_CLEAR )
-                {
-                    mtcAlarm_clear ( node_ptr->hostname, MTC_ALARM_ID__ENABLE );
-                }
+                node_ptr->degrade_mask |= DEGRADE_MASK_CONFIG ;
+                node_ptr->alarms[MTC_ALARM_ID__CONFIG] = config_alarm_severity ;
+                ilog ("%s found config alarm ; loaded %s",
+                          node_ptr->hostname.c_str(),
+                          alarmUtil_getSev_str(config_alarm_severity).c_str());
            }

            if ( is_controller(node_ptr) )
@ -6188,7 +6172,6 @@ int nodeLinkClass::add_handler ( struct nodeLinkClass::node * node_ptr )
                    {
                        ilog ("%s %s\n",node_ptr->hostname.c_str(), MTC_TASK_SWACT_COMPLETE );

-                        /* Work Around for issue: */
                        mtcInvApi_update_uptime ( node_ptr, node_ptr->uptime );

                        mtcInvApi_update_task ( node_ptr, MTC_TASK_SWACT_COMPLETE );
@ -6222,7 +6205,6 @@ int nodeLinkClass::add_handler ( struct nodeLinkClass::node * node_ptr )
                    mtcSmgrApi_request ( node_ptr, state , SWACT_FAIL_THRESHOLD );
                }
            }
-
            if ( daemon_get_cfg_ptr()->debug_level & 1 )
                nodeLinkClass::host_print (node_ptr);

@ -6357,6 +6339,7 @@ int nodeLinkClass::add_handler ( struct nodeLinkClass::node * node_ptr )
        }
        case MTC_ADD__WORKQUEUE_WAIT:
        {
+
            rc = workQueue_done ( node_ptr );
            if ( rc == RETRY )
            {
@ -6444,6 +6427,7 @@ int nodeLinkClass::add_handler ( struct nodeLinkClass::node * node_ptr )
            }

            node_ptr->addStage = MTC_ADD__START;
+
            plog ("%s Host Add Completed (uptime:%d)\n", node_ptr->hostname.c_str(), node_ptr->uptime );
            node_ptr->add_completed = true ;
            break ;
@ -7202,6 +7186,9 @@ int nodeLinkClass::oos_test_handler ( struct nodeLinkClass::node * node_ptr )
                }
            }

+            /* audit alarms */
+            mtcAlarm_audit (node_ptr );
+
            break ;
        }
        case MTC_OOS_TEST__WAIT:
@ -7600,7 +7587,7 @@ int nodeLinkClass::insv_test_handler ( struct nodeLinkClass::node * node_ptr )
                }
            }

-            /* Monitor the health of the host - no pass file */
+            /* Monitor the health of the host */
            if ((  node_ptr->adminState  == MTC_ADMIN_STATE__UNLOCKED ) &&
                (  node_ptr->operState   == MTC_OPER_STATE__ENABLED   ) &&
                (( node_ptr->availStatus == MTC_AVAIL_STATUS__AVAILABLE ) ||
@ -7626,6 +7613,11 @@ int nodeLinkClass::insv_test_handler ( struct nodeLinkClass::node * node_ptr )
                    ilog ("%s sm degrade clear\n", node_ptr->hostname.c_str());
                }

+                /*
+                 * In-service Config Failure/Alarm handling
+                 */
+
+                /* Detect new config failure condition */
                if ( node_ptr->mtce_flags & MTC_FLAG__I_AM_NOT_HEALTHY)
                {
                    /* not healthy .... */
@ -7637,16 +7629,7 @@ int nodeLinkClass::insv_test_handler ( struct nodeLinkClass::node * node_ptr )
                        {
                            wlog_throttled ( node_ptr->health_threshold_counter, (MTC_UNHEALTHY_THRESHOLD*10), "%s is UNHEALTHY\n", node_ptr->hostname.c_str());
                            if ( node_ptr->health_threshold_counter >= MTC_UNHEALTHY_THRESHOLD )
-                            {
-                                node_ptr->degrade_mask |= DEGRADE_MASK_CONFIG ;
-
-                                /* threshold is reached so raise the config alarm if it is not already raised */
-                                if ( node_ptr->alarms[MTC_ALARM_ID__CONFIG] != FM_ALARM_SEVERITY_CRITICAL )
-                                {
-                                    mtcAlarm_critical ( node_ptr->hostname, MTC_ALARM_ID__CONFIG );
-                                    node_ptr->alarms[MTC_ALARM_ID__CONFIG] = FM_ALARM_SEVERITY_CRITICAL ;
-                                }
-                            }
+                                alarm_config_failure ( node_ptr );
                        }
                    }
                    else
@ -7666,6 +7649,12 @@ int nodeLinkClass::insv_test_handler ( struct nodeLinkClass::node * node_ptr )
                        }
                    }
                }
+                /* or correct an alarmed config failure that has cleared */
+                else if ( node_ptr->degrade_mask & DEGRADE_MASK_CONFIG )
+                {
+                    if ( node_ptr->mtce_flags & MTC_FLAG__I_AM_HEALTHY )
+                        alarm_config_clear ( node_ptr );
+                }
                else
                {
                    node_ptr->health_threshold_counter = 0 ;