From f2fedc0446bc39ca59e9512dc7418ed7c02b67a1 Mon Sep 17 00:00:00 2001 From: Eric MacDonald Date: Fri, 27 Sep 2019 11:04:33 -0400 Subject: [PATCH] Add alarm retry support to maintenance alarm handling daemon The maintenance alarm handling daemon (mtcalarmd) should not drop alarm requests simply because FM process is not running. Insteads it should retry for it and other FM error cases that will likely succeed in time if they are retried. Some error cases however do need to be dropped such as those that are unlikely to succeed with retries. Reviewed FM return codes with FM designer which lead to a list of errors that should drop and others that should retry. This update implements that handling with a posting and servicing of a first-in / first-out alarm queue. Typical retry case is the NOCONNECT error code which occurs when FM is not running. Alarm ordering and first try timestamp is maintained. Retries and logs are throttled to avoid flooding. Test Plan: PASS: Verify success path alarm handling End-to-End. PASS: Verify retry handling while FM is not running. PASS: Verify handling of all FM error codes (fit tool). PASS: Verify alarm handling under stress (inject-alarm script) soak. PASS: verify no memory leak over stress soak. PASS: Verify logging (success, retry, failure) PASS: Verify alarm posted date is maintained over retry success. Change-Id: Icd1e75583ef660b767e0788dd4af7f184bdb9e86 Closes-Bug: 1841653 Signed-off-by: Eric MacDonald --- mtce-common/src/common/fitCodes.h | 4 +- mtce-common/src/common/nodeBase.h | 2 + mtce/src/alarm/alarm.h | 38 ++-- mtce/src/alarm/alarmHdlr.cpp | 46 +++-- mtce/src/alarm/alarmInit.cpp | 15 +- mtce/src/alarm/alarmMgr.cpp | 295 +++++++++++++++++++++++++----- mtce/src/alarm/alarmUtil.cpp | 93 ++++++---- mtce/src/common/nodeClass.h | 1 + 8 files changed, 376 insertions(+), 118 deletions(-) diff --git a/mtce-common/src/common/fitCodes.h b/mtce-common/src/common/fitCodes.h index 10abb498..e53ed254 100644 --- a/mtce-common/src/common/fitCodes.h +++ b/mtce-common/src/common/fitCodes.h @@ -67,6 +67,7 @@ #define MTC_CMD_FIT__JSON_LEAK_SOAK ("/var/run/fit/json_leak_soak") /* mtcAgent */ #define MTC_CMD_FIT__BMC_ACC_FAIL ("/var/run/fit/bmc_access_fail")/* mtcAgent */ #define MTC_CMD_FIT__MEM_LEAK_DEBUG ("/var/run/fit/mem_leak_debug")/* mtcAgent */ +#define MTC_CMD_FIT__FM_ERROR_CODE ("/var/run/fit/fm_error_code") /* mtcAgent */ /***************************************************** * Fault Insertion Codes @@ -120,7 +121,8 @@ #define FIT_CODE__FM_SET_ALARM (40) #define FIT_CODE__FM_GET_ALARM (41) -#define FIT_CODE__FM_QRY_ALARMS (42) +#define FIT_CODE__FM_CLR_ALARM (42) +#define FIT_CODE__FM_QRY_ALARMS (43) #define FIT_CODE__BMC_COMMAND_SEND (60) #define FIT_CODE__BMC_COMMAND_RECV (61) diff --git a/mtce-common/src/common/nodeBase.h b/mtce-common/src/common/nodeBase.h index f56161ca..da71cd08 100755 --- a/mtce-common/src/common/nodeBase.h +++ b/mtce-common/src/common/nodeBase.h @@ -371,6 +371,8 @@ typedef enum /* 50 milliseconds */ #define SOCKET_WAIT 50000 +#define SOCKET_WAIT_100MS (100000) + /* 5 milliseconds */ #define MTCAGENT_SELECT_TIMEOUT (5000) diff --git a/mtce/src/alarm/alarm.h b/mtce/src/alarm/alarm.h index 1a15830e..2ce56391 100644 --- a/mtce/src/alarm/alarm.h +++ b/mtce/src/alarm/alarm.h @@ -23,7 +23,6 @@ #define ENTITY_PREFIX ((const char *)"host=") -#define MAX_ALARMS (10) #define MAX_ALARM_REQ_PER_MSG (4) #define MAX_ALARM_REQ_MSG_SIZE (500) #define MAX_ALARM_REQ_SIZE (MAX_ALARM_REQ_PER_MSG*MAX_ALARM_REQ_MSG_SIZE) @@ -97,6 +96,17 @@ typedef struct string clear_reason ; } alarmUtil_type ; +typedef struct +{ + string alarmid ; + string hostname ; + string operation ; + string severity ; + string entity ; + string prefix ; + FMTimeT timestamp ; + +} queue_entry_type; #define MAX_FAILED_B2B_RECEIVES_B4_RESTART (5) @@ -130,13 +140,9 @@ alarmUtil_type * alarmData_getAlarm_ptr ( string alarm_id_str ); /* in alarmHdlr.cpp */ int alarmHdlr_request_handler ( char * msg_ptr ); -/* in alarmMgr.cpp */ -int alarmMgr_manage_alarm ( string alarmid , - string hostname, - string operation, - string severity, - string entity, - string prefix); +void alarmMgr_queue_clear ( void ); +void alarmMgr_queue_alarm (queue_entry_type entry); +void alarmMgr_service_queue(void); /* Clear all alarms against this host */ void alarmUtil_clear_all ( string hostname ); @@ -154,14 +160,14 @@ int alarmUtil_query_identity ( string identity, unsigned int alarms_max ); int alarmUtil_clear ( string hostname, string alarm_id, string entity ); -int alarmUtil_critical ( string hostname, string alarm_id, string entity ); -int alarmUtil_major ( string hostname, string alarm_id, string entity ); -int alarmUtil_minor ( string hostname, string alarm_id, string entity ); -int alarmUtil_warning ( string hostname, string alarm_id, string entity ); -int alarmUtil_critical_log ( string hostname, string alarm_id, string entity ); -int alarmUtil_major_log ( string hostname, string alarm_id, string entity ); -int alarmUtil_minor_log ( string hostname, string alarm_id, string entity ); -int alarmUtil_warning_log ( string hostname, string alarm_id, string entity, string prefix ); +int alarmUtil_critical ( string hostname, string alarm_id, string entity, FMTimeT & timestamp ); +int alarmUtil_major ( string hostname, string alarm_id, string entity, FMTimeT & timestamp ); +int alarmUtil_minor ( string hostname, string alarm_id, string entity, FMTimeT & timestamp ); +int alarmUtil_warning ( string hostname, string alarm_id, string entity, FMTimeT & timestamp ); +int alarmUtil_critical_log ( string hostname, string alarm_id, string entity, FMTimeT & timestamp ); +int alarmUtil_major_log ( string hostname, string alarm_id, string entity, FMTimeT & timestamp ); +int alarmUtil_minor_log ( string hostname, string alarm_id, string entity, FMTimeT & timestamp ); +int alarmUtil_warning_log ( string hostname, string alarm_id, string entity, string prefix, FMTimeT & timestamp ); #endif // _MODULE_PRIVATE_ #endif // __INCLUDE_ALARM_H__ diff --git a/mtce/src/alarm/alarmHdlr.cpp b/mtce/src/alarm/alarmHdlr.cpp index 8c2f7438..e0bb128c 100644 --- a/mtce/src/alarm/alarmHdlr.cpp +++ b/mtce/src/alarm/alarmHdlr.cpp @@ -31,6 +31,24 @@ using namespace std; void daemon_sigchld_hdlr ( void ) { ; } +/***************************************************************************** + * + * Name : _fm_timestamp + * + * Purpose : Get a microsecond timestamp of the current time. + * + * Description: Used to record the time the alarm/log was requested + * + * Uses : FMTimeT from fmAPI.h + * + ****************************************************************************/ + +FMTimeT _fm_timestamp ( void ) +{ + struct timespec ts; + clock_gettime(CLOCK_REALTIME, &ts); + return ( ts.tv_sec*1000000 + ts.tv_nsec/1000 ); +} /** Daemon timer handler */ void _timer_handler ( int sig, siginfo_t *si, void *uc) @@ -62,23 +80,19 @@ int alarmHdlr_request_handler ( char * msg_ptr ) if ( elements ) { #define PARSE_FAILURE ((const char *)"failed to parse value for key") - string alarmid = "" ; - string hostname = "" ; + queue_entry_type entry ; + string alarm_req = "" ; string operation = "" ; string severity = "" ; - string entity = "" ; - string prefix = "" ; - string alarm_req = "" ; - for ( int i = 0 ; i < elements ; i++ ) { if ( ( rc = jsonUtil_get_array_idx ( msg_ptr, MTCALARM_REQ_LABEL, i, alarm_req ) ) == PASS ) { - if (( rc = jsonUtil_get_key_val ( (char*)alarm_req.data(), MTCALARM_REQ_KEY__ALARMID, alarmid )) != PASS ) + if (( rc = jsonUtil_get_key_val ( (char*)alarm_req.data(), MTCALARM_REQ_KEY__ALARMID, entry.alarmid )) != PASS ) { elog ("%s '%s'\n", PARSE_FAILURE, MTCALARM_REQ_KEY__ALARMID); } - else if (( rc = jsonUtil_get_key_val ( (char*)alarm_req.data(), MTCALARM_REQ_KEY__HOSTNAME, hostname )) != PASS ) + else if (( rc = jsonUtil_get_key_val ( (char*)alarm_req.data(), MTCALARM_REQ_KEY__HOSTNAME, entry.hostname )) != PASS ) { elog ("%s '%s'\n", PARSE_FAILURE, MTCALARM_REQ_KEY__HOSTNAME); } @@ -90,23 +104,19 @@ int alarmHdlr_request_handler ( char * msg_ptr ) { elog ("%s '%s'\n", PARSE_FAILURE, MTCALARM_REQ_KEY__SEVERITY); } - else if (( rc = jsonUtil_get_key_val ( (char*)alarm_req.data(), MTCALARM_REQ_KEY__ENTITY, entity )) != PASS ) + else if (( rc = jsonUtil_get_key_val ( (char*)alarm_req.data(), MTCALARM_REQ_KEY__ENTITY, entry.entity )) != PASS ) { elog ("%s '%s'\n", PARSE_FAILURE, MTCALARM_REQ_KEY__ENTITY); } - else if (( rc = jsonUtil_get_key_val ( (char*)alarm_req.data(), MTCALARM_REQ_KEY__PREFIX, prefix)) != PASS ) + else if (( rc = jsonUtil_get_key_val ( (char*)alarm_req.data(), MTCALARM_REQ_KEY__PREFIX, entry.prefix)) != PASS ) { elog ("%s '%s'\n", PARSE_FAILURE, MTCALARM_REQ_KEY__PREFIX); } else - { - jlog ("Alarm Message has %d requests\n", elements ); - rc = alarmMgr_manage_alarm ( alarmid, - hostname, - tolowercase(operation), - tolowercase(severity), - entity, - prefix); + { entry.timestamp = _fm_timestamp (); + entry.operation = tolowercase(operation); + entry.severity = tolowercase(severity); + alarmMgr_queue_alarm (entry); } if ( rc ) break ; } diff --git a/mtce/src/alarm/alarmInit.cpp b/mtce/src/alarm/alarmInit.cpp index dcf29332..7da739d6 100644 --- a/mtce/src/alarm/alarmInit.cpp +++ b/mtce/src/alarm/alarmInit.cpp @@ -192,6 +192,13 @@ int daemon_init ( string iface, string nodeType_str ) void daemon_service_run ( void ) { int rc = PASS ; + +#ifdef WANT_FIT_TESTING + daemon_init_fit (); +#endif + + alarmMgr_queue_clear(); + if (( mtcalarm_req_sock_ptr ) && ( mtcalarm_req_sock_ptr->getFD() )) { std::list socks ; @@ -213,7 +220,7 @@ void daemon_service_run ( void ) { daemon_signal_hdlr (); waitd.tv_sec = 0; - waitd.tv_usec = SOCKET_WAIT; + waitd.tv_usec = SOCKET_WAIT_100MS; /* Initialize the master fd_set */ FD_ZERO(&readfds); @@ -269,6 +276,12 @@ void daemon_service_run ( void ) break ; } } + +#ifdef WANT_FIT_TESTING + daemon_load_fit(); +#endif + + alarmMgr_service_queue(); } } else diff --git a/mtce/src/alarm/alarmMgr.cpp b/mtce/src/alarm/alarmMgr.cpp index 817e20b1..2d196222 100644 --- a/mtce/src/alarm/alarmMgr.cpp +++ b/mtce/src/alarm/alarmMgr.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2017 Wind River Systems, Inc. + * Copyright (c) 2016-2017,2019 Wind River Systems, Inc. * * SPDX-License-Identifier: Apache-2.0 * @@ -7,7 +7,7 @@ /** * @file - * Wind River Titanium Cloud Maintenance Alarm Manager Daemon Manager + * Starling-X Maintenance Alarm Manager Daemon Manager */ #include @@ -19,88 +19,297 @@ using namespace std; #define __MODULE_PRIVATE__ -#include "alarm.h" /* module header */ +#include "daemon_common.h" /* for ... gettime_monotonic_nsec */ +#include "alarm.h" /* module header */ -int alarmMgr_manage_alarm ( string alarmid, - string hostname, - string operation, - string severity, - string entity, - string prefix) +/* Accomodate for MNFA heartbeat alarms. + * Up to 2 (Mgmnt and Cluster) for each node of up to 1000 nodes = 2000 */ +#define MAX_QUEUED_ALARMS (2000) + +/* the alarm queue */ +static list alarm_queue ; + +/* FM retry throttle */ +static unsigned long long _holdoff_timestamp = 0 ; + +/************************************************************************* + * + * Name : _pop_front, _pop_back + * + * Scope : local + * + * Purpose : Remove the entry at the head/tail of the queue. + * + * Also reset the log throttle counter. + * + ************************************************************************/ + +void _pop_front( void ) { + if ( alarm_queue.size() ) + { + alarm_queue.pop_front(); + } + _holdoff_timestamp = 0 ; +} + +void _pop_back( void ) +{ + if ( alarm_queue.size() ) + { + alarm_queue.pop_back(); + } + _holdoff_timestamp = 0 ; +} + +/************************************************************************* + * + * Name : alarmMgr_queue_clear + * + * Purpose : Clear the alarm queue ; called from init. + * + ************************************************************************/ +void alarmMgr_queue_clear ( void ) +{ + alarm_queue.clear(); +} + +/************************************************************************* + * + * Name : alarmMgr_queue_alarm + * + * Purpose : Add an incoming alarm request to the tail of the queue. + * + ************************************************************************/ +void alarmMgr_queue_alarm ( queue_entry_type entry ) +{ + alog ("%s adding %s to alarm queue [size=%ld]\n", + entry.hostname.c_str(), + entry.alarmid.c_str(), + alarm_queue.size() ); + + alarm_queue.push_back(entry); +} + +/************************************************************************* + * + * Name : alarmMgr_service_queue + * + * Purpose : Service the alarm queue from the head. + * + * Description: Load the first/oldest element of the queue and submit it + * to FM. + * + * If it fails for a reason that is likely to resolve itself + * with a retry, then it is not popped of the head. Instead + * it is left there to be retried after the hold off period. + * + * If it fails for a reason that is NOT likely to succeed + * by retries then an error log is produced and this faulty + * entry is dropped. It is done this way to avoid a bad + * entry from stalling/blocking the queue. + * + ************************************************************************/ + +/* 5 second holdoff time before FM retry */ +#define RETRY_HOLDOFF_TIME_NSECS ((unsigned long long)(5000000000)) + +void alarmMgr_service_queue ( void ) +{ + alog1 ("Elements: %ld\n", alarm_queue.size()); + if ( alarm_queue.empty() ) + return ; + + /* throttle access to FM if in retry mode */ + if ( _holdoff_timestamp ) + { + unsigned long long _now_time = gettime_monotonic_nsec (); + + /* retry only retry every RETRY_HOLDOFF_TIME_NSECS while in holdoff */ + if (( _now_time-_holdoff_timestamp ) < RETRY_HOLDOFF_TIME_NSECS) + return ; + else + _holdoff_timestamp = 0 ; + } + + queue_entry_type entry = alarm_queue.front() ; + int rc = PASS ; - string action = operation ; + string action = entry.operation ; action.append (" alarm"); - EFmAlarmSeverityT sev ; - ilog ("Alarm: alarmid:%s hostname:%s operation:%s severity:%s entity:%s prefix:%s\n", - alarmid.c_str(), - hostname.c_str(), - operation.c_str(), - severity.c_str(), - entity.c_str(), - prefix.c_str()); + alog ("%s %s operation:%s severity:%s entity:%s prefix:%s\n", + entry.hostname.c_str(), + entry.alarmid.c_str(), + entry.operation.c_str(), + entry.severity.c_str(), + entry.entity.c_str(), + entry.prefix.c_str()); - sev = alarmUtil_getSev_enum ( severity ); - if (!operation.compare("msg")) + EFmAlarmSeverityT sev = alarmUtil_getSev_enum ( entry.severity ); + + /* customer logs */ + if ( entry.operation == "msg" ) { if ( sev == FM_ALARM_SEVERITY_WARNING ) { - //if ( prefix.compare("none")) - alarmUtil_warning_log ( hostname, alarmid, entity, prefix ); - //else - // mtcAlarm_warning_log ( hostname, id, entity ); + rc = alarmUtil_warning_log ( entry.hostname, entry.alarmid, entry.entity, entry.prefix, entry.timestamp ); } else if ( sev == FM_ALARM_SEVERITY_MINOR ) { - rc = alarmUtil_minor_log ( hostname, alarmid, entity ); + rc = alarmUtil_minor_log ( entry.hostname, entry.alarmid, entry.entity, entry.timestamp ); } else if ( sev == FM_ALARM_SEVERITY_MAJOR) { - rc = alarmUtil_major_log ( hostname, alarmid, entity ); + rc = alarmUtil_major_log ( entry.hostname, entry.alarmid, entry.entity, entry.timestamp ); } else if ( sev == FM_ALARM_SEVERITY_CRITICAL ) { - rc = alarmUtil_critical_log ( hostname, alarmid, entity ); + rc = alarmUtil_critical_log ( entry.hostname, entry.alarmid, entry.entity, entry.timestamp ); } else { - rc = FAIL_INVALID_OPERATION ; - wlog ("Unsupported log severity '%d:%s'\n", sev, severity.c_str()); + rc = FM_ERR_INVALID_REQ ; + wlog ("Unsupported log severity '%d:%s'\n", sev, entry.severity.c_str()); } action="create log" ; } - /* Get the state */ - else if ( !operation.compare("clear")) + /* alarm clear request */ + else if ( entry.operation == "clear" ) { - rc = alarmUtil_clear ( hostname, alarmid, entity ); + rc = alarmUtil_clear ( entry.hostname, entry.alarmid, entry.entity ); } - else if ( !operation.compare("set") ) + /* alarm set request */ + else if ( entry.operation == "set" ) { if ( sev == FM_ALARM_SEVERITY_WARNING ) - rc = alarmUtil_warning ( hostname, alarmid, entity ); + rc = alarmUtil_warning ( entry.hostname, entry.alarmid, entry.entity, entry.timestamp ); else if ( sev == FM_ALARM_SEVERITY_MINOR ) - rc = alarmUtil_minor ( hostname, alarmid, entity ); + rc = alarmUtil_minor ( entry.hostname, entry.alarmid, entry.entity, entry.timestamp ); else if ( sev == FM_ALARM_SEVERITY_MAJOR ) - rc = alarmUtil_major ( hostname, alarmid, entity ); + rc = alarmUtil_major ( entry.hostname, entry.alarmid, entry.entity, entry.timestamp ); else if ( sev == FM_ALARM_SEVERITY_CRITICAL ) - rc = alarmUtil_critical ( hostname, alarmid, entity ); + rc = alarmUtil_critical ( entry.hostname, entry.alarmid, entry.entity, entry.timestamp ); else { - rc = FAIL_INVALID_OPERATION ; + rc = FM_ERR_INVALID_REQ ; } } else { - rc = FAIL_BAD_CASE ; + rc = FM_ERR_INVALID_PARAMETER ; } - if ( rc ) + + /* Handle behavior based on return code */ + if ( rc == FM_ERR_OK ) { - elog ("%s failed to %s '%s:%s'\n", hostname.c_str(), action.c_str(), alarmid.c_str(), entity.c_str() ) + /* alarm call succeeded, pop off the list. */ + _pop_front(); } - return (rc); -} + else if ( rc == FM_ERR_ENTITY_NOT_FOUND ) + { + ilog ("%s %s '%s:%s' ; not found", + entry.hostname.c_str(), + action.c_str(), + entry.alarmid.c_str(), + entry.entity.c_str()); + _pop_front(); + } + /******************************************************************* + * Now these are non-success cases. + *******************************************************************/ + + /* Most typical failure case first - FM not running */ + else if (( rc == FM_ERR_NOCONNECT ) || + ( rc == FM_ERR_REQUEST_PENDING ) || + ( rc == FM_ERR_COMMUNICATIONS )) + { + if ( _holdoff_timestamp == 0 ) + _holdoff_timestamp = gettime_monotonic_nsec(); + + string type = "" ; + if ( rc == FM_ERR_NOCONNECT ) type = "not connected" ; + else if ( rc == FM_ERR_COMMUNICATIONS ) type = "communication error" ; + else if ( rc == FM_ERR_REQUEST_PENDING ) type = "pending request" ; + + wlog ("%s %s '%s:%s' failure ; %s ; retrying [q=%ld]", + entry.hostname.c_str(), + action.c_str(), + entry.alarmid.c_str(), + entry.entity.c_str(), + type.c_str(), + alarm_queue.size()); + } + + /* Look for cases where we don't want to retry. + * + * These would be cases that are unlikely to resolve with retry. + */ + + /* pop off if alarm already asserted */ + else if ( rc == FM_ERR_ALARM_EXISTS ) + { + wlog ("%s %s '%s:%s' ; already exists", + entry.hostname.c_str(), + action.c_str(), + entry.alarmid.c_str(), + entry.entity.c_str()); + _pop_front(); + } + + /* never retry on any of these error cases */ + else if (( rc == FM_ERR_INVALID_REQ ) || + ( rc == FM_ERR_INVALID_ATTRIBUTE ) || + ( rc == FM_ERR_INVALID_PARAMETER ) || + ( rc == FM_ERR_DB_OPERATION_FAILURE ) || + ( rc == FM_ERR_RESOURCE_UNAVAILABLE )) + { + wlog ("%s failed to %s '%s:%s' ; dropped ; bad request [rc=%d]", + entry.hostname.c_str(), + action.c_str(), + entry.alarmid.c_str(), + entry.entity.c_str(), rc); + _pop_front(); + } + + /* never retry due to resource error on assert cases */ + else if (( rc == FM_ERR_NOMEM ) || + ( rc == FM_ERR_SERVER_NO_MEM ) || + ( rc == FM_ERR_NOT_ENOUGH_SPACE )) + { + wlog ("%s failed to %s '%s:%s' ; dropped ; resource error [rc=%d]", + entry.hostname.c_str(), + action.c_str(), + entry.alarmid.c_str(), + entry.entity.c_str(),rc ); + _pop_front(); + } + else + { + wlog ("%s failed to %s '%s:%s' ; dropped ; unexpected [rc=%d]", + entry.hostname.c_str(), + action.c_str(), + entry.alarmid.c_str(), + entry.entity.c_str(),rc ); + _pop_front(); + } + + /* pop from back if the queue is loaded to the max */ + if ( alarm_queue.size() > MAX_QUEUED_ALARMS ) + { + wlog ("%s %s '%s:%s' dropped ; most recent ; queue full", + entry.hostname.c_str(), + action.c_str(), + entry.alarmid.c_str(), + entry.entity.c_str() ); + _pop_back(); + } + else + { + ilog ("%ld queue entries to service", alarm_queue.size()); + } +} diff --git a/mtce/src/alarm/alarmUtil.cpp b/mtce/src/alarm/alarmUtil.cpp index 478cf8e7..5c0c2c92 100644 --- a/mtce/src/alarm/alarmUtil.cpp +++ b/mtce/src/alarm/alarmUtil.cpp @@ -217,6 +217,7 @@ int alarmUtil_query_identity ( string identity, SFmAlarmDataT * alarm_list_ptr, * * ********************************************************************************/ + int alarmUtil ( string & hostname, string & identity, string & instance, @@ -280,28 +281,26 @@ int alarmUtil ( string & hostname, alarm.service_affecting ? 'Y' : 'N', alarm.suppression ? 'Y' : 'N' ); - ilog ( "fm_set_fault: %s %s state:%d sev:%d type:%d cause:%d sa:%c supp:%c", - hostname.c_str(), - alarm.alarm_id, - alarm.alarm_state, - alarm.severity, - alarm.alarm_type, - alarm.probable_cause, - alarm.service_affecting ? 'Y' : 'N', - alarm.suppression ? 'Y' : 'N' ); - - rc = fm_set_fault ( &alarm , NULL ); - if ( rc != FM_ERR_OK ) +#ifdef WANT_FIT_TESTING + if (( daemon_is_file_present ( MTC_CMD_FIT__FM_ERROR_CODE )) && + ( daemon_want_fit ( FIT_CODE__FM_SET_ALARM, hostname ))) { - wlog ("%s fm_set_fault call failed for alarm %s (rc:%d) ; retrying\n", hostname.c_str(), alarm.alarm_id, rc); - usleep (100000); /* sleep 100 msec */ - rc = fm_set_fault ( &alarm , NULL ); - if ( rc != FM_ERR_OK ) - { - elog ("%s failed to set alarm %s (rc:%d) ; giving up\n", hostname.c_str(), alarm.alarm_id, rc); - rc = FAIL ; - } + rc = daemon_get_file_int(MTC_CMD_FIT__FM_ERROR_CODE) ; } + else +#endif + { + rc = fm_set_fault ( &alarm , NULL ); + } + if ( rc == FM_ERR_OK ) + { + ilog ( "%s %s %s alarm raised (%s)", + hostname.c_str(), + alarm.alarm_id, + alarm.entity_instance_id, + alarmUtil_getSev_str(alarm.severity).c_str()); + } + /* error cases are handled/logged in the caller's ; dequeue API */ } else { @@ -313,19 +312,26 @@ int alarmUtil ( string & hostname, alog ( "fm_clear_fault: %s %s:%s", hostname.c_str(), alarm.entity_instance_id, alarm.alarm_id ); - ilog ("%s clearing %s %s alarm\n", hostname.c_str(), alarm.alarm_id, alarm.entity_instance_id); - if ( ( rc = fm_clear_fault ( &filter )) != FM_ERR_OK ) +#ifdef WANT_FIT_TESTING + if (( daemon_is_file_present ( MTC_CMD_FIT__FM_ERROR_CODE )) && + ( daemon_want_fit ( FIT_CODE__FM_CLR_ALARM, hostname ))) { - if ( rc != FM_ERR_ENTITY_NOT_FOUND ) - { - elog ("%s failed to fm_clear_fault (rc:%d)\n", hostname.c_str(), rc ); - rc = FAIL ; - } - else - { - rc = PASS ; - } + rc = daemon_get_file_int(MTC_CMD_FIT__FM_ERROR_CODE) ; } + else +#endif + { + rc = fm_clear_fault ( &filter ); + } + + if ( rc == FM_ERR_OK ) + { + ilog ("%s %s %s alarm cleared\n", + hostname.c_str(), + alarm.alarm_id, + alarm.entity_instance_id); + } + /* error cases are handled/logged in the caller's ; dequeue API */ } return (rc); @@ -348,7 +354,7 @@ int alarmUtil_clear ( string hostname, string alarm_id , string entity ) } /** Assert a specified hosts's alarm with a CRITICAL severity level */ -int alarmUtil_critical ( string hostname, string alarm_id , string entity ) +int alarmUtil_critical ( string hostname, string alarm_id , string entity, FMTimeT & timestamp ) { alarmUtil_type * alarm_ptr = alarmData_getAlarm_ptr(alarm_id); if ( alarm_ptr ) @@ -358,6 +364,7 @@ int alarmUtil_critical ( string hostname, string alarm_id , string entity ) alarm_ptr->alarm.severity = FM_ALARM_SEVERITY_CRITICAL ; alarm_ptr->alarm.alarm_state = FM_ALARM_STATE_SET ; + if ( timestamp ) alarm_ptr->alarm.timestamp = timestamp ; snprintf ( alarm_ptr->alarm.reason_text, FM_MAX_BUFFER_LENGTH, "%s %s", hostname.data(), alarm_ptr->critl_reason.data()); @@ -368,7 +375,7 @@ int alarmUtil_critical ( string hostname, string alarm_id , string entity ) /** Assert a specified host's alarm with a MAJOR severity level */ -int alarmUtil_major ( string hostname, string alarm_id , string entity ) +int alarmUtil_major ( string hostname, string alarm_id , string entity, FMTimeT & timestamp ) { alarmUtil_type * alarm_ptr = alarmData_getAlarm_ptr(alarm_id); if ( alarm_ptr ) @@ -378,6 +385,7 @@ int alarmUtil_major ( string hostname, string alarm_id , string entity ) alarm_ptr->alarm.severity = FM_ALARM_SEVERITY_MAJOR ; alarm_ptr->alarm.alarm_state = FM_ALARM_STATE_SET ; + if ( timestamp ) alarm_ptr->alarm.timestamp = timestamp ; snprintf ( alarm_ptr->alarm.reason_text, FM_MAX_BUFFER_LENGTH, "%s %s", hostname.data(), alarm_ptr->major_reason.data()); @@ -385,8 +393,9 @@ int alarmUtil_major ( string hostname, string alarm_id , string entity ) } return (FAIL_NULL_POINTER); } + /** Assert a specified host's alarm with a MINOR severity level */ -int alarmUtil_minor ( string hostname, string alarm_id , string entity ) +int alarmUtil_minor ( string hostname, string alarm_id , string entity, FMTimeT & timestamp ) { alarmUtil_type * alarm_ptr = alarmData_getAlarm_ptr(alarm_id); if ( alarm_ptr ) @@ -396,6 +405,7 @@ int alarmUtil_minor ( string hostname, string alarm_id , string entity ) alarm_ptr->alarm.severity = FM_ALARM_SEVERITY_MINOR ; alarm_ptr->alarm.alarm_state = FM_ALARM_STATE_SET ; + if ( timestamp ) alarm_ptr->alarm.timestamp = timestamp ; snprintf ( alarm_ptr->alarm.reason_text, FM_MAX_BUFFER_LENGTH, "%s %s", hostname.data(), alarm_ptr->minor_reason.data()); @@ -405,7 +415,7 @@ int alarmUtil_minor ( string hostname, string alarm_id , string entity ) } /** Assert a specified host's alarm with a WARNING severity level */ -int alarmUtil_warning ( string hostname, string alarm_id , string entity ) +int alarmUtil_warning ( string hostname, string alarm_id , string entity, FMTimeT & timestamp ) { alarmUtil_type * alarm_ptr = alarmData_getAlarm_ptr(alarm_id); if ( alarm_ptr ) @@ -415,6 +425,7 @@ int alarmUtil_warning ( string hostname, string alarm_id , string entity alarm_ptr->alarm.severity = FM_ALARM_SEVERITY_WARNING ; alarm_ptr->alarm.alarm_state = FM_ALARM_STATE_SET ; + if ( timestamp ) alarm_ptr->alarm.timestamp = timestamp ; snprintf ( alarm_ptr->alarm.reason_text, FM_MAX_BUFFER_LENGTH, "%s %s", hostname.data(), alarm_ptr->minor_reason.data()); @@ -424,7 +435,7 @@ int alarmUtil_warning ( string hostname, string alarm_id , string entity } /** Create CRITICAL log */ -int alarmUtil_critical_log ( string hostname, string alarm_id , string entity ) +int alarmUtil_critical_log ( string hostname, string alarm_id , string entity, FMTimeT & timestamp ) { alarmUtil_type * alarm_ptr = alarmData_getAlarm_ptr(alarm_id); if ( alarm_ptr ) @@ -434,6 +445,7 @@ int alarmUtil_critical_log ( string hostname, string alarm_id , string entity ) alarm_ptr->alarm.severity = FM_ALARM_SEVERITY_CRITICAL ; alarm_ptr->alarm.alarm_state = FM_ALARM_STATE_MSG ; + if ( timestamp ) alarm_ptr->alarm.timestamp = timestamp ; snprintf ( alarm_ptr->alarm.reason_text, FM_MAX_BUFFER_LENGTH, "%s %s", hostname.data(), alarm_ptr->critl_reason.data()); @@ -444,7 +456,7 @@ int alarmUtil_critical_log ( string hostname, string alarm_id , string entity ) /** Create MAJOR log */ -int alarmUtil_major_log ( string hostname, string alarm_id , string entity ) +int alarmUtil_major_log ( string hostname, string alarm_id , string entity, FMTimeT & timestamp ) { alarmUtil_type * alarm_ptr = alarmData_getAlarm_ptr(alarm_id); if ( alarm_ptr ) @@ -454,6 +466,7 @@ int alarmUtil_major_log ( string hostname, string alarm_id , string entity ) alarm_ptr->alarm.severity = FM_ALARM_SEVERITY_MAJOR ; alarm_ptr->alarm.alarm_state = FM_ALARM_STATE_MSG ; + if ( timestamp ) alarm_ptr->alarm.timestamp = timestamp ; snprintf ( alarm_ptr->alarm.reason_text, FM_MAX_BUFFER_LENGTH, "%s %s", hostname.data(), alarm_ptr->major_reason.data()); @@ -462,7 +475,7 @@ int alarmUtil_major_log ( string hostname, string alarm_id , string entity ) return (FAIL_NULL_POINTER); } /** Create MINOR log */ -int alarmUtil_minor_log ( string hostname, string alarm_id , string entity ) +int alarmUtil_minor_log ( string hostname, string alarm_id , string entity, FMTimeT & timestamp ) { alarmUtil_type * alarm_ptr = alarmData_getAlarm_ptr(alarm_id); if ( alarm_ptr ) @@ -472,6 +485,7 @@ int alarmUtil_minor_log ( string hostname, string alarm_id , string entit alarm_ptr->alarm.severity = FM_ALARM_SEVERITY_MINOR ; alarm_ptr->alarm.alarm_state = FM_ALARM_STATE_MSG ; + if ( timestamp ) alarm_ptr->alarm.timestamp = timestamp ; snprintf ( alarm_ptr->alarm.reason_text, FM_MAX_BUFFER_LENGTH, "%s %s", hostname.data(), alarm_ptr->minor_reason.data()); @@ -481,7 +495,7 @@ int alarmUtil_minor_log ( string hostname, string alarm_id , string entit } /** Create WARNING log */ -int alarmUtil_warning_log ( string hostname, string alarm_id, string entity, string prefix ) +int alarmUtil_warning_log ( string hostname, string alarm_id, string entity, string prefix, FMTimeT & timestamp ) { alarmUtil_type * alarm_ptr = alarmData_getAlarm_ptr(alarm_id); if ( alarm_ptr ) @@ -491,6 +505,7 @@ int alarmUtil_warning_log ( string hostname, string alarm_id, string entity, str alarm_ptr->alarm.severity = FM_ALARM_SEVERITY_WARNING ; alarm_ptr->alarm.alarm_state = FM_ALARM_STATE_MSG ; + if ( timestamp ) alarm_ptr->alarm.timestamp = timestamp ; snprintf ( alarm_ptr->alarm.reason_text, FM_MAX_BUFFER_LENGTH, "%s %s", hostname.data(), entity.data()); diff --git a/mtce/src/common/nodeClass.h b/mtce/src/common/nodeClass.h index 42198bad..b20fbe6f 100755 --- a/mtce/src/common/nodeClass.h +++ b/mtce/src/common/nodeClass.h @@ -650,6 +650,7 @@ private: /** @} private_monitoring_services_variables */ /* List of alarms and current severity */ + #define MAX_ALARMS (10) EFmAlarmSeverityT alarms[MAX_ALARMS]; /* tracks whether the alarms for this host have been loaded already or not */