Mtce: Make Heartbeat Failure Action Configurable

The current maintenance heartbeat failure action handling is to Fail
and Gracefully Recover the host. This means that maintenance will
ensure that a heartbeat failed host is rebooted/reset before it is
recovered but will avoid rebooting it a second time if its recovered
uptime indicates that it has already rebooted.

This update expands that single action handling behavior to support
three new actions. In doing so it adds a new configuration service
parameter called heartbeat_failure_action. The customer can configure
this new parameter with any one of the following 4 actions in order of
decreasing impact.

   fail - Host is failed and gracefuly recovered.
        - Current Network specific alarms continue to be raised/cleared.
          Note: Prior to this update this was standard system behavior.
degrade - Host is only degraded while it is failing heartbeat.
        - Current Network specific alarms continue to be raised/cleared.
        - heartbeat degrade reason is cleared as are the alarms when
          heartbeat responses resume.
  alarm - The only indication of a heartbeat failure is by alarm.
        - Same set of alarms as in above action cases
        - Only in this case no degrade, no failure, no reboot/reset
   none - Heartbeat is disabled ; no multicase heartbeat message is sent.
        - All existing heartbeat alarms are cleared.
        - The heartbeat soak as part of the enable sequence is bypassed.

The selected action is a system wide setting.
The selected setting also applies to Multi-Node Failure Avoidance.
The default action is the legacy action Fail.

This update also

 1. Removes redundant inservice failure alarm for MNFA case in support
    of degrade only action. Keeping it would make that alarm handling
    case unnecessarily complicated.
 2. No longer used 'hbs calibration' code is removed (cleanup).
 3. Small amount of heartbeat logging cleanup.

Test Plan:
PASS:    fail: Verify MNFA and recovery
PASS:    fail: Verify Single Host heartbeat failure and recovery
PASS:    fail: Verify Single Host heartbeat failure and recovery (from none)
PASS: degrade: Verify MNFA and recovery
PASS: degrade: Verify Single Host heartbeat failure and recovery
PASS: degrade: Verify Single Host heartbeat failure and recovery (from alarm)
PASS:   alarm: Verify MNFA and recovery
PASS:   alarm: Verify Single Host heartbeat failure and recovery
PASS:   alarm: Verify Single Host heartbeat failure and recovery (from degrade)
PASS:    none: Verify heartbeat disable, fail ignore and no recovery
PASS:    none: Verify Single Host heartbeat ignore and no recovery
PASS:    none: Verify Single Host heartbeat ignode and no recovery (from fail)
PASS: Verify action change behavior from none to alarm with active MNFA
PASS: Verify action change behavior from alarm to degrade with active MNFA
PASS: Verify action change behavior from degrade to none with active MNFA
PASS: Verify action change behavior from none to fail with active MNFA
PASS: Verify action change behavior from fail to none with active MNFA
PASS: Verify action change behavior from degrade to fail then MNFA timeout
PASS: Verify all heartbeat action change customer logs
PASS: verify heartbeat stats clear over action change
PASS: Verify LO DOR (several large labs - compute and storage systems)
PASS: Verify recovery from failure of active controller
PASS: Verify 3 host failure behavior with MNFA threshold at 3 (action:fail)
PASS: Verify 2 host failure behavior with MNFA threshold at 3 (action:fail)

Depends-On: https://review.openstack.org/601264
Change-Id: Iede5cdbb1c923898fd71b3a95d5289182f4287b4
Signed-off-by: Eric MacDonald <eric.macdonald@windriver.com>
This commit is contained in:
Eric MacDonald 2018-09-04 20:54:41 -04:00
parent c9a4b9c1b5
commit 74c5f89ab4
20 changed files with 693 additions and 194 deletions

View File

@ -1,3 +1,3 @@
SRC_DIR="cgts-mtce-common-1.0"
TIS_PATCH_VER=136
TIS_PATCH_VER=137
BUILD_IS_SLOW=5

View File

@ -47,6 +47,7 @@
#define COMMAND_LOG_ID ((const char *)"200.021")
#define STATECHANGE_LOG_ID ((const char *)"200.022")
#define SERVICESTATUS_LOG_ID ((const char *)"200.023") /* log used to report service failure events against */
#define CONFIG_LOG_ID ((const char *)"200.024")
/**
* TODO: This class is more of a place holder for

View File

@ -44,12 +44,7 @@ typedef struct
int hbs_minor_threshold ; /**< heartbeat miss minor threshold */
int hbs_degrade_threshold ; /**< heartbeat miss degrade threshold */
int hbs_failure_threshold ; /**< heartbeat miss failure threshold */
int hbs_calibrate_threshold ; /**< number of hosts where threshold calibration begins to take effect */
int hbs_calibrate_period_factor ; /**< hbs_pulse_period = hbs_pulse_period * hosts */
int hbs_calibrate_minor_factor ; /**< hbs_minor_threshold = threshold factor * hosts */
int hbs_calibrate_degrade_factor; /**< hbs_degrade_threshold = threshold factor * hosts */
int hbs_calibrate_fail_factor ; /**< hbs_failure_threshold = threshold factor * hosts */
char* hbs_failure_action ; /**< action to take on host heartbeat falure*/
char* mgmnt_iface ; /**< management interface name pointer */
char* infra_iface ; /**< infrastructure interface name pointer */

View File

@ -258,7 +258,7 @@ nodeLinkClass::nodeLinkClass()
hbs_minor_threshold = HBS_MINOR_THRESHOLD ;
hbs_degrade_threshold = HBS_DEGRADE_THRESHOLD ;
hbs_failure_threshold = HBS_FAILURE_THRESHOLD ;
hbs_failure_action = HBS_FAILURE_ACTION__FAIL ;
hbs_silent_fault_detector = 0 ;
hbs_silent_fault_logged = false ;
@ -653,14 +653,14 @@ nodeLinkClass::node* nodeLinkClass::addNode( string hostname )
ptr->vimEvent.buf = NULL ;
ptr->httpReq.buf = NULL ;
/* log throttles */
ptr->stall_recovery_log_throttle = 0 ;
ptr->stall_monitor_log_throttle = 0 ;
ptr->unexpected_pulse_log_throttle = 0 ;
ptr->lookup_mismatch_log_throttle = 0 ;
ptr->log_throttle = 0 ;
ptr->no_work_log_throttle = 0 ;
ptr->no_rri_log_throttle = 0 ;
ptr->degrade_mask = ptr->degrade_mask_save = DEGRADE_MASK_NONE ;
@ -1615,13 +1615,15 @@ int nodeLinkClass::alarm_config_clear ( struct nodeLinkClass::node * node_ptr )
}
/* Generate a log and a critical alarm if the node enable failed */
int nodeLinkClass::alarm_enabled_failure ( struct nodeLinkClass::node * node_ptr )
int nodeLinkClass::alarm_enabled_failure ( struct nodeLinkClass::node * node_ptr, bool want_degrade )
{
if ( (node_ptr->degrade_mask & DEGRADE_MASK_ENABLE) == 0 )
if ( want_degrade )
{
node_ptr->degrade_mask |= DEGRADE_MASK_ENABLE ;
if ( (node_ptr->degrade_mask & DEGRADE_MASK_ENABLE) == 0 )
{
node_ptr->degrade_mask |= DEGRADE_MASK_ENABLE ;
}
}
if ( node_ptr->alarms[MTC_ALARM_ID__ENABLE] != FM_ALARM_SEVERITY_CRITICAL )
{
elog ("%s critical enable failure\n", node_ptr->hostname.c_str());
@ -4466,7 +4468,10 @@ void nodeLinkClass::manage_heartbeat_failure ( string hostname, iface_enum iface
}
else
{
alarm_enabled_failure (node_ptr);
//bool want_degrade = true ;
//if ( this->hbs_failure_action == HBS_FAILURE_ACTION__ALARM )
// want_degrade = false ;
// alarm_enabled_failure (node_ptr, want_degrade);
mnfa_add_host ( node_ptr , iface );
@ -4487,8 +4492,6 @@ void nodeLinkClass::manage_heartbeat_failure ( string hostname, iface_enum iface
nodeLinkClass::set_availStatus ( hostname, MTC_AVAIL_STATUS__FAILED );
alarm_enabled_failure (node_ptr);
if (( node_ptr->adminAction != MTC_ADMIN_ACTION__ENABLE ) &&
( node_ptr->adminAction != MTC_ADMIN_ACTION__UNLOCK ))
{
@ -4526,11 +4529,31 @@ void nodeLinkClass::manage_heartbeat_clear ( string hostname, iface_enum iface )
for ( int i = 0 ; i < MAX_IFACES ; i++ )
{
node_ptr->heartbeat_failed[i] = false ;
if ( i == MGMNT_IFACE )
{
node_ptr->alarms[HBS_ALARM_ID__HB_MGMNT] = FM_ALARM_SEVERITY_CLEAR ;
node_ptr->degrade_mask &= ~DEGRADE_MASK_HEARTBEAT_MGMNT ;
}
if ( i == INFRA_IFACE )
{
node_ptr->alarms[HBS_ALARM_ID__HB_INFRA] = FM_ALARM_SEVERITY_CLEAR ;
node_ptr->degrade_mask &= ~DEGRADE_MASK_HEARTBEAT_INFRA ;
}
}
}
else
{
node_ptr->heartbeat_failed[iface] = false ;
if ( iface == MGMNT_IFACE )
{
node_ptr->alarms[HBS_ALARM_ID__HB_MGMNT] = FM_ALARM_SEVERITY_CLEAR ;
node_ptr->degrade_mask &= ~DEGRADE_MASK_HEARTBEAT_MGMNT ;
}
else if ( iface == INFRA_IFACE )
{
node_ptr->alarms[HBS_ALARM_ID__HB_INFRA] = FM_ALARM_SEVERITY_CLEAR ;
node_ptr->degrade_mask &= ~DEGRADE_MASK_HEARTBEAT_INFRA ;
}
}
}
@ -4576,7 +4599,7 @@ void nodeLinkClass::manage_heartbeat_degrade ( string hostname, iface_enum iface
}
mnfa_add_host ( node_ptr, iface );
if ( nodeLinkClass::get_operState ( hostname ) == MTC_OPER_STATE__ENABLED )
{
if ( iface == MGMNT_IFACE )
@ -7074,7 +7097,7 @@ void nodeLinkClass::manage_autorecovery ( struct nodeLinkClass::node * node_ptr
}
else
{
alarm_enabled_failure ( node_ptr ) ;
alarm_enabled_failure ( node_ptr , true ) ;
}
allStateChange ( node_ptr, node_ptr->adminState,
@ -7155,7 +7178,7 @@ void nodeLinkClass::force_full_enable ( struct nodeLinkClass::node * node_ptr )
plog ("%s Forcing Full Enable Sequence\n", node_ptr->hostname.c_str());
/* Raise Critical Enable Alarm */
alarm_enabled_failure ( node_ptr );
alarm_enabled_failure ( node_ptr, true );
allStateChange ( node_ptr, node_ptr->adminState, MTC_OPER_STATE__DISABLED, MTC_AVAIL_STATUS__FAILED );
enableStageChange ( node_ptr, MTC_ENABLE__FAILURE );
@ -7359,7 +7382,18 @@ bool nodeLinkClass::get_hbs_monitor_state ( string & hostname, int iface )
node_ptr = nodeLinkClass::getNode ( hostname );
if ( node_ptr != NULL )
{
int rri_max = this->hosts ;
state = node_ptr->monitor[iface] ;
if ( state == true )
{
wlog_throttled (node_ptr->no_rri_log_throttle, rri_max,
"%s Not Offering RRI (%d)\n",
hostname.c_str(), this->hosts );
}
else
{
node_ptr->no_rri_log_throttle = 0 ;
}
}
}
return (state);
@ -7539,6 +7573,31 @@ int nodeLinkClass::create_pulse_list ( iface_enum iface )
return (pulses[iface]);
}
/** Clear heartbeat stats in support of failed heartbeat restart */
void nodeLinkClass::hbs_clear_all_stats ( void )
{
ilog ("clearing all hearbeat stats\n");
for ( struct node * ptr = head ; ptr != NULL ; ptr = ptr->next )
{
for ( int iface = 0 ; iface < MAX_IFACES ; iface++ )
{
ptr->max_count[iface] = 0 ;
ptr->hbs_count[iface] = 0 ;
ptr->hbs_misses_count[iface] = 0 ;
ptr->b2b_pulses_count[iface] = 0 ;
ptr->b2b_misses_count[iface] = 0 ;
ptr->hbs_minor_count[iface] = 0 ;
ptr->hbs_degrade_count[iface] = 0 ;
ptr->hbs_failure_count[iface] = 0 ;
ptr->hbs_minor[iface] = false ;
ptr->hbs_degrade[iface] = false ;
ptr->hbs_failure[iface] = false ;
ptr->heartbeat_failed[iface] = false ;
}
if (( ptr->next == NULL ) || ( ptr == tail ))
break ;
}
}
/** Build the Reasource Reference Array */
void nodeLinkClass::build_rra ( void )
@ -7717,7 +7776,7 @@ int nodeLinkClass::remPulse ( struct node * node_ptr, iface_enum iface, bool cle
* if this interfaces failed and has not yet received the
* required number of back to back pulses needed for recovery */
clear_b2b_misses_count = false ;
ilog ("%s %s heartbeat failure recovery (%d of %d)\n",
dlog ("%s %s heartbeat failure recovery (%d of %d)\n",
node_ptr->hostname.c_str(),
get_iface_name_str(iface),
ptr->b2b_pulses_count[iface],
@ -7870,8 +7929,8 @@ int nodeLinkClass::remPulse ( struct node * node_ptr, iface_enum iface, bool cle
}
/** This utility will try and remove a pluse from the pulse
* linked list first by index and then by hostname.
*
* linked list first by index and then by hostname.
*
* By index does not require a lookup whereas hostname does */
int nodeLinkClass::remove_pulse ( string & hostname, iface_enum iface, int index, unsigned int flags )
{
@ -7889,10 +7948,7 @@ int nodeLinkClass::remove_pulse ( string & hostname, iface_enum iface, int index
{
if ( hostname.compare("localhost") )
{
if ( get_hbs_monitor_state ( hostname , iface ) == true )
{
wlog ("%s Not Offering RRI\n", hostname.c_str());
}
get_hbs_monitor_state ( hostname , iface ) ;
}
else
{
@ -7914,7 +7970,7 @@ void nodeLinkClass::clear_pulse_list ( iface_enum iface )
}
pulse_list[iface].head_ptr = NULL ;
pulse_list[iface].tail_ptr = NULL ;
if ( ptr != NULL )
{
ptr->linknum[iface] = 0 ;
@ -7929,6 +7985,15 @@ void nodeLinkClass::manage_heartbeat_alarm ( struct nodeLinkClass::node * node_p
if ( this->heartbeat != true )
return ;
if ( this->hbs_failure_action == HBS_FAILURE_ACTION__NONE )
{
dlog ("%s dropping heartbeat alarm request (%s:%s) ; action none\n",
node_ptr->hostname.c_str(),
alarmUtil_getSev_str(sev).c_str(),
get_iface_name_str(iface) );
return ;
}
bool make_alarm_call = false ;
alarm_id_enum id ;
EFmAlarmStateT state = FM_ALARM_STATE_SET ;
@ -8025,7 +8090,7 @@ int nodeLinkClass::lost_pulses ( iface_enum iface )
{
if ( pulse_ptr->b2b_misses_count[iface] == hbs_failure_threshold )
{
ilog ("%-13s %s Pulse Miss (%d) (log throttled to every %d)\n",
ilog ("%s %s Pulse Miss (%d) (log throttled to every %d)\n",
pulse_ptr->hostname.c_str(),
get_iface_name_str(iface),
pulse_ptr->b2b_misses_count[iface],
@ -8034,7 +8099,7 @@ int nodeLinkClass::lost_pulses ( iface_enum iface )
/* Once the misses exceed 25 then throttle the logging to avoid flooding */
if ( (pulse_ptr->b2b_misses_count[iface] & 0xfff) == 0 )
{
ilog ("%-13s %s Pulse Miss (%d)\n", pulse_ptr->hostname.c_str(),
ilog ("%s %s Pulse Miss (%d)\n", pulse_ptr->hostname.c_str(),
get_iface_name_str(iface),
pulse_ptr->b2b_misses_count[iface] );
}
@ -8043,27 +8108,27 @@ int nodeLinkClass::lost_pulses ( iface_enum iface )
{
if ( pulse_ptr->b2b_misses_count[iface] > hbs_failure_threshold )
{
ilog ("%-13s %s Pulse Miss (%3d) (in failure)\n", pulse_ptr->hostname.c_str(),
ilog ("%s %s Pulse Miss (%3d) (in failure)\n", pulse_ptr->hostname.c_str(),
get_iface_name_str(iface),
pulse_ptr->b2b_misses_count[iface] );
}
else if ( pulse_ptr->b2b_misses_count[iface] > hbs_degrade_threshold )
{
ilog ("%-13s %s Pulse Miss (%3d) (max:%3d) (in degrade)\n", pulse_ptr->hostname.c_str(),
ilog ("%s %s Pulse Miss (%3d) (max:%3d) (in degrade)\n", pulse_ptr->hostname.c_str(),
get_iface_name_str(iface),
pulse_ptr->b2b_misses_count[iface],
pulse_ptr->max_count[iface]);
}
else if ( pulse_ptr->b2b_misses_count[iface] > hbs_minor_threshold )
{
ilog ("%-13s %s Pulse Miss (%3d) (max:%3d) (in minor)\n", pulse_ptr->hostname.c_str(),
ilog ("%s %s Pulse Miss (%3d) (max:%3d) (in minor)\n", pulse_ptr->hostname.c_str(),
get_iface_name_str(iface),
pulse_ptr->b2b_misses_count[iface] ,
pulse_ptr->max_count[iface]);
}
else
{
ilog ("%-13s %s Pulse Miss (%3d) (max:%3d)\n", pulse_ptr->hostname.c_str(),
ilog ("%s %s Pulse Miss (%3d) (max:%3d)\n", pulse_ptr->hostname.c_str(),
get_iface_name_str(iface),
pulse_ptr->b2b_misses_count[iface],
pulse_ptr->max_count[iface]);
@ -8072,7 +8137,7 @@ int nodeLinkClass::lost_pulses ( iface_enum iface )
}
else
{
dlog ("%-13s %s Pulse Miss (%d)\n", pulse_ptr->hostname.c_str(),
dlog ("%s %s Pulse Miss (%d)\n", pulse_ptr->hostname.c_str(),
get_iface_name_str(iface),
pulse_ptr->b2b_misses_count[iface] );
}

View File

@ -542,6 +542,9 @@ private:
/** Resource reference identifier, aka resource reference array index */
int rri ;
/** variable used to throttle the rri log */
int no_rri_log_throttle ;
/** @} private_Heartbeat_variables */
/**
@ -1023,7 +1026,7 @@ private:
int lazy_graceful_fs_reboot ( struct nodeLinkClass::node * node_ptr );
int alarm_enabled_clear ( struct nodeLinkClass::node * node_ptr, bool force );
int alarm_enabled_failure ( struct nodeLinkClass::node * node_ptr );
int alarm_enabled_failure ( struct nodeLinkClass::node * node_ptr, bool want_degrade );
int alarm_insv_clear ( struct nodeLinkClass::node * node_ptr, bool force );
int alarm_insv_failure ( struct nodeLinkClass::node * node_ptr );
@ -1296,6 +1299,9 @@ public:
/** The number of heartbeat misses that result in a failed state */
int hbs_failure_threshold ;
/** enumerated failure action code ; fail, degrade, alarm, none */
hbs_failure_action_enum hbs_failure_action ;
/** Running Resource Reference Identifier */
int rrri ;
@ -1427,6 +1433,7 @@ public:
* node failure avoidance threshold and until there are no more
* in service trouble hosts */
bool mnfa_active ;
void mnfa_cancel( void );
std::list<string> mnfa_awol_list ;
void mnfa_timeout_handler ( void );
@ -1526,6 +1533,10 @@ public:
//#ifdef WANT_HBS
/** Add a host to the Node list */
int add_heartbeat_host ( const node_inv_type &inv );
/** Clear heartbeat stats for all hosts */
void hbs_clear_all_stats ( void ) ;
// #endif
void host_print ( struct nodeLinkClass::node * node_ptr );

View File

@ -174,6 +174,23 @@ int client_timeout_handler ( void * user,
const char * name,
const char * value);
/* User selectable heartbeat failure actions */
typedef enum
{
HBS_FAILURE_ACTION__NONE = 0, /* no heartbeat tally */
HBS_FAILURE_ACTION__ALARM = 1, /* alarm only */
HBS_FAILURE_ACTION__DEGRADE = 2, /* degrade and alarm */
HBS_FAILURE_ACTION__FAIL = 3, /* fail and alarm */
} hbs_failure_action_enum ;
#define HBS_FAILURE_ACTION__NONE_STR ((const char *)("none"))
#define HBS_FAILURE_ACTION__ALARM_STR ((const char *)("alarm"))
#define HBS_FAILURE_ACTION__DEGRADE_STR ((const char *)("degrade"))
#define HBS_FAILURE_ACTION__FAIL_STR ((const char *)("fail"))
hbs_failure_action_enum
get_hbs_failure_action ( daemon_config_type & config );
/** Test Head Entry */
int daemon_run_testhead ( void );
/**

View File

@ -195,6 +195,48 @@ int timeout_config_handler ( void * user,
return (PASS);
}
/* ***********************************************************************
*
* Name : get_hbs_failure_action
*
* Desctription: Convert already loaded heartbeat failure action config
* string into its equivalent enumerated type.
* See code comments below for more detail.
*
* Assumptions : Both mtcAgent and hbsAgent need this conversion.
*
* Returns : Converted enum value ; error/default is 'fail' action
*
* ***********************************************************************/
hbs_failure_action_enum get_hbs_failure_action (
daemon_config_type & config )
{
/* push the Heartbeat Failure Action character array into string
* for easy/safe compare */
string hbs_failure_action = config.hbs_failure_action ;
/* default action is 'fail' */
hbs_failure_action_enum action_enum = HBS_FAILURE_ACTION__FAIL ;
/* look for 'none' action - hbsAgent only cares about this one
* so that it knows to clear or not to raise any alarms for heartbeat
* failures ; or degrades for that matter */
if ( hbs_failure_action == HBS_FAILURE_ACTION__NONE_STR )
action_enum = HBS_FAILURE_ACTION__NONE ;
/* look for degrade action - alarms are still managed in this mode */
else if ( hbs_failure_action == HBS_FAILURE_ACTION__DEGRADE_STR )
action_enum = HBS_FAILURE_ACTION__DEGRADE ;
/* look for 'alarm' action - no host degrade in this case */
else if ( hbs_failure_action == HBS_FAILURE_ACTION__ALARM_STR )
action_enum = HBS_FAILURE_ACTION__ALARM ;
ilog("HBS Action : %s\n", config.hbs_failure_action );
return (action_enum);
}
/* System Inventory Config Reader */
int sysinv_config_handler ( void * user,
const char * section,

View File

@ -72,6 +72,8 @@ using namespace std;
static string unexpected_pulse_list[MAX_IFACES] = { "" , "" } ;
static string arrival_histogram[MAX_IFACES] = { "" , "" } ;
static std::list<string> hostname_inventory ;
/** This heartbeat service inventory is tracked by
* the same nodeLinkClass that maintenance uses.
*
@ -88,8 +90,6 @@ int module_init ( void )
return (PASS);
}
static unsigned int my_nodetype= CGTS_NODE_NULL ;
void daemon_sigchld_hdlr ( void )
{
; /* dlog("Received SIGCHLD ... no action\n"); */
@ -107,12 +107,19 @@ daemon_config_type * daemon_get_cfg_ptr () { return &hbs_config ; }
* @see hbsBase.h for hbs_socket_type struct format.
*/
static hbs_socket_type hbs_sock ;
msgSock_type * get_mtclogd_sockPtr ( void )
{
return (&hbs_sock.mtclogd);
}
/**
* Module Control Struct - The allocated struct
* @see hbsBase.h for hbs_ctrl_type struct format.
*/
static hbs_ctrl_type hbs_ctrl ;
hbs_ctrl_type * get_hbs_ctrl_ptr () { return &hbs_ctrl ; }
#define SCHED_MONITOR__MAIN_LOOP ((const char *) "---> scheduling latency : main loop :")
#define SCHED_MONITOR__RECEIVER ((const char *) "---> scheduling latency : rx pulses :")
void monitor_scheduling ( unsigned long long & this_time, unsigned long long & prev_time , int data, const char * label_ptr )
@ -241,25 +248,31 @@ static int hbs_config_handler ( void * user,
hbsInv.hbs_failure_threshold = atoi(value);
config_ptr->mask |= CONFIG_AGENT_HBS_FAILURE ;
}
if (MATCH("agent", "hbs_calibrate_threshold"))
if (MATCH("agent", "heartbeat_failure_action"))
{
config_ptr->hbs_calibrate_threshold = atoi(value);
}
if (MATCH("agent", "hbs_calibrate_period_factor"))
{
config_ptr->hbs_calibrate_period_factor = atoi(value);
}
if (MATCH("agent", "hbs_calibrate_minor_factor"))
{
config_ptr->hbs_calibrate_minor_factor = atoi(value);
}
if (MATCH("agent", "hbs_calibrate_degrade_factor"))
{
config_ptr->hbs_calibrate_degrade_factor = atoi(value);
}
if (MATCH("agent", "hbs_calibrate_fail_factor"))
{
config_ptr->hbs_calibrate_fail_factor = atoi(value);
hbs_failure_action_enum current_action = hbsInv.hbs_failure_action ;
/*
* 1. free previous memory from strdup on reconfig
* 2. get the new value string
* 3. convert it to an enum
* 4. if failure action is 'none' then set the clear_alarms audit bool
* telling the main loop to clear all heartbeat related alarms.
* 5. clear all stats if the action is changed from none to other.
*
* Note: The none action prevents any new alarms from being raised.
*/
if ( config_ptr->hbs_failure_action )
free(config_ptr->hbs_failure_action);
config_ptr->hbs_failure_action = strdup(value);
/* get the configured action */
hbsInv.hbs_failure_action = get_hbs_failure_action(hbs_config);
if ( current_action != hbsInv.hbs_failure_action )
{
hbs_ctrl.clear_alarms = true ;
hbsInv.hbs_clear_all_stats();
}
}
if (MATCH("agent", "multicast"))
{
@ -334,6 +347,7 @@ int daemon_configure ( void )
/* Read the ini */
hbs_config.mask = 0 ;
get_debug_options ( MTCE_CONF_FILE, &hbs_config );
if (ini_parse(MTCE_CONF_FILE, hbs_config_handler, &hbs_config) < 0)
{
elog("Can't load '%s'\n", MTCE_CONF_FILE );
@ -346,8 +360,6 @@ int daemon_configure ( void )
return (FAIL_LOAD_INI);
}
get_debug_options ( MTCE_CONF_FILE, &hbs_config );
/* Verify loaded config against an expected mask
* as an ini file fault detection method */
if ( hbs_config.mask != CONFIG_AGENT_MASK )
@ -362,15 +374,13 @@ int daemon_configure ( void )
hbsInv.hbs_minor_threshold = hbsInv.hbs_degrade_threshold ;
}
// hbsInv.recalibrate_thresholds ();
/* Log the startup settings */
ilog("Realtime Pri: RR/%i \n", hbs_config.scheduling_priority );
ilog("Pulse Period: %i msec\n", hbsInv.hbs_pulse_period );
ilog("Minor Thld: %i misses\n", hbsInv.hbs_minor_threshold );
ilog("Degrade Thld: %i misses\n", hbsInv.hbs_degrade_threshold );
ilog("Failure Thld: %i misses\n", hbsInv.hbs_failure_threshold );
ilog("Multicast: %s\n", hbs_config.multicast );
ilog("Multicast : %s\n", hbs_config.multicast );
hbs_config.mgmnt_iface = daemon_get_iface_master ( hbs_config.mgmnt_iface );
ilog("Mgmnt iface : %s\n", hbs_config.mgmnt_iface );
@ -1014,12 +1024,19 @@ int daemon_init ( string iface, string nodetype )
/* Not used by this service */
UNUSED(nodetype);
/* Initialize socket construct and pointer to it */
memset ( &hbs_sock, 0, sizeof(hbs_sock));
MEMSET_ZERO ( hbs_sock );
/* Initialize the hbs control struct */
MEMSET_ZERO ( hbs_ctrl );
/* initialize the timer */
mtcTimer_init ( hbsTimer, "controller", "heartbeat" );
/* start with no inventory */
hostname_inventory.clear();
/* Assign interface to config */
hbs_config.mgmnt_iface = (char*)iface.data() ;
@ -1032,8 +1049,8 @@ int daemon_init ( string iface, string nodetype )
hbsInv.system_type = daemon_system_type ();
/* convert node type to integer */
my_nodetype = get_host_function_mask ( nodetype ) ;
ilog ("Node Type : %s (%d)\n", nodetype.c_str(), my_nodetype );
hbs_ctrl.nodetype = get_host_function_mask ( nodetype ) ;
ilog ("Node Type : %s (%d)\n", nodetype.c_str(), hbs_ctrl.nodetype );
/* Bind signal handlers */
if ( daemon_signal_init () != PASS )
@ -1134,7 +1151,7 @@ void daemon_service_run ( void )
/* CGTS 4114: Small Footprint: Alarm 200.005 remains active after connectivity restored
*
* Clear self alarms */
hbsAlarm_clear_all ( hbsInv.my_hostname );
hbsAlarm_clear_all ( hbsInv.my_hostname, hbsInv.infra_network_provisioned );
/* add this host as inventory to hbsAgent
* Although this host is not monitored for heartbeat,
@ -1254,6 +1271,29 @@ void daemon_service_run ( void )
}
}
/* audit for forced alarms clear due to ...
*
* 1. heartbeat failure action being set to none
* 2. ... future
*
*/
if ( hbs_ctrl.clear_alarms == true )
{
if ( goenabled == true )
{
std::list<string>::iterator hostname_ptr ;
ilog ("clearing all heartbeat alarms for all hosts due to 'none' action");
for ( hostname_ptr = hostname_inventory.begin();
hostname_ptr != hostname_inventory.end() ;
hostname_ptr++ )
{
hbsAlarm_clear_all ( hostname_ptr->data(), hbsInv.infra_network_provisioned );
hbsInv.manage_heartbeat_clear ( hostname_ptr->data(), MAX_IFACES );
}
hbs_ctrl.clear_alarms = false ;
}
}
/***************** Service Sockets ********************/
/* Initialize the master fd_set and clear socket list */
@ -1356,10 +1396,15 @@ void daemon_service_run ( void )
inv.name = hostname ;
inv.nodetype = msg.parm[0];
hbsInv.add_heartbeat_host ( inv ) ;
hostname_inventory.push_back ( hostname );
ilog ("%s added to heartbeat service (%d)\n", hostname.c_str(), inv.nodetype );
/* clear any outstanding alarms on the ADD */
hbsAlarm_clear_all ( hostname );
if ( hbsInv.hbs_failure_action != HBS_FAILURE_ACTION__NONE )
{
hbsAlarm_clear_all ( hostname,
hbsInv.infra_network_provisioned );
}
}
else if ( msg.cmd == MTC_CMD_DEL_HOST )
{
@ -1367,12 +1412,16 @@ void daemon_service_run ( void )
{
hbsInv.mon_host ( hostname, (iface_enum)iface, false, false );
}
hostname_inventory.remove ( hostname );
hbsInv.del_host ( hostname );
ilog ("%s deleted from heartbeat service\n", hostname.c_str());
/* clear any outstanding alarms on the DEL */
hbsAlarm_clear_all ( hostname );
if ( hbsInv.hbs_failure_action != HBS_FAILURE_ACTION__NONE )
{
hbsAlarm_clear_all ( hostname,
hbsInv.infra_network_provisioned );
}
}
else if ( msg.cmd == MTC_CMD_STOP_HOST )
{
@ -1484,6 +1533,13 @@ void daemon_service_run ( void )
counter = 1 ;
}
else if ( hbsInv.hbs_failure_action == HBS_FAILURE_ACTION__NONE )
{
wlog_throttled (counter, 100000, "Heartbeat disabled by 'none' action\n");
usleep (50000) ;
continue ;
}
/* Send a log indicating the main loop has recognized
* a state change to enable */
else if (( hbsInv.hbs_state_change == true ) &&

View File

@ -31,10 +31,11 @@ using namespace std;
#include "hbsAlarm.h" /* for ... this module header */
#include "alarm.h" /* for ... alarm send message to mtcalarmd */
void hbsAlarm_clear_all ( string hostname )
void hbsAlarm_clear_all ( string hostname, bool infra )
{
alarm_clear ( hostname, MGMNT_HB_ALARM_ID, MGMNT_NAME );
alarm_clear ( hostname, INFRA_HB_ALARM_ID, INFRA_NAME );
if ( infra )
alarm_clear ( hostname, INFRA_HB_ALARM_ID, INFRA_NAME );
alarm_clear ( hostname , PMOND_ALARM_ID, PMON_NAME );
}

View File

@ -27,6 +27,6 @@ using namespace std;
#define INFRA_NAME ((const char *)"Infrastructure")
#define PMON_NAME ((char *)"pmond")
void hbsAlarm_clear_all ( string hostname );
void hbsAlarm_clear_all ( string hostname, bool infra );
#endif /* __HBSALARM_H__ */

View File

@ -56,6 +56,13 @@ const char rsp_msg_header [HBS_HEADER_SIZE+1] = {"cgts pulse rsp:"};
#define HBS_MAX_MSG (HBS_HEADER_SIZE+MAX_CHARS_HOSTNAME)
/* Heartbeat control structure */
typedef struct
{
unsigned int nodetype ;
bool clear_alarms ;
} hbs_ctrl_type ;
/* A heartbeat service message
* if this structire is changed then
* hbs_pulse_request needs to be looked at

View File

@ -359,6 +359,7 @@ int mtcAlarm_critical_log ( string hostname, mtc_alarm_id_enum id ) { UNUSED(ho
int mtcAlarm_major_log ( string hostname, mtc_alarm_id_enum id ) { UNUSED(hostname); id = id ; return (PASS); }
int mtcAlarm_minor_log ( string hostname, mtc_alarm_id_enum id ) { UNUSED(hostname); id = id ; return (PASS); }
int mtcAlarm_warning_log ( string hostname, mtc_alarm_id_enum id ) { UNUSED(hostname); id = id ; return (PASS); }
int mtcAlarm_log ( string hostname, mtc_alarm_id_enum id ) { UNUSED(hostname); id = id ; return (PASS); }
int mtcAlarm_log ( string hostname, mtc_alarm_id_enum id, string str )
{ UNUSED(hostname); id = id ; UNUSED(str) ; return (PASS); }
string mtcAlarm_getId_str ( mtc_alarm_id_enum id ) { id = id ; return ("stub"); }

View File

@ -128,7 +128,7 @@ void mtcAlarm_init ( void )
"If manual or auto-recovery is consistently unable to recover host to the unlocked-enabled "
"state contact next level of support or lock and replace failing Host.");
/** Board Management Controller Access Alarm ************************************/
/** Init Board Management Controller Access Alarm Entry ******************/
ptr = &alarm_list[MTC_ALARM_ID__BM];
memset (&ptr->alarm, 0, (sizeof(SFmAlarmDataT)));
@ -154,7 +154,7 @@ void mtcAlarm_init ( void )
snprintf( ptr->alarm.proposed_repair_action, FM_MAX_BUFFER_LENGTH,
"Check Host's board management config and connectivity.");
/** Controller Failure Alarm ****************************************************/
/** Init Controller Failure Alarm Entry **********************************/
ptr = &alarm_list[MTC_ALARM_ID__CH_CONT];
memset (&ptr->alarm, 0, (sizeof(SFmAlarmDataT)));
@ -184,7 +184,7 @@ void mtcAlarm_init ( void )
"running on this host. If lock action fails then contact next level "
"of support to investigate and recover.");
/** Compute Failure Alarm ****************************************************/
/** Init Compute Failure Alarm Entry *************************************/
ptr = &alarm_list[MTC_ALARM_ID__CH_COMP];
memset (&ptr->alarm, 0, (sizeof(SFmAlarmDataT)));
@ -213,7 +213,7 @@ void mtcAlarm_init ( void )
"and Switch Activity (Swact) to it as soon as possible. If the alarm "
"persists then Lock/Unlock host to recover its local compute service.");
/** Add Event Log ****************************************************/
/** Init Event Log Entry *************************************************/
ptr = &alarm_list[MTC_LOG_ID__EVENT];
memset (&ptr->alarm, 0, (sizeof(SFmAlarmDataT)));
@ -236,6 +236,103 @@ void mtcAlarm_init ( void )
ptr->alarm.alarm_state = FM_ALARM_STATE_MSG ; /* Dynamic */
snprintf ( ptr->alarm.proposed_repair_action, FM_MAX_BUFFER_LENGTH, "%s", "");
/** Init Command Log Entry ***********************************************/
ptr = &alarm_list[MTC_LOG_ID__COMMAND];
memset (&ptr->alarm, 0, (sizeof(SFmAlarmDataT)));
snprintf(&ptr->alarm.alarm_id[0], FM_MAX_BUFFER_LENGTH, "%s", COMMAND_LOG_ID);
ptr->name = "Maintenance Command" ;
ptr->minor_reason =
ptr->major_reason =
ptr->critl_reason =
ptr->clear_reason = "";
ptr->alarm.alarm_type = FM_ALARM_TYPE_UNKNOWN ;
ptr->alarm.probable_cause = FM_ALARM_CAUSE_UNKNOWN ;
ptr->alarm.inhibit_alarms = FM_FALSE ;
ptr->alarm.service_affecting = FM_FALSE ;
ptr->alarm.suppression = FM_FALSE ;
ptr->alarm.severity = FM_ALARM_SEVERITY_CLEAR ; /* Dynamic */
ptr->alarm.alarm_state = FM_ALARM_STATE_MSG ; /* Dynamic */
snprintf ( ptr->alarm.proposed_repair_action, FM_MAX_BUFFER_LENGTH, "%s", "");
/** Init Config Log Entry ***********************************************/
ptr = &alarm_list[MTC_LOG_ID__CONFIG];
memset (&ptr->alarm, 0, (sizeof(SFmAlarmDataT)));
snprintf(&ptr->alarm.alarm_id[0], FM_MAX_BUFFER_LENGTH, "%s", CONFIG_LOG_ID);
ptr->name = "Maintenance Config" ;
ptr->minor_reason =
ptr->major_reason =
ptr->critl_reason =
ptr->clear_reason = "";
ptr->alarm.alarm_type = FM_ALARM_TYPE_UNKNOWN ;
ptr->alarm.probable_cause = FM_ALARM_CAUSE_UNKNOWN ;
ptr->alarm.inhibit_alarms = FM_FALSE ;
ptr->alarm.service_affecting = FM_FALSE ;
ptr->alarm.suppression = FM_FALSE ;
ptr->alarm.severity = FM_ALARM_SEVERITY_CLEAR ; /* Dynamic */
ptr->alarm.alarm_state = FM_ALARM_STATE_MSG ; /* Dynamic */
snprintf ( ptr->alarm.proposed_repair_action, FM_MAX_BUFFER_LENGTH, "%s", "");
/** Init State Change Log Entry ******************************************/
ptr = &alarm_list[MTC_LOG_ID__STATECHANGE];
memset (&ptr->alarm, 0, (sizeof(SFmAlarmDataT)));
snprintf(&ptr->alarm.alarm_id[0], FM_MAX_BUFFER_LENGTH, "%s", STATECHANGE_LOG_ID);
ptr->name = "Maintenance State Change" ;
ptr->minor_reason =
ptr->major_reason =
ptr->critl_reason =
ptr->clear_reason = "";
ptr->alarm.alarm_type = FM_ALARM_TYPE_UNKNOWN ;
ptr->alarm.probable_cause = FM_ALARM_CAUSE_UNKNOWN ;
ptr->alarm.inhibit_alarms = FM_FALSE ;
ptr->alarm.service_affecting = FM_FALSE ;
ptr->alarm.suppression = FM_FALSE ;
ptr->alarm.severity = FM_ALARM_SEVERITY_CLEAR ; /* Dynamic */
ptr->alarm.alarm_state = FM_ALARM_STATE_MSG ; /* Dynamic */
snprintf ( ptr->alarm.proposed_repair_action, FM_MAX_BUFFER_LENGTH, "%s", "");
/** Init Service Status Log Entry ****************************************/
ptr = &alarm_list[MTC_LOG_ID__SERVICESTATUS];
memset (&ptr->alarm, 0, (sizeof(SFmAlarmDataT)));
snprintf(&ptr->alarm.alarm_id[0], FM_MAX_BUFFER_LENGTH, "%s", SERVICESTATUS_LOG_ID);
ptr->name = "Maintenance Service Status Change" ;
ptr->minor_reason =
ptr->major_reason =
ptr->critl_reason =
ptr->clear_reason = "";
ptr->alarm.alarm_type = FM_ALARM_TYPE_UNKNOWN ;
ptr->alarm.probable_cause = FM_ALARM_CAUSE_UNKNOWN ;
ptr->alarm.inhibit_alarms = FM_FALSE ;
ptr->alarm.service_affecting = FM_FALSE ;
ptr->alarm.suppression = FM_FALSE ;
ptr->alarm.severity = FM_ALARM_SEVERITY_CLEAR ; /* Dynamic */
ptr->alarm.alarm_state = FM_ALARM_STATE_MSG ; /* Dynamic */
snprintf ( ptr->alarm.proposed_repair_action, FM_MAX_BUFFER_LENGTH, "%s", "");
}
string _getIdentity ( mtc_alarm_id_enum id )
@ -251,6 +348,7 @@ string _getIdentity ( mtc_alarm_id_enum id )
case MTC_LOG_ID__EVENT: return (EVENT_LOG_ID);
case MTC_LOG_ID__COMMAND: return (COMMAND_LOG_ID);
case MTC_LOG_ID__STATECHANGE: return (STATECHANGE_LOG_ID);
case MTC_LOG_ID__CONFIG: return (CONFIG_LOG_ID);
default: return ("200.000");
}
}
@ -493,7 +591,7 @@ int mtcAlarm_warning_log ( string hostname, mtc_alarm_id_enum id )
}
/** Create a neutral customer log */
int mtcAlarm_log ( string hostname, mtc_alarm_id_enum id )
int mtcAlarm_log ( string hostname, mtc_alarm_id_enum id, string str )
{
if ( id < MTC_ALARM_ID__END )
{
@ -750,6 +848,39 @@ int mtcAlarm_log ( string hostname, mtc_alarm_id_enum id )
"board management controller has been 're-provisioned'" );
found = true ;
}
else if (( id == MTC_LOG_ID__CONFIG_HB_ACTION_FAIL ) ||
( id == MTC_LOG_ID__CONFIG_HB_ACTION_DEGRADE ) ||
( id == MTC_LOG_ID__CONFIG_HB_ACTION_ALARM ) ||
( id == MTC_LOG_ID__CONFIG_HB_ACTION_NONE ))
{
alarm_list[index].instc_prefix = "config=heartbeat_failure_action" ;
snprintf ( alarm_list[index].alarm.reason_text,
FM_MAX_BUFFER_LENGTH, "%s %s %s",
hostname.data(),
"platform maintenance service parameter 'heartbeat failure action' changed from",
str.data());
found = true ;
}
else if ( id == MTC_LOG_ID__CONFIG_MNFA_TIMEOUT )
{
alarm_list[index].instc_prefix = "config=mnfa_timeout" ;
snprintf ( alarm_list[index].alarm.reason_text,
FM_MAX_BUFFER_LENGTH, "%s %s %s",
hostname.data(),
"platform maintenance service parameter 'mnfa_timeout' changed from",
str.data());
found = true ;
}
else if ( id == MTC_LOG_ID__CONFIG_MNFA_THRESHOLD )
{
alarm_list[index].instc_prefix = "config=mnfa_threshold" ;
snprintf ( alarm_list[index].alarm.reason_text,
FM_MAX_BUFFER_LENGTH, "%s %s %s",
hostname.data(),
"platform maintenance service parameter 'mnfa_threshold' changed from",
str.data());
found = true ;
}
if ( found == true )
{
@ -758,11 +889,6 @@ int mtcAlarm_log ( string hostname, mtc_alarm_id_enum id )
string identity = _getIdentity(index);
string instance = _getInstance(index);
instance.append(alarm_list[index].instc_prefix);
//wlog ("%s '%s' log (%s.%s)\n",
// hostname.c_str(),
// alarm_list[index].alarm.reason_text,
// identity.c_str(),
// instance.c_str());
/* Want to make this log a critical */
if ( id == MTC_LOG_ID__STATUSCHANGE_REINSTALL_FAILED )

View File

@ -25,52 +25,63 @@ using namespace std;
/** Maintenance Alarm Abstract Reference IDs */
typedef enum
{
MTC_ALARM_ID__LOCK = 0,
MTC_ALARM_ID__CONFIG = 1,
MTC_ALARM_ID__ENABLE = 2,
MTC_ALARM_ID__BM = 3,
MTC_ALARM_ID__CH_CONT = 4, /* Combo Host Controller Failure - with Active Compute */
MTC_ALARM_ID__CH_COMP = 5, /* Combo Host Compute Failure - on last Controller */
MTC_ALARM_ID__LOCK,
MTC_ALARM_ID__CONFIG,
MTC_ALARM_ID__ENABLE,
MTC_ALARM_ID__BM,
MTC_ALARM_ID__CH_CONT, /* Combo Host Controller Failure - with Active Compute */
MTC_ALARM_ID__CH_COMP, /* Combo Host Compute Failure - on last Controller */
MTC_LOG_ID__EVENT = 6,
MTC_LOG_ID__COMMAND = 7,
MTC_LOG_ID__STATECHANGE = 8,
MTC_ALARM_ID__LAST = 9,
MTC_LOG_ID__EVENT,
MTC_LOG_ID__COMMAND,
MTC_LOG_ID__CONFIG,
MTC_LOG_ID__STATECHANGE,
MTC_LOG_ID__SERVICESTATUS,
MTC_ALARM_ID__LAST,
MTC_LOG_ID__EVENT_ADD = 10,
MTC_LOG_ID__EVENT_RESTART = 11,
MTC_LOG_ID__EVENT_DISCOVERED = 12,
MTC_LOG_ID__EVENT_MNFA_ENTER = 13,
MTC_LOG_ID__EVENT_MNFA_EXIT = 14,
MTC_LOG_ID__EVENT_ADD,
MTC_LOG_ID__EVENT_RESTART,
MTC_LOG_ID__EVENT_DISCOVERED,
MTC_LOG_ID__EVENT_MNFA_ENTER,
MTC_LOG_ID__EVENT_MNFA_EXIT,
MTC_LOG_ID__COMMAND_DELETE = 19,
MTC_LOG_ID__COMMAND_UNLOCK = 20,
MTC_LOG_ID__COMMAND_FORCE_LOCK = 21,
MTC_LOG_ID__COMMAND_SWACT = 22,
MTC_LOG_ID__COMMAND_REINSTALL = 23,
MTC_LOG_ID__COMMAND_BM_PROVISIONED = 24,
MTC_LOG_ID__COMMAND_BM_DEPROVISIONED = 25,
MTC_LOG_ID__COMMAND_BM_REPROVISIONED = 26,
MTC_LOG_ID__COMMAND_DELETE,
MTC_LOG_ID__COMMAND_UNLOCK,
MTC_LOG_ID__COMMAND_FORCE_LOCK,
MTC_LOG_ID__COMMAND_SWACT,
MTC_LOG_ID__COMMAND_REINSTALL,
MTC_LOG_ID__COMMAND_BM_PROVISIONED,
MTC_LOG_ID__COMMAND_BM_DEPROVISIONED,
MTC_LOG_ID__COMMAND_BM_REPROVISIONED,
MTC_LOG_ID__COMMAND_AUTO_REBOOT = 30,
MTC_LOG_ID__COMMAND_MANUAL_REBOOT = 31,
MTC_LOG_ID__COMMAND_AUTO_RESET = 32,
MTC_LOG_ID__COMMAND_MANUAL_RESET = 33,
MTC_LOG_ID__COMMAND_AUTO_POWER_ON = 34,
MTC_LOG_ID__COMMAND_MANUAL_POWER_ON = 35,
MTC_LOG_ID__COMMAND_AUTO_POWER_OFF = 36,
MTC_LOG_ID__COMMAND_MANUAL_POWER_OFF = 37,
MTC_LOG_ID__CONFIG_HB_ACTION_FAIL,
MTC_LOG_ID__CONFIG_HB_ACTION_DEGRADE,
MTC_LOG_ID__CONFIG_HB_ACTION_ALARM,
MTC_LOG_ID__CONFIG_HB_ACTION_NONE,
MTC_LOG_ID__CONFIG_HB_PERIOD,
MTC_LOG_ID__CONFIG_HB_DEGRADE_THRESHOLD,
MTC_LOG_ID__CONFIG_HB_FAILURE_THRESHOLD,
MTC_LOG_ID__CONFIG_MNFA_TIMEOUT,
MTC_LOG_ID__CONFIG_MNFA_THRESHOLD,
MTC_LOG_ID__STATUSCHANGE_ENABLED = 40,
MTC_LOG_ID__STATUSCHANGE_DISABLED = 41,
MTC_LOG_ID__STATUSCHANGE_ONLINE = 42,
MTC_LOG_ID__STATUSCHANGE_OFFLINE = 43,
MTC_LOG_ID__STATUSCHANGE_FAILED = 44,
MTC_LOG_ID__STATUSCHANGE_REINSTALL_FAILED = 45,
MTC_LOG_ID__STATUSCHANGE_REINSTALL_COMPLETE = 46,
MTC_LOG_ID__COMMAND_AUTO_REBOOT,
MTC_LOG_ID__COMMAND_MANUAL_REBOOT,
MTC_LOG_ID__COMMAND_AUTO_RESET,
MTC_LOG_ID__COMMAND_MANUAL_RESET,
MTC_LOG_ID__COMMAND_AUTO_POWER_ON,
MTC_LOG_ID__COMMAND_MANUAL_POWER_ON,
MTC_LOG_ID__COMMAND_AUTO_POWER_OFF,
MTC_LOG_ID__COMMAND_MANUAL_POWER_OFF,
MTC_ALARM_ID__END = 50
MTC_LOG_ID__STATUSCHANGE_ENABLED,
MTC_LOG_ID__STATUSCHANGE_DISABLED,
MTC_LOG_ID__STATUSCHANGE_ONLINE,
MTC_LOG_ID__STATUSCHANGE_OFFLINE,
MTC_LOG_ID__STATUSCHANGE_FAILED,
MTC_LOG_ID__STATUSCHANGE_REINSTALL_FAILED,
MTC_LOG_ID__STATUSCHANGE_REINSTALL_COMPLETE,
MTC_ALARM_ID__END
} mtc_alarm_id_enum ;
@ -109,6 +120,6 @@ int mtcAlarm_minor_log ( string hostname, mtc_alarm_id_enum id );
int mtcAlarm_warning_log ( string hostname, mtc_alarm_id_enum id );
/** Create a maintenance log */
int mtcAlarm_log ( string hostname, mtc_alarm_id_enum id );
int mtcAlarm_log ( string hostname, mtc_alarm_id_enum id, string str = "");
#endif /* __MTCALARM_H__ */

View File

@ -947,8 +947,12 @@ int service_events ( nodeLinkClass * obj_ptr, mtc_socket_type * sock_ptr )
if ( msg.cmd == MTC_EVENT_HEARTBEAT_DEGRADE_SET )
{
/* Assert the degrade condition with the 'false' (i.e. not clear)*/
obj_ptr->manage_heartbeat_degrade ( hostname, iface, false );
if (( obj_ptr->hbs_failure_action == HBS_FAILURE_ACTION__FAIL ) ||
( obj_ptr->hbs_failure_action == HBS_FAILURE_ACTION__DEGRADE ))
{
/* Assert the degrade condition with the 'false' (i.e. not clear)*/
obj_ptr->manage_heartbeat_degrade ( hostname, iface, false );
}
}
else
{
@ -985,7 +989,23 @@ int service_events ( nodeLinkClass * obj_ptr, mtc_socket_type * sock_ptr )
}
string hostname = &msg.buf[0] ;
print_mtc_message ( hostname, MTC_CMD_RX, msg, get_iface_name_str(MGMNT_INTERFACE), false );
obj_ptr->manage_heartbeat_failure ( hostname, iface, false );
/* If heartbeat failure action is fail then call the fail handler */
if ( obj_ptr->hbs_failure_action == HBS_FAILURE_ACTION__FAIL )
obj_ptr->manage_heartbeat_failure ( hostname, iface, false );
/* If heartbeat failure action is degrade then call the degrade handler */
else if ( obj_ptr->hbs_failure_action == HBS_FAILURE_ACTION__DEGRADE )
obj_ptr->manage_heartbeat_degrade ( hostname, iface, false );
/* Otherwise the action must be alarm only or none ; both of which
* are already handled by the hbsAgent, so do nothing */
else
{
dlog ("%s heartbeat loss event dropped (%s)\n",
hostname.c_str(),
get_iface_name_str(iface));
}
}
}
else if ( msg.cmd == MTC_EVENT_PMOND_CLEAR )

View File

@ -330,15 +330,67 @@ static int mtc_ini_handler ( void * user,
{
UNUSED(user);
if (MATCH("agent", "mnfa_threshold"))
if (MATCH("agent", "heartbeat_failure_action"))
{
string cur_action = "" ;
string new_action = "" ;
/* prevent memory leak over a reconfig */
if ( mtc_config.hbs_failure_action )
{
cur_action = mtc_config.hbs_failure_action ;
free(mtc_config.hbs_failure_action);
}
new_action = mtc_config.hbs_failure_action = strdup(value);
mtcInv.hbs_failure_action = get_hbs_failure_action(mtc_config);
if (( !cur_action.empty() ) && ( cur_action != new_action))
{
mtc_alarm_id_enum alarm_id = MTC_LOG_ID__CONFIG_HB_ACTION_FAIL ;
if ( mtcInv.hbs_failure_action == HBS_FAILURE_ACTION__NONE )
alarm_id = MTC_LOG_ID__CONFIG_HB_ACTION_NONE ;
else if ( mtcInv.hbs_failure_action == HBS_FAILURE_ACTION__ALARM )
alarm_id = MTC_LOG_ID__CONFIG_HB_ACTION_ALARM ;
else if ( mtcInv.hbs_failure_action == HBS_FAILURE_ACTION__DEGRADE )
alarm_id = MTC_LOG_ID__CONFIG_HB_ACTION_DEGRADE ;
/* re-use cur_action to build the action change string from it */
cur_action.append(" to ");
cur_action.append(new_action);
mtcAlarm_log ( mtcInv.my_hostname, alarm_id, cur_action );
}
if (( mtcInv.mnfa_active == true ) &&
(( mtcInv.hbs_failure_action == HBS_FAILURE_ACTION__NONE ) ||
( mtcInv.hbs_failure_action == HBS_FAILURE_ACTION__ALARM )))
{
mtcInv.mnfa_cancel ();
}
}
else if (MATCH("agent", "mnfa_threshold"))
{
int old = mtcInv.mnfa_threshold ;
mtcInv.mnfa_threshold = atoi(value);
if (( old != 0 ) && ( old != mtcInv.mnfa_threshold ))
{
string cur_threshold = "" ;
cur_threshold.append(itos(old));
cur_threshold.append(" to ");
cur_threshold.append(itos(mtcInv.mnfa_threshold));
mtcAlarm_log ( mtcInv.my_hostname, MTC_LOG_ID__CONFIG_MNFA_THRESHOLD, cur_threshold );
}
ilog ("MNFA Threshd: %d\n", mtcInv.mnfa_threshold);
}
else if (MATCH("timeouts", "mnfa_timeout"))
{
int old = mtcInv.mnfa_timeout ;
mtcInv.mnfa_timeout = atoi(value);
if ( old != mtcInv.mnfa_timeout )
{
string cur_timeout = "" ;
cur_timeout.append(itos(old));
cur_timeout.append(" to ");
cur_timeout.append(itos(mtcInv.mnfa_timeout));
mtcAlarm_log ( mtcInv.my_hostname, MTC_LOG_ID__CONFIG_MNFA_TIMEOUT, cur_timeout );
}
if ( mtcInv.mnfa_timeout == 0 )
{
ilog ("MNFA Timeout: Never\n");

View File

@ -526,7 +526,7 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr )
node_ptr->cmdRsp_status = 0 ;
/* Raise Critical Enable Alarm */
alarm_enabled_failure ( node_ptr );
alarm_enabled_failure ( node_ptr, true );
/* Handle active controller failures */
if ( THIS_HOST )
@ -774,7 +774,7 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr )
}
else
{
alarm_enabled_failure ( node_ptr );
alarm_enabled_failure ( node_ptr , true );
if ( node_ptr->availStatus != MTC_AVAIL_STATUS__FAILED )
{
@ -1095,7 +1095,7 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr )
elog ("%s Timeout waiting for MTCALIVE\n", node_ptr->hostname.c_str());
/* raise an alarm for the enable failure */
alarm_enabled_failure ( node_ptr );
alarm_enabled_failure ( node_ptr , true );
/* go back and issue reboot again */
enableStageChange ( node_ptr, MTC_ENABLE__RESET_PROGRESSION );
@ -1190,7 +1190,7 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr )
node_ptr->mtcTimer.ring = false ;
/* raise an alarm for the enable failure */
alarm_enabled_failure ( node_ptr );
alarm_enabled_failure ( node_ptr , true );
/* go back and issue reboot again */
enableStageChange ( node_ptr, MTC_ENABLE__FAILURE );
@ -1309,18 +1309,29 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr )
mtcTimer_reset ( node_ptr->mtcTimer );
}
plog ("%s Starting %d sec Heartbeat Soak (with%s)\n",
node_ptr->hostname.c_str(),
MTC_HEARTBEAT_SOAK_BEFORE_ENABLE,
node_ptr->hbsClient_ready ? " ready event" : "out ready event" );
/* Start Monitoring Services - heartbeat, process and hardware */
send_hbs_command ( node_ptr->hostname, MTC_CMD_START_HOST );
send_hbs_command ( node_ptr->hostname, MTC_CMD_START_HOST );
/* allow heartbeat to run for 10 seconds before we declare enable */
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_HEARTBEAT_SOAK_BEFORE_ENABLE );
enableStageChange ( node_ptr, MTC_ENABLE__HEARTBEAT_SOAK );
if ( this->hbs_failure_action == HBS_FAILURE_ACTION__NONE )
{
/* Skip over the heartbeat soak if the failuer handlig is
* none because in that case heartbeating is disabled and
* would just be a waste of startup time. */
enableStageChange ( node_ptr, MTC_ENABLE__STATE_CHANGE );
}
else
{
plog ("%s Starting %d sec Heartbeat Soak (with%s)\n",
node_ptr->hostname.c_str(),
MTC_HEARTBEAT_SOAK_BEFORE_ENABLE,
node_ptr->hbsClient_ready ? " ready event" : "out ready event" );
/* allow heartbeat to run for MTC_HEARTBEAT_SOAK_BEFORE_ENABLE
* seconds before we declare enable */
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_HEARTBEAT_SOAK_BEFORE_ENABLE );
enableStageChange ( node_ptr, MTC_ENABLE__HEARTBEAT_SOAK );
}
break ;
}
case MTC_ENABLE__HEARTBEAT_SOAK:
@ -1524,6 +1535,15 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr )
case MTC_RECOVERY__START:
{
if ( this->hbs_failure_action != HBS_FAILURE_ACTION__FAIL )
{
wlog ("%s heartbeat failure recovery action is not fail\n",
node_ptr->hostname.c_str());
mtcInvApi_update_task ( node_ptr, "" );
adminActionChange ( node_ptr, MTC_ADMIN_ACTION__NONE );
break ;
}
/* Purge this hosts work queues */
mtcCmd_workQ_purge ( node_ptr );
mtcCmd_doneQ_purge ( node_ptr );
@ -1690,7 +1710,7 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr )
/* Go to the goEnabled stage */
recoveryStageChange ( node_ptr, MTC_RECOVERY__GOENABLED_TIMER );
alarm_enabled_failure(node_ptr);
alarm_enabled_failure(node_ptr, true );
break ;
}
}
@ -1728,7 +1748,7 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr )
/* Go to the goEnabled stage */
recoveryStageChange ( node_ptr, MTC_RECOVERY__GOENABLED_TIMER );
alarm_enabled_failure (node_ptr);
alarm_enabled_failure (node_ptr, true );
}
}
/* A timer ring indicates that the host is not up */
@ -1772,7 +1792,7 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr )
/* Inform the VIM that this host has failed */
mtcVimApi_state_change ( node_ptr, VIM_HOST_FAILED, 3 );
alarm_enabled_failure(node_ptr);
alarm_enabled_failure(node_ptr, true );
/* Clear all degrade flags except for the HWMON one */
clear_host_degrade_causes ( node_ptr->degrade_mask );
@ -2351,21 +2371,31 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr )
mtcTimer_reset ( node_ptr->mtcTimer );
}
plog ("%s Starting %d sec Heartbeat Soak (with%s)\n",
node_ptr->hostname.c_str(),
MTC_HEARTBEAT_SOAK_BEFORE_ENABLE,
node_ptr->hbsClient_ready ? " ready event" : "out ready event" );
/* Enable the heartbeat service for Graceful Recovery */
send_hbs_command ( node_ptr->hostname, MTC_CMD_START_HOST );
send_hbs_command ( node_ptr->hostname, MTC_CMD_START_HOST );
/* allow heartbeat to run for 10 seconds before we declare enable */
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_HEARTBEAT_SOAK_BEFORE_ENABLE );
if ( this->hbs_failure_action == HBS_FAILURE_ACTION__NONE )
{
/* Skip over the heartbeat soak if the failuer handlig is
* none because in that case heartbeating is disabled and
* would just be a waste of recovery time. */
recoveryStageChange ( node_ptr, MTC_RECOVERY__STATE_CHANGE );
}
else
{
plog ("%s Starting %d sec Heartbeat Soak (with%s)\n",
node_ptr->hostname.c_str(),
MTC_HEARTBEAT_SOAK_BEFORE_ENABLE,
node_ptr->hbsClient_ready ? " ready event" : "out ready event" );
/* if heartbeat is not working then we will
* never get here and enable the host */
recoveryStageChange ( node_ptr, MTC_RECOVERY__HEARTBEAT_SOAK );
/* allow heartbeat to run for 10 seconds before we declare enable */
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_HEARTBEAT_SOAK_BEFORE_ENABLE );
/* if heartbeat is not working then we will
* never get here and enable the host */
recoveryStageChange ( node_ptr, MTC_RECOVERY__HEARTBEAT_SOAK );
}
break ;
}
case MTC_RECOVERY__HEARTBEAT_SOAK:
@ -4667,7 +4697,7 @@ int nodeLinkClass::powercycle_handler ( struct nodeLinkClass::node * node_ptr )
if ( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED )
{
ilog ("%s failing host for powercycle\n", node_ptr->hostname.c_str() );
alarm_enabled_failure ( node_ptr );
alarm_enabled_failure ( node_ptr , true );
/* Set node as unlocked-disabled-failed */
allStateChange ( node_ptr, MTC_ADMIN_STATE__UNLOCKED,

View File

@ -31,14 +31,45 @@ void log_mnfa_pool ( std::list<string> & mnfa_awol_list )
{
std::list<string>::iterator mnfa_awol_ptr ;
string pool_list = "" ;
if ( mnfa_awol_list.size() )
{
for ( mnfa_awol_ptr = mnfa_awol_list.begin() ;
mnfa_awol_ptr != mnfa_awol_list.end() ;
mnfa_awol_ptr++ )
{
pool_list.append (" ");
pool_list.append (mnfa_awol_ptr->data());
}
ilog ("MNFA POOL:%s\n", pool_list.c_str());
}
}
/*****************************************************************************
*
* Name : add_host_to_awol_list
*
* Description: Add a hostname to the awol list if its not already in the list
*
* Returns : true if added
* false if not added because it is already in the list.
*
*****************************************************************************/
static bool add_host_to_awol_list ( string hostname, std::list<string> & mnfa_awol_list )
{
std::list<string>::iterator mnfa_awol_ptr ;
for ( mnfa_awol_ptr = mnfa_awol_list.begin() ;
mnfa_awol_ptr != mnfa_awol_list.end() ;
mnfa_awol_ptr++ )
{
pool_list.append (" ");
pool_list.append (mnfa_awol_ptr->data());
if ( *(mnfa_awol_ptr) == hostname )
{
/* already in list */
return false ;
}
}
ilog ("MNFA POOL:%s\n", pool_list.c_str());
mnfa_awol_list.push_back(hostname);
return true ;
}
/*****************************************************************************
@ -51,6 +82,14 @@ void log_mnfa_pool ( std::list<string> & mnfa_awol_list )
*****************************************************************************/
void nodeLinkClass::mnfa_add_host ( struct nodeLinkClass::node * node_ptr , iface_enum iface )
{
if (( this->hbs_failure_action == HBS_FAILURE_ACTION__ALARM ) ||
( this->hbs_failure_action == HBS_FAILURE_ACTION__NONE ))
{
/* Do nothing for the 'alarm only' or 'none' action.
* Alarming is handled by the hbsAgent already */
return ;
}
if ( node_ptr->hbs_minor[iface] == false )
{
bool enter = false ;
@ -63,15 +102,12 @@ void nodeLinkClass::mnfa_add_host ( struct nodeLinkClass::node * node_ptr , ifac
/* if we are active then add the node to the awol list */
if ( mnfa_active == true )
{
alarm_enabled_failure (node_ptr);
/* once we are mnfa_active we need to give all the
* hbs_minor=true hosts a graceful recovery token
* mnfa_graceful_recovery = true and add to the awol list */
node_ptr->mnfa_graceful_recovery = true ;
added = true ;
mnfa_awol_list.push_back(node_ptr->hostname);
mnfa_awol_list.unique();
add_host_to_awol_list (node_ptr->hostname, mnfa_awol_list );
if ( node_ptr->task != MTC_TASK_RECOVERY_WAIT )
mtcInvApi_update_task ( node_ptr, MTC_TASK_RECOVERY_WAIT );
}
@ -94,10 +130,7 @@ void nodeLinkClass::mnfa_add_host ( struct nodeLinkClass::node * node_ptr , ifac
get_iface_name_str(INFRA_IFACE),
node_ptr->hbs_minor_count[INFRA_IFACE]);
if ( mnfa_awol_list.size() )
{
log_mnfa_pool ( mnfa_awol_list );
}
log_mnfa_pool ( mnfa_awol_list );
if ( enter == true )
{
@ -191,28 +224,20 @@ void nodeLinkClass::mnfa_enter ( void )
* recovery token mnfa_graceful_recovery = true
* basically a get out of double reset free card */
ptr->mnfa_graceful_recovery = true ;
mnfa_awol_list.push_back(ptr->hostname);
add_host_to_awol_list (ptr->hostname, mnfa_awol_list );
if ( ptr->task != MTC_TASK_RECOVERY_WAIT )
mtcInvApi_update_task ( ptr, MTC_TASK_RECOVERY_WAIT );
alarm_enabled_failure (ptr);
}
if (( ptr->next == NULL ) || ( ptr == tail ))
break ;
}
mnfa_awol_list.unique();
if ( this->mnfa_timeout )
{
wlog ("MNFA Auto-Recovery in %d seconds\n", this->mnfa_timeout);
mtcTimer_start ( mtcTimer_mnfa, mtcTimer_handler, this->mnfa_timeout);
}
if ( mnfa_awol_list.size() )
{
log_mnfa_pool ( mnfa_awol_list );
}
log_mnfa_pool ( mnfa_awol_list );
}
/****************************************************************************
@ -262,10 +287,7 @@ void nodeLinkClass::mnfa_exit ( bool force )
force ? "(Auto-Recover)" : "");
mtcAlarm_log ( active_controller_hostname , MTC_LOG_ID__EVENT_MNFA_EXIT );
if ( mnfa_awol_list.size() )
{
log_mnfa_pool ( mnfa_awol_list );
}
log_mnfa_pool ( mnfa_awol_list );
/* Loop through inventory and recover each host that
* remains in the hbs_minor state.
@ -329,3 +351,44 @@ void nodeLinkClass::mnfa_exit ( bool force )
mnfa_host_count[INFRA_IFACE] = 0 ;
mnfa_awol_list.clear();
}
/****************************************************************************
*
* Name : mnfa_cancel
*
* Description: Cancel MNFA if its active.
*
****************************************************************************/
void nodeLinkClass::mnfa_cancel ( void )
{
if ( this->mnfa_active )
{
wlog ("MNFA CANCEL --> Cancelling Multi-Node Failure Avoidance\n");
mtcTimer_reset ( this->mtcTimer_mnfa );
/* Loop through MNFA Pool.
* Clear MNFA attributes from hosts in the pool. */
std::list<string>::iterator mnfa_awol_ptr ;
for ( mnfa_awol_ptr = mnfa_awol_list.begin() ;
mnfa_awol_ptr != mnfa_awol_list.end() ;
mnfa_awol_ptr++ )
{
struct node * node_ptr = nodeLinkClass::getNode ( *(mnfa_awol_ptr) );
if ( node_ptr != NULL )
{
node_ptr->degrade_mask &= ~DEGRADE_MASK_HEARTBEAT_MGMNT ;
node_ptr->degrade_mask &= ~DEGRADE_MASK_HEARTBEAT_INFRA ;
node_ptr->hbs_minor[INFRA_IFACE] = false ;
node_ptr->hbs_minor[MGMNT_IFACE] = false ;
node_ptr->mnfa_graceful_recovery = false ;
mtcInvApi_update_task ( node_ptr, "" );
}
}
send_hbs_command ( this->my_hostname, MTC_RECOVER_HBS );
this->mnfa_host_count[MGMNT_IFACE] = 0 ;
this->mnfa_host_count[INFRA_IFACE] = 0 ;
this->mnfa_active = false ;
}
mnfa_awol_list.clear();
}

View File

@ -409,17 +409,24 @@ int nodeLinkClass::enable_subf_handler ( struct nodeLinkClass::node * node_ptr )
mtcTimer_reset ( node_ptr->mtcTimer );
}
plog ("%s Starting %d sec Heartbeat Soak (with%s)\n",
/* Start Monitoring heartbeat */
send_hbs_command ( node_ptr->hostname, MTC_CMD_START_HOST );
if ( this->hbs_failure_action == HBS_FAILURE_ACTION__NONE )
{
enableStageChange ( node_ptr, MTC_ENABLE__STATE_CHANGE );
}
else
{
plog ("%s Starting %d sec Heartbeat Soak (with%s)\n",
name.c_str(),
MTC_HEARTBEAT_SOAK_BEFORE_ENABLE,
node_ptr->hbsClient_ready ? " ready event" : "out ready event" );
/* Start Monitoring Services - heartbeat, process and hardware */
send_hbs_command ( node_ptr->hostname, MTC_CMD_START_HOST );
/* allow heartbeat to run for 10 seconds before we declare enable */
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_HEARTBEAT_SOAK_BEFORE_ENABLE );
enableStageChange ( node_ptr, MTC_ENABLE__HEARTBEAT_SOAK );
/* allow heartbeat to run for 10 seconds before we declare enable */
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_HEARTBEAT_SOAK_BEFORE_ENABLE );
enableStageChange ( node_ptr, MTC_ENABLE__HEARTBEAT_SOAK );
}
break ;
}
case MTC_ENABLE__HEARTBEAT_SOAK:

View File

@ -7,12 +7,6 @@ hbs_minor_threshold = 4 ; Heartbeat minor threshold count.
; heartbeat misses that result in a
; minor notification to maintenance.
hbs_calibrate_threshold = 7 ; number of hosts before calibration kicks in
hbs_calibrate_period_factor = 200 ; x for each host over hbs_calibrate_threshold
hbs_calibrate_minor_factor = 20 ; x for each host over hbs_calibrate_threshold
hbs_calibrate_degrade_factor = 21 ; x for each host over hbs_calibrate_threshold
hbs_calibrate_fail_factor = 30 ; x for each host over hbs_calibrate_threshold
offline_period = 100 ; number of msecs to wait for each offline audit
offline_threshold = 46 ; number of back to back mtcAlive requests missed
; 100:46 will yield a typical 5 sec holdoff from