Merge "Mtce: Make Heartbeat Failure Action Configurable"

This commit is contained in:
Zuul 2018-09-11 13:50:31 +00:00 committed by Gerrit Code Review
commit 31c4beff75
20 changed files with 693 additions and 194 deletions

View File

@ -1,3 +1,3 @@
SRC_DIR="cgts-mtce-common-1.0"
TIS_PATCH_VER=136
TIS_PATCH_VER=137
BUILD_IS_SLOW=5

View File

@ -47,6 +47,7 @@
#define COMMAND_LOG_ID ((const char *)"200.021")
#define STATECHANGE_LOG_ID ((const char *)"200.022")
#define SERVICESTATUS_LOG_ID ((const char *)"200.023") /* log used to report service failure events against */
#define CONFIG_LOG_ID ((const char *)"200.024")
/**
* TODO: This class is more of a place holder for

View File

@ -44,12 +44,7 @@ typedef struct
int hbs_minor_threshold ; /**< heartbeat miss minor threshold */
int hbs_degrade_threshold ; /**< heartbeat miss degrade threshold */
int hbs_failure_threshold ; /**< heartbeat miss failure threshold */
int hbs_calibrate_threshold ; /**< number of hosts where threshold calibration begins to take effect */
int hbs_calibrate_period_factor ; /**< hbs_pulse_period = hbs_pulse_period * hosts */
int hbs_calibrate_minor_factor ; /**< hbs_minor_threshold = threshold factor * hosts */
int hbs_calibrate_degrade_factor; /**< hbs_degrade_threshold = threshold factor * hosts */
int hbs_calibrate_fail_factor ; /**< hbs_failure_threshold = threshold factor * hosts */
char* hbs_failure_action ; /**< action to take on host heartbeat falure*/
char* mgmnt_iface ; /**< management interface name pointer */
char* infra_iface ; /**< infrastructure interface name pointer */

View File

@ -258,7 +258,7 @@ nodeLinkClass::nodeLinkClass()
hbs_minor_threshold = HBS_MINOR_THRESHOLD ;
hbs_degrade_threshold = HBS_DEGRADE_THRESHOLD ;
hbs_failure_threshold = HBS_FAILURE_THRESHOLD ;
hbs_failure_action = HBS_FAILURE_ACTION__FAIL ;
hbs_silent_fault_detector = 0 ;
hbs_silent_fault_logged = false ;
@ -653,14 +653,14 @@ nodeLinkClass::node* nodeLinkClass::addNode( string hostname )
ptr->vimEvent.buf = NULL ;
ptr->httpReq.buf = NULL ;
/* log throttles */
ptr->stall_recovery_log_throttle = 0 ;
ptr->stall_monitor_log_throttle = 0 ;
ptr->unexpected_pulse_log_throttle = 0 ;
ptr->lookup_mismatch_log_throttle = 0 ;
ptr->log_throttle = 0 ;
ptr->no_work_log_throttle = 0 ;
ptr->no_rri_log_throttle = 0 ;
ptr->degrade_mask = ptr->degrade_mask_save = DEGRADE_MASK_NONE ;
@ -1615,13 +1615,15 @@ int nodeLinkClass::alarm_config_clear ( struct nodeLinkClass::node * node_ptr )
}
/* Generate a log and a critical alarm if the node enable failed */
int nodeLinkClass::alarm_enabled_failure ( struct nodeLinkClass::node * node_ptr )
int nodeLinkClass::alarm_enabled_failure ( struct nodeLinkClass::node * node_ptr, bool want_degrade )
{
if ( (node_ptr->degrade_mask & DEGRADE_MASK_ENABLE) == 0 )
if ( want_degrade )
{
node_ptr->degrade_mask |= DEGRADE_MASK_ENABLE ;
if ( (node_ptr->degrade_mask & DEGRADE_MASK_ENABLE) == 0 )
{
node_ptr->degrade_mask |= DEGRADE_MASK_ENABLE ;
}
}
if ( node_ptr->alarms[MTC_ALARM_ID__ENABLE] != FM_ALARM_SEVERITY_CRITICAL )
{
elog ("%s critical enable failure\n", node_ptr->hostname.c_str());
@ -4466,7 +4468,10 @@ void nodeLinkClass::manage_heartbeat_failure ( string hostname, iface_enum iface
}
else
{
alarm_enabled_failure (node_ptr);
//bool want_degrade = true ;
//if ( this->hbs_failure_action == HBS_FAILURE_ACTION__ALARM )
// want_degrade = false ;
// alarm_enabled_failure (node_ptr, want_degrade);
mnfa_add_host ( node_ptr , iface );
@ -4487,8 +4492,6 @@ void nodeLinkClass::manage_heartbeat_failure ( string hostname, iface_enum iface
nodeLinkClass::set_availStatus ( hostname, MTC_AVAIL_STATUS__FAILED );
alarm_enabled_failure (node_ptr);
if (( node_ptr->adminAction != MTC_ADMIN_ACTION__ENABLE ) &&
( node_ptr->adminAction != MTC_ADMIN_ACTION__UNLOCK ))
{
@ -4526,11 +4529,31 @@ void nodeLinkClass::manage_heartbeat_clear ( string hostname, iface_enum iface )
for ( int i = 0 ; i < MAX_IFACES ; i++ )
{
node_ptr->heartbeat_failed[i] = false ;
if ( i == MGMNT_IFACE )
{
node_ptr->alarms[HBS_ALARM_ID__HB_MGMNT] = FM_ALARM_SEVERITY_CLEAR ;
node_ptr->degrade_mask &= ~DEGRADE_MASK_HEARTBEAT_MGMNT ;
}
if ( i == INFRA_IFACE )
{
node_ptr->alarms[HBS_ALARM_ID__HB_INFRA] = FM_ALARM_SEVERITY_CLEAR ;
node_ptr->degrade_mask &= ~DEGRADE_MASK_HEARTBEAT_INFRA ;
}
}
}
else
{
node_ptr->heartbeat_failed[iface] = false ;
if ( iface == MGMNT_IFACE )
{
node_ptr->alarms[HBS_ALARM_ID__HB_MGMNT] = FM_ALARM_SEVERITY_CLEAR ;
node_ptr->degrade_mask &= ~DEGRADE_MASK_HEARTBEAT_MGMNT ;
}
else if ( iface == INFRA_IFACE )
{
node_ptr->alarms[HBS_ALARM_ID__HB_INFRA] = FM_ALARM_SEVERITY_CLEAR ;
node_ptr->degrade_mask &= ~DEGRADE_MASK_HEARTBEAT_INFRA ;
}
}
}
@ -4576,7 +4599,7 @@ void nodeLinkClass::manage_heartbeat_degrade ( string hostname, iface_enum iface
}
mnfa_add_host ( node_ptr, iface );
if ( nodeLinkClass::get_operState ( hostname ) == MTC_OPER_STATE__ENABLED )
{
if ( iface == MGMNT_IFACE )
@ -7074,7 +7097,7 @@ void nodeLinkClass::manage_autorecovery ( struct nodeLinkClass::node * node_ptr
}
else
{
alarm_enabled_failure ( node_ptr ) ;
alarm_enabled_failure ( node_ptr , true ) ;
}
allStateChange ( node_ptr, node_ptr->adminState,
@ -7155,7 +7178,7 @@ void nodeLinkClass::force_full_enable ( struct nodeLinkClass::node * node_ptr )
plog ("%s Forcing Full Enable Sequence\n", node_ptr->hostname.c_str());
/* Raise Critical Enable Alarm */
alarm_enabled_failure ( node_ptr );
alarm_enabled_failure ( node_ptr, true );
allStateChange ( node_ptr, node_ptr->adminState, MTC_OPER_STATE__DISABLED, MTC_AVAIL_STATUS__FAILED );
enableStageChange ( node_ptr, MTC_ENABLE__FAILURE );
@ -7359,7 +7382,18 @@ bool nodeLinkClass::get_hbs_monitor_state ( string & hostname, int iface )
node_ptr = nodeLinkClass::getNode ( hostname );
if ( node_ptr != NULL )
{
int rri_max = this->hosts ;
state = node_ptr->monitor[iface] ;
if ( state == true )
{
wlog_throttled (node_ptr->no_rri_log_throttle, rri_max,
"%s Not Offering RRI (%d)\n",
hostname.c_str(), this->hosts );
}
else
{
node_ptr->no_rri_log_throttle = 0 ;
}
}
}
return (state);
@ -7539,6 +7573,31 @@ int nodeLinkClass::create_pulse_list ( iface_enum iface )
return (pulses[iface]);
}
/** Clear heartbeat stats in support of failed heartbeat restart */
void nodeLinkClass::hbs_clear_all_stats ( void )
{
ilog ("clearing all hearbeat stats\n");
for ( struct node * ptr = head ; ptr != NULL ; ptr = ptr->next )
{
for ( int iface = 0 ; iface < MAX_IFACES ; iface++ )
{
ptr->max_count[iface] = 0 ;
ptr->hbs_count[iface] = 0 ;
ptr->hbs_misses_count[iface] = 0 ;
ptr->b2b_pulses_count[iface] = 0 ;
ptr->b2b_misses_count[iface] = 0 ;
ptr->hbs_minor_count[iface] = 0 ;
ptr->hbs_degrade_count[iface] = 0 ;
ptr->hbs_failure_count[iface] = 0 ;
ptr->hbs_minor[iface] = false ;
ptr->hbs_degrade[iface] = false ;
ptr->hbs_failure[iface] = false ;
ptr->heartbeat_failed[iface] = false ;
}
if (( ptr->next == NULL ) || ( ptr == tail ))
break ;
}
}
/** Build the Reasource Reference Array */
void nodeLinkClass::build_rra ( void )
@ -7717,7 +7776,7 @@ int nodeLinkClass::remPulse ( struct node * node_ptr, iface_enum iface, bool cle
* if this interfaces failed and has not yet received the
* required number of back to back pulses needed for recovery */
clear_b2b_misses_count = false ;
ilog ("%s %s heartbeat failure recovery (%d of %d)\n",
dlog ("%s %s heartbeat failure recovery (%d of %d)\n",
node_ptr->hostname.c_str(),
get_iface_name_str(iface),
ptr->b2b_pulses_count[iface],
@ -7870,8 +7929,8 @@ int nodeLinkClass::remPulse ( struct node * node_ptr, iface_enum iface, bool cle
}
/** This utility will try and remove a pluse from the pulse
* linked list first by index and then by hostname.
*
* linked list first by index and then by hostname.
*
* By index does not require a lookup whereas hostname does */
int nodeLinkClass::remove_pulse ( string & hostname, iface_enum iface, int index, unsigned int flags )
{
@ -7889,10 +7948,7 @@ int nodeLinkClass::remove_pulse ( string & hostname, iface_enum iface, int index
{
if ( hostname.compare("localhost") )
{
if ( get_hbs_monitor_state ( hostname , iface ) == true )
{
wlog ("%s Not Offering RRI\n", hostname.c_str());
}
get_hbs_monitor_state ( hostname , iface ) ;
}
else
{
@ -7914,7 +7970,7 @@ void nodeLinkClass::clear_pulse_list ( iface_enum iface )
}
pulse_list[iface].head_ptr = NULL ;
pulse_list[iface].tail_ptr = NULL ;
if ( ptr != NULL )
{
ptr->linknum[iface] = 0 ;
@ -7929,6 +7985,15 @@ void nodeLinkClass::manage_heartbeat_alarm ( struct nodeLinkClass::node * node_p
if ( this->heartbeat != true )
return ;
if ( this->hbs_failure_action == HBS_FAILURE_ACTION__NONE )
{
dlog ("%s dropping heartbeat alarm request (%s:%s) ; action none\n",
node_ptr->hostname.c_str(),
alarmUtil_getSev_str(sev).c_str(),
get_iface_name_str(iface) );
return ;
}
bool make_alarm_call = false ;
alarm_id_enum id ;
EFmAlarmStateT state = FM_ALARM_STATE_SET ;
@ -8025,7 +8090,7 @@ int nodeLinkClass::lost_pulses ( iface_enum iface )
{
if ( pulse_ptr->b2b_misses_count[iface] == hbs_failure_threshold )
{
ilog ("%-13s %s Pulse Miss (%d) (log throttled to every %d)\n",
ilog ("%s %s Pulse Miss (%d) (log throttled to every %d)\n",
pulse_ptr->hostname.c_str(),
get_iface_name_str(iface),
pulse_ptr->b2b_misses_count[iface],
@ -8034,7 +8099,7 @@ int nodeLinkClass::lost_pulses ( iface_enum iface )
/* Once the misses exceed 25 then throttle the logging to avoid flooding */
if ( (pulse_ptr->b2b_misses_count[iface] & 0xfff) == 0 )
{
ilog ("%-13s %s Pulse Miss (%d)\n", pulse_ptr->hostname.c_str(),
ilog ("%s %s Pulse Miss (%d)\n", pulse_ptr->hostname.c_str(),
get_iface_name_str(iface),
pulse_ptr->b2b_misses_count[iface] );
}
@ -8043,27 +8108,27 @@ int nodeLinkClass::lost_pulses ( iface_enum iface )
{
if ( pulse_ptr->b2b_misses_count[iface] > hbs_failure_threshold )
{
ilog ("%-13s %s Pulse Miss (%3d) (in failure)\n", pulse_ptr->hostname.c_str(),
ilog ("%s %s Pulse Miss (%3d) (in failure)\n", pulse_ptr->hostname.c_str(),
get_iface_name_str(iface),
pulse_ptr->b2b_misses_count[iface] );
}
else if ( pulse_ptr->b2b_misses_count[iface] > hbs_degrade_threshold )
{
ilog ("%-13s %s Pulse Miss (%3d) (max:%3d) (in degrade)\n", pulse_ptr->hostname.c_str(),
ilog ("%s %s Pulse Miss (%3d) (max:%3d) (in degrade)\n", pulse_ptr->hostname.c_str(),
get_iface_name_str(iface),
pulse_ptr->b2b_misses_count[iface],
pulse_ptr->max_count[iface]);
}
else if ( pulse_ptr->b2b_misses_count[iface] > hbs_minor_threshold )
{
ilog ("%-13s %s Pulse Miss (%3d) (max:%3d) (in minor)\n", pulse_ptr->hostname.c_str(),
ilog ("%s %s Pulse Miss (%3d) (max:%3d) (in minor)\n", pulse_ptr->hostname.c_str(),
get_iface_name_str(iface),
pulse_ptr->b2b_misses_count[iface] ,
pulse_ptr->max_count[iface]);
}
else
{
ilog ("%-13s %s Pulse Miss (%3d) (max:%3d)\n", pulse_ptr->hostname.c_str(),
ilog ("%s %s Pulse Miss (%3d) (max:%3d)\n", pulse_ptr->hostname.c_str(),
get_iface_name_str(iface),
pulse_ptr->b2b_misses_count[iface],
pulse_ptr->max_count[iface]);
@ -8072,7 +8137,7 @@ int nodeLinkClass::lost_pulses ( iface_enum iface )
}
else
{
dlog ("%-13s %s Pulse Miss (%d)\n", pulse_ptr->hostname.c_str(),
dlog ("%s %s Pulse Miss (%d)\n", pulse_ptr->hostname.c_str(),
get_iface_name_str(iface),
pulse_ptr->b2b_misses_count[iface] );
}

View File

@ -542,6 +542,9 @@ private:
/** Resource reference identifier, aka resource reference array index */
int rri ;
/** variable used to throttle the rri log */
int no_rri_log_throttle ;
/** @} private_Heartbeat_variables */
/**
@ -1023,7 +1026,7 @@ private:
int lazy_graceful_fs_reboot ( struct nodeLinkClass::node * node_ptr );
int alarm_enabled_clear ( struct nodeLinkClass::node * node_ptr, bool force );
int alarm_enabled_failure ( struct nodeLinkClass::node * node_ptr );
int alarm_enabled_failure ( struct nodeLinkClass::node * node_ptr, bool want_degrade );
int alarm_insv_clear ( struct nodeLinkClass::node * node_ptr, bool force );
int alarm_insv_failure ( struct nodeLinkClass::node * node_ptr );
@ -1296,6 +1299,9 @@ public:
/** The number of heartbeat misses that result in a failed state */
int hbs_failure_threshold ;
/** enumerated failure action code ; fail, degrade, alarm, none */
hbs_failure_action_enum hbs_failure_action ;
/** Running Resource Reference Identifier */
int rrri ;
@ -1427,6 +1433,7 @@ public:
* node failure avoidance threshold and until there are no more
* in service trouble hosts */
bool mnfa_active ;
void mnfa_cancel( void );
std::list<string> mnfa_awol_list ;
void mnfa_timeout_handler ( void );
@ -1526,6 +1533,10 @@ public:
//#ifdef WANT_HBS
/** Add a host to the Node list */
int add_heartbeat_host ( const node_inv_type &inv );
/** Clear heartbeat stats for all hosts */
void hbs_clear_all_stats ( void ) ;
// #endif
void host_print ( struct nodeLinkClass::node * node_ptr );

View File

@ -174,6 +174,23 @@ int client_timeout_handler ( void * user,
const char * name,
const char * value);
/* User selectable heartbeat failure actions */
typedef enum
{
HBS_FAILURE_ACTION__NONE = 0, /* no heartbeat tally */
HBS_FAILURE_ACTION__ALARM = 1, /* alarm only */
HBS_FAILURE_ACTION__DEGRADE = 2, /* degrade and alarm */
HBS_FAILURE_ACTION__FAIL = 3, /* fail and alarm */
} hbs_failure_action_enum ;
#define HBS_FAILURE_ACTION__NONE_STR ((const char *)("none"))
#define HBS_FAILURE_ACTION__ALARM_STR ((const char *)("alarm"))
#define HBS_FAILURE_ACTION__DEGRADE_STR ((const char *)("degrade"))
#define HBS_FAILURE_ACTION__FAIL_STR ((const char *)("fail"))
hbs_failure_action_enum
get_hbs_failure_action ( daemon_config_type & config );
/** Test Head Entry */
int daemon_run_testhead ( void );
/**

View File

@ -195,6 +195,48 @@ int timeout_config_handler ( void * user,
return (PASS);
}
/* ***********************************************************************
*
* Name : get_hbs_failure_action
*
* Desctription: Convert already loaded heartbeat failure action config
* string into its equivalent enumerated type.
* See code comments below for more detail.
*
* Assumptions : Both mtcAgent and hbsAgent need this conversion.
*
* Returns : Converted enum value ; error/default is 'fail' action
*
* ***********************************************************************/
hbs_failure_action_enum get_hbs_failure_action (
daemon_config_type & config )
{
/* push the Heartbeat Failure Action character array into string
* for easy/safe compare */
string hbs_failure_action = config.hbs_failure_action ;
/* default action is 'fail' */
hbs_failure_action_enum action_enum = HBS_FAILURE_ACTION__FAIL ;
/* look for 'none' action - hbsAgent only cares about this one
* so that it knows to clear or not to raise any alarms for heartbeat
* failures ; or degrades for that matter */
if ( hbs_failure_action == HBS_FAILURE_ACTION__NONE_STR )
action_enum = HBS_FAILURE_ACTION__NONE ;
/* look for degrade action - alarms are still managed in this mode */
else if ( hbs_failure_action == HBS_FAILURE_ACTION__DEGRADE_STR )
action_enum = HBS_FAILURE_ACTION__DEGRADE ;
/* look for 'alarm' action - no host degrade in this case */
else if ( hbs_failure_action == HBS_FAILURE_ACTION__ALARM_STR )
action_enum = HBS_FAILURE_ACTION__ALARM ;
ilog("HBS Action : %s\n", config.hbs_failure_action );
return (action_enum);
}
/* System Inventory Config Reader */
int sysinv_config_handler ( void * user,
const char * section,

View File

@ -72,6 +72,8 @@ using namespace std;
static string unexpected_pulse_list[MAX_IFACES] = { "" , "" } ;
static string arrival_histogram[MAX_IFACES] = { "" , "" } ;
static std::list<string> hostname_inventory ;
/** This heartbeat service inventory is tracked by
* the same nodeLinkClass that maintenance uses.
*
@ -88,8 +90,6 @@ int module_init ( void )
return (PASS);
}
static unsigned int my_nodetype= CGTS_NODE_NULL ;
void daemon_sigchld_hdlr ( void )
{
; /* dlog("Received SIGCHLD ... no action\n"); */
@ -107,12 +107,19 @@ daemon_config_type * daemon_get_cfg_ptr () { return &hbs_config ; }
* @see hbsBase.h for hbs_socket_type struct format.
*/
static hbs_socket_type hbs_sock ;
msgSock_type * get_mtclogd_sockPtr ( void )
{
return (&hbs_sock.mtclogd);
}
/**
* Module Control Struct - The allocated struct
* @see hbsBase.h for hbs_ctrl_type struct format.
*/
static hbs_ctrl_type hbs_ctrl ;
hbs_ctrl_type * get_hbs_ctrl_ptr () { return &hbs_ctrl ; }
#define SCHED_MONITOR__MAIN_LOOP ((const char *) "---> scheduling latency : main loop :")
#define SCHED_MONITOR__RECEIVER ((const char *) "---> scheduling latency : rx pulses :")
void monitor_scheduling ( unsigned long long & this_time, unsigned long long & prev_time , int data, const char * label_ptr )
@ -241,25 +248,31 @@ static int hbs_config_handler ( void * user,
hbsInv.hbs_failure_threshold = atoi(value);
config_ptr->mask |= CONFIG_AGENT_HBS_FAILURE ;
}
if (MATCH("agent", "hbs_calibrate_threshold"))
if (MATCH("agent", "heartbeat_failure_action"))
{
config_ptr->hbs_calibrate_threshold = atoi(value);
}
if (MATCH("agent", "hbs_calibrate_period_factor"))
{
config_ptr->hbs_calibrate_period_factor = atoi(value);
}
if (MATCH("agent", "hbs_calibrate_minor_factor"))
{
config_ptr->hbs_calibrate_minor_factor = atoi(value);
}
if (MATCH("agent", "hbs_calibrate_degrade_factor"))
{
config_ptr->hbs_calibrate_degrade_factor = atoi(value);
}
if (MATCH("agent", "hbs_calibrate_fail_factor"))
{
config_ptr->hbs_calibrate_fail_factor = atoi(value);
hbs_failure_action_enum current_action = hbsInv.hbs_failure_action ;
/*
* 1. free previous memory from strdup on reconfig
* 2. get the new value string
* 3. convert it to an enum
* 4. if failure action is 'none' then set the clear_alarms audit bool
* telling the main loop to clear all heartbeat related alarms.
* 5. clear all stats if the action is changed from none to other.
*
* Note: The none action prevents any new alarms from being raised.
*/
if ( config_ptr->hbs_failure_action )
free(config_ptr->hbs_failure_action);
config_ptr->hbs_failure_action = strdup(value);
/* get the configured action */
hbsInv.hbs_failure_action = get_hbs_failure_action(hbs_config);
if ( current_action != hbsInv.hbs_failure_action )
{
hbs_ctrl.clear_alarms = true ;
hbsInv.hbs_clear_all_stats();
}
}
if (MATCH("agent", "multicast"))
{
@ -334,6 +347,7 @@ int daemon_configure ( void )
/* Read the ini */
hbs_config.mask = 0 ;
get_debug_options ( MTCE_CONF_FILE, &hbs_config );
if (ini_parse(MTCE_CONF_FILE, hbs_config_handler, &hbs_config) < 0)
{
elog("Can't load '%s'\n", MTCE_CONF_FILE );
@ -346,8 +360,6 @@ int daemon_configure ( void )
return (FAIL_LOAD_INI);
}
get_debug_options ( MTCE_CONF_FILE, &hbs_config );
/* Verify loaded config against an expected mask
* as an ini file fault detection method */
if ( hbs_config.mask != CONFIG_AGENT_MASK )
@ -362,15 +374,13 @@ int daemon_configure ( void )
hbsInv.hbs_minor_threshold = hbsInv.hbs_degrade_threshold ;
}
// hbsInv.recalibrate_thresholds ();
/* Log the startup settings */
ilog("Realtime Pri: RR/%i \n", hbs_config.scheduling_priority );
ilog("Pulse Period: %i msec\n", hbsInv.hbs_pulse_period );
ilog("Minor Thld: %i misses\n", hbsInv.hbs_minor_threshold );
ilog("Degrade Thld: %i misses\n", hbsInv.hbs_degrade_threshold );
ilog("Failure Thld: %i misses\n", hbsInv.hbs_failure_threshold );
ilog("Multicast: %s\n", hbs_config.multicast );
ilog("Multicast : %s\n", hbs_config.multicast );
hbs_config.mgmnt_iface = daemon_get_iface_master ( hbs_config.mgmnt_iface );
ilog("Mgmnt iface : %s\n", hbs_config.mgmnt_iface );
@ -1014,12 +1024,19 @@ int daemon_init ( string iface, string nodetype )
/* Not used by this service */
UNUSED(nodetype);
/* Initialize socket construct and pointer to it */
memset ( &hbs_sock, 0, sizeof(hbs_sock));
MEMSET_ZERO ( hbs_sock );
/* Initialize the hbs control struct */
MEMSET_ZERO ( hbs_ctrl );
/* initialize the timer */
mtcTimer_init ( hbsTimer, "controller", "heartbeat" );
/* start with no inventory */
hostname_inventory.clear();
/* Assign interface to config */
hbs_config.mgmnt_iface = (char*)iface.data() ;
@ -1032,8 +1049,8 @@ int daemon_init ( string iface, string nodetype )
hbsInv.system_type = daemon_system_type ();
/* convert node type to integer */
my_nodetype = get_host_function_mask ( nodetype ) ;
ilog ("Node Type : %s (%d)\n", nodetype.c_str(), my_nodetype );
hbs_ctrl.nodetype = get_host_function_mask ( nodetype ) ;
ilog ("Node Type : %s (%d)\n", nodetype.c_str(), hbs_ctrl.nodetype );
/* Bind signal handlers */
if ( daemon_signal_init () != PASS )
@ -1134,7 +1151,7 @@ void daemon_service_run ( void )
/* CGTS 4114: Small Footprint: Alarm 200.005 remains active after connectivity restored
*
* Clear self alarms */
hbsAlarm_clear_all ( hbsInv.my_hostname );
hbsAlarm_clear_all ( hbsInv.my_hostname, hbsInv.infra_network_provisioned );
/* add this host as inventory to hbsAgent
* Although this host is not monitored for heartbeat,
@ -1254,6 +1271,29 @@ void daemon_service_run ( void )
}
}
/* audit for forced alarms clear due to ...
*
* 1. heartbeat failure action being set to none
* 2. ... future
*
*/
if ( hbs_ctrl.clear_alarms == true )
{
if ( goenabled == true )
{
std::list<string>::iterator hostname_ptr ;
ilog ("clearing all heartbeat alarms for all hosts due to 'none' action");
for ( hostname_ptr = hostname_inventory.begin();
hostname_ptr != hostname_inventory.end() ;
hostname_ptr++ )
{
hbsAlarm_clear_all ( hostname_ptr->data(), hbsInv.infra_network_provisioned );
hbsInv.manage_heartbeat_clear ( hostname_ptr->data(), MAX_IFACES );
}
hbs_ctrl.clear_alarms = false ;
}
}
/***************** Service Sockets ********************/
/* Initialize the master fd_set and clear socket list */
@ -1356,10 +1396,15 @@ void daemon_service_run ( void )
inv.name = hostname ;
inv.nodetype = msg.parm[0];
hbsInv.add_heartbeat_host ( inv ) ;
hostname_inventory.push_back ( hostname );
ilog ("%s added to heartbeat service (%d)\n", hostname.c_str(), inv.nodetype );
/* clear any outstanding alarms on the ADD */
hbsAlarm_clear_all ( hostname );
if ( hbsInv.hbs_failure_action != HBS_FAILURE_ACTION__NONE )
{
hbsAlarm_clear_all ( hostname,
hbsInv.infra_network_provisioned );
}
}
else if ( msg.cmd == MTC_CMD_DEL_HOST )
{
@ -1367,12 +1412,16 @@ void daemon_service_run ( void )
{
hbsInv.mon_host ( hostname, (iface_enum)iface, false, false );
}
hostname_inventory.remove ( hostname );
hbsInv.del_host ( hostname );
ilog ("%s deleted from heartbeat service\n", hostname.c_str());
/* clear any outstanding alarms on the DEL */
hbsAlarm_clear_all ( hostname );
if ( hbsInv.hbs_failure_action != HBS_FAILURE_ACTION__NONE )
{
hbsAlarm_clear_all ( hostname,
hbsInv.infra_network_provisioned );
}
}
else if ( msg.cmd == MTC_CMD_STOP_HOST )
{
@ -1484,6 +1533,13 @@ void daemon_service_run ( void )
counter = 1 ;
}
else if ( hbsInv.hbs_failure_action == HBS_FAILURE_ACTION__NONE )
{
wlog_throttled (counter, 100000, "Heartbeat disabled by 'none' action\n");
usleep (50000) ;
continue ;
}
/* Send a log indicating the main loop has recognized
* a state change to enable */
else if (( hbsInv.hbs_state_change == true ) &&

View File

@ -31,10 +31,11 @@ using namespace std;
#include "hbsAlarm.h" /* for ... this module header */
#include "alarm.h" /* for ... alarm send message to mtcalarmd */
void hbsAlarm_clear_all ( string hostname )
void hbsAlarm_clear_all ( string hostname, bool infra )
{
alarm_clear ( hostname, MGMNT_HB_ALARM_ID, MGMNT_NAME );
alarm_clear ( hostname, INFRA_HB_ALARM_ID, INFRA_NAME );
if ( infra )
alarm_clear ( hostname, INFRA_HB_ALARM_ID, INFRA_NAME );
alarm_clear ( hostname , PMOND_ALARM_ID, PMON_NAME );
}

View File

@ -27,6 +27,6 @@ using namespace std;
#define INFRA_NAME ((const char *)"Infrastructure")
#define PMON_NAME ((char *)"pmond")
void hbsAlarm_clear_all ( string hostname );
void hbsAlarm_clear_all ( string hostname, bool infra );
#endif /* __HBSALARM_H__ */

View File

@ -56,6 +56,13 @@ const char rsp_msg_header [HBS_HEADER_SIZE+1] = {"cgts pulse rsp:"};
#define HBS_MAX_MSG (HBS_HEADER_SIZE+MAX_CHARS_HOSTNAME)
/* Heartbeat control structure */
typedef struct
{
unsigned int nodetype ;
bool clear_alarms ;
} hbs_ctrl_type ;
/* A heartbeat service message
* if this structire is changed then
* hbs_pulse_request needs to be looked at

View File

@ -359,6 +359,7 @@ int mtcAlarm_critical_log ( string hostname, mtc_alarm_id_enum id ) { UNUSED(ho
int mtcAlarm_major_log ( string hostname, mtc_alarm_id_enum id ) { UNUSED(hostname); id = id ; return (PASS); }
int mtcAlarm_minor_log ( string hostname, mtc_alarm_id_enum id ) { UNUSED(hostname); id = id ; return (PASS); }
int mtcAlarm_warning_log ( string hostname, mtc_alarm_id_enum id ) { UNUSED(hostname); id = id ; return (PASS); }
int mtcAlarm_log ( string hostname, mtc_alarm_id_enum id ) { UNUSED(hostname); id = id ; return (PASS); }
int mtcAlarm_log ( string hostname, mtc_alarm_id_enum id, string str )
{ UNUSED(hostname); id = id ; UNUSED(str) ; return (PASS); }
string mtcAlarm_getId_str ( mtc_alarm_id_enum id ) { id = id ; return ("stub"); }

View File

@ -128,7 +128,7 @@ void mtcAlarm_init ( void )
"If manual or auto-recovery is consistently unable to recover host to the unlocked-enabled "
"state contact next level of support or lock and replace failing Host.");
/** Board Management Controller Access Alarm ************************************/
/** Init Board Management Controller Access Alarm Entry ******************/
ptr = &alarm_list[MTC_ALARM_ID__BM];
memset (&ptr->alarm, 0, (sizeof(SFmAlarmDataT)));
@ -154,7 +154,7 @@ void mtcAlarm_init ( void )
snprintf( ptr->alarm.proposed_repair_action, FM_MAX_BUFFER_LENGTH,
"Check Host's board management config and connectivity.");
/** Controller Failure Alarm ****************************************************/
/** Init Controller Failure Alarm Entry **********************************/
ptr = &alarm_list[MTC_ALARM_ID__CH_CONT];
memset (&ptr->alarm, 0, (sizeof(SFmAlarmDataT)));
@ -184,7 +184,7 @@ void mtcAlarm_init ( void )
"running on this host. If lock action fails then contact next level "
"of support to investigate and recover.");
/** Compute Failure Alarm ****************************************************/
/** Init Compute Failure Alarm Entry *************************************/
ptr = &alarm_list[MTC_ALARM_ID__CH_COMP];
memset (&ptr->alarm, 0, (sizeof(SFmAlarmDataT)));
@ -213,7 +213,7 @@ void mtcAlarm_init ( void )
"and Switch Activity (Swact) to it as soon as possible. If the alarm "
"persists then Lock/Unlock host to recover its local compute service.");
/** Add Event Log ****************************************************/
/** Init Event Log Entry *************************************************/
ptr = &alarm_list[MTC_LOG_ID__EVENT];
memset (&ptr->alarm, 0, (sizeof(SFmAlarmDataT)));
@ -236,6 +236,103 @@ void mtcAlarm_init ( void )
ptr->alarm.alarm_state = FM_ALARM_STATE_MSG ; /* Dynamic */
snprintf ( ptr->alarm.proposed_repair_action, FM_MAX_BUFFER_LENGTH, "%s", "");
/** Init Command Log Entry ***********************************************/
ptr = &alarm_list[MTC_LOG_ID__COMMAND];
memset (&ptr->alarm, 0, (sizeof(SFmAlarmDataT)));
snprintf(&ptr->alarm.alarm_id[0], FM_MAX_BUFFER_LENGTH, "%s", COMMAND_LOG_ID);
ptr->name = "Maintenance Command" ;
ptr->minor_reason =
ptr->major_reason =
ptr->critl_reason =
ptr->clear_reason = "";
ptr->alarm.alarm_type = FM_ALARM_TYPE_UNKNOWN ;
ptr->alarm.probable_cause = FM_ALARM_CAUSE_UNKNOWN ;
ptr->alarm.inhibit_alarms = FM_FALSE ;
ptr->alarm.service_affecting = FM_FALSE ;
ptr->alarm.suppression = FM_FALSE ;
ptr->alarm.severity = FM_ALARM_SEVERITY_CLEAR ; /* Dynamic */
ptr->alarm.alarm_state = FM_ALARM_STATE_MSG ; /* Dynamic */
snprintf ( ptr->alarm.proposed_repair_action, FM_MAX_BUFFER_LENGTH, "%s", "");
/** Init Config Log Entry ***********************************************/
ptr = &alarm_list[MTC_LOG_ID__CONFIG];
memset (&ptr->alarm, 0, (sizeof(SFmAlarmDataT)));
snprintf(&ptr->alarm.alarm_id[0], FM_MAX_BUFFER_LENGTH, "%s", CONFIG_LOG_ID);
ptr->name = "Maintenance Config" ;
ptr->minor_reason =
ptr->major_reason =
ptr->critl_reason =
ptr->clear_reason = "";
ptr->alarm.alarm_type = FM_ALARM_TYPE_UNKNOWN ;
ptr->alarm.probable_cause = FM_ALARM_CAUSE_UNKNOWN ;
ptr->alarm.inhibit_alarms = FM_FALSE ;
ptr->alarm.service_affecting = FM_FALSE ;
ptr->alarm.suppression = FM_FALSE ;
ptr->alarm.severity = FM_ALARM_SEVERITY_CLEAR ; /* Dynamic */
ptr->alarm.alarm_state = FM_ALARM_STATE_MSG ; /* Dynamic */
snprintf ( ptr->alarm.proposed_repair_action, FM_MAX_BUFFER_LENGTH, "%s", "");
/** Init State Change Log Entry ******************************************/
ptr = &alarm_list[MTC_LOG_ID__STATECHANGE];
memset (&ptr->alarm, 0, (sizeof(SFmAlarmDataT)));
snprintf(&ptr->alarm.alarm_id[0], FM_MAX_BUFFER_LENGTH, "%s", STATECHANGE_LOG_ID);
ptr->name = "Maintenance State Change" ;
ptr->minor_reason =
ptr->major_reason =
ptr->critl_reason =
ptr->clear_reason = "";
ptr->alarm.alarm_type = FM_ALARM_TYPE_UNKNOWN ;
ptr->alarm.probable_cause = FM_ALARM_CAUSE_UNKNOWN ;
ptr->alarm.inhibit_alarms = FM_FALSE ;
ptr->alarm.service_affecting = FM_FALSE ;
ptr->alarm.suppression = FM_FALSE ;
ptr->alarm.severity = FM_ALARM_SEVERITY_CLEAR ; /* Dynamic */
ptr->alarm.alarm_state = FM_ALARM_STATE_MSG ; /* Dynamic */
snprintf ( ptr->alarm.proposed_repair_action, FM_MAX_BUFFER_LENGTH, "%s", "");
/** Init Service Status Log Entry ****************************************/
ptr = &alarm_list[MTC_LOG_ID__SERVICESTATUS];
memset (&ptr->alarm, 0, (sizeof(SFmAlarmDataT)));
snprintf(&ptr->alarm.alarm_id[0], FM_MAX_BUFFER_LENGTH, "%s", SERVICESTATUS_LOG_ID);
ptr->name = "Maintenance Service Status Change" ;
ptr->minor_reason =
ptr->major_reason =
ptr->critl_reason =
ptr->clear_reason = "";
ptr->alarm.alarm_type = FM_ALARM_TYPE_UNKNOWN ;
ptr->alarm.probable_cause = FM_ALARM_CAUSE_UNKNOWN ;
ptr->alarm.inhibit_alarms = FM_FALSE ;
ptr->alarm.service_affecting = FM_FALSE ;
ptr->alarm.suppression = FM_FALSE ;
ptr->alarm.severity = FM_ALARM_SEVERITY_CLEAR ; /* Dynamic */
ptr->alarm.alarm_state = FM_ALARM_STATE_MSG ; /* Dynamic */
snprintf ( ptr->alarm.proposed_repair_action, FM_MAX_BUFFER_LENGTH, "%s", "");
}
string _getIdentity ( mtc_alarm_id_enum id )
@ -251,6 +348,7 @@ string _getIdentity ( mtc_alarm_id_enum id )
case MTC_LOG_ID__EVENT: return (EVENT_LOG_ID);
case MTC_LOG_ID__COMMAND: return (COMMAND_LOG_ID);
case MTC_LOG_ID__STATECHANGE: return (STATECHANGE_LOG_ID);
case MTC_LOG_ID__CONFIG: return (CONFIG_LOG_ID);
default: return ("200.000");
}
}
@ -493,7 +591,7 @@ int mtcAlarm_warning_log ( string hostname, mtc_alarm_id_enum id )
}
/** Create a neutral customer log */
int mtcAlarm_log ( string hostname, mtc_alarm_id_enum id )
int mtcAlarm_log ( string hostname, mtc_alarm_id_enum id, string str )
{
if ( id < MTC_ALARM_ID__END )
{
@ -750,6 +848,39 @@ int mtcAlarm_log ( string hostname, mtc_alarm_id_enum id )
"board management controller has been 're-provisioned'" );
found = true ;
}
else if (( id == MTC_LOG_ID__CONFIG_HB_ACTION_FAIL ) ||
( id == MTC_LOG_ID__CONFIG_HB_ACTION_DEGRADE ) ||
( id == MTC_LOG_ID__CONFIG_HB_ACTION_ALARM ) ||
( id == MTC_LOG_ID__CONFIG_HB_ACTION_NONE ))
{
alarm_list[index].instc_prefix = "config=heartbeat_failure_action" ;
snprintf ( alarm_list[index].alarm.reason_text,
FM_MAX_BUFFER_LENGTH, "%s %s %s",
hostname.data(),
"platform maintenance service parameter 'heartbeat failure action' changed from",
str.data());
found = true ;
}
else if ( id == MTC_LOG_ID__CONFIG_MNFA_TIMEOUT )
{
alarm_list[index].instc_prefix = "config=mnfa_timeout" ;
snprintf ( alarm_list[index].alarm.reason_text,
FM_MAX_BUFFER_LENGTH, "%s %s %s",
hostname.data(),
"platform maintenance service parameter 'mnfa_timeout' changed from",
str.data());
found = true ;
}
else if ( id == MTC_LOG_ID__CONFIG_MNFA_THRESHOLD )
{
alarm_list[index].instc_prefix = "config=mnfa_threshold" ;
snprintf ( alarm_list[index].alarm.reason_text,
FM_MAX_BUFFER_LENGTH, "%s %s %s",
hostname.data(),
"platform maintenance service parameter 'mnfa_threshold' changed from",
str.data());
found = true ;
}
if ( found == true )
{
@ -758,11 +889,6 @@ int mtcAlarm_log ( string hostname, mtc_alarm_id_enum id )
string identity = _getIdentity(index);
string instance = _getInstance(index);
instance.append(alarm_list[index].instc_prefix);
//wlog ("%s '%s' log (%s.%s)\n",
// hostname.c_str(),
// alarm_list[index].alarm.reason_text,
// identity.c_str(),
// instance.c_str());
/* Want to make this log a critical */
if ( id == MTC_LOG_ID__STATUSCHANGE_REINSTALL_FAILED )

View File

@ -25,52 +25,63 @@ using namespace std;
/** Maintenance Alarm Abstract Reference IDs */
typedef enum
{
MTC_ALARM_ID__LOCK = 0,
MTC_ALARM_ID__CONFIG = 1,
MTC_ALARM_ID__ENABLE = 2,
MTC_ALARM_ID__BM = 3,
MTC_ALARM_ID__CH_CONT = 4, /* Combo Host Controller Failure - with Active Compute */
MTC_ALARM_ID__CH_COMP = 5, /* Combo Host Compute Failure - on last Controller */
MTC_ALARM_ID__LOCK,
MTC_ALARM_ID__CONFIG,
MTC_ALARM_ID__ENABLE,
MTC_ALARM_ID__BM,
MTC_ALARM_ID__CH_CONT, /* Combo Host Controller Failure - with Active Compute */
MTC_ALARM_ID__CH_COMP, /* Combo Host Compute Failure - on last Controller */
MTC_LOG_ID__EVENT = 6,
MTC_LOG_ID__COMMAND = 7,
MTC_LOG_ID__STATECHANGE = 8,
MTC_ALARM_ID__LAST = 9,
MTC_LOG_ID__EVENT,
MTC_LOG_ID__COMMAND,
MTC_LOG_ID__CONFIG,
MTC_LOG_ID__STATECHANGE,
MTC_LOG_ID__SERVICESTATUS,
MTC_ALARM_ID__LAST,
MTC_LOG_ID__EVENT_ADD = 10,
MTC_LOG_ID__EVENT_RESTART = 11,
MTC_LOG_ID__EVENT_DISCOVERED = 12,
MTC_LOG_ID__EVENT_MNFA_ENTER = 13,
MTC_LOG_ID__EVENT_MNFA_EXIT = 14,
MTC_LOG_ID__EVENT_ADD,
MTC_LOG_ID__EVENT_RESTART,
MTC_LOG_ID__EVENT_DISCOVERED,
MTC_LOG_ID__EVENT_MNFA_ENTER,
MTC_LOG_ID__EVENT_MNFA_EXIT,
MTC_LOG_ID__COMMAND_DELETE = 19,
MTC_LOG_ID__COMMAND_UNLOCK = 20,
MTC_LOG_ID__COMMAND_FORCE_LOCK = 21,
MTC_LOG_ID__COMMAND_SWACT = 22,
MTC_LOG_ID__COMMAND_REINSTALL = 23,
MTC_LOG_ID__COMMAND_BM_PROVISIONED = 24,
MTC_LOG_ID__COMMAND_BM_DEPROVISIONED = 25,
MTC_LOG_ID__COMMAND_BM_REPROVISIONED = 26,
MTC_LOG_ID__COMMAND_DELETE,
MTC_LOG_ID__COMMAND_UNLOCK,
MTC_LOG_ID__COMMAND_FORCE_LOCK,
MTC_LOG_ID__COMMAND_SWACT,
MTC_LOG_ID__COMMAND_REINSTALL,
MTC_LOG_ID__COMMAND_BM_PROVISIONED,
MTC_LOG_ID__COMMAND_BM_DEPROVISIONED,
MTC_LOG_ID__COMMAND_BM_REPROVISIONED,
MTC_LOG_ID__COMMAND_AUTO_REBOOT = 30,
MTC_LOG_ID__COMMAND_MANUAL_REBOOT = 31,
MTC_LOG_ID__COMMAND_AUTO_RESET = 32,
MTC_LOG_ID__COMMAND_MANUAL_RESET = 33,
MTC_LOG_ID__COMMAND_AUTO_POWER_ON = 34,
MTC_LOG_ID__COMMAND_MANUAL_POWER_ON = 35,
MTC_LOG_ID__COMMAND_AUTO_POWER_OFF = 36,
MTC_LOG_ID__COMMAND_MANUAL_POWER_OFF = 37,
MTC_LOG_ID__CONFIG_HB_ACTION_FAIL,
MTC_LOG_ID__CONFIG_HB_ACTION_DEGRADE,
MTC_LOG_ID__CONFIG_HB_ACTION_ALARM,
MTC_LOG_ID__CONFIG_HB_ACTION_NONE,
MTC_LOG_ID__CONFIG_HB_PERIOD,
MTC_LOG_ID__CONFIG_HB_DEGRADE_THRESHOLD,
MTC_LOG_ID__CONFIG_HB_FAILURE_THRESHOLD,
MTC_LOG_ID__CONFIG_MNFA_TIMEOUT,
MTC_LOG_ID__CONFIG_MNFA_THRESHOLD,
MTC_LOG_ID__STATUSCHANGE_ENABLED = 40,
MTC_LOG_ID__STATUSCHANGE_DISABLED = 41,
MTC_LOG_ID__STATUSCHANGE_ONLINE = 42,
MTC_LOG_ID__STATUSCHANGE_OFFLINE = 43,
MTC_LOG_ID__STATUSCHANGE_FAILED = 44,
MTC_LOG_ID__STATUSCHANGE_REINSTALL_FAILED = 45,
MTC_LOG_ID__STATUSCHANGE_REINSTALL_COMPLETE = 46,
MTC_LOG_ID__COMMAND_AUTO_REBOOT,
MTC_LOG_ID__COMMAND_MANUAL_REBOOT,
MTC_LOG_ID__COMMAND_AUTO_RESET,
MTC_LOG_ID__COMMAND_MANUAL_RESET,
MTC_LOG_ID__COMMAND_AUTO_POWER_ON,
MTC_LOG_ID__COMMAND_MANUAL_POWER_ON,
MTC_LOG_ID__COMMAND_AUTO_POWER_OFF,
MTC_LOG_ID__COMMAND_MANUAL_POWER_OFF,
MTC_ALARM_ID__END = 50
MTC_LOG_ID__STATUSCHANGE_ENABLED,
MTC_LOG_ID__STATUSCHANGE_DISABLED,
MTC_LOG_ID__STATUSCHANGE_ONLINE,
MTC_LOG_ID__STATUSCHANGE_OFFLINE,
MTC_LOG_ID__STATUSCHANGE_FAILED,
MTC_LOG_ID__STATUSCHANGE_REINSTALL_FAILED,
MTC_LOG_ID__STATUSCHANGE_REINSTALL_COMPLETE,
MTC_ALARM_ID__END
} mtc_alarm_id_enum ;
@ -109,6 +120,6 @@ int mtcAlarm_minor_log ( string hostname, mtc_alarm_id_enum id );
int mtcAlarm_warning_log ( string hostname, mtc_alarm_id_enum id );
/** Create a maintenance log */
int mtcAlarm_log ( string hostname, mtc_alarm_id_enum id );
int mtcAlarm_log ( string hostname, mtc_alarm_id_enum id, string str = "");
#endif /* __MTCALARM_H__ */

View File

@ -947,8 +947,12 @@ int service_events ( nodeLinkClass * obj_ptr, mtc_socket_type * sock_ptr )
if ( msg.cmd == MTC_EVENT_HEARTBEAT_DEGRADE_SET )
{
/* Assert the degrade condition with the 'false' (i.e. not clear)*/
obj_ptr->manage_heartbeat_degrade ( hostname, iface, false );
if (( obj_ptr->hbs_failure_action == HBS_FAILURE_ACTION__FAIL ) ||
( obj_ptr->hbs_failure_action == HBS_FAILURE_ACTION__DEGRADE ))
{
/* Assert the degrade condition with the 'false' (i.e. not clear)*/
obj_ptr->manage_heartbeat_degrade ( hostname, iface, false );
}
}
else
{
@ -985,7 +989,23 @@ int service_events ( nodeLinkClass * obj_ptr, mtc_socket_type * sock_ptr )
}
string hostname = &msg.buf[0] ;
print_mtc_message ( hostname, MTC_CMD_RX, msg, get_iface_name_str(MGMNT_INTERFACE), false );
obj_ptr->manage_heartbeat_failure ( hostname, iface, false );
/* If heartbeat failure action is fail then call the fail handler */
if ( obj_ptr->hbs_failure_action == HBS_FAILURE_ACTION__FAIL )
obj_ptr->manage_heartbeat_failure ( hostname, iface, false );
/* If heartbeat failure action is degrade then call the degrade handler */
else if ( obj_ptr->hbs_failure_action == HBS_FAILURE_ACTION__DEGRADE )
obj_ptr->manage_heartbeat_degrade ( hostname, iface, false );
/* Otherwise the action must be alarm only or none ; both of which
* are already handled by the hbsAgent, so do nothing */
else
{
dlog ("%s heartbeat loss event dropped (%s)\n",
hostname.c_str(),
get_iface_name_str(iface));
}
}
}
else if ( msg.cmd == MTC_EVENT_PMOND_CLEAR )

View File

@ -330,15 +330,67 @@ static int mtc_ini_handler ( void * user,
{
UNUSED(user);
if (MATCH("agent", "mnfa_threshold"))
if (MATCH("agent", "heartbeat_failure_action"))
{
string cur_action = "" ;
string new_action = "" ;
/* prevent memory leak over a reconfig */
if ( mtc_config.hbs_failure_action )
{
cur_action = mtc_config.hbs_failure_action ;
free(mtc_config.hbs_failure_action);
}
new_action = mtc_config.hbs_failure_action = strdup(value);
mtcInv.hbs_failure_action = get_hbs_failure_action(mtc_config);
if (( !cur_action.empty() ) && ( cur_action != new_action))
{
mtc_alarm_id_enum alarm_id = MTC_LOG_ID__CONFIG_HB_ACTION_FAIL ;
if ( mtcInv.hbs_failure_action == HBS_FAILURE_ACTION__NONE )
alarm_id = MTC_LOG_ID__CONFIG_HB_ACTION_NONE ;
else if ( mtcInv.hbs_failure_action == HBS_FAILURE_ACTION__ALARM )
alarm_id = MTC_LOG_ID__CONFIG_HB_ACTION_ALARM ;
else if ( mtcInv.hbs_failure_action == HBS_FAILURE_ACTION__DEGRADE )
alarm_id = MTC_LOG_ID__CONFIG_HB_ACTION_DEGRADE ;
/* re-use cur_action to build the action change string from it */
cur_action.append(" to ");
cur_action.append(new_action);
mtcAlarm_log ( mtcInv.my_hostname, alarm_id, cur_action );
}
if (( mtcInv.mnfa_active == true ) &&
(( mtcInv.hbs_failure_action == HBS_FAILURE_ACTION__NONE ) ||
( mtcInv.hbs_failure_action == HBS_FAILURE_ACTION__ALARM )))
{
mtcInv.mnfa_cancel ();
}
}
else if (MATCH("agent", "mnfa_threshold"))
{
int old = mtcInv.mnfa_threshold ;
mtcInv.mnfa_threshold = atoi(value);
if (( old != 0 ) && ( old != mtcInv.mnfa_threshold ))
{
string cur_threshold = "" ;
cur_threshold.append(itos(old));
cur_threshold.append(" to ");
cur_threshold.append(itos(mtcInv.mnfa_threshold));
mtcAlarm_log ( mtcInv.my_hostname, MTC_LOG_ID__CONFIG_MNFA_THRESHOLD, cur_threshold );
}
ilog ("MNFA Threshd: %d\n", mtcInv.mnfa_threshold);
}
else if (MATCH("timeouts", "mnfa_timeout"))
{
int old = mtcInv.mnfa_timeout ;
mtcInv.mnfa_timeout = atoi(value);
if ( old != mtcInv.mnfa_timeout )
{
string cur_timeout = "" ;
cur_timeout.append(itos(old));
cur_timeout.append(" to ");
cur_timeout.append(itos(mtcInv.mnfa_timeout));
mtcAlarm_log ( mtcInv.my_hostname, MTC_LOG_ID__CONFIG_MNFA_TIMEOUT, cur_timeout );
}
if ( mtcInv.mnfa_timeout == 0 )
{
ilog ("MNFA Timeout: Never\n");

View File

@ -526,7 +526,7 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr )
node_ptr->cmdRsp_status = 0 ;
/* Raise Critical Enable Alarm */
alarm_enabled_failure ( node_ptr );
alarm_enabled_failure ( node_ptr, true );
/* Handle active controller failures */
if ( THIS_HOST )
@ -774,7 +774,7 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr )
}
else
{
alarm_enabled_failure ( node_ptr );
alarm_enabled_failure ( node_ptr , true );
if ( node_ptr->availStatus != MTC_AVAIL_STATUS__FAILED )
{
@ -1095,7 +1095,7 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr )
elog ("%s Timeout waiting for MTCALIVE\n", node_ptr->hostname.c_str());
/* raise an alarm for the enable failure */
alarm_enabled_failure ( node_ptr );
alarm_enabled_failure ( node_ptr , true );
/* go back and issue reboot again */
enableStageChange ( node_ptr, MTC_ENABLE__RESET_PROGRESSION );
@ -1190,7 +1190,7 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr )
node_ptr->mtcTimer.ring = false ;
/* raise an alarm for the enable failure */
alarm_enabled_failure ( node_ptr );
alarm_enabled_failure ( node_ptr , true );
/* go back and issue reboot again */
enableStageChange ( node_ptr, MTC_ENABLE__FAILURE );
@ -1309,18 +1309,29 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr )
mtcTimer_reset ( node_ptr->mtcTimer );
}
plog ("%s Starting %d sec Heartbeat Soak (with%s)\n",
node_ptr->hostname.c_str(),
MTC_HEARTBEAT_SOAK_BEFORE_ENABLE,
node_ptr->hbsClient_ready ? " ready event" : "out ready event" );
/* Start Monitoring Services - heartbeat, process and hardware */
send_hbs_command ( node_ptr->hostname, MTC_CMD_START_HOST );
send_hbs_command ( node_ptr->hostname, MTC_CMD_START_HOST );
/* allow heartbeat to run for 10 seconds before we declare enable */
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_HEARTBEAT_SOAK_BEFORE_ENABLE );
enableStageChange ( node_ptr, MTC_ENABLE__HEARTBEAT_SOAK );
if ( this->hbs_failure_action == HBS_FAILURE_ACTION__NONE )
{
/* Skip over the heartbeat soak if the failuer handlig is
* none because in that case heartbeating is disabled and
* would just be a waste of startup time. */
enableStageChange ( node_ptr, MTC_ENABLE__STATE_CHANGE );
}
else
{
plog ("%s Starting %d sec Heartbeat Soak (with%s)\n",
node_ptr->hostname.c_str(),
MTC_HEARTBEAT_SOAK_BEFORE_ENABLE,
node_ptr->hbsClient_ready ? " ready event" : "out ready event" );
/* allow heartbeat to run for MTC_HEARTBEAT_SOAK_BEFORE_ENABLE
* seconds before we declare enable */
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_HEARTBEAT_SOAK_BEFORE_ENABLE );
enableStageChange ( node_ptr, MTC_ENABLE__HEARTBEAT_SOAK );
}
break ;
}
case MTC_ENABLE__HEARTBEAT_SOAK:
@ -1524,6 +1535,15 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr )
case MTC_RECOVERY__START:
{
if ( this->hbs_failure_action != HBS_FAILURE_ACTION__FAIL )
{
wlog ("%s heartbeat failure recovery action is not fail\n",
node_ptr->hostname.c_str());
mtcInvApi_update_task ( node_ptr, "" );
adminActionChange ( node_ptr, MTC_ADMIN_ACTION__NONE );
break ;
}
/* Purge this hosts work queues */
mtcCmd_workQ_purge ( node_ptr );
mtcCmd_doneQ_purge ( node_ptr );
@ -1690,7 +1710,7 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr )
/* Go to the goEnabled stage */
recoveryStageChange ( node_ptr, MTC_RECOVERY__GOENABLED_TIMER );
alarm_enabled_failure(node_ptr);
alarm_enabled_failure(node_ptr, true );
break ;
}
}
@ -1728,7 +1748,7 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr )
/* Go to the goEnabled stage */
recoveryStageChange ( node_ptr, MTC_RECOVERY__GOENABLED_TIMER );
alarm_enabled_failure (node_ptr);
alarm_enabled_failure (node_ptr, true );
}
}
/* A timer ring indicates that the host is not up */
@ -1772,7 +1792,7 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr )
/* Inform the VIM that this host has failed */
mtcVimApi_state_change ( node_ptr, VIM_HOST_FAILED, 3 );
alarm_enabled_failure(node_ptr);
alarm_enabled_failure(node_ptr, true );
/* Clear all degrade flags except for the HWMON one */
clear_host_degrade_causes ( node_ptr->degrade_mask );
@ -2351,21 +2371,31 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr )
mtcTimer_reset ( node_ptr->mtcTimer );
}
plog ("%s Starting %d sec Heartbeat Soak (with%s)\n",
node_ptr->hostname.c_str(),
MTC_HEARTBEAT_SOAK_BEFORE_ENABLE,
node_ptr->hbsClient_ready ? " ready event" : "out ready event" );
/* Enable the heartbeat service for Graceful Recovery */
send_hbs_command ( node_ptr->hostname, MTC_CMD_START_HOST );
send_hbs_command ( node_ptr->hostname, MTC_CMD_START_HOST );
/* allow heartbeat to run for 10 seconds before we declare enable */
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_HEARTBEAT_SOAK_BEFORE_ENABLE );
if ( this->hbs_failure_action == HBS_FAILURE_ACTION__NONE )
{
/* Skip over the heartbeat soak if the failuer handlig is
* none because in that case heartbeating is disabled and
* would just be a waste of recovery time. */
recoveryStageChange ( node_ptr, MTC_RECOVERY__STATE_CHANGE );
}
else
{
plog ("%s Starting %d sec Heartbeat Soak (with%s)\n",
node_ptr->hostname.c_str(),
MTC_HEARTBEAT_SOAK_BEFORE_ENABLE,
node_ptr->hbsClient_ready ? " ready event" : "out ready event" );
/* if heartbeat is not working then we will
* never get here and enable the host */
recoveryStageChange ( node_ptr, MTC_RECOVERY__HEARTBEAT_SOAK );
/* allow heartbeat to run for 10 seconds before we declare enable */
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_HEARTBEAT_SOAK_BEFORE_ENABLE );
/* if heartbeat is not working then we will
* never get here and enable the host */
recoveryStageChange ( node_ptr, MTC_RECOVERY__HEARTBEAT_SOAK );
}
break ;
}
case MTC_RECOVERY__HEARTBEAT_SOAK:
@ -4667,7 +4697,7 @@ int nodeLinkClass::powercycle_handler ( struct nodeLinkClass::node * node_ptr )
if ( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED )
{
ilog ("%s failing host for powercycle\n", node_ptr->hostname.c_str() );
alarm_enabled_failure ( node_ptr );
alarm_enabled_failure ( node_ptr , true );
/* Set node as unlocked-disabled-failed */
allStateChange ( node_ptr, MTC_ADMIN_STATE__UNLOCKED,

View File

@ -31,14 +31,45 @@ void log_mnfa_pool ( std::list<string> & mnfa_awol_list )
{
std::list<string>::iterator mnfa_awol_ptr ;
string pool_list = "" ;
if ( mnfa_awol_list.size() )
{
for ( mnfa_awol_ptr = mnfa_awol_list.begin() ;
mnfa_awol_ptr != mnfa_awol_list.end() ;
mnfa_awol_ptr++ )
{
pool_list.append (" ");
pool_list.append (mnfa_awol_ptr->data());
}
ilog ("MNFA POOL:%s\n", pool_list.c_str());
}
}
/*****************************************************************************
*
* Name : add_host_to_awol_list
*
* Description: Add a hostname to the awol list if its not already in the list
*
* Returns : true if added
* false if not added because it is already in the list.
*
*****************************************************************************/
static bool add_host_to_awol_list ( string hostname, std::list<string> & mnfa_awol_list )
{
std::list<string>::iterator mnfa_awol_ptr ;
for ( mnfa_awol_ptr = mnfa_awol_list.begin() ;
mnfa_awol_ptr != mnfa_awol_list.end() ;
mnfa_awol_ptr++ )
{
pool_list.append (" ");
pool_list.append (mnfa_awol_ptr->data());
if ( *(mnfa_awol_ptr) == hostname )
{
/* already in list */
return false ;
}
}
ilog ("MNFA POOL:%s\n", pool_list.c_str());
mnfa_awol_list.push_back(hostname);
return true ;
}
/*****************************************************************************
@ -51,6 +82,14 @@ void log_mnfa_pool ( std::list<string> & mnfa_awol_list )
*****************************************************************************/
void nodeLinkClass::mnfa_add_host ( struct nodeLinkClass::node * node_ptr , iface_enum iface )
{
if (( this->hbs_failure_action == HBS_FAILURE_ACTION__ALARM ) ||
( this->hbs_failure_action == HBS_FAILURE_ACTION__NONE ))
{
/* Do nothing for the 'alarm only' or 'none' action.
* Alarming is handled by the hbsAgent already */
return ;
}
if ( node_ptr->hbs_minor[iface] == false )
{
bool enter = false ;
@ -63,15 +102,12 @@ void nodeLinkClass::mnfa_add_host ( struct nodeLinkClass::node * node_ptr , ifac
/* if we are active then add the node to the awol list */
if ( mnfa_active == true )
{
alarm_enabled_failure (node_ptr);
/* once we are mnfa_active we need to give all the
* hbs_minor=true hosts a graceful recovery token
* mnfa_graceful_recovery = true and add to the awol list */
node_ptr->mnfa_graceful_recovery = true ;
added = true ;
mnfa_awol_list.push_back(node_ptr->hostname);
mnfa_awol_list.unique();
add_host_to_awol_list (node_ptr->hostname, mnfa_awol_list );
if ( node_ptr->task != MTC_TASK_RECOVERY_WAIT )
mtcInvApi_update_task ( node_ptr, MTC_TASK_RECOVERY_WAIT );
}
@ -94,10 +130,7 @@ void nodeLinkClass::mnfa_add_host ( struct nodeLinkClass::node * node_ptr , ifac
get_iface_name_str(INFRA_IFACE),
node_ptr->hbs_minor_count[INFRA_IFACE]);
if ( mnfa_awol_list.size() )
{
log_mnfa_pool ( mnfa_awol_list );
}
log_mnfa_pool ( mnfa_awol_list );
if ( enter == true )
{
@ -191,28 +224,20 @@ void nodeLinkClass::mnfa_enter ( void )
* recovery token mnfa_graceful_recovery = true
* basically a get out of double reset free card */
ptr->mnfa_graceful_recovery = true ;
mnfa_awol_list.push_back(ptr->hostname);
add_host_to_awol_list (ptr->hostname, mnfa_awol_list );
if ( ptr->task != MTC_TASK_RECOVERY_WAIT )
mtcInvApi_update_task ( ptr, MTC_TASK_RECOVERY_WAIT );
alarm_enabled_failure (ptr);
}
if (( ptr->next == NULL ) || ( ptr == tail ))
break ;
}
mnfa_awol_list.unique();
if ( this->mnfa_timeout )
{
wlog ("MNFA Auto-Recovery in %d seconds\n", this->mnfa_timeout);
mtcTimer_start ( mtcTimer_mnfa, mtcTimer_handler, this->mnfa_timeout);
}
if ( mnfa_awol_list.size() )
{
log_mnfa_pool ( mnfa_awol_list );
}
log_mnfa_pool ( mnfa_awol_list );
}
/****************************************************************************
@ -262,10 +287,7 @@ void nodeLinkClass::mnfa_exit ( bool force )
force ? "(Auto-Recover)" : "");
mtcAlarm_log ( active_controller_hostname , MTC_LOG_ID__EVENT_MNFA_EXIT );
if ( mnfa_awol_list.size() )
{
log_mnfa_pool ( mnfa_awol_list );
}
log_mnfa_pool ( mnfa_awol_list );
/* Loop through inventory and recover each host that
* remains in the hbs_minor state.
@ -329,3 +351,44 @@ void nodeLinkClass::mnfa_exit ( bool force )
mnfa_host_count[INFRA_IFACE] = 0 ;
mnfa_awol_list.clear();
}
/****************************************************************************
*
* Name : mnfa_cancel
*
* Description: Cancel MNFA if its active.
*
****************************************************************************/
void nodeLinkClass::mnfa_cancel ( void )
{
if ( this->mnfa_active )
{
wlog ("MNFA CANCEL --> Cancelling Multi-Node Failure Avoidance\n");
mtcTimer_reset ( this->mtcTimer_mnfa );
/* Loop through MNFA Pool.
* Clear MNFA attributes from hosts in the pool. */
std::list<string>::iterator mnfa_awol_ptr ;
for ( mnfa_awol_ptr = mnfa_awol_list.begin() ;
mnfa_awol_ptr != mnfa_awol_list.end() ;
mnfa_awol_ptr++ )
{
struct node * node_ptr = nodeLinkClass::getNode ( *(mnfa_awol_ptr) );
if ( node_ptr != NULL )
{
node_ptr->degrade_mask &= ~DEGRADE_MASK_HEARTBEAT_MGMNT ;
node_ptr->degrade_mask &= ~DEGRADE_MASK_HEARTBEAT_INFRA ;
node_ptr->hbs_minor[INFRA_IFACE] = false ;
node_ptr->hbs_minor[MGMNT_IFACE] = false ;
node_ptr->mnfa_graceful_recovery = false ;
mtcInvApi_update_task ( node_ptr, "" );
}
}
send_hbs_command ( this->my_hostname, MTC_RECOVER_HBS );
this->mnfa_host_count[MGMNT_IFACE] = 0 ;
this->mnfa_host_count[INFRA_IFACE] = 0 ;
this->mnfa_active = false ;
}
mnfa_awol_list.clear();
}

View File

@ -409,17 +409,24 @@ int nodeLinkClass::enable_subf_handler ( struct nodeLinkClass::node * node_ptr )
mtcTimer_reset ( node_ptr->mtcTimer );
}
plog ("%s Starting %d sec Heartbeat Soak (with%s)\n",
/* Start Monitoring heartbeat */
send_hbs_command ( node_ptr->hostname, MTC_CMD_START_HOST );
if ( this->hbs_failure_action == HBS_FAILURE_ACTION__NONE )
{
enableStageChange ( node_ptr, MTC_ENABLE__STATE_CHANGE );
}
else
{
plog ("%s Starting %d sec Heartbeat Soak (with%s)\n",
name.c_str(),
MTC_HEARTBEAT_SOAK_BEFORE_ENABLE,
node_ptr->hbsClient_ready ? " ready event" : "out ready event" );
/* Start Monitoring Services - heartbeat, process and hardware */
send_hbs_command ( node_ptr->hostname, MTC_CMD_START_HOST );
/* allow heartbeat to run for 10 seconds before we declare enable */
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_HEARTBEAT_SOAK_BEFORE_ENABLE );
enableStageChange ( node_ptr, MTC_ENABLE__HEARTBEAT_SOAK );
/* allow heartbeat to run for 10 seconds before we declare enable */
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_HEARTBEAT_SOAK_BEFORE_ENABLE );
enableStageChange ( node_ptr, MTC_ENABLE__HEARTBEAT_SOAK );
}
break ;
}
case MTC_ENABLE__HEARTBEAT_SOAK:

View File

@ -7,12 +7,6 @@ hbs_minor_threshold = 4 ; Heartbeat minor threshold count.
; heartbeat misses that result in a
; minor notification to maintenance.
hbs_calibrate_threshold = 7 ; number of hosts before calibration kicks in
hbs_calibrate_period_factor = 200 ; x for each host over hbs_calibrate_threshold
hbs_calibrate_minor_factor = 20 ; x for each host over hbs_calibrate_threshold
hbs_calibrate_degrade_factor = 21 ; x for each host over hbs_calibrate_threshold
hbs_calibrate_fail_factor = 30 ; x for each host over hbs_calibrate_threshold
offline_period = 100 ; number of msecs to wait for each offline audit
offline_threshold = 46 ; number of back to back mtcAlive requests missed
; 100:46 will yield a typical 5 sec holdoff from