diff --git a/mtce-common/centos/build_srpm.data b/mtce-common/centos/build_srpm.data index d8075689..4b87b935 100644 --- a/mtce-common/centos/build_srpm.data +++ b/mtce-common/centos/build_srpm.data @@ -1,3 +1,3 @@ SRC_DIR="cgts-mtce-common-1.0" -TIS_PATCH_VER=136 +TIS_PATCH_VER=137 BUILD_IS_SLOW=5 diff --git a/mtce-common/cgts-mtce-common-1.0/common/alarmUtil.h b/mtce-common/cgts-mtce-common-1.0/common/alarmUtil.h index d9ca7ce4..1e1ff807 100644 --- a/mtce-common/cgts-mtce-common-1.0/common/alarmUtil.h +++ b/mtce-common/cgts-mtce-common-1.0/common/alarmUtil.h @@ -47,6 +47,7 @@ #define COMMAND_LOG_ID ((const char *)"200.021") #define STATECHANGE_LOG_ID ((const char *)"200.022") #define SERVICESTATUS_LOG_ID ((const char *)"200.023") /* log used to report service failure events against */ +#define CONFIG_LOG_ID ((const char *)"200.024") /** * TODO: This class is more of a place holder for diff --git a/mtce-common/cgts-mtce-common-1.0/common/logMacros.h b/mtce-common/cgts-mtce-common-1.0/common/logMacros.h index 568ba7be..bef9221a 100644 --- a/mtce-common/cgts-mtce-common-1.0/common/logMacros.h +++ b/mtce-common/cgts-mtce-common-1.0/common/logMacros.h @@ -44,12 +44,7 @@ typedef struct int hbs_minor_threshold ; /**< heartbeat miss minor threshold */ int hbs_degrade_threshold ; /**< heartbeat miss degrade threshold */ int hbs_failure_threshold ; /**< heartbeat miss failure threshold */ - - int hbs_calibrate_threshold ; /**< number of hosts where threshold calibration begins to take effect */ - int hbs_calibrate_period_factor ; /**< hbs_pulse_period = hbs_pulse_period * hosts */ - int hbs_calibrate_minor_factor ; /**< hbs_minor_threshold = threshold factor * hosts */ - int hbs_calibrate_degrade_factor; /**< hbs_degrade_threshold = threshold factor * hosts */ - int hbs_calibrate_fail_factor ; /**< hbs_failure_threshold = threshold factor * hosts */ + char* hbs_failure_action ; /**< action to take on host heartbeat falure*/ char* mgmnt_iface ; /**< management interface name pointer */ char* infra_iface ; /**< infrastructure interface name pointer */ diff --git a/mtce-common/cgts-mtce-common-1.0/common/nodeClass.cpp b/mtce-common/cgts-mtce-common-1.0/common/nodeClass.cpp index eb941704..a629cf77 100755 --- a/mtce-common/cgts-mtce-common-1.0/common/nodeClass.cpp +++ b/mtce-common/cgts-mtce-common-1.0/common/nodeClass.cpp @@ -258,7 +258,7 @@ nodeLinkClass::nodeLinkClass() hbs_minor_threshold = HBS_MINOR_THRESHOLD ; hbs_degrade_threshold = HBS_DEGRADE_THRESHOLD ; hbs_failure_threshold = HBS_FAILURE_THRESHOLD ; - + hbs_failure_action = HBS_FAILURE_ACTION__FAIL ; hbs_silent_fault_detector = 0 ; hbs_silent_fault_logged = false ; @@ -653,14 +653,14 @@ nodeLinkClass::node* nodeLinkClass::addNode( string hostname ) ptr->vimEvent.buf = NULL ; ptr->httpReq.buf = NULL ; - + /* log throttles */ ptr->stall_recovery_log_throttle = 0 ; ptr->stall_monitor_log_throttle = 0 ; ptr->unexpected_pulse_log_throttle = 0 ; ptr->lookup_mismatch_log_throttle = 0 ; - ptr->log_throttle = 0 ; ptr->no_work_log_throttle = 0 ; + ptr->no_rri_log_throttle = 0 ; ptr->degrade_mask = ptr->degrade_mask_save = DEGRADE_MASK_NONE ; @@ -1615,13 +1615,15 @@ int nodeLinkClass::alarm_config_clear ( struct nodeLinkClass::node * node_ptr ) } /* Generate a log and a critical alarm if the node enable failed */ -int nodeLinkClass::alarm_enabled_failure ( struct nodeLinkClass::node * node_ptr ) +int nodeLinkClass::alarm_enabled_failure ( struct nodeLinkClass::node * node_ptr, bool want_degrade ) { - if ( (node_ptr->degrade_mask & DEGRADE_MASK_ENABLE) == 0 ) + if ( want_degrade ) { - node_ptr->degrade_mask |= DEGRADE_MASK_ENABLE ; + if ( (node_ptr->degrade_mask & DEGRADE_MASK_ENABLE) == 0 ) + { + node_ptr->degrade_mask |= DEGRADE_MASK_ENABLE ; + } } - if ( node_ptr->alarms[MTC_ALARM_ID__ENABLE] != FM_ALARM_SEVERITY_CRITICAL ) { elog ("%s critical enable failure\n", node_ptr->hostname.c_str()); @@ -4466,7 +4468,10 @@ void nodeLinkClass::manage_heartbeat_failure ( string hostname, iface_enum iface } else { - alarm_enabled_failure (node_ptr); + //bool want_degrade = true ; + //if ( this->hbs_failure_action == HBS_FAILURE_ACTION__ALARM ) + // want_degrade = false ; + // alarm_enabled_failure (node_ptr, want_degrade); mnfa_add_host ( node_ptr , iface ); @@ -4487,8 +4492,6 @@ void nodeLinkClass::manage_heartbeat_failure ( string hostname, iface_enum iface nodeLinkClass::set_availStatus ( hostname, MTC_AVAIL_STATUS__FAILED ); - alarm_enabled_failure (node_ptr); - if (( node_ptr->adminAction != MTC_ADMIN_ACTION__ENABLE ) && ( node_ptr->adminAction != MTC_ADMIN_ACTION__UNLOCK )) { @@ -4526,11 +4529,31 @@ void nodeLinkClass::manage_heartbeat_clear ( string hostname, iface_enum iface ) for ( int i = 0 ; i < MAX_IFACES ; i++ ) { node_ptr->heartbeat_failed[i] = false ; + if ( i == MGMNT_IFACE ) + { + node_ptr->alarms[HBS_ALARM_ID__HB_MGMNT] = FM_ALARM_SEVERITY_CLEAR ; + node_ptr->degrade_mask &= ~DEGRADE_MASK_HEARTBEAT_MGMNT ; + } + if ( i == INFRA_IFACE ) + { + node_ptr->alarms[HBS_ALARM_ID__HB_INFRA] = FM_ALARM_SEVERITY_CLEAR ; + node_ptr->degrade_mask &= ~DEGRADE_MASK_HEARTBEAT_INFRA ; + } } } else { node_ptr->heartbeat_failed[iface] = false ; + if ( iface == MGMNT_IFACE ) + { + node_ptr->alarms[HBS_ALARM_ID__HB_MGMNT] = FM_ALARM_SEVERITY_CLEAR ; + node_ptr->degrade_mask &= ~DEGRADE_MASK_HEARTBEAT_MGMNT ; + } + else if ( iface == INFRA_IFACE ) + { + node_ptr->alarms[HBS_ALARM_ID__HB_INFRA] = FM_ALARM_SEVERITY_CLEAR ; + node_ptr->degrade_mask &= ~DEGRADE_MASK_HEARTBEAT_INFRA ; + } } } @@ -4576,7 +4599,7 @@ void nodeLinkClass::manage_heartbeat_degrade ( string hostname, iface_enum iface } mnfa_add_host ( node_ptr, iface ); - + if ( nodeLinkClass::get_operState ( hostname ) == MTC_OPER_STATE__ENABLED ) { if ( iface == MGMNT_IFACE ) @@ -7074,7 +7097,7 @@ void nodeLinkClass::manage_autorecovery ( struct nodeLinkClass::node * node_ptr } else { - alarm_enabled_failure ( node_ptr ) ; + alarm_enabled_failure ( node_ptr , true ) ; } allStateChange ( node_ptr, node_ptr->adminState, @@ -7155,7 +7178,7 @@ void nodeLinkClass::force_full_enable ( struct nodeLinkClass::node * node_ptr ) plog ("%s Forcing Full Enable Sequence\n", node_ptr->hostname.c_str()); /* Raise Critical Enable Alarm */ - alarm_enabled_failure ( node_ptr ); + alarm_enabled_failure ( node_ptr, true ); allStateChange ( node_ptr, node_ptr->adminState, MTC_OPER_STATE__DISABLED, MTC_AVAIL_STATUS__FAILED ); enableStageChange ( node_ptr, MTC_ENABLE__FAILURE ); @@ -7359,7 +7382,18 @@ bool nodeLinkClass::get_hbs_monitor_state ( string & hostname, int iface ) node_ptr = nodeLinkClass::getNode ( hostname ); if ( node_ptr != NULL ) { + int rri_max = this->hosts ; state = node_ptr->monitor[iface] ; + if ( state == true ) + { + wlog_throttled (node_ptr->no_rri_log_throttle, rri_max, + "%s Not Offering RRI (%d)\n", + hostname.c_str(), this->hosts ); + } + else + { + node_ptr->no_rri_log_throttle = 0 ; + } } } return (state); @@ -7539,6 +7573,31 @@ int nodeLinkClass::create_pulse_list ( iface_enum iface ) return (pulses[iface]); } +/** Clear heartbeat stats in support of failed heartbeat restart */ +void nodeLinkClass::hbs_clear_all_stats ( void ) +{ + ilog ("clearing all hearbeat stats\n"); + for ( struct node * ptr = head ; ptr != NULL ; ptr = ptr->next ) + { + for ( int iface = 0 ; iface < MAX_IFACES ; iface++ ) + { + ptr->max_count[iface] = 0 ; + ptr->hbs_count[iface] = 0 ; + ptr->hbs_misses_count[iface] = 0 ; + ptr->b2b_pulses_count[iface] = 0 ; + ptr->b2b_misses_count[iface] = 0 ; + ptr->hbs_minor_count[iface] = 0 ; + ptr->hbs_degrade_count[iface] = 0 ; + ptr->hbs_failure_count[iface] = 0 ; + ptr->hbs_minor[iface] = false ; + ptr->hbs_degrade[iface] = false ; + ptr->hbs_failure[iface] = false ; + ptr->heartbeat_failed[iface] = false ; + } + if (( ptr->next == NULL ) || ( ptr == tail )) + break ; + } +} /** Build the Reasource Reference Array */ void nodeLinkClass::build_rra ( void ) @@ -7717,7 +7776,7 @@ int nodeLinkClass::remPulse ( struct node * node_ptr, iface_enum iface, bool cle * if this interfaces failed and has not yet received the * required number of back to back pulses needed for recovery */ clear_b2b_misses_count = false ; - ilog ("%s %s heartbeat failure recovery (%d of %d)\n", + dlog ("%s %s heartbeat failure recovery (%d of %d)\n", node_ptr->hostname.c_str(), get_iface_name_str(iface), ptr->b2b_pulses_count[iface], @@ -7870,8 +7929,8 @@ int nodeLinkClass::remPulse ( struct node * node_ptr, iface_enum iface, bool cle } /** This utility will try and remove a pluse from the pulse - * linked list first by index and then by hostname. - * + * linked list first by index and then by hostname. + * * By index does not require a lookup whereas hostname does */ int nodeLinkClass::remove_pulse ( string & hostname, iface_enum iface, int index, unsigned int flags ) { @@ -7889,10 +7948,7 @@ int nodeLinkClass::remove_pulse ( string & hostname, iface_enum iface, int index { if ( hostname.compare("localhost") ) { - if ( get_hbs_monitor_state ( hostname , iface ) == true ) - { - wlog ("%s Not Offering RRI\n", hostname.c_str()); - } + get_hbs_monitor_state ( hostname , iface ) ; } else { @@ -7914,7 +7970,7 @@ void nodeLinkClass::clear_pulse_list ( iface_enum iface ) } pulse_list[iface].head_ptr = NULL ; pulse_list[iface].tail_ptr = NULL ; - + if ( ptr != NULL ) { ptr->linknum[iface] = 0 ; @@ -7929,6 +7985,15 @@ void nodeLinkClass::manage_heartbeat_alarm ( struct nodeLinkClass::node * node_p if ( this->heartbeat != true ) return ; + if ( this->hbs_failure_action == HBS_FAILURE_ACTION__NONE ) + { + dlog ("%s dropping heartbeat alarm request (%s:%s) ; action none\n", + node_ptr->hostname.c_str(), + alarmUtil_getSev_str(sev).c_str(), + get_iface_name_str(iface) ); + return ; + } + bool make_alarm_call = false ; alarm_id_enum id ; EFmAlarmStateT state = FM_ALARM_STATE_SET ; @@ -8025,7 +8090,7 @@ int nodeLinkClass::lost_pulses ( iface_enum iface ) { if ( pulse_ptr->b2b_misses_count[iface] == hbs_failure_threshold ) { - ilog ("%-13s %s Pulse Miss (%d) (log throttled to every %d)\n", + ilog ("%s %s Pulse Miss (%d) (log throttled to every %d)\n", pulse_ptr->hostname.c_str(), get_iface_name_str(iface), pulse_ptr->b2b_misses_count[iface], @@ -8034,7 +8099,7 @@ int nodeLinkClass::lost_pulses ( iface_enum iface ) /* Once the misses exceed 25 then throttle the logging to avoid flooding */ if ( (pulse_ptr->b2b_misses_count[iface] & 0xfff) == 0 ) { - ilog ("%-13s %s Pulse Miss (%d)\n", pulse_ptr->hostname.c_str(), + ilog ("%s %s Pulse Miss (%d)\n", pulse_ptr->hostname.c_str(), get_iface_name_str(iface), pulse_ptr->b2b_misses_count[iface] ); } @@ -8043,27 +8108,27 @@ int nodeLinkClass::lost_pulses ( iface_enum iface ) { if ( pulse_ptr->b2b_misses_count[iface] > hbs_failure_threshold ) { - ilog ("%-13s %s Pulse Miss (%3d) (in failure)\n", pulse_ptr->hostname.c_str(), + ilog ("%s %s Pulse Miss (%3d) (in failure)\n", pulse_ptr->hostname.c_str(), get_iface_name_str(iface), pulse_ptr->b2b_misses_count[iface] ); } else if ( pulse_ptr->b2b_misses_count[iface] > hbs_degrade_threshold ) { - ilog ("%-13s %s Pulse Miss (%3d) (max:%3d) (in degrade)\n", pulse_ptr->hostname.c_str(), + ilog ("%s %s Pulse Miss (%3d) (max:%3d) (in degrade)\n", pulse_ptr->hostname.c_str(), get_iface_name_str(iface), pulse_ptr->b2b_misses_count[iface], pulse_ptr->max_count[iface]); } else if ( pulse_ptr->b2b_misses_count[iface] > hbs_minor_threshold ) { - ilog ("%-13s %s Pulse Miss (%3d) (max:%3d) (in minor)\n", pulse_ptr->hostname.c_str(), + ilog ("%s %s Pulse Miss (%3d) (max:%3d) (in minor)\n", pulse_ptr->hostname.c_str(), get_iface_name_str(iface), pulse_ptr->b2b_misses_count[iface] , pulse_ptr->max_count[iface]); } else { - ilog ("%-13s %s Pulse Miss (%3d) (max:%3d)\n", pulse_ptr->hostname.c_str(), + ilog ("%s %s Pulse Miss (%3d) (max:%3d)\n", pulse_ptr->hostname.c_str(), get_iface_name_str(iface), pulse_ptr->b2b_misses_count[iface], pulse_ptr->max_count[iface]); @@ -8072,7 +8137,7 @@ int nodeLinkClass::lost_pulses ( iface_enum iface ) } else { - dlog ("%-13s %s Pulse Miss (%d)\n", pulse_ptr->hostname.c_str(), + dlog ("%s %s Pulse Miss (%d)\n", pulse_ptr->hostname.c_str(), get_iface_name_str(iface), pulse_ptr->b2b_misses_count[iface] ); } diff --git a/mtce-common/cgts-mtce-common-1.0/common/nodeClass.h b/mtce-common/cgts-mtce-common-1.0/common/nodeClass.h index 46b5398a..c6bd1ed7 100755 --- a/mtce-common/cgts-mtce-common-1.0/common/nodeClass.h +++ b/mtce-common/cgts-mtce-common-1.0/common/nodeClass.h @@ -542,6 +542,9 @@ private: /** Resource reference identifier, aka resource reference array index */ int rri ; + /** variable used to throttle the rri log */ + int no_rri_log_throttle ; + /** @} private_Heartbeat_variables */ /** @@ -1023,7 +1026,7 @@ private: int lazy_graceful_fs_reboot ( struct nodeLinkClass::node * node_ptr ); int alarm_enabled_clear ( struct nodeLinkClass::node * node_ptr, bool force ); - int alarm_enabled_failure ( struct nodeLinkClass::node * node_ptr ); + int alarm_enabled_failure ( struct nodeLinkClass::node * node_ptr, bool want_degrade ); int alarm_insv_clear ( struct nodeLinkClass::node * node_ptr, bool force ); int alarm_insv_failure ( struct nodeLinkClass::node * node_ptr ); @@ -1296,6 +1299,9 @@ public: /** The number of heartbeat misses that result in a failed state */ int hbs_failure_threshold ; + /** enumerated failure action code ; fail, degrade, alarm, none */ + hbs_failure_action_enum hbs_failure_action ; + /** Running Resource Reference Identifier */ int rrri ; @@ -1427,6 +1433,7 @@ public: * node failure avoidance threshold and until there are no more * in service trouble hosts */ bool mnfa_active ; + void mnfa_cancel( void ); std::list mnfa_awol_list ; void mnfa_timeout_handler ( void ); @@ -1526,6 +1533,10 @@ public: //#ifdef WANT_HBS /** Add a host to the Node list */ int add_heartbeat_host ( const node_inv_type &inv ); + + /** Clear heartbeat stats for all hosts */ + void hbs_clear_all_stats ( void ) ; + // #endif void host_print ( struct nodeLinkClass::node * node_ptr ); diff --git a/mtce-common/cgts-mtce-common-1.0/daemon/daemon_common.h b/mtce-common/cgts-mtce-common-1.0/daemon/daemon_common.h index e7e6c85d..61abb0ab 100755 --- a/mtce-common/cgts-mtce-common-1.0/daemon/daemon_common.h +++ b/mtce-common/cgts-mtce-common-1.0/daemon/daemon_common.h @@ -174,6 +174,23 @@ int client_timeout_handler ( void * user, const char * name, const char * value); +/* User selectable heartbeat failure actions */ +typedef enum +{ + HBS_FAILURE_ACTION__NONE = 0, /* no heartbeat tally */ + HBS_FAILURE_ACTION__ALARM = 1, /* alarm only */ + HBS_FAILURE_ACTION__DEGRADE = 2, /* degrade and alarm */ + HBS_FAILURE_ACTION__FAIL = 3, /* fail and alarm */ +} hbs_failure_action_enum ; + +#define HBS_FAILURE_ACTION__NONE_STR ((const char *)("none")) +#define HBS_FAILURE_ACTION__ALARM_STR ((const char *)("alarm")) +#define HBS_FAILURE_ACTION__DEGRADE_STR ((const char *)("degrade")) +#define HBS_FAILURE_ACTION__FAIL_STR ((const char *)("fail")) + +hbs_failure_action_enum +get_hbs_failure_action ( daemon_config_type & config ); + /** Test Head Entry */ int daemon_run_testhead ( void ); /** diff --git a/mtce-common/cgts-mtce-common-1.0/daemon/daemon_config.cpp b/mtce-common/cgts-mtce-common-1.0/daemon/daemon_config.cpp index a8b02c30..80f4c801 100644 --- a/mtce-common/cgts-mtce-common-1.0/daemon/daemon_config.cpp +++ b/mtce-common/cgts-mtce-common-1.0/daemon/daemon_config.cpp @@ -195,6 +195,48 @@ int timeout_config_handler ( void * user, return (PASS); } +/* *********************************************************************** + * + * Name : get_hbs_failure_action + * + * Desctription: Convert already loaded heartbeat failure action config + * string into its equivalent enumerated type. + * See code comments below for more detail. + * + * Assumptions : Both mtcAgent and hbsAgent need this conversion. + * + * Returns : Converted enum value ; error/default is 'fail' action + * + * ***********************************************************************/ +hbs_failure_action_enum get_hbs_failure_action ( + daemon_config_type & config ) +{ + /* push the Heartbeat Failure Action character array into string + * for easy/safe compare */ + string hbs_failure_action = config.hbs_failure_action ; + + /* default action is 'fail' */ + hbs_failure_action_enum action_enum = HBS_FAILURE_ACTION__FAIL ; + + /* look for 'none' action - hbsAgent only cares about this one + * so that it knows to clear or not to raise any alarms for heartbeat + * failures ; or degrades for that matter */ + if ( hbs_failure_action == HBS_FAILURE_ACTION__NONE_STR ) + action_enum = HBS_FAILURE_ACTION__NONE ; + + /* look for degrade action - alarms are still managed in this mode */ + else if ( hbs_failure_action == HBS_FAILURE_ACTION__DEGRADE_STR ) + action_enum = HBS_FAILURE_ACTION__DEGRADE ; + + /* look for 'alarm' action - no host degrade in this case */ + else if ( hbs_failure_action == HBS_FAILURE_ACTION__ALARM_STR ) + action_enum = HBS_FAILURE_ACTION__ALARM ; + + ilog("HBS Action : %s\n", config.hbs_failure_action ); + return (action_enum); +} + + /* System Inventory Config Reader */ int sysinv_config_handler ( void * user, const char * section, diff --git a/mtce-common/cgts-mtce-common-1.0/heartbeat/hbsAgent.cpp b/mtce-common/cgts-mtce-common-1.0/heartbeat/hbsAgent.cpp index fa19c1e8..692d8a8e 100755 --- a/mtce-common/cgts-mtce-common-1.0/heartbeat/hbsAgent.cpp +++ b/mtce-common/cgts-mtce-common-1.0/heartbeat/hbsAgent.cpp @@ -72,6 +72,8 @@ using namespace std; static string unexpected_pulse_list[MAX_IFACES] = { "" , "" } ; static string arrival_histogram[MAX_IFACES] = { "" , "" } ; +static std::list hostname_inventory ; + /** This heartbeat service inventory is tracked by * the same nodeLinkClass that maintenance uses. * @@ -88,8 +90,6 @@ int module_init ( void ) return (PASS); } -static unsigned int my_nodetype= CGTS_NODE_NULL ; - void daemon_sigchld_hdlr ( void ) { ; /* dlog("Received SIGCHLD ... no action\n"); */ @@ -107,12 +107,19 @@ daemon_config_type * daemon_get_cfg_ptr () { return &hbs_config ; } * @see hbsBase.h for hbs_socket_type struct format. */ static hbs_socket_type hbs_sock ; - msgSock_type * get_mtclogd_sockPtr ( void ) { return (&hbs_sock.mtclogd); } +/** + * Module Control Struct - The allocated struct + * @see hbsBase.h for hbs_ctrl_type struct format. + */ +static hbs_ctrl_type hbs_ctrl ; +hbs_ctrl_type * get_hbs_ctrl_ptr () { return &hbs_ctrl ; } + + #define SCHED_MONITOR__MAIN_LOOP ((const char *) "---> scheduling latency : main loop :") #define SCHED_MONITOR__RECEIVER ((const char *) "---> scheduling latency : rx pulses :") void monitor_scheduling ( unsigned long long & this_time, unsigned long long & prev_time , int data, const char * label_ptr ) @@ -241,25 +248,31 @@ static int hbs_config_handler ( void * user, hbsInv.hbs_failure_threshold = atoi(value); config_ptr->mask |= CONFIG_AGENT_HBS_FAILURE ; } - if (MATCH("agent", "hbs_calibrate_threshold")) + if (MATCH("agent", "heartbeat_failure_action")) { - config_ptr->hbs_calibrate_threshold = atoi(value); - } - if (MATCH("agent", "hbs_calibrate_period_factor")) - { - config_ptr->hbs_calibrate_period_factor = atoi(value); - } - if (MATCH("agent", "hbs_calibrate_minor_factor")) - { - config_ptr->hbs_calibrate_minor_factor = atoi(value); - } - if (MATCH("agent", "hbs_calibrate_degrade_factor")) - { - config_ptr->hbs_calibrate_degrade_factor = atoi(value); - } - if (MATCH("agent", "hbs_calibrate_fail_factor")) - { - config_ptr->hbs_calibrate_fail_factor = atoi(value); + hbs_failure_action_enum current_action = hbsInv.hbs_failure_action ; + /* + * 1. free previous memory from strdup on reconfig + * 2. get the new value string + * 3. convert it to an enum + * 4. if failure action is 'none' then set the clear_alarms audit bool + * telling the main loop to clear all heartbeat related alarms. + * 5. clear all stats if the action is changed from none to other. + * + * Note: The none action prevents any new alarms from being raised. + */ + if ( config_ptr->hbs_failure_action ) + free(config_ptr->hbs_failure_action); + config_ptr->hbs_failure_action = strdup(value); + + /* get the configured action */ + hbsInv.hbs_failure_action = get_hbs_failure_action(hbs_config); + + if ( current_action != hbsInv.hbs_failure_action ) + { + hbs_ctrl.clear_alarms = true ; + hbsInv.hbs_clear_all_stats(); + } } if (MATCH("agent", "multicast")) { @@ -334,6 +347,7 @@ int daemon_configure ( void ) /* Read the ini */ hbs_config.mask = 0 ; + get_debug_options ( MTCE_CONF_FILE, &hbs_config ); if (ini_parse(MTCE_CONF_FILE, hbs_config_handler, &hbs_config) < 0) { elog("Can't load '%s'\n", MTCE_CONF_FILE ); @@ -346,8 +360,6 @@ int daemon_configure ( void ) return (FAIL_LOAD_INI); } - get_debug_options ( MTCE_CONF_FILE, &hbs_config ); - /* Verify loaded config against an expected mask * as an ini file fault detection method */ if ( hbs_config.mask != CONFIG_AGENT_MASK ) @@ -362,15 +374,13 @@ int daemon_configure ( void ) hbsInv.hbs_minor_threshold = hbsInv.hbs_degrade_threshold ; } - // hbsInv.recalibrate_thresholds (); - /* Log the startup settings */ ilog("Realtime Pri: RR/%i \n", hbs_config.scheduling_priority ); ilog("Pulse Period: %i msec\n", hbsInv.hbs_pulse_period ); ilog("Minor Thld: %i misses\n", hbsInv.hbs_minor_threshold ); ilog("Degrade Thld: %i misses\n", hbsInv.hbs_degrade_threshold ); ilog("Failure Thld: %i misses\n", hbsInv.hbs_failure_threshold ); - ilog("Multicast: %s\n", hbs_config.multicast ); + ilog("Multicast : %s\n", hbs_config.multicast ); hbs_config.mgmnt_iface = daemon_get_iface_master ( hbs_config.mgmnt_iface ); ilog("Mgmnt iface : %s\n", hbs_config.mgmnt_iface ); @@ -1014,12 +1024,19 @@ int daemon_init ( string iface, string nodetype ) /* Not used by this service */ UNUSED(nodetype); + /* Initialize socket construct and pointer to it */ - memset ( &hbs_sock, 0, sizeof(hbs_sock)); + MEMSET_ZERO ( hbs_sock ); + + /* Initialize the hbs control struct */ + MEMSET_ZERO ( hbs_ctrl ); /* initialize the timer */ mtcTimer_init ( hbsTimer, "controller", "heartbeat" ); + /* start with no inventory */ + hostname_inventory.clear(); + /* Assign interface to config */ hbs_config.mgmnt_iface = (char*)iface.data() ; @@ -1032,8 +1049,8 @@ int daemon_init ( string iface, string nodetype ) hbsInv.system_type = daemon_system_type (); /* convert node type to integer */ - my_nodetype = get_host_function_mask ( nodetype ) ; - ilog ("Node Type : %s (%d)\n", nodetype.c_str(), my_nodetype ); + hbs_ctrl.nodetype = get_host_function_mask ( nodetype ) ; + ilog ("Node Type : %s (%d)\n", nodetype.c_str(), hbs_ctrl.nodetype ); /* Bind signal handlers */ if ( daemon_signal_init () != PASS ) @@ -1134,7 +1151,7 @@ void daemon_service_run ( void ) /* CGTS 4114: Small Footprint: Alarm 200.005 remains active after connectivity restored * * Clear self alarms */ - hbsAlarm_clear_all ( hbsInv.my_hostname ); + hbsAlarm_clear_all ( hbsInv.my_hostname, hbsInv.infra_network_provisioned ); /* add this host as inventory to hbsAgent * Although this host is not monitored for heartbeat, @@ -1254,6 +1271,29 @@ void daemon_service_run ( void ) } } + /* audit for forced alarms clear due to ... + * + * 1. heartbeat failure action being set to none + * 2. ... future + * + */ + if ( hbs_ctrl.clear_alarms == true ) + { + if ( goenabled == true ) + { + std::list::iterator hostname_ptr ; + ilog ("clearing all heartbeat alarms for all hosts due to 'none' action"); + for ( hostname_ptr = hostname_inventory.begin(); + hostname_ptr != hostname_inventory.end() ; + hostname_ptr++ ) + { + hbsAlarm_clear_all ( hostname_ptr->data(), hbsInv.infra_network_provisioned ); + hbsInv.manage_heartbeat_clear ( hostname_ptr->data(), MAX_IFACES ); + } + hbs_ctrl.clear_alarms = false ; + } + } + /***************** Service Sockets ********************/ /* Initialize the master fd_set and clear socket list */ @@ -1356,10 +1396,15 @@ void daemon_service_run ( void ) inv.name = hostname ; inv.nodetype = msg.parm[0]; hbsInv.add_heartbeat_host ( inv ) ; + hostname_inventory.push_back ( hostname ); ilog ("%s added to heartbeat service (%d)\n", hostname.c_str(), inv.nodetype ); /* clear any outstanding alarms on the ADD */ - hbsAlarm_clear_all ( hostname ); + if ( hbsInv.hbs_failure_action != HBS_FAILURE_ACTION__NONE ) + { + hbsAlarm_clear_all ( hostname, + hbsInv.infra_network_provisioned ); + } } else if ( msg.cmd == MTC_CMD_DEL_HOST ) { @@ -1367,12 +1412,16 @@ void daemon_service_run ( void ) { hbsInv.mon_host ( hostname, (iface_enum)iface, false, false ); } - + hostname_inventory.remove ( hostname ); hbsInv.del_host ( hostname ); ilog ("%s deleted from heartbeat service\n", hostname.c_str()); /* clear any outstanding alarms on the DEL */ - hbsAlarm_clear_all ( hostname ); + if ( hbsInv.hbs_failure_action != HBS_FAILURE_ACTION__NONE ) + { + hbsAlarm_clear_all ( hostname, + hbsInv.infra_network_provisioned ); + } } else if ( msg.cmd == MTC_CMD_STOP_HOST ) { @@ -1484,6 +1533,13 @@ void daemon_service_run ( void ) counter = 1 ; } + else if ( hbsInv.hbs_failure_action == HBS_FAILURE_ACTION__NONE ) + { + wlog_throttled (counter, 100000, "Heartbeat disabled by 'none' action\n"); + usleep (50000) ; + continue ; + } + /* Send a log indicating the main loop has recognized * a state change to enable */ else if (( hbsInv.hbs_state_change == true ) && diff --git a/mtce-common/cgts-mtce-common-1.0/heartbeat/hbsAlarm.cpp b/mtce-common/cgts-mtce-common-1.0/heartbeat/hbsAlarm.cpp index 7fc96b75..da46cc89 100644 --- a/mtce-common/cgts-mtce-common-1.0/heartbeat/hbsAlarm.cpp +++ b/mtce-common/cgts-mtce-common-1.0/heartbeat/hbsAlarm.cpp @@ -31,10 +31,11 @@ using namespace std; #include "hbsAlarm.h" /* for ... this module header */ #include "alarm.h" /* for ... alarm send message to mtcalarmd */ -void hbsAlarm_clear_all ( string hostname ) +void hbsAlarm_clear_all ( string hostname, bool infra ) { alarm_clear ( hostname, MGMNT_HB_ALARM_ID, MGMNT_NAME ); - alarm_clear ( hostname, INFRA_HB_ALARM_ID, INFRA_NAME ); + if ( infra ) + alarm_clear ( hostname, INFRA_HB_ALARM_ID, INFRA_NAME ); alarm_clear ( hostname , PMOND_ALARM_ID, PMON_NAME ); } diff --git a/mtce-common/cgts-mtce-common-1.0/heartbeat/hbsAlarm.h b/mtce-common/cgts-mtce-common-1.0/heartbeat/hbsAlarm.h index 8f2f204f..25a88461 100644 --- a/mtce-common/cgts-mtce-common-1.0/heartbeat/hbsAlarm.h +++ b/mtce-common/cgts-mtce-common-1.0/heartbeat/hbsAlarm.h @@ -27,6 +27,6 @@ using namespace std; #define INFRA_NAME ((const char *)"Infrastructure") #define PMON_NAME ((char *)"pmond") -void hbsAlarm_clear_all ( string hostname ); +void hbsAlarm_clear_all ( string hostname, bool infra ); #endif /* __HBSALARM_H__ */ diff --git a/mtce-common/cgts-mtce-common-1.0/heartbeat/hbsBase.h b/mtce-common/cgts-mtce-common-1.0/heartbeat/hbsBase.h index d6725dc3..8b5cf2d3 100755 --- a/mtce-common/cgts-mtce-common-1.0/heartbeat/hbsBase.h +++ b/mtce-common/cgts-mtce-common-1.0/heartbeat/hbsBase.h @@ -56,6 +56,13 @@ const char rsp_msg_header [HBS_HEADER_SIZE+1] = {"cgts pulse rsp:"}; #define HBS_MAX_MSG (HBS_HEADER_SIZE+MAX_CHARS_HOSTNAME) +/* Heartbeat control structure */ +typedef struct +{ + unsigned int nodetype ; + bool clear_alarms ; +} hbs_ctrl_type ; + /* A heartbeat service message * if this structire is changed then * hbs_pulse_request needs to be looked at diff --git a/mtce-common/cgts-mtce-common-1.0/heartbeat/hbsStubs.cpp b/mtce-common/cgts-mtce-common-1.0/heartbeat/hbsStubs.cpp index a864ab3e..70b25df3 100644 --- a/mtce-common/cgts-mtce-common-1.0/heartbeat/hbsStubs.cpp +++ b/mtce-common/cgts-mtce-common-1.0/heartbeat/hbsStubs.cpp @@ -359,6 +359,7 @@ int mtcAlarm_critical_log ( string hostname, mtc_alarm_id_enum id ) { UNUSED(ho int mtcAlarm_major_log ( string hostname, mtc_alarm_id_enum id ) { UNUSED(hostname); id = id ; return (PASS); } int mtcAlarm_minor_log ( string hostname, mtc_alarm_id_enum id ) { UNUSED(hostname); id = id ; return (PASS); } int mtcAlarm_warning_log ( string hostname, mtc_alarm_id_enum id ) { UNUSED(hostname); id = id ; return (PASS); } -int mtcAlarm_log ( string hostname, mtc_alarm_id_enum id ) { UNUSED(hostname); id = id ; return (PASS); } +int mtcAlarm_log ( string hostname, mtc_alarm_id_enum id, string str ) +{ UNUSED(hostname); id = id ; UNUSED(str) ; return (PASS); } string mtcAlarm_getId_str ( mtc_alarm_id_enum id ) { id = id ; return ("stub"); } diff --git a/mtce-common/cgts-mtce-common-1.0/maintenance/mtcAlarm.cpp b/mtce-common/cgts-mtce-common-1.0/maintenance/mtcAlarm.cpp index 27b1f88a..8262da9f 100644 --- a/mtce-common/cgts-mtce-common-1.0/maintenance/mtcAlarm.cpp +++ b/mtce-common/cgts-mtce-common-1.0/maintenance/mtcAlarm.cpp @@ -128,7 +128,7 @@ void mtcAlarm_init ( void ) "If manual or auto-recovery is consistently unable to recover host to the unlocked-enabled " "state contact next level of support or lock and replace failing Host."); - /** Board Management Controller Access Alarm ************************************/ + /** Init Board Management Controller Access Alarm Entry ******************/ ptr = &alarm_list[MTC_ALARM_ID__BM]; memset (&ptr->alarm, 0, (sizeof(SFmAlarmDataT))); @@ -154,7 +154,7 @@ void mtcAlarm_init ( void ) snprintf( ptr->alarm.proposed_repair_action, FM_MAX_BUFFER_LENGTH, "Check Host's board management config and connectivity."); - /** Controller Failure Alarm ****************************************************/ + /** Init Controller Failure Alarm Entry **********************************/ ptr = &alarm_list[MTC_ALARM_ID__CH_CONT]; memset (&ptr->alarm, 0, (sizeof(SFmAlarmDataT))); @@ -184,7 +184,7 @@ void mtcAlarm_init ( void ) "running on this host. If lock action fails then contact next level " "of support to investigate and recover."); - /** Compute Failure Alarm ****************************************************/ + /** Init Compute Failure Alarm Entry *************************************/ ptr = &alarm_list[MTC_ALARM_ID__CH_COMP]; memset (&ptr->alarm, 0, (sizeof(SFmAlarmDataT))); @@ -213,7 +213,7 @@ void mtcAlarm_init ( void ) "and Switch Activity (Swact) to it as soon as possible. If the alarm " "persists then Lock/Unlock host to recover its local compute service."); - /** Add Event Log ****************************************************/ + /** Init Event Log Entry *************************************************/ ptr = &alarm_list[MTC_LOG_ID__EVENT]; memset (&ptr->alarm, 0, (sizeof(SFmAlarmDataT))); @@ -236,6 +236,103 @@ void mtcAlarm_init ( void ) ptr->alarm.alarm_state = FM_ALARM_STATE_MSG ; /* Dynamic */ snprintf ( ptr->alarm.proposed_repair_action, FM_MAX_BUFFER_LENGTH, "%s", ""); + + /** Init Command Log Entry ***********************************************/ + + ptr = &alarm_list[MTC_LOG_ID__COMMAND]; + memset (&ptr->alarm, 0, (sizeof(SFmAlarmDataT))); + snprintf(&ptr->alarm.alarm_id[0], FM_MAX_BUFFER_LENGTH, "%s", COMMAND_LOG_ID); + + ptr->name = "Maintenance Command" ; + + ptr->minor_reason = + ptr->major_reason = + ptr->critl_reason = + ptr->clear_reason = ""; + + ptr->alarm.alarm_type = FM_ALARM_TYPE_UNKNOWN ; + ptr->alarm.probable_cause = FM_ALARM_CAUSE_UNKNOWN ; + ptr->alarm.inhibit_alarms = FM_FALSE ; + ptr->alarm.service_affecting = FM_FALSE ; + ptr->alarm.suppression = FM_FALSE ; + + ptr->alarm.severity = FM_ALARM_SEVERITY_CLEAR ; /* Dynamic */ + ptr->alarm.alarm_state = FM_ALARM_STATE_MSG ; /* Dynamic */ + + snprintf ( ptr->alarm.proposed_repair_action, FM_MAX_BUFFER_LENGTH, "%s", ""); + + /** Init Config Log Entry ***********************************************/ + + ptr = &alarm_list[MTC_LOG_ID__CONFIG]; + memset (&ptr->alarm, 0, (sizeof(SFmAlarmDataT))); + snprintf(&ptr->alarm.alarm_id[0], FM_MAX_BUFFER_LENGTH, "%s", CONFIG_LOG_ID); + + ptr->name = "Maintenance Config" ; + + ptr->minor_reason = + ptr->major_reason = + ptr->critl_reason = + ptr->clear_reason = ""; + + ptr->alarm.alarm_type = FM_ALARM_TYPE_UNKNOWN ; + ptr->alarm.probable_cause = FM_ALARM_CAUSE_UNKNOWN ; + ptr->alarm.inhibit_alarms = FM_FALSE ; + ptr->alarm.service_affecting = FM_FALSE ; + ptr->alarm.suppression = FM_FALSE ; + + ptr->alarm.severity = FM_ALARM_SEVERITY_CLEAR ; /* Dynamic */ + ptr->alarm.alarm_state = FM_ALARM_STATE_MSG ; /* Dynamic */ + + snprintf ( ptr->alarm.proposed_repair_action, FM_MAX_BUFFER_LENGTH, "%s", ""); + + /** Init State Change Log Entry ******************************************/ + + ptr = &alarm_list[MTC_LOG_ID__STATECHANGE]; + memset (&ptr->alarm, 0, (sizeof(SFmAlarmDataT))); + snprintf(&ptr->alarm.alarm_id[0], FM_MAX_BUFFER_LENGTH, "%s", STATECHANGE_LOG_ID); + + ptr->name = "Maintenance State Change" ; + + ptr->minor_reason = + ptr->major_reason = + ptr->critl_reason = + ptr->clear_reason = ""; + + ptr->alarm.alarm_type = FM_ALARM_TYPE_UNKNOWN ; + ptr->alarm.probable_cause = FM_ALARM_CAUSE_UNKNOWN ; + ptr->alarm.inhibit_alarms = FM_FALSE ; + ptr->alarm.service_affecting = FM_FALSE ; + ptr->alarm.suppression = FM_FALSE ; + + ptr->alarm.severity = FM_ALARM_SEVERITY_CLEAR ; /* Dynamic */ + ptr->alarm.alarm_state = FM_ALARM_STATE_MSG ; /* Dynamic */ + + snprintf ( ptr->alarm.proposed_repair_action, FM_MAX_BUFFER_LENGTH, "%s", ""); + + /** Init Service Status Log Entry ****************************************/ + + ptr = &alarm_list[MTC_LOG_ID__SERVICESTATUS]; + memset (&ptr->alarm, 0, (sizeof(SFmAlarmDataT))); + snprintf(&ptr->alarm.alarm_id[0], FM_MAX_BUFFER_LENGTH, "%s", SERVICESTATUS_LOG_ID); + + ptr->name = "Maintenance Service Status Change" ; + + ptr->minor_reason = + ptr->major_reason = + ptr->critl_reason = + ptr->clear_reason = ""; + + ptr->alarm.alarm_type = FM_ALARM_TYPE_UNKNOWN ; + ptr->alarm.probable_cause = FM_ALARM_CAUSE_UNKNOWN ; + ptr->alarm.inhibit_alarms = FM_FALSE ; + ptr->alarm.service_affecting = FM_FALSE ; + ptr->alarm.suppression = FM_FALSE ; + + ptr->alarm.severity = FM_ALARM_SEVERITY_CLEAR ; /* Dynamic */ + ptr->alarm.alarm_state = FM_ALARM_STATE_MSG ; /* Dynamic */ + + snprintf ( ptr->alarm.proposed_repair_action, FM_MAX_BUFFER_LENGTH, "%s", ""); + } string _getIdentity ( mtc_alarm_id_enum id ) @@ -251,6 +348,7 @@ string _getIdentity ( mtc_alarm_id_enum id ) case MTC_LOG_ID__EVENT: return (EVENT_LOG_ID); case MTC_LOG_ID__COMMAND: return (COMMAND_LOG_ID); case MTC_LOG_ID__STATECHANGE: return (STATECHANGE_LOG_ID); + case MTC_LOG_ID__CONFIG: return (CONFIG_LOG_ID); default: return ("200.000"); } } @@ -493,7 +591,7 @@ int mtcAlarm_warning_log ( string hostname, mtc_alarm_id_enum id ) } /** Create a neutral customer log */ -int mtcAlarm_log ( string hostname, mtc_alarm_id_enum id ) +int mtcAlarm_log ( string hostname, mtc_alarm_id_enum id, string str ) { if ( id < MTC_ALARM_ID__END ) { @@ -750,6 +848,39 @@ int mtcAlarm_log ( string hostname, mtc_alarm_id_enum id ) "board management controller has been 're-provisioned'" ); found = true ; } + else if (( id == MTC_LOG_ID__CONFIG_HB_ACTION_FAIL ) || + ( id == MTC_LOG_ID__CONFIG_HB_ACTION_DEGRADE ) || + ( id == MTC_LOG_ID__CONFIG_HB_ACTION_ALARM ) || + ( id == MTC_LOG_ID__CONFIG_HB_ACTION_NONE )) + { + alarm_list[index].instc_prefix = "config=heartbeat_failure_action" ; + snprintf ( alarm_list[index].alarm.reason_text, + FM_MAX_BUFFER_LENGTH, "%s %s %s", + hostname.data(), + "platform maintenance service parameter 'heartbeat failure action' changed from", + str.data()); + found = true ; + } + else if ( id == MTC_LOG_ID__CONFIG_MNFA_TIMEOUT ) + { + alarm_list[index].instc_prefix = "config=mnfa_timeout" ; + snprintf ( alarm_list[index].alarm.reason_text, + FM_MAX_BUFFER_LENGTH, "%s %s %s", + hostname.data(), + "platform maintenance service parameter 'mnfa_timeout' changed from", + str.data()); + found = true ; + } + else if ( id == MTC_LOG_ID__CONFIG_MNFA_THRESHOLD ) + { + alarm_list[index].instc_prefix = "config=mnfa_threshold" ; + snprintf ( alarm_list[index].alarm.reason_text, + FM_MAX_BUFFER_LENGTH, "%s %s %s", + hostname.data(), + "platform maintenance service parameter 'mnfa_threshold' changed from", + str.data()); + found = true ; + } if ( found == true ) { @@ -758,11 +889,6 @@ int mtcAlarm_log ( string hostname, mtc_alarm_id_enum id ) string identity = _getIdentity(index); string instance = _getInstance(index); instance.append(alarm_list[index].instc_prefix); - //wlog ("%s '%s' log (%s.%s)\n", - // hostname.c_str(), - // alarm_list[index].alarm.reason_text, - // identity.c_str(), - // instance.c_str()); /* Want to make this log a critical */ if ( id == MTC_LOG_ID__STATUSCHANGE_REINSTALL_FAILED ) diff --git a/mtce-common/cgts-mtce-common-1.0/maintenance/mtcAlarm.h b/mtce-common/cgts-mtce-common-1.0/maintenance/mtcAlarm.h index 93d27492..25565d4f 100644 --- a/mtce-common/cgts-mtce-common-1.0/maintenance/mtcAlarm.h +++ b/mtce-common/cgts-mtce-common-1.0/maintenance/mtcAlarm.h @@ -25,52 +25,63 @@ using namespace std; /** Maintenance Alarm Abstract Reference IDs */ typedef enum { - MTC_ALARM_ID__LOCK = 0, - MTC_ALARM_ID__CONFIG = 1, - MTC_ALARM_ID__ENABLE = 2, - MTC_ALARM_ID__BM = 3, - MTC_ALARM_ID__CH_CONT = 4, /* Combo Host Controller Failure - with Active Compute */ - MTC_ALARM_ID__CH_COMP = 5, /* Combo Host Compute Failure - on last Controller */ + MTC_ALARM_ID__LOCK, + MTC_ALARM_ID__CONFIG, + MTC_ALARM_ID__ENABLE, + MTC_ALARM_ID__BM, + MTC_ALARM_ID__CH_CONT, /* Combo Host Controller Failure - with Active Compute */ + MTC_ALARM_ID__CH_COMP, /* Combo Host Compute Failure - on last Controller */ - MTC_LOG_ID__EVENT = 6, - MTC_LOG_ID__COMMAND = 7, - MTC_LOG_ID__STATECHANGE = 8, - MTC_ALARM_ID__LAST = 9, + MTC_LOG_ID__EVENT, + MTC_LOG_ID__COMMAND, + MTC_LOG_ID__CONFIG, + MTC_LOG_ID__STATECHANGE, + MTC_LOG_ID__SERVICESTATUS, + MTC_ALARM_ID__LAST, - MTC_LOG_ID__EVENT_ADD = 10, - MTC_LOG_ID__EVENT_RESTART = 11, - MTC_LOG_ID__EVENT_DISCOVERED = 12, - MTC_LOG_ID__EVENT_MNFA_ENTER = 13, - MTC_LOG_ID__EVENT_MNFA_EXIT = 14, + MTC_LOG_ID__EVENT_ADD, + MTC_LOG_ID__EVENT_RESTART, + MTC_LOG_ID__EVENT_DISCOVERED, + MTC_LOG_ID__EVENT_MNFA_ENTER, + MTC_LOG_ID__EVENT_MNFA_EXIT, - MTC_LOG_ID__COMMAND_DELETE = 19, - MTC_LOG_ID__COMMAND_UNLOCK = 20, - MTC_LOG_ID__COMMAND_FORCE_LOCK = 21, - MTC_LOG_ID__COMMAND_SWACT = 22, - MTC_LOG_ID__COMMAND_REINSTALL = 23, - MTC_LOG_ID__COMMAND_BM_PROVISIONED = 24, - MTC_LOG_ID__COMMAND_BM_DEPROVISIONED = 25, - MTC_LOG_ID__COMMAND_BM_REPROVISIONED = 26, + MTC_LOG_ID__COMMAND_DELETE, + MTC_LOG_ID__COMMAND_UNLOCK, + MTC_LOG_ID__COMMAND_FORCE_LOCK, + MTC_LOG_ID__COMMAND_SWACT, + MTC_LOG_ID__COMMAND_REINSTALL, + MTC_LOG_ID__COMMAND_BM_PROVISIONED, + MTC_LOG_ID__COMMAND_BM_DEPROVISIONED, + MTC_LOG_ID__COMMAND_BM_REPROVISIONED, - MTC_LOG_ID__COMMAND_AUTO_REBOOT = 30, - MTC_LOG_ID__COMMAND_MANUAL_REBOOT = 31, - MTC_LOG_ID__COMMAND_AUTO_RESET = 32, - MTC_LOG_ID__COMMAND_MANUAL_RESET = 33, - MTC_LOG_ID__COMMAND_AUTO_POWER_ON = 34, - MTC_LOG_ID__COMMAND_MANUAL_POWER_ON = 35, - MTC_LOG_ID__COMMAND_AUTO_POWER_OFF = 36, - MTC_LOG_ID__COMMAND_MANUAL_POWER_OFF = 37, + MTC_LOG_ID__CONFIG_HB_ACTION_FAIL, + MTC_LOG_ID__CONFIG_HB_ACTION_DEGRADE, + MTC_LOG_ID__CONFIG_HB_ACTION_ALARM, + MTC_LOG_ID__CONFIG_HB_ACTION_NONE, + MTC_LOG_ID__CONFIG_HB_PERIOD, + MTC_LOG_ID__CONFIG_HB_DEGRADE_THRESHOLD, + MTC_LOG_ID__CONFIG_HB_FAILURE_THRESHOLD, + MTC_LOG_ID__CONFIG_MNFA_TIMEOUT, + MTC_LOG_ID__CONFIG_MNFA_THRESHOLD, - - MTC_LOG_ID__STATUSCHANGE_ENABLED = 40, - MTC_LOG_ID__STATUSCHANGE_DISABLED = 41, - MTC_LOG_ID__STATUSCHANGE_ONLINE = 42, - MTC_LOG_ID__STATUSCHANGE_OFFLINE = 43, - MTC_LOG_ID__STATUSCHANGE_FAILED = 44, - MTC_LOG_ID__STATUSCHANGE_REINSTALL_FAILED = 45, - MTC_LOG_ID__STATUSCHANGE_REINSTALL_COMPLETE = 46, + MTC_LOG_ID__COMMAND_AUTO_REBOOT, + MTC_LOG_ID__COMMAND_MANUAL_REBOOT, + MTC_LOG_ID__COMMAND_AUTO_RESET, + MTC_LOG_ID__COMMAND_MANUAL_RESET, + MTC_LOG_ID__COMMAND_AUTO_POWER_ON, + MTC_LOG_ID__COMMAND_MANUAL_POWER_ON, + MTC_LOG_ID__COMMAND_AUTO_POWER_OFF, + MTC_LOG_ID__COMMAND_MANUAL_POWER_OFF, - MTC_ALARM_ID__END = 50 + MTC_LOG_ID__STATUSCHANGE_ENABLED, + MTC_LOG_ID__STATUSCHANGE_DISABLED, + MTC_LOG_ID__STATUSCHANGE_ONLINE, + MTC_LOG_ID__STATUSCHANGE_OFFLINE, + MTC_LOG_ID__STATUSCHANGE_FAILED, + MTC_LOG_ID__STATUSCHANGE_REINSTALL_FAILED, + MTC_LOG_ID__STATUSCHANGE_REINSTALL_COMPLETE, + + MTC_ALARM_ID__END } mtc_alarm_id_enum ; @@ -109,6 +120,6 @@ int mtcAlarm_minor_log ( string hostname, mtc_alarm_id_enum id ); int mtcAlarm_warning_log ( string hostname, mtc_alarm_id_enum id ); /** Create a maintenance log */ -int mtcAlarm_log ( string hostname, mtc_alarm_id_enum id ); +int mtcAlarm_log ( string hostname, mtc_alarm_id_enum id, string str = ""); #endif /* __MTCALARM_H__ */ diff --git a/mtce-common/cgts-mtce-common-1.0/maintenance/mtcCtrlMsg.cpp b/mtce-common/cgts-mtce-common-1.0/maintenance/mtcCtrlMsg.cpp index bcc9b5f5..b8fa12b8 100755 --- a/mtce-common/cgts-mtce-common-1.0/maintenance/mtcCtrlMsg.cpp +++ b/mtce-common/cgts-mtce-common-1.0/maintenance/mtcCtrlMsg.cpp @@ -947,8 +947,12 @@ int service_events ( nodeLinkClass * obj_ptr, mtc_socket_type * sock_ptr ) if ( msg.cmd == MTC_EVENT_HEARTBEAT_DEGRADE_SET ) { - /* Assert the degrade condition with the 'false' (i.e. not clear)*/ - obj_ptr->manage_heartbeat_degrade ( hostname, iface, false ); + if (( obj_ptr->hbs_failure_action == HBS_FAILURE_ACTION__FAIL ) || + ( obj_ptr->hbs_failure_action == HBS_FAILURE_ACTION__DEGRADE )) + { + /* Assert the degrade condition with the 'false' (i.e. not clear)*/ + obj_ptr->manage_heartbeat_degrade ( hostname, iface, false ); + } } else { @@ -985,7 +989,23 @@ int service_events ( nodeLinkClass * obj_ptr, mtc_socket_type * sock_ptr ) } string hostname = &msg.buf[0] ; print_mtc_message ( hostname, MTC_CMD_RX, msg, get_iface_name_str(MGMNT_INTERFACE), false ); - obj_ptr->manage_heartbeat_failure ( hostname, iface, false ); + + /* If heartbeat failure action is fail then call the fail handler */ + if ( obj_ptr->hbs_failure_action == HBS_FAILURE_ACTION__FAIL ) + obj_ptr->manage_heartbeat_failure ( hostname, iface, false ); + + /* If heartbeat failure action is degrade then call the degrade handler */ + else if ( obj_ptr->hbs_failure_action == HBS_FAILURE_ACTION__DEGRADE ) + obj_ptr->manage_heartbeat_degrade ( hostname, iface, false ); + + /* Otherwise the action must be alarm only or none ; both of which + * are already handled by the hbsAgent, so do nothing */ + else + { + dlog ("%s heartbeat loss event dropped (%s)\n", + hostname.c_str(), + get_iface_name_str(iface)); + } } } else if ( msg.cmd == MTC_EVENT_PMOND_CLEAR ) diff --git a/mtce-common/cgts-mtce-common-1.0/maintenance/mtcNodeCtrl.cpp b/mtce-common/cgts-mtce-common-1.0/maintenance/mtcNodeCtrl.cpp index 1299f687..f9ce94ae 100755 --- a/mtce-common/cgts-mtce-common-1.0/maintenance/mtcNodeCtrl.cpp +++ b/mtce-common/cgts-mtce-common-1.0/maintenance/mtcNodeCtrl.cpp @@ -330,15 +330,67 @@ static int mtc_ini_handler ( void * user, { UNUSED(user); - if (MATCH("agent", "mnfa_threshold")) + if (MATCH("agent", "heartbeat_failure_action")) { + string cur_action = "" ; + string new_action = "" ; + + /* prevent memory leak over a reconfig */ + if ( mtc_config.hbs_failure_action ) + { + cur_action = mtc_config.hbs_failure_action ; + free(mtc_config.hbs_failure_action); + } + new_action = mtc_config.hbs_failure_action = strdup(value); + mtcInv.hbs_failure_action = get_hbs_failure_action(mtc_config); + if (( !cur_action.empty() ) && ( cur_action != new_action)) + { + mtc_alarm_id_enum alarm_id = MTC_LOG_ID__CONFIG_HB_ACTION_FAIL ; + if ( mtcInv.hbs_failure_action == HBS_FAILURE_ACTION__NONE ) + alarm_id = MTC_LOG_ID__CONFIG_HB_ACTION_NONE ; + else if ( mtcInv.hbs_failure_action == HBS_FAILURE_ACTION__ALARM ) + alarm_id = MTC_LOG_ID__CONFIG_HB_ACTION_ALARM ; + else if ( mtcInv.hbs_failure_action == HBS_FAILURE_ACTION__DEGRADE ) + alarm_id = MTC_LOG_ID__CONFIG_HB_ACTION_DEGRADE ; + + /* re-use cur_action to build the action change string from it */ + cur_action.append(" to "); + cur_action.append(new_action); + mtcAlarm_log ( mtcInv.my_hostname, alarm_id, cur_action ); + } + if (( mtcInv.mnfa_active == true ) && + (( mtcInv.hbs_failure_action == HBS_FAILURE_ACTION__NONE ) || + ( mtcInv.hbs_failure_action == HBS_FAILURE_ACTION__ALARM ))) + { + mtcInv.mnfa_cancel (); + } + } + else if (MATCH("agent", "mnfa_threshold")) + { + int old = mtcInv.mnfa_threshold ; mtcInv.mnfa_threshold = atoi(value); + if (( old != 0 ) && ( old != mtcInv.mnfa_threshold )) + { + string cur_threshold = "" ; + cur_threshold.append(itos(old)); + cur_threshold.append(" to "); + cur_threshold.append(itos(mtcInv.mnfa_threshold)); + mtcAlarm_log ( mtcInv.my_hostname, MTC_LOG_ID__CONFIG_MNFA_THRESHOLD, cur_threshold ); + } ilog ("MNFA Threshd: %d\n", mtcInv.mnfa_threshold); } else if (MATCH("timeouts", "mnfa_timeout")) { int old = mtcInv.mnfa_timeout ; mtcInv.mnfa_timeout = atoi(value); + if ( old != mtcInv.mnfa_timeout ) + { + string cur_timeout = "" ; + cur_timeout.append(itos(old)); + cur_timeout.append(" to "); + cur_timeout.append(itos(mtcInv.mnfa_timeout)); + mtcAlarm_log ( mtcInv.my_hostname, MTC_LOG_ID__CONFIG_MNFA_TIMEOUT, cur_timeout ); + } if ( mtcInv.mnfa_timeout == 0 ) { ilog ("MNFA Timeout: Never\n"); diff --git a/mtce-common/cgts-mtce-common-1.0/maintenance/mtcNodeHdlrs.cpp b/mtce-common/cgts-mtce-common-1.0/maintenance/mtcNodeHdlrs.cpp index 13df5951..3f2c2253 100755 --- a/mtce-common/cgts-mtce-common-1.0/maintenance/mtcNodeHdlrs.cpp +++ b/mtce-common/cgts-mtce-common-1.0/maintenance/mtcNodeHdlrs.cpp @@ -526,7 +526,7 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr ) node_ptr->cmdRsp_status = 0 ; /* Raise Critical Enable Alarm */ - alarm_enabled_failure ( node_ptr ); + alarm_enabled_failure ( node_ptr, true ); /* Handle active controller failures */ if ( THIS_HOST ) @@ -774,7 +774,7 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr ) } else { - alarm_enabled_failure ( node_ptr ); + alarm_enabled_failure ( node_ptr , true ); if ( node_ptr->availStatus != MTC_AVAIL_STATUS__FAILED ) { @@ -1095,7 +1095,7 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr ) elog ("%s Timeout waiting for MTCALIVE\n", node_ptr->hostname.c_str()); /* raise an alarm for the enable failure */ - alarm_enabled_failure ( node_ptr ); + alarm_enabled_failure ( node_ptr , true ); /* go back and issue reboot again */ enableStageChange ( node_ptr, MTC_ENABLE__RESET_PROGRESSION ); @@ -1190,7 +1190,7 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr ) node_ptr->mtcTimer.ring = false ; /* raise an alarm for the enable failure */ - alarm_enabled_failure ( node_ptr ); + alarm_enabled_failure ( node_ptr , true ); /* go back and issue reboot again */ enableStageChange ( node_ptr, MTC_ENABLE__FAILURE ); @@ -1309,18 +1309,29 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr ) mtcTimer_reset ( node_ptr->mtcTimer ); } - plog ("%s Starting %d sec Heartbeat Soak (with%s)\n", - node_ptr->hostname.c_str(), - MTC_HEARTBEAT_SOAK_BEFORE_ENABLE, - node_ptr->hbsClient_ready ? " ready event" : "out ready event" ); - /* Start Monitoring Services - heartbeat, process and hardware */ - send_hbs_command ( node_ptr->hostname, MTC_CMD_START_HOST ); + send_hbs_command ( node_ptr->hostname, MTC_CMD_START_HOST ); - /* allow heartbeat to run for 10 seconds before we declare enable */ - mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_HEARTBEAT_SOAK_BEFORE_ENABLE ); - enableStageChange ( node_ptr, MTC_ENABLE__HEARTBEAT_SOAK ); + if ( this->hbs_failure_action == HBS_FAILURE_ACTION__NONE ) + { + /* Skip over the heartbeat soak if the failuer handlig is + * none because in that case heartbeating is disabled and + * would just be a waste of startup time. */ + enableStageChange ( node_ptr, MTC_ENABLE__STATE_CHANGE ); + } + else + { + plog ("%s Starting %d sec Heartbeat Soak (with%s)\n", + node_ptr->hostname.c_str(), + MTC_HEARTBEAT_SOAK_BEFORE_ENABLE, + node_ptr->hbsClient_ready ? " ready event" : "out ready event" ); + + /* allow heartbeat to run for MTC_HEARTBEAT_SOAK_BEFORE_ENABLE + * seconds before we declare enable */ + mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_HEARTBEAT_SOAK_BEFORE_ENABLE ); + enableStageChange ( node_ptr, MTC_ENABLE__HEARTBEAT_SOAK ); + } break ; } case MTC_ENABLE__HEARTBEAT_SOAK: @@ -1524,6 +1535,15 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr ) case MTC_RECOVERY__START: { + if ( this->hbs_failure_action != HBS_FAILURE_ACTION__FAIL ) + { + wlog ("%s heartbeat failure recovery action is not fail\n", + node_ptr->hostname.c_str()); + mtcInvApi_update_task ( node_ptr, "" ); + adminActionChange ( node_ptr, MTC_ADMIN_ACTION__NONE ); + break ; + } + /* Purge this hosts work queues */ mtcCmd_workQ_purge ( node_ptr ); mtcCmd_doneQ_purge ( node_ptr ); @@ -1690,7 +1710,7 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr ) /* Go to the goEnabled stage */ recoveryStageChange ( node_ptr, MTC_RECOVERY__GOENABLED_TIMER ); - alarm_enabled_failure(node_ptr); + alarm_enabled_failure(node_ptr, true ); break ; } } @@ -1728,7 +1748,7 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr ) /* Go to the goEnabled stage */ recoveryStageChange ( node_ptr, MTC_RECOVERY__GOENABLED_TIMER ); - alarm_enabled_failure (node_ptr); + alarm_enabled_failure (node_ptr, true ); } } /* A timer ring indicates that the host is not up */ @@ -1772,7 +1792,7 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr ) /* Inform the VIM that this host has failed */ mtcVimApi_state_change ( node_ptr, VIM_HOST_FAILED, 3 ); - alarm_enabled_failure(node_ptr); + alarm_enabled_failure(node_ptr, true ); /* Clear all degrade flags except for the HWMON one */ clear_host_degrade_causes ( node_ptr->degrade_mask ); @@ -2351,21 +2371,31 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr ) mtcTimer_reset ( node_ptr->mtcTimer ); } - plog ("%s Starting %d sec Heartbeat Soak (with%s)\n", - node_ptr->hostname.c_str(), - MTC_HEARTBEAT_SOAK_BEFORE_ENABLE, - node_ptr->hbsClient_ready ? " ready event" : "out ready event" ); - /* Enable the heartbeat service for Graceful Recovery */ - send_hbs_command ( node_ptr->hostname, MTC_CMD_START_HOST ); + send_hbs_command ( node_ptr->hostname, MTC_CMD_START_HOST ); - /* allow heartbeat to run for 10 seconds before we declare enable */ - mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_HEARTBEAT_SOAK_BEFORE_ENABLE ); + if ( this->hbs_failure_action == HBS_FAILURE_ACTION__NONE ) + { + /* Skip over the heartbeat soak if the failuer handlig is + * none because in that case heartbeating is disabled and + * would just be a waste of recovery time. */ + recoveryStageChange ( node_ptr, MTC_RECOVERY__STATE_CHANGE ); + } + else + { + plog ("%s Starting %d sec Heartbeat Soak (with%s)\n", + node_ptr->hostname.c_str(), + MTC_HEARTBEAT_SOAK_BEFORE_ENABLE, + node_ptr->hbsClient_ready ? " ready event" : "out ready event" ); - /* if heartbeat is not working then we will - * never get here and enable the host */ - recoveryStageChange ( node_ptr, MTC_RECOVERY__HEARTBEAT_SOAK ); + /* allow heartbeat to run for 10 seconds before we declare enable */ + mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_HEARTBEAT_SOAK_BEFORE_ENABLE ); + + /* if heartbeat is not working then we will + * never get here and enable the host */ + recoveryStageChange ( node_ptr, MTC_RECOVERY__HEARTBEAT_SOAK ); + } break ; } case MTC_RECOVERY__HEARTBEAT_SOAK: @@ -4667,7 +4697,7 @@ int nodeLinkClass::powercycle_handler ( struct nodeLinkClass::node * node_ptr ) if ( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED ) { ilog ("%s failing host for powercycle\n", node_ptr->hostname.c_str() ); - alarm_enabled_failure ( node_ptr ); + alarm_enabled_failure ( node_ptr , true ); /* Set node as unlocked-disabled-failed */ allStateChange ( node_ptr, MTC_ADMIN_STATE__UNLOCKED, diff --git a/mtce-common/cgts-mtce-common-1.0/maintenance/mtcNodeMnfa.cpp b/mtce-common/cgts-mtce-common-1.0/maintenance/mtcNodeMnfa.cpp index 4c2106bc..97e26bf0 100644 --- a/mtce-common/cgts-mtce-common-1.0/maintenance/mtcNodeMnfa.cpp +++ b/mtce-common/cgts-mtce-common-1.0/maintenance/mtcNodeMnfa.cpp @@ -31,14 +31,45 @@ void log_mnfa_pool ( std::list & mnfa_awol_list ) { std::list::iterator mnfa_awol_ptr ; string pool_list = "" ; + if ( mnfa_awol_list.size() ) + { + for ( mnfa_awol_ptr = mnfa_awol_list.begin() ; + mnfa_awol_ptr != mnfa_awol_list.end() ; + mnfa_awol_ptr++ ) + { + pool_list.append (" "); + pool_list.append (mnfa_awol_ptr->data()); + } + ilog ("MNFA POOL:%s\n", pool_list.c_str()); + } +} + +/***************************************************************************** + * + * Name : add_host_to_awol_list + * + * Description: Add a hostname to the awol list if its not already in the list + * + * Returns : true if added + * false if not added because it is already in the list. + * + *****************************************************************************/ + +static bool add_host_to_awol_list ( string hostname, std::list & mnfa_awol_list ) +{ + std::list::iterator mnfa_awol_ptr ; for ( mnfa_awol_ptr = mnfa_awol_list.begin() ; mnfa_awol_ptr != mnfa_awol_list.end() ; mnfa_awol_ptr++ ) { - pool_list.append (" "); - pool_list.append (mnfa_awol_ptr->data()); + if ( *(mnfa_awol_ptr) == hostname ) + { + /* already in list */ + return false ; + } } - ilog ("MNFA POOL:%s\n", pool_list.c_str()); + mnfa_awol_list.push_back(hostname); + return true ; } /***************************************************************************** @@ -51,6 +82,14 @@ void log_mnfa_pool ( std::list & mnfa_awol_list ) *****************************************************************************/ void nodeLinkClass::mnfa_add_host ( struct nodeLinkClass::node * node_ptr , iface_enum iface ) { + if (( this->hbs_failure_action == HBS_FAILURE_ACTION__ALARM ) || + ( this->hbs_failure_action == HBS_FAILURE_ACTION__NONE )) + { + /* Do nothing for the 'alarm only' or 'none' action. + * Alarming is handled by the hbsAgent already */ + return ; + } + if ( node_ptr->hbs_minor[iface] == false ) { bool enter = false ; @@ -63,15 +102,12 @@ void nodeLinkClass::mnfa_add_host ( struct nodeLinkClass::node * node_ptr , ifac /* if we are active then add the node to the awol list */ if ( mnfa_active == true ) { - alarm_enabled_failure (node_ptr); - /* once we are mnfa_active we need to give all the * hbs_minor=true hosts a graceful recovery token * mnfa_graceful_recovery = true and add to the awol list */ node_ptr->mnfa_graceful_recovery = true ; added = true ; - mnfa_awol_list.push_back(node_ptr->hostname); - mnfa_awol_list.unique(); + add_host_to_awol_list (node_ptr->hostname, mnfa_awol_list ); if ( node_ptr->task != MTC_TASK_RECOVERY_WAIT ) mtcInvApi_update_task ( node_ptr, MTC_TASK_RECOVERY_WAIT ); } @@ -94,10 +130,7 @@ void nodeLinkClass::mnfa_add_host ( struct nodeLinkClass::node * node_ptr , ifac get_iface_name_str(INFRA_IFACE), node_ptr->hbs_minor_count[INFRA_IFACE]); - if ( mnfa_awol_list.size() ) - { - log_mnfa_pool ( mnfa_awol_list ); - } + log_mnfa_pool ( mnfa_awol_list ); if ( enter == true ) { @@ -191,28 +224,20 @@ void nodeLinkClass::mnfa_enter ( void ) * recovery token mnfa_graceful_recovery = true * basically a get out of double reset free card */ ptr->mnfa_graceful_recovery = true ; - - mnfa_awol_list.push_back(ptr->hostname); + add_host_to_awol_list (ptr->hostname, mnfa_awol_list ); if ( ptr->task != MTC_TASK_RECOVERY_WAIT ) mtcInvApi_update_task ( ptr, MTC_TASK_RECOVERY_WAIT ); - - alarm_enabled_failure (ptr); } if (( ptr->next == NULL ) || ( ptr == tail )) break ; } - mnfa_awol_list.unique(); - if ( this->mnfa_timeout ) { wlog ("MNFA Auto-Recovery in %d seconds\n", this->mnfa_timeout); mtcTimer_start ( mtcTimer_mnfa, mtcTimer_handler, this->mnfa_timeout); } - if ( mnfa_awol_list.size() ) - { - log_mnfa_pool ( mnfa_awol_list ); - } + log_mnfa_pool ( mnfa_awol_list ); } /**************************************************************************** @@ -262,10 +287,7 @@ void nodeLinkClass::mnfa_exit ( bool force ) force ? "(Auto-Recover)" : ""); mtcAlarm_log ( active_controller_hostname , MTC_LOG_ID__EVENT_MNFA_EXIT ); - if ( mnfa_awol_list.size() ) - { - log_mnfa_pool ( mnfa_awol_list ); - } + log_mnfa_pool ( mnfa_awol_list ); /* Loop through inventory and recover each host that * remains in the hbs_minor state. @@ -329,3 +351,44 @@ void nodeLinkClass::mnfa_exit ( bool force ) mnfa_host_count[INFRA_IFACE] = 0 ; mnfa_awol_list.clear(); } + +/**************************************************************************** + * + * Name : mnfa_cancel + * + * Description: Cancel MNFA if its active. + * + ****************************************************************************/ +void nodeLinkClass::mnfa_cancel ( void ) +{ + if ( this->mnfa_active ) + { + wlog ("MNFA CANCEL --> Cancelling Multi-Node Failure Avoidance\n"); + + mtcTimer_reset ( this->mtcTimer_mnfa ); + + /* Loop through MNFA Pool. + * Clear MNFA attributes from hosts in the pool. */ + std::list::iterator mnfa_awol_ptr ; + for ( mnfa_awol_ptr = mnfa_awol_list.begin() ; + mnfa_awol_ptr != mnfa_awol_list.end() ; + mnfa_awol_ptr++ ) + { + struct node * node_ptr = nodeLinkClass::getNode ( *(mnfa_awol_ptr) ); + if ( node_ptr != NULL ) + { + node_ptr->degrade_mask &= ~DEGRADE_MASK_HEARTBEAT_MGMNT ; + node_ptr->degrade_mask &= ~DEGRADE_MASK_HEARTBEAT_INFRA ; + node_ptr->hbs_minor[INFRA_IFACE] = false ; + node_ptr->hbs_minor[MGMNT_IFACE] = false ; + node_ptr->mnfa_graceful_recovery = false ; + mtcInvApi_update_task ( node_ptr, "" ); + } + } + send_hbs_command ( this->my_hostname, MTC_RECOVER_HBS ); + this->mnfa_host_count[MGMNT_IFACE] = 0 ; + this->mnfa_host_count[INFRA_IFACE] = 0 ; + this->mnfa_active = false ; + } + mnfa_awol_list.clear(); +} diff --git a/mtce-common/cgts-mtce-common-1.0/maintenance/mtcSubfHdlrs.cpp b/mtce-common/cgts-mtce-common-1.0/maintenance/mtcSubfHdlrs.cpp index a6e6ed10..20d42793 100644 --- a/mtce-common/cgts-mtce-common-1.0/maintenance/mtcSubfHdlrs.cpp +++ b/mtce-common/cgts-mtce-common-1.0/maintenance/mtcSubfHdlrs.cpp @@ -409,17 +409,24 @@ int nodeLinkClass::enable_subf_handler ( struct nodeLinkClass::node * node_ptr ) mtcTimer_reset ( node_ptr->mtcTimer ); } - plog ("%s Starting %d sec Heartbeat Soak (with%s)\n", + /* Start Monitoring heartbeat */ + send_hbs_command ( node_ptr->hostname, MTC_CMD_START_HOST ); + + if ( this->hbs_failure_action == HBS_FAILURE_ACTION__NONE ) + { + enableStageChange ( node_ptr, MTC_ENABLE__STATE_CHANGE ); + } + else + { + plog ("%s Starting %d sec Heartbeat Soak (with%s)\n", name.c_str(), MTC_HEARTBEAT_SOAK_BEFORE_ENABLE, node_ptr->hbsClient_ready ? " ready event" : "out ready event" ); - /* Start Monitoring Services - heartbeat, process and hardware */ - send_hbs_command ( node_ptr->hostname, MTC_CMD_START_HOST ); - - /* allow heartbeat to run for 10 seconds before we declare enable */ - mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_HEARTBEAT_SOAK_BEFORE_ENABLE ); - enableStageChange ( node_ptr, MTC_ENABLE__HEARTBEAT_SOAK ); + /* allow heartbeat to run for 10 seconds before we declare enable */ + mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_HEARTBEAT_SOAK_BEFORE_ENABLE ); + enableStageChange ( node_ptr, MTC_ENABLE__HEARTBEAT_SOAK ); + } break ; } case MTC_ENABLE__HEARTBEAT_SOAK: diff --git a/mtce-common/cgts-mtce-common-1.0/scripts/mtc.conf b/mtce-common/cgts-mtce-common-1.0/scripts/mtc.conf index 92d0b9db..d46e7a20 100644 --- a/mtce-common/cgts-mtce-common-1.0/scripts/mtc.conf +++ b/mtce-common/cgts-mtce-common-1.0/scripts/mtc.conf @@ -7,12 +7,6 @@ hbs_minor_threshold = 4 ; Heartbeat minor threshold count. ; heartbeat misses that result in a ; minor notification to maintenance. -hbs_calibrate_threshold = 7 ; number of hosts before calibration kicks in -hbs_calibrate_period_factor = 200 ; x for each host over hbs_calibrate_threshold -hbs_calibrate_minor_factor = 20 ; x for each host over hbs_calibrate_threshold -hbs_calibrate_degrade_factor = 21 ; x for each host over hbs_calibrate_threshold -hbs_calibrate_fail_factor = 30 ; x for each host over hbs_calibrate_threshold - offline_period = 100 ; number of msecs to wait for each offline audit offline_threshold = 46 ; number of back to back mtcAlive requests missed ; 100:46 will yield a typical 5 sec holdoff from