diff --git a/mtce-common/centos/build_srpm.data b/mtce-common/centos/build_srpm.data index 728c2f30..d8075689 100644 --- a/mtce-common/centos/build_srpm.data +++ b/mtce-common/centos/build_srpm.data @@ -1,3 +1,3 @@ SRC_DIR="cgts-mtce-common-1.0" -TIS_PATCH_VER=135 +TIS_PATCH_VER=136 BUILD_IS_SLOW=5 diff --git a/mtce-common/cgts-mtce-common-1.0/common/logMacros.h b/mtce-common/cgts-mtce-common-1.0/common/logMacros.h index 267eaa41..568ba7be 100644 --- a/mtce-common/cgts-mtce-common-1.0/common/logMacros.h +++ b/mtce-common/cgts-mtce-common-1.0/common/logMacros.h @@ -147,21 +147,6 @@ typedef struct int latency_thld ; /**< scheduling latency threshold in msec b4 log */ - /** Multi Node Failure Avoidance Controls */ - char * mnfa_threshold_type ; /**< value used in multi node failure - avoidance calculation ; - 'number' / 'percent'age of hosts */ - int mnfa_threshold_percent ; /**< number of hosts simultaneously - failing heartbeat */ - int mnfa_threshold_number ; /**< percentage of pool - simultanepously failing heartbeat*/ - int mnfa_recovery_threshold ; /**< Multi-Node-Failure Avoidance Recovery Threshold - Similar to the LOC above for graceful recovery - hosts that have LOC for longer than this time in - seconds are failed and sent into the enable_handler - FSM while those that recover before this period are - sent into the graceful recovery_handler FSM. */ - /** Configurable Timeouts ; unit is 'seconds' */ int controller_mtcalive_timeout ; /**< mtcAlive wait timeout */ int compute_mtcalive_timeout ; /**< mtcAlive wait timeout */ @@ -172,7 +157,6 @@ typedef struct int sysinv_noncrit_timeout ; /**< sysinv nonc request timeout */ int work_queue_timeout ; /**< end of action workq complete TO */ int loc_recovery_timeout ; /**< loss of comms recovery timeout */ - int mnfa_recovery_timeout ; /**< mnfa recovery timeout */ int node_reinstall_timeout ; /**< node reinstall timeout */ int dor_mode_timeout ; /**< dead office recovery timeout */ int dor_recovery_timeout_ext ; /**< dor recovery timeout extension */ diff --git a/mtce-common/cgts-mtce-common-1.0/common/nodeClass.cpp b/mtce-common/cgts-mtce-common-1.0/common/nodeClass.cpp index 4352950e..eb941704 100755 --- a/mtce-common/cgts-mtce-common-1.0/common/nodeClass.cpp +++ b/mtce-common/cgts-mtce-common-1.0/common/nodeClass.cpp @@ -218,7 +218,6 @@ nodeLinkClass::nodeLinkClass() this->controller_mtcalive_timeout = 0; this->goenabled_timeout = 0; this->loc_recovery_timeout = 0; - this->mnfa_recovery_timeout = 0; this->node_reinstall_timeout = 0; this->token_refresh_rate = 0; this->autorecovery_enabled = false ; @@ -270,16 +269,16 @@ nodeLinkClass::nodeLinkClass() active_controller_hostname.clear() ; inactive_controller_hostname.clear() ; + /* MNFA Activity Controls */ + mnfa_threshold = 2 ; /* 2 hosts */ + mnfa_timeout = 0 ; /* no timeout */ + /* Start with no failures */ mnfa_awol_list.clear(); mnfa_host_count[MGMNT_IFACE] = 0 ; mnfa_host_count[INFRA_IFACE] = 0 ; mnfa_occurances = 0 ; mnfa_active = false ; - mnfa_threshold_type = MNFA_NUMBER ; - mnfa_threshold_percent = 5 ; - mnfa_threshold_number = 3 ; - mnfa_threshold = mnfa_threshold_number ; mgmnt_link_up_and_running = false ; infra_link_up_and_running = false ; @@ -4303,16 +4302,15 @@ void nodeLinkClass::hbs_minor_clear ( struct nodeLinkClass::node * node_ptr, ifa { /* clear it - possibly temporarily */ node_ptr->hbs_minor[iface] = false ; - + /* manage counts over heartbeat failure */ if ( mnfa_host_count[iface] ) { - /* If we are mnfa_active AND now below the threshold + /* If we are mnfa_active AND now below the threshold * then trigger mnfa_exit */ - if (( --mnfa_host_count[iface] < mnfa_calculate_threshold( node_ptr->hostname ) ) && + if (( --mnfa_host_count[iface] < mnfa_threshold) && ( mnfa_active == true )) { - wlog ("%s MNFA exit with graceful recovery (%s:%d)\n", node_ptr->hostname.c_str(), get_iface_name_str(iface), @@ -4468,6 +4466,8 @@ void nodeLinkClass::manage_heartbeat_failure ( string hostname, iface_enum iface } else { + alarm_enabled_failure (node_ptr); + mnfa_add_host ( node_ptr , iface ); if ( mnfa_active == false ) @@ -4481,17 +4481,13 @@ void nodeLinkClass::manage_heartbeat_failure ( string hostname, iface_enum iface { node_ptr->heartbeat_failed[MGMNT_IFACE] = true ; } - if ( mnfa_host_count[iface] < mnfa_calculate_threshold( hostname )) + if (mnfa_host_count[iface] < this->mnfa_threshold) { - elog ("%s %s network heartbeat failure\n", hostname.c_str(), get_iface_name_str(iface)); nodeLinkClass::set_availStatus ( hostname, MTC_AVAIL_STATUS__FAILED ); - if ( node_ptr->alarms[MTC_ALARM_ID__ENABLE] != FM_ALARM_SEVERITY_CRITICAL ) - { - mtcAlarm_critical ( node_ptr->hostname, MTC_ALARM_ID__ENABLE ); - node_ptr->alarms[MTC_ALARM_ID__ENABLE] = FM_ALARM_SEVERITY_CRITICAL; - } + + alarm_enabled_failure (node_ptr); if (( node_ptr->adminAction != MTC_ADMIN_ACTION__ENABLE ) && ( node_ptr->adminAction != MTC_ADMIN_ACTION__UNLOCK )) @@ -8296,19 +8292,12 @@ void nodeLinkClass::mem_log_dor ( struct nodeLinkClass::node * node_ptr ) void nodeLinkClass::mem_log_mnfa ( void ) { char str[MAX_MEM_LOG_DATA] ; - - int temp = mnfa_threshold_number ; - if ( mnfa_threshold_type == MNFA_PERCENT ) - temp = mnfa_threshold_percent ; - - snprintf (&str[0], MAX_MEM_LOG_DATA, "%s MNFA: Mode:%s:%d State:%s Hosts:%d:%d Cases:%d Threshold:%d\n", + snprintf (&str[0], MAX_MEM_LOG_DATA, "%s MNFA: State:%s Hosts:%d:%d Threshold:%d Occurances:%d\n", my_hostname.c_str(), - mnfa_threshold_type ? "Percent" : "Number", - temp, mnfa_active ? "ACTIVE" : "inactive", mnfa_host_count[MGMNT_IFACE], mnfa_host_count[INFRA_IFACE], - mnfa_calculate_threshold( "" ), + mnfa_threshold, mnfa_occurances); mem_log (str); } @@ -8316,7 +8305,7 @@ void nodeLinkClass::mem_log_mnfa ( void ) void nodeLinkClass::mem_log_general_mtce_hosts ( void ) { char str[MAX_MEM_LOG_DATA] ; - snprintf (&str[0], MAX_MEM_LOG_DATA, "%s EnableHosts -> Cont:%d Comp:%d Stor:%d StorType:%d\n", + snprintf (&str[0], MAX_MEM_LOG_DATA, "%s EnableHosts -> Cont:%d Comp:%d Stor:%d StorType:%d\n", my_hostname.c_str(), num_controllers_enabled(), enabled_compute_nodes(), diff --git a/mtce-common/cgts-mtce-common-1.0/common/nodeClass.h b/mtce-common/cgts-mtce-common-1.0/common/nodeClass.h index 8940f39b..46b5398a 100755 --- a/mtce-common/cgts-mtce-common-1.0/common/nodeClass.h +++ b/mtce-common/cgts-mtce-common-1.0/common/nodeClass.h @@ -1149,11 +1149,6 @@ private: /** Tracks the number of times multi failure avoidance was exited */ int mnfa_occurances ; - /** true when the multi node failure count exceeds the multi - * node failure avoidance threshold and until there are no more - * in service trouble hosts */ - bool mnfa_active ; - /** Recover or exit from the muli-node failure avoidance state * This involves restarting the heartbeat on all the nodes * that remain hbs_minor and clearing any heartbneat degrade @@ -1428,6 +1423,11 @@ public: std::list hostname_inventory ; std::list::iterator host ; + /** true when the multi node failure count exceeds the multi + * node failure avoidance threshold and until there are no more + * in service trouble hosts */ + bool mnfa_active ; + std::list mnfa_awol_list ; void mnfa_timeout_handler ( void ); @@ -1722,23 +1722,17 @@ public: int inotify_shadow_file_fd ; int inotify_shadow_file_wd ; - /** The multi node failure avoidance type */ - #define MNFA_NUMBER 0 - #define MNFA_PERCENT 1 - int mnfa_threshold_type ; + /* MNFA Timeout + * + * Time in secs MNFA can remain active. + * If 0 then there is no timeout. */ + int mnfa_timeout ; - /** % of hosts that need to simultaneously fail before 'mnfa' kicks in */ - int mnfa_threshold_percent ; - - /** # of hosts that need to simultaneously fail before 'mnfa' kicks in */ - int mnfa_threshold_number ; - - /** the calculated threshold */ + /* MNFA Host Involvement Threshold + * Number of hosts simultaneously failing heartbeat + * upon which feature will activate */ int mnfa_threshold ; - /** Calculates and returns the mnfa threshold based on enabled hosts */ - int mnfa_calculate_threshold ( string hostname ); - /* collectd event handler */ int collectd_notify_handler ( string & hostname, string & resource, @@ -1997,7 +1991,6 @@ public: int sysinv_timeout ; int sysinv_noncrit_timeout ; int loc_recovery_timeout ; /**< Loss Of Communication Recovery Timeout */ - int mnfa_recovery_timeout; /**< Multi-Node-Failure Avoidance Recovery Timeout */ int work_queue_timeout ; int node_reinstall_timeout ; diff --git a/mtce-common/cgts-mtce-common-1.0/daemon/daemon_config.cpp b/mtce-common/cgts-mtce-common-1.0/daemon/daemon_config.cpp index e0dcea8e..a8b02c30 100644 --- a/mtce-common/cgts-mtce-common-1.0/daemon/daemon_config.cpp +++ b/mtce-common/cgts-mtce-common-1.0/daemon/daemon_config.cpp @@ -46,7 +46,6 @@ void daemon_config_default ( daemon_config_type* config_ptr ) config_ptr->sysinv_api_bind_ip = strdup("none"); config_ptr->mode = strdup("none"); config_ptr->fit_host = strdup("none"); - config_ptr->mnfa_threshold_type = strdup("none"); config_ptr->multicast = strdup("none"); config_ptr->debug_all = 0 ; @@ -174,14 +173,6 @@ int timeout_config_handler ( void * user, config_ptr->loc_recovery_timeout = atoi(value); ilog ("LOC Timeout: %3d secs\n", config_ptr->loc_recovery_timeout ); } - else if (MATCH("timeouts", "mnfa_recovery_timeout")) - { - config_ptr->mnfa_recovery_timeout = atoi(value); - if (( config_ptr->mnfa_recovery_timeout > 300 ) || - ( config_ptr->mnfa_recovery_timeout == 0 )) - config_ptr->mnfa_recovery_timeout = 5 ; - ilog ("MNFA Timeout: %3d secs\n", config_ptr->mnfa_recovery_timeout ); - } else if (MATCH("timeouts", "node_reinstall_timeout")) { config_ptr->node_reinstall_timeout = atoi(value); @@ -238,7 +229,7 @@ void daemon_dump_cfg ( void ) ilog ("Configuration Settings\n------------------------------\n"); if ( ptr->scheduling_priority ) { ilog ("scheduling_priority = %d\n", ptr->scheduling_priority ); } - + if ( ptr->infra_degrade_only ) { ilog ("infra_degrade_only = %s\n", ptr->infra_degrade_only ? "Yes" : "No" );} if ( ptr->need_infra_poll_audit ) { ilog ("need_infra_poll_audit = %s\n", ptr->need_infra_poll_audit ? "Yes" : "No" );} if ( ptr->active ) { ilog ("active = %s\n", ptr->active ? "Yes" : "No" );} @@ -254,7 +245,6 @@ void daemon_dump_cfg ( void ) if ( strcmp(ptr->infra_iface, "none" )) { ilog ("infra_iface = %s\n", ptr->infra_iface );} if ( strcmp(ptr->multicast, "none" )) { ilog ("multicast = %s\n", ptr->multicast );} - if ( ptr->ha_port ) { ilog ("ha_port = %d\n", ptr->ha_port );} if ( ptr->vim_cmd_port ) { ilog ("vim_cmd_port = %d\n", ptr->vim_cmd_port );} if ( ptr->vim_event_port ) { ilog ("vim_event_port = %d\n", ptr->vim_event_port );} @@ -286,7 +276,7 @@ void daemon_dump_cfg ( void ) if ( ptr->hwmon_cmd_port ) { ilog ("hwmon_cmd_port = %d\n", ptr->hwmon_cmd_port );} if ( ptr->hbs_to_mtc_event_port) { ilog ("hbs_to_mtc_event_port = %d\n", ptr->hbs_to_mtc_event_port);} if ( ptr->inv_event_port ) { ilog ("inv_event_port = %d\n", ptr->inv_event_port );} - + /* rmond */ if ( ptr->per_node ) { ilog ("per_node = %d\n", ptr->per_node );} if ( ptr->audit_period ) { ilog ("audit_period = %d\n", ptr->audit_period );} @@ -316,10 +306,6 @@ void daemon_dump_cfg ( void ) if ( ptr->stall_rec_thld ) { ilog ("stall_rec_thld = %d\n", ptr->stall_rec_thld );} /* mtcAgent */ - if ( ptr->mnfa_threshold_type ) { ilog ("mnfa_threshold_type = %s\n", ptr->mnfa_threshold_type );} - if ( ptr->mnfa_threshold_percent ) { ilog ("mnfa_threshold_percent= %d\n", ptr->mnfa_threshold_percent );} - if ( ptr->mnfa_threshold_number ) { ilog ("mnfa_threshold_number = %d\n", ptr->mnfa_threshold_number );} - if ( ptr->mnfa_recovery_threshold ) { ilog ("mnfa_recovery_threshod= %d\n", ptr->mnfa_recovery_threshold );} if ( ptr->controller_mtcalive_timeout) { ilog ("controller_mtcalive_to= %d\n", ptr->controller_mtcalive_timeout );} if ( ptr->compute_mtcalive_timeout ) { ilog ("compute_mtcalive_to = %d\n", ptr->compute_mtcalive_timeout );} if ( ptr->goenabled_timeout ) { ilog ("goenabled_timeout = %d\n", ptr->goenabled_timeout );} @@ -328,7 +314,6 @@ void daemon_dump_cfg ( void ) if ( ptr->sysinv_noncrit_timeout ) { ilog ("sysinv_noncrit_timeout= %d\n", ptr->sysinv_noncrit_timeout );} if ( ptr->work_queue_timeout ) { ilog ("work_queue_timeout = %d\n", ptr->work_queue_timeout );} if ( ptr->loc_recovery_timeout ) { ilog ("loc_recovery_timeout = %d\n", ptr->loc_recovery_timeout );} - if ( ptr->mnfa_recovery_timeout ) { ilog ("mnfa_recovery_timeout = %d\n", ptr->mnfa_recovery_timeout );} if ( ptr->node_reinstall_timeout ) { ilog ("node_reinstall_timeout= %d\n", ptr->node_reinstall_timeout );} if ( ptr->uptime_period ) { ilog ("uptime_period = %d\n", ptr->uptime_period );} if ( ptr->online_period ) { ilog ("online_period = %d\n", ptr->online_period );} diff --git a/mtce-common/cgts-mtce-common-1.0/heartbeat/hbsStubs.cpp b/mtce-common/cgts-mtce-common-1.0/heartbeat/hbsStubs.cpp index 503f7a9a..a864ab3e 100644 --- a/mtce-common/cgts-mtce-common-1.0/heartbeat/hbsStubs.cpp +++ b/mtce-common/cgts-mtce-common-1.0/heartbeat/hbsStubs.cpp @@ -270,12 +270,6 @@ int daemon_log_message ( const char * hostname, return(PASS); } - -int nodeLinkClass::mnfa_calculate_threshold ( string hostname ) -{ - UNUSED(hostname); - return(PASS) ; -} void nodeLinkClass::mnfa_add_host ( struct nodeLinkClass::node * node_ptr, iface_enum iface ) { node_ptr = node_ptr ; iface = iface ; } void nodeLinkClass::mnfa_recover_host ( struct nodeLinkClass::node * node_ptr ) diff --git a/mtce-common/cgts-mtce-common-1.0/maintenance/mtcNodeCtrl.cpp b/mtce-common/cgts-mtce-common-1.0/maintenance/mtcNodeCtrl.cpp index 88200afc..1299f687 100755 --- a/mtce-common/cgts-mtce-common-1.0/maintenance/mtcNodeCtrl.cpp +++ b/mtce-common/cgts-mtce-common-1.0/maintenance/mtcNodeCtrl.cpp @@ -295,18 +295,6 @@ static int mtc_config_handler ( void * user, config_ptr->mask |= CONFIG_AGENT_API_RETRIES ; mtcInv.api_retries = config_ptr->api_retries ; } - else if (MATCH("agent", "mnfa_threshold_type")) - { - config_ptr->mnfa_threshold_type = strdup(value); - } - else if (MATCH("agent", "mnfa_threshold_percent")) - { - config_ptr->mnfa_threshold_percent = atoi(value); - } - else if (MATCH("agent", "mnfa_threshold_number")) - { - config_ptr->mnfa_threshold_number = atoi(value); - } else if (MATCH("timeouts", "failsafe_shutdown_delay")) { config_ptr->failsafe_shutdown_delay = atoi(value); @@ -335,6 +323,55 @@ static int mtc_config_handler ( void * user, return (FAIL); } +static int mtc_ini_handler ( void * user, + const char * section, + const char * name, + const char * value) +{ + UNUSED(user); + + if (MATCH("agent", "mnfa_threshold")) + { + mtcInv.mnfa_threshold = atoi(value); + ilog ("MNFA Threshd: %d\n", mtcInv.mnfa_threshold); + } + else if (MATCH("timeouts", "mnfa_timeout")) + { + int old = mtcInv.mnfa_timeout ; + mtcInv.mnfa_timeout = atoi(value); + if ( mtcInv.mnfa_timeout == 0 ) + { + ilog ("MNFA Timeout: Never\n"); + } + else + { + ilog ("MNFA Timeout: %3d secs\n", mtcInv.mnfa_timeout ); + } + + /* handle a change in mnfa timeout while MNFA is active */ + if (( mtcInv.mnfa_active == true ) && + ( mtcInv.mnfa_timeout != old )) + { + mtcTimer_reset ( mtcInv.mtcTimer_mnfa ); + if (( old == 0 ) || mtcInv.mnfa_timeout != 0 ) + { + wlog ("MNFA Auto-Recovery in %d seconds\n", + mtcInv.mnfa_timeout); + + mtcTimer_start ( mtcInv.mtcTimer_mnfa, + mtcTimer_handler, + mtcInv.mnfa_timeout); + } + else if ( mtcInv.mnfa_timeout == 0 ) + { + ilog ("MNFA timer set to no-timeout ; previous %d sec timer cancelled", old ); + } + } + } + return (PASS); +} + + /* Read and process mtc.ini file settings into the daemon configuration */ int daemon_configure ( void ) { @@ -350,6 +387,12 @@ int daemon_configure ( void ) return (FAIL_LOAD_INI); } + if (ini_parse(MTCE_INI_FILE, mtc_ini_handler, &mtc_config) < 0) + { + elog ("Can't load '%s'\n", MTCE_INI_FILE ); + return (FAIL_LOAD_INI); + } + if (ini_parse(MTCE_INI_FILE, keystone_config_handler, &mtc_config) < 0) { elog ("Can't load '%s'\n", MTCE_INI_FILE ); @@ -406,14 +449,12 @@ int daemon_configure ( void ) mtcInv.goenabled_timeout = DEFAULT_GOENABLE_TIMEOUT ; mtcInv.loc_recovery_timeout = mtc_config.loc_recovery_timeout ; - mtcInv.mnfa_recovery_timeout = mtc_config.mnfa_recovery_timeout ; if ( mtc_config.node_reinstall_timeout ) mtcInv.node_reinstall_timeout = mtc_config.node_reinstall_timeout ; else mtcInv.node_reinstall_timeout = MTC_REINSTALL_TIMEOUT_DEFAULT ; - if ( mtc_config.dor_mode_timeout <= 0 ) { slog ("DOR Mode Timeout is invalid (%d), setting to default (%d)\n", @@ -423,25 +464,6 @@ int daemon_configure ( void ) mtc_config.dor_mode_timeout = DEFAULT_DOR_MODE_TIMEOUT ; } - /* validate and auto correct manage multi node failure avoidance thresholds */ - if (( mtc_config.mnfa_threshold_type != NULL ) && - ( !strncmp (mtc_config.mnfa_threshold_type, "percent", strlen("percent")))) - { - if ( mtc_config.mnfa_threshold_percent > 100 ) - { - mtc_config.mnfa_threshold_percent = 100 ; - } - mtcInv.mnfa_threshold_type = MNFA_PERCENT ; - ilog ("mnfAvoidance: %d%c\n", mtc_config.mnfa_threshold_percent, '%' ); - mtcInv.mnfa_threshold_percent = mtc_config.mnfa_threshold_percent ; - } - else - { - mtcInv.mnfa_threshold_type = MNFA_NUMBER ; - ilog ("mnfAvoidance: %d hosts\n", mtc_config.mnfa_threshold_number ); - mtcInv.mnfa_threshold_number = mtc_config.mnfa_threshold_number ; - } - if ( mtc_config.swact_timeout ) { if ( mtc_config.swact_timeout < (MTC_SWACT_POLL_TIMER*2)) diff --git a/mtce-common/cgts-mtce-common-1.0/maintenance/mtcNodeHdlrs.cpp b/mtce-common/cgts-mtce-common-1.0/maintenance/mtcNodeHdlrs.cpp index 1a3d1c7e..13df5951 100755 --- a/mtce-common/cgts-mtce-common-1.0/maintenance/mtcNodeHdlrs.cpp +++ b/mtce-common/cgts-mtce-common-1.0/maintenance/mtcNodeHdlrs.cpp @@ -1690,11 +1690,7 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr ) /* Go to the goEnabled stage */ recoveryStageChange ( node_ptr, MTC_RECOVERY__GOENABLED_TIMER ); - if ( node_ptr->alarms[MTC_ALARM_ID__ENABLE] != FM_ALARM_SEVERITY_CRITICAL ) - { - mtcAlarm_critical ( node_ptr->hostname, MTC_ALARM_ID__ENABLE ); - node_ptr->alarms[MTC_ALARM_ID__ENABLE] = FM_ALARM_SEVERITY_CRITICAL ; - } + alarm_enabled_failure(node_ptr); break ; } } @@ -1732,11 +1728,7 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr ) /* Go to the goEnabled stage */ recoveryStageChange ( node_ptr, MTC_RECOVERY__GOENABLED_TIMER ); - if ( node_ptr->alarms[MTC_ALARM_ID__ENABLE] != FM_ALARM_SEVERITY_CRITICAL ) - { - mtcAlarm_critical ( node_ptr->hostname, MTC_ALARM_ID__ENABLE ); - node_ptr->alarms[MTC_ALARM_ID__ENABLE] = FM_ALARM_SEVERITY_CRITICAL ; - } + alarm_enabled_failure (node_ptr); } } /* A timer ring indicates that the host is not up */ @@ -1780,11 +1772,8 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr ) /* Inform the VIM that this host has failed */ mtcVimApi_state_change ( node_ptr, VIM_HOST_FAILED, 3 ); - if ( node_ptr->alarms[MTC_ALARM_ID__ENABLE] != FM_ALARM_SEVERITY_CRITICAL ) - { - mtcAlarm_critical ( node_ptr->hostname, MTC_ALARM_ID__ENABLE ); - node_ptr->alarms[MTC_ALARM_ID__ENABLE] = FM_ALARM_SEVERITY_CRITICAL ; - } + alarm_enabled_failure(node_ptr); + /* Clear all degrade flags except for the HWMON one */ clear_host_degrade_causes ( node_ptr->degrade_mask ); node_ptr->degraded_resources_list.clear(); diff --git a/mtce-common/cgts-mtce-common-1.0/maintenance/mtcNodeMnfa.cpp b/mtce-common/cgts-mtce-common-1.0/maintenance/mtcNodeMnfa.cpp index 2c8348d9..4c2106bc 100644 --- a/mtce-common/cgts-mtce-common-1.0/maintenance/mtcNodeMnfa.cpp +++ b/mtce-common/cgts-mtce-common-1.0/maintenance/mtcNodeMnfa.cpp @@ -41,48 +41,6 @@ void log_mnfa_pool ( std::list & mnfa_awol_list ) ilog ("MNFA POOL:%s\n", pool_list.c_str()); } -/******************************************************************************* - * - * Name : mnfa_calculate_threshold - * - * Description: Calculates and returns the mnfa threshold based - * on enabled hosts. - * - * Auto corrects the value to a min number. - * - * Calculate the multi-node failure avoidance handling threshold - * This is the number of hosts than need to fail simultaneously - * in order to trigger mode ; i.e. mnfa_active=true - * - *******************************************************************************/ -int nodeLinkClass::mnfa_calculate_threshold ( string hostname ) -{ - int mnfa_enabled_nodes = enabled_nodes (); - - /* Calculate the threshold */ - if ( mnfa_threshold_type == MNFA_PERCENT ) - mnfa_threshold = mnfa_enabled_nodes / mnfa_threshold_percent ; - else - mnfa_threshold = mnfa_threshold_number ; - - /* Don't allow the multi-node failure avoidance - * to ever be 1 or we would never fail a host */ - if ( mnfa_threshold < mnfa_threshold_number ) - { - ilog ("%s MNFA threshold rounded to %d from %d\n", - hostname.c_str(), - mnfa_threshold_number, - mnfa_enabled_nodes / mnfa_threshold_percent ); - mnfa_threshold = mnfa_threshold_number ; - } - - if ( mnfa_awol_list.size() ) - { - log_mnfa_pool ( mnfa_awol_list ); - } - return (mnfa_threshold); -} - /***************************************************************************** * * Name : mnfa_add_host @@ -105,6 +63,8 @@ void nodeLinkClass::mnfa_add_host ( struct nodeLinkClass::node * node_ptr , ifac /* if we are active then add the node to the awol list */ if ( mnfa_active == true ) { + alarm_enabled_failure (node_ptr); + /* once we are mnfa_active we need to give all the * hbs_minor=true hosts a graceful recovery token * mnfa_graceful_recovery = true and add to the awol list */ @@ -116,7 +76,7 @@ void nodeLinkClass::mnfa_add_host ( struct nodeLinkClass::node * node_ptr , ifac mtcInvApi_update_task ( node_ptr, MTC_TASK_RECOVERY_WAIT ); } else if (( mnfa_active == false ) && - ( mnfa_host_count[iface] >= mnfa_calculate_threshold( node_ptr->hostname ))) + ( mnfa_host_count[iface] >= this->mnfa_threshold)) { enter = true ; } @@ -134,6 +94,11 @@ void nodeLinkClass::mnfa_add_host ( struct nodeLinkClass::node * node_ptr , ifac get_iface_name_str(INFRA_IFACE), node_ptr->hbs_minor_count[INFRA_IFACE]); + if ( mnfa_awol_list.size() ) + { + log_mnfa_pool ( mnfa_awol_list ); + } + if ( enter == true ) { mnfa_enter (); @@ -196,7 +161,7 @@ void nodeLinkClass::mnfa_recover_host ( struct nodeLinkClass::node * node_ptr ) * mnfa_graceful_recovery token * * 5. Start the MNFA Auto-Recovery timer with time based on the config - * setting mnfa_recovery_timeout + * setting mnfa_timeout * ****************************************************************************/ void nodeLinkClass::mnfa_enter ( void ) @@ -211,8 +176,7 @@ void nodeLinkClass::mnfa_enter ( void ) * previous mnfa but the failure case occurs again. If that * happens we need to cancel the timer that will issue * the period recovery command. */ - if ( mtcTimer_mnfa.tid ) - mtcTimer_stop ( mtcTimer_mnfa ); + mtcTimer_reset ( mtcTimer_mnfa ); /* Loop through inventory and recover each host that * remains in the hbs_minor state. @@ -232,6 +196,7 @@ void nodeLinkClass::mnfa_enter ( void ) if ( ptr->task != MTC_TASK_RECOVERY_WAIT ) mtcInvApi_update_task ( ptr, MTC_TASK_RECOVERY_WAIT ); + alarm_enabled_failure (ptr); } if (( ptr->next == NULL ) || ( ptr == tail )) break ; @@ -239,14 +204,20 @@ void nodeLinkClass::mnfa_enter ( void ) mnfa_awol_list.unique(); - /* Start the timer that will eventually send the MTC_RECOVER_HBS command */ - wlog ("MNFA Auto-Recovery in %d seconds\n", mnfa_recovery_timeout); - mtcTimer_start ( mtcTimer_mnfa, mtcTimer_handler, mnfa_recovery_timeout); + if ( this->mnfa_timeout ) + { + wlog ("MNFA Auto-Recovery in %d seconds\n", this->mnfa_timeout); + mtcTimer_start ( mtcTimer_mnfa, mtcTimer_handler, this->mnfa_timeout); + } + if ( mnfa_awol_list.size() ) + { + log_mnfa_pool ( mnfa_awol_list ); + } } /**************************************************************************** * - * Name : mnfa_enter + * Name : mnfa_exit * * Description: Perform the operations required to exit mnfa mode * These include ... @@ -266,7 +237,7 @@ void nodeLinkClass::mnfa_enter ( void ) * hosts that remain in the hbs_minor state. * * if ( force == true ) - * The mnfa_recovery_timeout has expired + * The mnfa_timeout has expired * All hosts in the awol list are forced failed and into the * enable_handler FSM. * else @@ -279,18 +250,20 @@ void nodeLinkClass::mnfa_exit ( bool force ) { if ( mnfa_active == true ) { - wlog ("MNFA EXIT <-- Exiting Multi-Node Failure Avoidance %s\n", - force ? "(Auto-Recover)" : ""); - - mtcAlarm_log ( active_controller_hostname , MTC_LOG_ID__EVENT_MNFA_EXIT ); mnfa_occurances++ ; mnfa_active = false ; - if ( force == true ) { elog ("... MNFA %d sec timeout - forcing full enable on ... \n", - mnfa_recovery_timeout); + this->mnfa_timeout); + } + wlog ("MNFA EXIT <-- Exiting Multi-Node Failure Avoidance %s\n", + force ? "(Auto-Recover)" : ""); + mtcAlarm_log ( active_controller_hostname , MTC_LOG_ID__EVENT_MNFA_EXIT ); + + if ( mnfa_awol_list.size() ) + { log_mnfa_pool ( mnfa_awol_list ); } @@ -342,8 +315,7 @@ void nodeLinkClass::mnfa_exit ( bool force ) } /* Stop the ... failure -> full enable ... window timer if it is active */ - if ( mtcTimer_mnfa.tid ) - mtcTimer_stop ( mtcTimer_mnfa ); + mtcTimer_reset ( mtcTimer_mnfa ); /* Start the timer that will eventually send the MTC_RECOVER_HBS command */ mtcTimer_start ( mtcTimer_mnfa, mtcTimer_handler, MTC_MNFA_RECOVERY_TIMER ); diff --git a/mtce-common/cgts-mtce-common-1.0/scripts/mtc.conf b/mtce-common/cgts-mtce-common-1.0/scripts/mtc.conf index 890c3b11..92d0b9db 100644 --- a/mtce-common/cgts-mtce-common-1.0/scripts/mtc.conf +++ b/mtce-common/cgts-mtce-common-1.0/scripts/mtc.conf @@ -18,14 +18,6 @@ offline_threshold = 46 ; number of back to back mtcAlive requests missed ; 100:46 will yield a typical 5 sec holdoff from ; failed to offline -mnfa_threshold_type = number ; Two different types are supported - ; 'number' or 'percent' of simultaneous - ; failures that enable multi-node - ; failure avoidance handling - ; -mnfa_threshold_percent = 10 ; if ( mnfa_threshold_type == percent ) -mnfa_threshold_number = 3 ; if ( mnfa_threshold_type == number ) - inventory_port = 6385 ; The Inventory Port Number keystone_port = 5000 ; The Keystone Port Number ha_port = 7777 ; The Inventory Port Number @@ -86,13 +78,6 @@ loc_recovery_timeout = 5 ; Loss Of Communication Recovery Timeout ; the max number of seconds that a host can be in ; loss of communication state without failing the unit -mnfa_recovery_timeout = 100 ; Multi-Node-Failure Avoidance Recovery Threshold - ; Similar to the LOC above for graceful recovery - ; hosts that have LOC for longer than this time in - ; seconds are failed and sent into the enable_handler - ; FSM while those that recover before this period are - ; sent into the graceful recovery_handler FSM. - dor_mode_timeout = 20 ; The default base time in seconds for how long ; maintenance DOR mode is active. This number ; is extended by the number of enabled hosts.