diff --git a/mtce-common/centos/build_srpm.data b/mtce-common/centos/build_srpm.data index a6783bd5..d872dde9 100644 --- a/mtce-common/centos/build_srpm.data +++ b/mtce-common/centos/build_srpm.data @@ -1,3 +1,3 @@ SRC_DIR="src" -TIS_PATCH_VER=140 +TIS_PATCH_VER=141 BUILD_IS_SLOW=5 diff --git a/mtce-common/src/common/logMacros.h b/mtce-common/src/common/logMacros.h index 8d73fe1c..55a72496 100644 --- a/mtce-common/src/common/logMacros.h +++ b/mtce-common/src/common/logMacros.h @@ -166,6 +166,18 @@ typedef struct int kernwd_update_period ; /**< expect kernel watchdog to be updated */ int autorecovery_threshold ; /**< AIO stop autorecovery threshold */ + /**< Auto Recovery Thresholds */ + int ar_config_threshold ; /**< Configuration Failure Threshold */ + int ar_goenable_threshold ; /**< GoEnable Failure Threshold */ + int ar_hostservices_threshold ; /**< Host Services Failure Threshold */ + int ar_heartbeat_threshold ; /**< Heartbeat Soak Failure Threshold*/ + + /**< Auto Recovery Retry Intervals */ + int ar_config_interval ; /**< Configuration Failure Interval */ + int ar_goenable_interval ; /**< GoEnable Failure Interval */ + int ar_hostservices_interval ; /**< Host Services Failure Interval */ + int ar_heartbeat_interval ; /**< Heartbeat Soak Failure Interval */ + int debug_all ; int debug_json ; /**< Enable jlog (json string ) output if not false */ int debug_timer ; /**< Enable tlog (timer logs ) output if not false */ diff --git a/mtce-common/src/common/nodeBase.h b/mtce-common/src/common/nodeBase.h index 9e8586cd..9d113768 100755 --- a/mtce-common/src/common/nodeBase.h +++ b/mtce-common/src/common/nodeBase.h @@ -229,28 +229,35 @@ void daemon_exit ( void ); #define MTC_TASK_INIT_FAIL "Initialization Failed, recovering" #define MTC_TASK_START_SERVICE_FAIL "Start Services Failed" #define MTC_TASK_START_SERVICE_TO "Start Services Timeout" -#define MTC_TASK_ENABLING "Enabling" -#define MTC_TASK_ENABLING_SUBF "Enabling Compute Service" -#define MTC_TASK_ENABLING_SUBF_FAIL "Enabling Compute Service Failed" -#define MTC_TASK_ENABLING_SUBF_TO "Enabling Compute Service Timeout" -#define MTC_TASK_ENABLE_WORK_FAIL "Enable Action Failed, re-enabling" -#define MTC_TASK_ENABLE_WORK_FAIL_ "Enable Action Failed" -#define MTC_TASK_ENABLE_WORK_TO "Enable Action Timeout, re-enabling" -#define MTC_TASK_ENABLE_WORK_TO_ "Enable Action Timeout" +#define MTC_TASK_ENABLE_WORK_FAIL "Enable Action Failed" +#define MTC_TASK_ENABLE_WORK_TO "Enable Action Timeout" #define MTC_TASK_ENABLE_FAIL_HB "Enable Heartbeat Failure, re-enabling" #define MTC_TASK_RECOVERY_FAIL "Graceful Recovery Failed, re-enabling" #define MTC_TASK_RECOVERY_WAIT "Graceful Recovery Wait" #define MTC_TASK_RECOVERED "Gracefully Recovered" + +#define MTC_TASK_ENABLING "Enabling" #define MTC_TASK_MAIN_CONFIG_FAIL "Configuration Failed, re-enabling" -#define MTC_TASK_SUBF_CONFIG_FAIL "Compute Configuration Failed, re-enabling" -#define MTC_TASK_SUBF_CONFIG_FAIL_ "Compute Configuration Failed" #define MTC_TASK_MAIN_CONFIG_TO "Configuration Timeout, re-enabling" +#define MTC_TASK_MAIN_INTEST_FAIL "In-Test Failed, re-enabling" +#define MTC_TASK_MAIN_INTEST_TO "In-Test Timeout, re-enabling" +#define MTC_TASK_MAIN_SERVICE_FAIL "Start Services Failed, re-enabling" +#define MTC_TASK_MAIN_SERVICE_TO "Start Services Timeout, re-enabling" + +#define MTC_TASK_ENABLING_SUBF "Enabling Compute Service" +#define MTC_TASK_SUBF_CONFIG_FAIL "Compute Configuration Failed, re-enabling" #define MTC_TASK_SUBF_CONFIG_TO "Compute Configuration Timeout, re-enabling" -#define MTC_TASK_SUBF_CONFIG_TO_ "Compute Configuration Timeout" -#define MTC_TASK_INTEST_FAIL "In-Test Failed, re-enabling" -#define MTC_TASK_INTEST_FAIL_ "In-Test Failed" -#define MTC_TASK_INTEST_FAIL_TO "In-Test Timeout, re-enabling" -#define MTC_TASK_INTEST_FAIL_TO_ "In-Test Timeout" +#define MTC_TASK_SUBF_INTEST_FAIL "Compute In-Test Failed, re-enabling" +#define MTC_TASK_SUBF_INTEST_TO "Compute In-Test Timeout, re-enabling" +#define MTC_TASK_SUBF_SERVICE_FAIL "Compute Start Services Failed, re-enabling" +#define MTC_TASK_SUBF_SERVICE_TO "Compute Start Services Timeout, re-enabling" + +#define MTC_TASK_AR_DISABLED_CONFIG "Configuration failure, threshold reached, Lock/Unlock to retry" +#define MTC_TASK_AR_DISABLED_GOENABLE "In-Test Failure, threshold reached, Lock/Unlock to retry" +#define MTC_TASK_AR_DISABLED_SERVICES "Service Failure, threshold reached, Lock/Unlock to retry" +#define MTC_TASK_AR_DISABLED_ENABLE "Enable Failure, threshold reached, Lock/Unlock to retry" +#define MTC_TASK_AR_DISABLED_HEARTBEAT "Heartbeat Failure, threshold reached, Lock/Unlock to retry" + #define MTC_TASK_RESET_FAIL "Reset Failed" #define MTC_TASK_RESET_QUEUE "Reset Failed, retrying (%d of %d)" #define MTC_TASK_POWERON_FAIL "Power-On Failed" @@ -275,8 +282,6 @@ void daemon_exit ( void ); #define MTC_TASK_RESETTING_HOST "Resetting Host, critical sensor" #define MTC_TASK_CPE_SX_UNLOCK_MSG "Unlocking, please stand-by while the system gracefully reboots" #define MTC_TASK_SELF_UNLOCK_MSG "Unlocking active controller, please stand-by while it reboots" -#define MTC_TASK_AUTO_RECOVERY "Critical failure. Auto-recovery enabled, re-enabling" -#define MTC_TASK_AUTO_RECOVERY_DISABLED "Critical failure. Auto-recovery disabled, threshold reached" #define MTC_TASK_FAILED_SWACT_REQ "Critical failure.Requesting SWACT to enabled standby controller" #define MTC_TASK_FAILED_NO_BACKUP "Critical failure.Please provision/enable standby controller" @@ -1176,13 +1181,6 @@ typedef enum MTC_STRESS_TEST__STAGES = 6, } mtc_stressStages_enum ; -typedef union -{ - mtc_enableStages_enum enable ; - mtc_disableStages_enum disable ; - int raw ; -} mtc_stages_union ; - typedef struct { mtc_nodeAdminAction_enum adminAction ; @@ -1210,6 +1208,24 @@ typedef enum MAX_IFACES = 2 } iface_enum ; +/* Auto recovery Disable Causes */ +typedef enum +{ + MTC_AR_DISABLE_CAUSE__CONFIG, + MTC_AR_DISABLE_CAUSE__GOENABLE, + MTC_AR_DISABLE_CAUSE__HOST_SERVICES, + MTC_AR_DISABLE_CAUSE__HEARTBEAT, + MTC_AR_DISABLE_CAUSE__LAST, + MTC_AR_DISABLE_CAUSE__NONE, +} autorecovery_disable_cause_enum ; + +/* Service Based Auto Recovery Control Structure */ +typedef struct +{ + unsigned int count ; /* running back-2-back failure count */ + bool disabled ; /* true if autorecovery is disabled */ +} autorecovery_cause_ctrl_type ; + /** Returns true if the specified admin state string is valid */ bool adminStateOk ( string admin ); diff --git a/mtce/src/common/nodeClass.cpp b/mtce/src/common/nodeClass.cpp index 04d96330..e88b84be 100755 --- a/mtce/src/common/nodeClass.cpp +++ b/mtce/src/common/nodeClass.cpp @@ -236,9 +236,6 @@ nodeLinkClass::nodeLinkClass() this->loc_recovery_timeout = 0; this->node_reinstall_timeout = 0; this->token_refresh_rate = 0; - this->autorecovery_enabled = false ; - this->autorecovery_disabled = false ; - head = tail = NULL; memory_allocs = 0 ; @@ -313,6 +310,11 @@ nodeLinkClass::nodeLinkClass() sysinv_noncrit_timeout = HTTP_SYSINV_NONC_TIMEOUT ; work_queue_timeout = MTC_WORKQUEUE_TIMEOUT ; + /* Init the auto recovery threshold and intervals to zero until + * modified by daemon config */ + memset (&ar_threshold, 0, sizeof(ar_threshold)); + memset (&ar_interval, 0, sizeof(ar_interval)); + /* Inservice test periods in seconds - 0 = disabled */ insv_test_period = 0 ; oos_test_period = 0 ; @@ -340,7 +342,6 @@ nodeLinkClass::nodeLinkClass() tokenEvent.buf = NULL ; unknown_host_throttle = 0 ; - invalid_arg_throttle = 0 ; testmode = 0 ; module_init( ); @@ -564,6 +565,11 @@ nodeLinkClass::node* nodeLinkClass::addNode( string hostname ) ptr->was_dor_recovery_mode= false ; ptr->dor_recovery_time = 0 ; + ptr->ar_disabled = false ; + ptr->ar_cause = MTC_AR_DISABLE_CAUSE__NONE ; + memset (&ptr->ar_count, 0, sizeof(ptr->ar_count)); + ptr->ar_log_throttle = 0 ; + mtcTimer_init ( ptr->mtcTimer, hostname, "mtc timer"); /* Init node's general mtc timer */ mtcTimer_init ( ptr->insvTestTimer, hostname, "insv test timer"); mtcTimer_init ( ptr->oosTestTimer, hostname, "oos test timer"); /* Init node's oos test timer */ @@ -603,7 +609,8 @@ nodeLinkClass::node* nodeLinkClass::addNode( string hostname ) ptr->subStage = MTC_SUBSTAGE__DONE ; ptr->reinstallStage = MTC_REINSTALL__DONE ; ptr->resetStage = MTC_RESET__START ; - ptr->handlerStage.enable = MTC_ENABLE__START ; /* Enable and Disable */ + ptr->enableStage = MTC_ENABLE__START ; + ptr->disableStage = MTC_DISABLE__START ; ptr->oos_test_count = 0 ; ptr->insv_test_count = 0 ; @@ -613,9 +620,8 @@ nodeLinkClass::node* nodeLinkClass::addNode( string hostname ) ptr->uptime_refresh_counter = 0 ; ptr->node_unlocked_counter = 0 ; - /* Default to a healthy config until mtcAlive messages prove otherwise */ - ptr->mtce_flags = ( MTC_FLAG__I_AM_CONFIGURED | - MTC_FLAG__I_AM_HEALTHY ) ; + /* Good health needs to be learned */ + ptr->mtce_flags = 0 ; ptr->graceful_recovery_counter = 0 ; ptr->health_threshold_counter = 0 ; @@ -746,8 +752,6 @@ nodeLinkClass::node* nodeLinkClass::addNode( string hostname ) ptr->adminAction = MTC_ADMIN_ACTION__NONE ; ptr->adminAction_todo_list.clear(); - ptr->handlerStage.enable = MTC_ENABLE__START; - hosts++ ; /* (re)build the Resource Reference Array */ @@ -4504,16 +4508,31 @@ void nodeLinkClass::manage_heartbeat_failure ( string hostname, iface_enum iface } else { - //bool want_degrade = true ; - //if ( this->hbs_failure_action == HBS_FAILURE_ACTION__ALARM ) - // want_degrade = false ; - // alarm_enabled_failure (node_ptr, want_degrade); + /* handle auto recovery for heartbeat failure during enable */ + if ( node_ptr->ar_cause == MTC_AR_DISABLE_CAUSE__HEARTBEAT ) + return ; + else if ( node_ptr->enableStage == MTC_ENABLE__HEARTBEAT_SOAK ) + { + elog ("%s %s *** Heartbeat Loss *** (during enable soak)\n", + hostname.c_str(), + get_iface_name_str(iface)); + + if ( ar_manage ( node_ptr, + MTC_AR_DISABLE_CAUSE__HEARTBEAT, + MTC_TASK_AR_DISABLED_HEARTBEAT ) == PASS ) + { + mtcInvApi_update_task ( node_ptr, MTC_TASK_ENABLE_FAIL_HB ); + enableStageChange ( node_ptr, MTC_ENABLE__FAILURE ); + adminActionChange ( node_ptr, MTC_ADMIN_ACTION__NONE ); + } + return ; + } mnfa_add_host ( node_ptr , iface ); if ( mnfa_active == false ) { - elog ("%s %s *** Heartbeat Loss ***\n", hostname.c_str(), get_iface_name_str(iface)); + elog ("%s %s *** Heartbeat Loss ***\n", hostname.c_str(), get_iface_name_str(iface)); if ( iface == INFRA_IFACE ) { node_ptr->heartbeat_failed[INFRA_IFACE] = true ; @@ -4546,6 +4565,7 @@ void nodeLinkClass::manage_heartbeat_failure ( string hostname, iface_enum iface { mtcInvApi_update_task ( node_ptr, MTC_TASK_ENABLE_FAIL_HB ); enableStageChange ( node_ptr, MTC_ENABLE__FAILURE ); + adminActionChange ( node_ptr, MTC_ADMIN_ACTION__NONE ); } } } @@ -5485,7 +5505,7 @@ int nodeLinkClass::critical_process_failed( string & hostname, /* Special critical process failure handling for AIO system */ if ( THIS_HOST && ( is_inactive_controller_main_insv() == false )) { - if ( this->autorecovery_disabled == true ) + if ( node_ptr->ar_disabled == true ) { dlog ("%s bypassing persistent critical process failure\n", node_ptr->hostname.c_str()); @@ -5510,7 +5530,7 @@ int nodeLinkClass::critical_process_failed( string & hostname, dlog ("%s adminState:%s EnableStage:%s\n", node_ptr->hostname.c_str(), adminAction_enum_to_str(node_ptr->adminAction).c_str(), - get_enableStages_str(node_ptr->handlerStage.enable).c_str()); + get_enableStages_str(node_ptr->enableStage).c_str()); } return (PASS); } @@ -5843,7 +5863,7 @@ int nodeLinkClass::set_enableStage ( string & hostname, nodeLinkClass::node * node_ptr = getNode ( hostname ) ; if ( node_ptr != NULL ) { - node_ptr->handlerStage.enable = stage ; + node_ptr->enableStage = stage ; return (PASS); } return (FAIL); @@ -5867,7 +5887,7 @@ mtc_enableStages_enum nodeLinkClass::get_enableStage ( string & hostname) nodeLinkClass::node * node_ptr = getNode ( hostname ) ; if ( node_ptr != NULL ) { - return ( node_ptr->handlerStage.enable ) ; + return ( node_ptr->enableStage ) ; } return (MTC_ENABLE__STAGES); } @@ -6124,16 +6144,15 @@ int nodeLinkClass::adminActionChange ( struct nodeLinkClass::node * node_ptr, node_ptr->node_unlocked_counter++ ; } - if ( is_controller ( node_ptr ) ) - autorecovery_clear (node_ptr->hostname); + ar_enable (node_ptr); - node_ptr->handlerStage.enable = MTC_ENABLE__START ; + node_ptr->enableStage = MTC_ENABLE__START ; break ; } case MTC_ADMIN_ACTION__LOCK: case MTC_ADMIN_ACTION__FORCE_LOCK: { - node_ptr->handlerStage.disable = MTC_DISABLE__START ; + node_ptr->disableStage = MTC_DISABLE__START ; break ; } case MTC_ADMIN_ACTION__RESET: @@ -6423,26 +6442,26 @@ int nodeLinkClass::enableStageChange ( struct nodeLinkClass::node * node_ptr, { /* TODO: Consider converting stage to strings ... */ if (( newHdlrStage >= MTC_ENABLE__STAGES ) || - ( node_ptr->handlerStage.enable >= MTC_ENABLE__STAGES )) + ( node_ptr->enableStage >= MTC_ENABLE__STAGES )) { slog ("%s has invalid Enable stage (%d:%d)\n", node_ptr->hostname.c_str(), - node_ptr->handlerStage.enable, + node_ptr->enableStage, newHdlrStage ); - node_ptr->handlerStage.enable = MTC_ENABLE__FAILURE ; + node_ptr->enableStage = MTC_ENABLE__FAILURE ; /* TODO: cause failed or degraded state ? */ return (FAIL); } - else if ( node_ptr->handlerStage.enable != newHdlrStage ) + else if ( node_ptr->enableStage != newHdlrStage ) { clog ("%s %s -> %s\n", node_ptr->hostname.c_str(), - get_enableStages_str(node_ptr->handlerStage.enable).c_str(), + get_enableStages_str(node_ptr->enableStage).c_str(), get_enableStages_str(newHdlrStage).c_str()); - node_ptr->handlerStage.enable = newHdlrStage ; + node_ptr->enableStage = newHdlrStage ; return (PASS); } else @@ -6450,7 +6469,7 @@ int nodeLinkClass::enableStageChange ( struct nodeLinkClass::node * node_ptr, /* No state change */ dlog1 ("%s %s -> %s\n", node_ptr->hostname.c_str(), - get_enableStages_str(node_ptr->handlerStage.enable).c_str(), + get_enableStages_str(node_ptr->enableStage).c_str(), get_enableStages_str(newHdlrStage).c_str()); return (PASS); } @@ -6461,15 +6480,15 @@ int nodeLinkClass::disableStageChange ( struct nodeLinkClass::node * node_ptr, mtc_disableStages_enum newHdlrStage ) { /* TODO: Consider converting stage to strings ... */ - if (( newHdlrStage >= MTC_DISABLE__STAGES ) || - ( node_ptr->handlerStage.disable >= MTC_DISABLE__STAGES )) + if (( newHdlrStage >= MTC_DISABLE__STAGES ) || + ( node_ptr->disableStage >= MTC_DISABLE__STAGES )) { - slog ("%s has invalid disable stage (%d:%d)\n", + slog ("%s has invalid disable stage (%d:%d)\n", node_ptr->hostname.c_str(), - node_ptr->handlerStage.disable, + node_ptr->disableStage, newHdlrStage ); - node_ptr->handlerStage.disable = MTC_DISABLE__DISABLED ; + node_ptr->disableStage = MTC_DISABLE__DISABLED ; /* TODO: cause failed or degraded state ? */ return (FAIL); @@ -6478,10 +6497,10 @@ int nodeLinkClass::disableStageChange ( struct nodeLinkClass::node * node_ptr, { clog ("%s %s -> %s\n", node_ptr->hostname.c_str(), - get_disableStages_str(node_ptr->handlerStage.disable).c_str(), + get_disableStages_str(node_ptr->disableStage).c_str(), get_disableStages_str(newHdlrStage).c_str()); - node_ptr->handlerStage.disable = newHdlrStage ; + node_ptr->disableStage = newHdlrStage ; return (PASS); } } @@ -7053,61 +7072,131 @@ struct nodeLinkClass::node * nodeLinkClass::get_insvTestTimer ( timer_t tid ) return static_cast(NULL); } - /***************************************************************************** * - * Name : autorecovery_clear + * Name : ar_enable * - * Assumptions: Applies when simplex. - * - * Description: Removes the auto recovery count file if it exists. - * - * Auto recovery count is tracked/preserved in a host named auto recovery - * counter file /tmp/hostname_ar_count. - * - *****************************************************************************/ - -void autorecovery_clear ( string hostname ) -{ - string ar_file = TMP_DIR_PATH + hostname + AUTO_RECOVERY_FILE_SUFFIX ; - if ( daemon_is_file_present (ar_file.data())) - { - wlog ("%s clearing autorecovery counter\n", hostname.c_str()); - daemon_remove_file (ar_file.data()); - } -} - -/***************************************************************************** - * - * Name : manage_autorecovery - * - * Assumptions: Applies to the active controller only while simplex. - * - * Description: Issues an immediate lazy reboot if the autorecovery threshold - * is reached. Otherwise it disables autorecovery and returns - * do we don't get a rolling boot loop. + * Description: Clears all auto recovery state for the specified host and + * removes the auto recovery count file if it exists. * * Auto recovery count is tracked/preserved in a host named auto recovery * counter file /etc/mtc/tmp/hostname_ar_count. * - * in the event of a persistent autorecovery failure that results in a - * disable then the active controller goes enabled-degraded with a horizon - * status that indicates the active controller has a critical failure but - * auto recovery is disabled. The enable alarm is raised. - * *****************************************************************************/ -void nodeLinkClass::manage_autorecovery ( struct nodeLinkClass::node * node_ptr ) +void nodeLinkClass::ar_enable ( struct nodeLinkClass::node * node_ptr ) { - /* manage removing the auto recovery threshold count file */ - if ( ( THIS_HOST ) && - ( this->autorecovery_enabled == true ) && - ( this->autorecovery_disabled == false ) && - ( is_inactive_controller_main_insv() == false )) + string ar_file = TMP_DIR_PATH + node_ptr->hostname + AUTO_RECOVERY_FILE_SUFFIX ; + if ( daemon_is_file_present (ar_file.data())) { - int value = 0 ; - string ar_file = TMP_DIR_PATH + node_ptr->hostname + AUTO_RECOVERY_FILE_SUFFIX ; - int threshold = daemon_get_cfg_ptr()->autorecovery_threshold ; + wlog ("%s clearing autorecovery file counter\n", node_ptr->hostname.c_str()); + daemon_remove_file (ar_file.data()); + } + + if (( node_ptr->ar_disabled ) || + ( node_ptr->ar_cause != MTC_AR_DISABLE_CAUSE__NONE )) + { + wlog ("%s re-enabling autorecovery\n", node_ptr->hostname.c_str()); + } + + node_ptr->ar_disabled = false ; + node_ptr->ar_cause = MTC_AR_DISABLE_CAUSE__NONE ; + memset (&node_ptr->ar_count, 0, sizeof(node_ptr->ar_count)); + + node_ptr->ar_log_throttle = 0 ; +} + +/***************************************************************************** + * + * Name : ar_manage + * + * Purpose : Manage Auto Recovery state. + * + * Description: the following checks and operations are performed ... + * + * Pre Checks: + * + * Validate auto recovery cause code + * Return if already in ar_disabled state. Unlikely but safe guard. + * + * Manage Auto Recovery: + * + * Case 1: Failed active controller with no enabled inactive controller. + * + * Requires persistent count file and self reboot until threshold + * is reached. + * + * Issues an immediate lazy reboot if the autorecovery threshold + * is not reached. Otherwise it disables autorecovery and returns + * so we don't get a rolling boot loop. + * + * Auto recovery count is tracked/preserved in a host named auto + * recovery counter file /etc/mtc/tmp/hostname_ar_count. + * + * Case 2: All other cases + * + * Case 2a: No auto recovery thresholding of active controller in non AIO SX + * where the user can't lock and unlock the active controller. + * + * Maintain auto recovery count and set ar_disabled for the host when + * the threshold is reached. + * + * Parameters: + * + * node_ptr nodeLinkClass ptr of failing host. + * + * cause autorecovery_disable_cause_enum failure cause code. + * + * string host status string to display when auto recovery + * threshold is reached and autorecovery is disabled. + * + * Returns: + * + * FAIL tells the caller to break from its FSM at earliest opportunity + * because auto recovery threshold is reached and auto recovery + * is disabled. + * + * PASS tells the caller that the threshold is not reached and to + * continue handling the failure. + * + ******************************************************************************/ + +int nodeLinkClass::ar_manage ( struct nodeLinkClass::node * node_ptr, + autorecovery_disable_cause_enum cause, + string ar_disable_banner ) +{ + int rc = FAIL ; + + /* Auto recovery only applies for hosts that are unlocked + * and not already in ar_disabled state */ + if (( node_ptr->adminState != MTC_ADMIN_STATE__UNLOCKED ) || + ( node_ptr->ar_disabled )) + { + return (rc); + } + + /* check for invalid call case */ + if ( cause >= MTC_AR_DISABLE_CAUSE__LAST ) + { + slog ("%s called with invalid auto recovery cause (%d)", + node_ptr->hostname.c_str(), cause ); + return (rc); + } + + /* update cause code */ + if ( node_ptr->ar_cause != cause ) + node_ptr->ar_cause = cause ; + + + /* Case 1 check */ + if ( ( THIS_HOST ) && ( is_inactive_controller_main_insv() == false )) + { + /* manage the auto recovery threshold count file */ + unsigned int value = 0 ; + + string ar_file = TMP_DIR_PATH + + node_ptr->hostname + + AUTO_RECOVERY_FILE_SUFFIX ; if ( daemon_is_file_present (ar_file.data())) { @@ -7119,48 +7208,75 @@ void nodeLinkClass::manage_autorecovery ( struct nodeLinkClass::node * node_ptr /* Save the new value in the file */ daemon_log_value ( ar_file.data(), value ); + value = daemon_get_file_int ( ar_file.data() ); + /* set rc to reflect what the caller should do */ - if ( value > threshold ) + if ( value > this->ar_threshold[node_ptr->ar_cause] ) { elog ("%s auto recovery threshold exceeded (%d)\n", - node_ptr->hostname.c_str(), threshold ); + node_ptr->hostname.c_str(), + this->ar_threshold[node_ptr->ar_cause] ); - this->autorecovery_disabled = true ; - - if ( this->system_type == SYSTEM_TYPE__CPE_MODE__SIMPLEX ) - { - alarm_compute_failure ( node_ptr , FM_ALARM_SEVERITY_CRITICAL ) ; - } - else - { - alarm_enabled_failure ( node_ptr , true ) ; - } + node_ptr->ar_disabled = true ; + adminActionChange ( node_ptr, MTC_ADMIN_ACTION__NONE ); allStateChange ( node_ptr, node_ptr->adminState, MTC_OPER_STATE__ENABLED, MTC_AVAIL_STATUS__DEGRADED ); - mtcInvApi_update_task ( node_ptr, - MTC_TASK_AUTO_RECOVERY_DISABLED ); + mtcInvApi_update_task ( node_ptr, ar_disable_banner ); - return ; + return (rc); } - wlog ("%s auto recovery (try %d of %d)\n", - node_ptr->hostname.c_str(), value , threshold ); + wlog ("%s auto recovery (try %d of %d) (%d)", + node_ptr->hostname.c_str(), + value, + this->ar_threshold[node_ptr->ar_cause], + node_ptr->ar_cause); - mtcInvApi_update_states_now ( node_ptr, - "unlocked", - "disabled", - "failed", - "disabled", - "failed" ); - - mtcInvApi_update_task_now ( node_ptr, - MTC_TASK_AUTO_RECOVERY ); + mtcInvApi_update_states_now ( node_ptr, "unlocked", + "disabled", "failed", + "disabled", "failed" ); lazy_graceful_fs_reboot ( node_ptr ); } + else /* Case 2 */ + { + send_hbs_command ( node_ptr->hostname, MTC_CMD_STOP_HOST ); + mtcInvApi_update_states ( node_ptr, "unlocked", "disabled", "failed" ); + + if (( NOT_THIS_HOST ) && + ( this->system_type != SYSTEM_TYPE__CPE_MODE__SIMPLEX )) + { + if ( ++node_ptr->ar_count[node_ptr->ar_cause] >= + this->ar_threshold [node_ptr->ar_cause] ) + { + elog ("%s auto recovery threshold exceeded (%d)\n", + node_ptr->hostname.c_str(), + this->ar_threshold[node_ptr->ar_cause] ); + node_ptr->ar_disabled = true ; + adminActionChange ( node_ptr, MTC_ADMIN_ACTION__NONE ); + mtcInvApi_update_task ( node_ptr, ar_disable_banner ); + rc = FAIL ; + } + else + { + wlog ("%s auto recovery (try %d of %d) (%d)", + node_ptr->hostname.c_str(), + node_ptr->ar_count[node_ptr->ar_cause], + this->ar_threshold[node_ptr->ar_cause], + node_ptr->ar_cause); + rc = PASS ; + } + } + else + { + wlog ("%s auto recovery\n", node_ptr->hostname.c_str()); + rc = PASS ; + } + } + return (rc); } /**************************************************************************** @@ -7198,13 +7314,8 @@ void nodeLinkClass::report_dor_recovery ( struct nodeLinkClass::node * node_ptr, void nodeLinkClass::force_full_enable ( struct nodeLinkClass::node * node_ptr ) { - /* don't do a full enable if active controller in simplex mode */ - if ( THIS_HOST && SIMPLEX ) - { - wlog ("%s avoiding full enable of simplex system\n", node_ptr->hostname.c_str()); - wlog ("%s ... lock and unlock host to force recovery\n", node_ptr->hostname.c_str()); + if ( node_ptr->ar_disabled == true ) return ; - } if ( node_ptr->was_dor_recovery_mode ) { @@ -8599,13 +8710,14 @@ void nodeLinkClass::mem_log_alarm1 ( struct nodeLinkClass::node * node_ptr ) void nodeLinkClass::mem_log_stage ( struct nodeLinkClass::node * node_ptr ) { char str[MAX_MEM_LOG_DATA] ; - snprintf (&str[0], MAX_MEM_LOG_DATA, "%s\tAdd:%d Offline:%d: Swact:%d Recovery:%d Able:%d\n", + snprintf (&str[0], MAX_MEM_LOG_DATA, "%s\tAdd:%d Offline:%d: Swact:%d Recovery:%d Enable:%d Disable:%d\n", node_ptr->hostname.c_str(), - node_ptr->addStage, - node_ptr->offlineStage, - node_ptr->swactStage, + node_ptr->addStage, + node_ptr->offlineStage, + node_ptr->swactStage, node_ptr->recoveryStage, - node_ptr->handlerStage.raw); + node_ptr->enableStage, + node_ptr->disableStage); mem_log (str); } diff --git a/mtce/src/common/nodeClass.h b/mtce/src/common/nodeClass.h index e5c39172..da61e779 100755 --- a/mtce/src/common/nodeClass.h +++ b/mtce/src/common/nodeClass.h @@ -254,13 +254,9 @@ private: mtc_nodeOperState_enum operState_dport ; /**< Data Port Operational State */ mtc_nodeAvailStatus_enum availStatus_dport; /**< Data Port Availability Status */ - - /** Maintains the current handler stage. - * This is a union of all handler types such as enable, - * disable, degrade etc. See nodeBase.h for list of union members */ - mtc_stages_union handlerStage; - /* Individual FSM handler stages */ + mtc_enableStages_enum enableStage ; + mtc_disableStages_enum disableStage ; mtc_offlineStages_enum offlineStage ; mtc_onlineStages_enum onlineStage ; mtc_swactStages_enum swactStage ; @@ -380,6 +376,24 @@ private: /** when true requests the task for this host be cleared at first opportunity */ bool clear_task ; + /******* Auto Recovery Control Structure and member Functions ********/ + + /* reason/cause based host level enable failure counter */ + unsigned int ar_count[MTC_AR_DISABLE_CAUSE__LAST] ; + + /* The last enable failure reason/cause. + * Note: MTC_AR_DISABLE_CAUSE__NONE is no failure (default) */ + autorecovery_disable_cause_enum ar_cause ; + + /* when true indicates that a host has reached its enbale failure + * threshold and is left in the unlocked-disabled state */ + bool ar_disabled ; + + /* throttles the ar_disabled log to periodically indicate auto + * recovery disabled state but avoid flooding that same message. */ + #define AR_LOG_THROTTLE_THRESHOLD (100000) + unsigned int ar_log_throttle ; + /** Host's mtc timer struct. Use to time handler stages. * * reset -> reset command response @@ -870,9 +884,10 @@ private: int update_dport_states ( struct nodeLinkClass::node * node_ptr, int event ); - /* manage deciding to return or issue an immediate reboot if the - * auto recovery threshold is exceeded. */ - void manage_autorecovery ( struct nodeLinkClass::node * node_ptr ); + /* manage auto recovery */ + int ar_manage ( struct nodeLinkClass::node * node_ptr, + autorecovery_disable_cause_enum cause, + string ar_disable_banner ); /** *********************************************************************** * @@ -1041,6 +1056,12 @@ private: void clear_main_failed_bools ( struct nodeLinkClass::node * node_ptr ); void clear_hostservices_ctls ( struct nodeLinkClass::node * node_ptr ); + /* Enables/Clears dynamic auto recovery state. start fresh ! + * called in disabled_handler (lock) and in the DONE stages + * of the enable handler. */ + void ar_enable ( struct nodeLinkClass::node * node_ptr ); + + /** Find the node that has this timerID in its general mtc timer */ struct nodeLinkClass::node * get_mtcTimer_timer ( timer_t tid ); struct nodeLinkClass::node * get_mtcConfig_timer ( timer_t tid ); @@ -2005,22 +2026,36 @@ public: int compute_mtcalive_timeout; int controller_mtcalive_timeout ; int goenabled_timeout ; + + /** /etc/mtc.conf configurable audit intervals */ int swact_timeout ; int sysinv_timeout ; int sysinv_noncrit_timeout ; - int loc_recovery_timeout ; /**< Loss Of Communication Recovery Timeout */ - int work_queue_timeout ; + int loc_recovery_timeout ; /**< Loss Of Communication Recovery Timeout */ + int work_queue_timeout ; int node_reinstall_timeout ; - /** /etc/mtc.ini configurable audit intervals */ int insv_test_period ; int oos_test_period ; int uptime_period ; int online_period ; int token_refresh_rate; + /* Service specific max failures before autorecovery is disabled. + * + * ... values for each service are loaded from mtc config + * file at daemon startup + */ + unsigned int ar_threshold[MTC_AR_DISABLE_CAUSE__LAST] ; + + /* service specific secs between autorecovery retries. + * + * ... values for each service are loaded from mtc config + * file at daemon startup + */ + unsigned int ar_interval[MTC_AR_DISABLE_CAUSE__LAST] ; + int unknown_host_throttle ; - int invalid_arg_throttle ; }; /** @@ -2052,7 +2087,6 @@ const char * get_adminAction_str ( mtc_nodeAdminAction_enum action ); string bmc_get_ip ( string hostname, string mac , string & current_bm_ip ); void clear_host_degrade_causes ( unsigned int & degrade_mask ); bool sensor_monitoring_supported ( string hostname ); -void autorecovery_clear ( string hostname ); void log_mnfa_pool ( std::list & mnfa_awol_list ); #endif /* __INCLUDE_NODECLASS_H__ */ diff --git a/mtce/src/maintenance/mtcNodeCtrl.cpp b/mtce/src/maintenance/mtcNodeCtrl.cpp index b82ee661..75c26853 100644 --- a/mtce/src/maintenance/mtcNodeCtrl.cpp +++ b/mtce/src/maintenance/mtcNodeCtrl.cpp @@ -316,6 +316,25 @@ static int mtc_config_handler ( void * user, mtcInv.offline_threshold = atoi(value); ilog ("OfflineThrsh: %d\n", mtcInv.offline_threshold ); } + + else if (MATCH("agent", "ar_config_threshold")) + mtcInv.ar_threshold[MTC_AR_DISABLE_CAUSE__CONFIG] = atoi(value); + else if (MATCH("agent", "ar_goenable_threshold")) + mtcInv.ar_threshold[MTC_AR_DISABLE_CAUSE__GOENABLE] = atoi(value); + else if (MATCH("agent", "ar_hostservices_threshold")) + mtcInv.ar_threshold[MTC_AR_DISABLE_CAUSE__HOST_SERVICES] = atoi(value); + else if (MATCH("agent", "ar_heartbeat_threshold")) + mtcInv.ar_threshold[MTC_AR_DISABLE_CAUSE__HEARTBEAT] = atoi(value); + + else if (MATCH("agent", "ar_config_interval")) + mtcInv.ar_interval[MTC_AR_DISABLE_CAUSE__CONFIG] = atoi(value); + else if (MATCH("agent", "ar_goenable_interval")) + mtcInv.ar_interval[MTC_AR_DISABLE_CAUSE__GOENABLE] = atoi(value); + else if (MATCH("agent", "ar_hostservices_interval")) + mtcInv.ar_interval[MTC_AR_DISABLE_CAUSE__HOST_SERVICES] = atoi(value); + else if (MATCH("agent", "ar_heartbeat_interval")) + mtcInv.ar_interval[MTC_AR_DISABLE_CAUSE__HEARTBEAT] = atoi(value); + else { return (PASS); @@ -635,6 +654,20 @@ int daemon_configure ( void ) ilog("hwmond : %d (port)\n", mtc_config.hwmon_cmd_port ); ilog("auth_host : %s \n", mtc_config.keystone_auth_host ); + /* log system wide service based auto recovery control values */ + ilog("AR Config : %d (threshold) %d sec (retry interval)", + mtcInv.ar_threshold[MTC_AR_DISABLE_CAUSE__CONFIG], + mtcInv.ar_interval [MTC_AR_DISABLE_CAUSE__CONFIG]); + ilog("AR GoEnable : %d (threshold) %d sec (retry interval)", + mtcInv.ar_threshold[MTC_AR_DISABLE_CAUSE__GOENABLE], + mtcInv.ar_interval [MTC_AR_DISABLE_CAUSE__GOENABLE]); + ilog("AR Host Svcs: %d (threshold) %d sec (retry interval)", + mtcInv.ar_threshold[MTC_AR_DISABLE_CAUSE__HOST_SERVICES], + mtcInv.ar_interval [MTC_AR_DISABLE_CAUSE__HOST_SERVICES]); + ilog("AR Heartbeat: %d (threshold) %d sec (retry interval)", + mtcInv.ar_threshold[MTC_AR_DISABLE_CAUSE__HEARTBEAT], + mtcInv.ar_interval [MTC_AR_DISABLE_CAUSE__HEARTBEAT]); + /* Get this Controller Activity State */ mtc_config.active = daemon_get_run_option ("active") ; ilog ("Controller : %s\n", diff --git a/mtce/src/maintenance/mtcNodeFsm.cpp b/mtce/src/maintenance/mtcNodeFsm.cpp index 20fad599..19eba3ee 100755 --- a/mtce/src/maintenance/mtcNodeFsm.cpp +++ b/mtce/src/maintenance/mtcNodeFsm.cpp @@ -118,7 +118,8 @@ int nodeLinkClass::fsm ( struct nodeLinkClass::node * node_ptr ) * the insv_test_handler gets run as soon as a host's main function is enabled. **************************************************************************** */ - if (( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED ) && + if (( node_ptr->ar_disabled == false ) && + ( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED ) && ( node_ptr->operState == MTC_OPER_STATE__ENABLED ) && ((node_ptr->availStatus == MTC_AVAIL_STATUS__AVAILABLE ) || (node_ptr->availStatus == MTC_AVAIL_STATUS__DEGRADED ))) @@ -265,7 +266,7 @@ int nodeLinkClass::fsm ( struct nodeLinkClass::node * node_ptr ) { flog ("%s -> Running SubFunction Enable handler (%d)\n", node_ptr->hostname.c_str(), - node_ptr->handlerStage.enable ); + node_ptr->enableStage ); nodeLinkClass::enable_subf_handler ( node_ptr ); } diff --git a/mtce/src/maintenance/mtcNodeHdlrs.cpp b/mtce/src/maintenance/mtcNodeHdlrs.cpp index 73f85b1d..6bd4c42a 100755 --- a/mtce/src/maintenance/mtcNodeHdlrs.cpp +++ b/mtce/src/maintenance/mtcNodeHdlrs.cpp @@ -446,6 +446,15 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr ) { int rc = PASS ; + if ( node_ptr->ar_disabled == true ) + { + wlog_throttled ( node_ptr->ar_log_throttle, + AR_LOG_THROTTLE_THRESHOLD, + "%s auto recovery disabled cause:%d", + node_ptr->hostname.c_str(), node_ptr->ar_cause ); + return (RETRY); ; + } + if ( THIS_HOST ) { /****************************************************************** @@ -476,7 +485,7 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr ) } } - switch ( (int)node_ptr->handlerStage.enable ) + switch ( (int)node_ptr->enableStage ) { case MTC_ENABLE__FAILURE: { @@ -539,7 +548,8 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr ) if ( is_inactive_controller_main_insv() == true ) { wlog ("%s has critical failure\n", node_ptr->hostname.c_str()); - wlog ("%s ... requesting swact to in-service inactive controller\n", node_ptr->hostname.c_str()); + wlog ("%s ... requesting swact to peer controller", + node_ptr->hostname.c_str()); mtcInvApi_update_task_now ( node_ptr, MTC_TASK_FAILED_SWACT_REQ ); @@ -587,19 +597,6 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr ) } else { - this->autorecovery_enabled = true ; - - /* use thresholded auto recovery for simplext failure case */ - manage_autorecovery ( node_ptr ); - - if ( this->autorecovery_disabled == false ) - { - wlog ("%s has critical failure.\n", node_ptr->hostname.c_str()); - wlog ("%s ... downgrading to degrade with auto recovery disabled\n", node_ptr->hostname.c_str()); - wlog ("%s ... to avoid disabling only enabled controller\n", node_ptr->hostname.c_str()); - this->autorecovery_disabled = true ; - } - if (( CPE_SYSTEM ) && ( is_controller(node_ptr) == true )) { /* Raise Critical Compute Function Alarm */ @@ -620,7 +617,22 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr ) MTC_AVAIL_STATUS__FAILED ); } - if ( degrade_only == true ) + /* if we get here in controller simplex mode then go degraded + * if we are not already degraded. Otherwise, fail. */ + if ( THIS_HOST && ( is_inactive_controller_main_insv() == false )) + { + if (( node_ptr->adminState != MTC_ADMIN_STATE__UNLOCKED ) || + ( node_ptr->operState != MTC_OPER_STATE__ENABLED ) || + ( node_ptr->availStatus != MTC_AVAIL_STATUS__DEGRADED)) + { + allStateChange ( node_ptr, MTC_ADMIN_STATE__UNLOCKED, + MTC_OPER_STATE__ENABLED, + MTC_AVAIL_STATUS__DEGRADED ); + } + /* adminAction state is already changed to NONE. */ + } + + else if ( degrade_only == true ) { allStateChange ( node_ptr, MTC_ADMIN_STATE__UNLOCKED, MTC_OPER_STATE__ENABLED, @@ -636,25 +648,28 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr ) /* Inform the VIM of the failure */ mtcVimApi_state_change ( node_ptr, VIM_HOST_FAILED, 3 ); - /* if we get here in controller simplex mode then go degraded - * if we are not already degraded. Otherwise, fail. */ - if ( THIS_HOST && ( is_inactive_controller_main_insv() == false )) + /* handle thresholded auto recovery retry delay interval */ + if ( node_ptr->ar_cause < MTC_AR_DISABLE_CAUSE__LAST ) { - /* autorecovery must be disabled */ - if (( node_ptr->adminState != MTC_ADMIN_STATE__UNLOCKED ) || - ( node_ptr->operState != MTC_OPER_STATE__ENABLED ) || - ( node_ptr->availStatus != MTC_AVAIL_STATUS__DEGRADED)) + unsigned int interval = this->ar_interval[node_ptr->ar_cause] ; + if ( interval ) { - allStateChange ( node_ptr, MTC_ADMIN_STATE__UNLOCKED, - MTC_OPER_STATE__ENABLED, - MTC_AVAIL_STATUS__DEGRADED ); + /* Wait this failure cause's retry delay */ + mtcTimer_start ( node_ptr->mtcTimer, + mtcTimer_handler, + interval ); + + wlog ("%s waiting %d secs before enable sequence retry (%d)", + node_ptr->hostname.c_str(), + interval, node_ptr->ar_cause ); } - /* adminAction state is already changed to NONE. */ + else + node_ptr->mtcTimer.ring = true ; } else - { - enableStageChange ( node_ptr, MTC_ENABLE__FAILURE_WAIT ); - } + node_ptr->mtcTimer.ring = true ; + + enableStageChange ( node_ptr, MTC_ENABLE__FAILURE_WAIT ); break; } @@ -717,6 +732,8 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr ) mtcCmd_workQ_purge ( node_ptr ); mtcCmd_doneQ_purge ( node_ptr ); + node_ptr->mtce_flags = 0 ; + /* Assert the mtc alive gate */ node_ptr->mtcAlive_gate = true ; @@ -739,8 +756,9 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr ) /* enable auto recovery if the inactive controller * is out of service */ - if (( is_controller (node_ptr) ) && ( NOT_THIS_HOST )) - this->autorecovery_enabled = true ; + //if (( is_controller (node_ptr) ) && ( NOT_THIS_HOST )) + // node_ptr->ar_disabled = false ; + // this->autorecovery_enabled = true ; /* fall through */ @@ -757,20 +775,6 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr ) get_availStatus_str(node_ptr->availStatus).c_str()); mtcInvApi_update_task ( node_ptr, "" ); - - /* Special case */ - // alarm_enabled_clear ( node_ptr, false ); - - //mtcAlarm_clear ( node_ptr->hostname, MTC_ALARM_ID__CONFIG ); - //node_ptr->alarms[MTC_ALARM_ID__CONFIG] = FM_ALARM_SEVERITY_CLEAR ; - - //allStateChange ( node_ptr, MTC_ADMIN_STATE__UNLOCKED, - // MTC_OPER_STATE__ENABLED, - // MTC_AVAIL_STATUS__DEGRADED ); - - // adminActionChange ( node_ptr, MTC_ADMIN_ACTION__NONE ); - - // return (PASS); } else { @@ -986,6 +990,7 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr ) node_ptr->mtcAlive_online = false ; node_ptr->mtcAlive_offline = true ; node_ptr->goEnabled = false ; + node_ptr->ar_cause = MTC_AR_DISABLE_CAUSE__NONE ; clear_service_readies ( node_ptr ); @@ -1038,17 +1043,24 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr ) mtcTimer_reset ( node_ptr->mtcTimer ); /* Check to see if the host is/got configured correctly */ - if ( (node_ptr->mtce_flags & MTC_FLAG__I_AM_CONFIGURED) == 0 ) + if ((( !node_ptr->mtce_flags & MTC_FLAG__I_AM_CONFIGURED )) || + (( node_ptr->mtce_flags & MTC_FLAG__I_AM_NOT_HEALTHY ))) { - elog ("%s configuration incomplete or failed (oob:%x:%x)\n", + elog ("%s configuration failed or incomplete (oob:%x)\n", node_ptr->hostname.c_str(), - node_ptr->mtce_flags, - MTC_FLAG__I_AM_CONFIGURED); + node_ptr->mtce_flags) /* raise an alarm for the failure of the config */ alarm_config_failure ( node_ptr ); + mtcInvApi_update_task ( node_ptr, MTC_TASK_MAIN_CONFIG_FAIL ); enableStageChange ( node_ptr, MTC_ENABLE__FAILURE ); + + /* handle auto recovery for this failure */ + if ( ar_manage ( node_ptr, + MTC_AR_DISABLE_CAUSE__CONFIG, + MTC_TASK_AR_DISABLED_CONFIG ) != PASS ) + break ; } else { @@ -1152,6 +1164,7 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr ) } case MTC_ENABLE__GOENABLED_WAIT: { + bool goenable_failed = false ; /* The healthy code comes from the host in the mtcAlive message. * This 'if' clause was introduced to detected failure of host * without having to wait for the GOENABLED phase to timeout. @@ -1162,27 +1175,22 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr ) * be gracefully recovered to enabled in that case. Instead * we want to recover the card through a reset as quickly as * possible. */ - if ( node_ptr->health == NODE_UNHEALTHY ) - { - elog ("%s is UNHEALTHY\n", node_ptr->hostname.c_str()); - mtcTimer_reset ( node_ptr->mtcTimer ); - this->force_full_enable ( node_ptr ); - } /* search for the Go Enable message */ - else if ( node_ptr->goEnabled_failed == true ) + if (( node_ptr->health == NODE_UNHEALTHY ) || + (( node_ptr->mtce_flags & MTC_FLAG__I_AM_NOT_HEALTHY)) || + ( node_ptr->goEnabled_failed == true )) { elog ("%s got GOENABLED Failed\n", node_ptr->hostname.c_str()); mtcTimer_reset ( node_ptr->mtcTimer ); - mtcInvApi_update_task ( node_ptr, MTC_TASK_INTEST_FAIL ); - enableStageChange ( node_ptr, MTC_ENABLE__FAILURE ); + goenable_failed = true ; + mtcInvApi_update_task ( node_ptr, MTC_TASK_MAIN_INTEST_FAIL ); } /* search for the Go Enable message */ else if ( node_ptr->goEnabled == true ) { mtcTimer_reset ( node_ptr->mtcTimer ); plog ("%s got GOENABLED\n", node_ptr->hostname.c_str()); - // plog ("%s main configured OK\n", node_ptr->hostname.c_str()); /* O.K. clearing the state now that we got it */ node_ptr->goEnabled = false ; @@ -1194,26 +1202,28 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr ) } else if ( mtcTimer_expired ( node_ptr->mtcTimer )) { - elog ("%s has GOENABLED Timeout\n", node_ptr->hostname.c_str()); - ilog ("%s ... the out-of-service tests took too long to complete\n", - node_ptr->hostname.c_str()); - - mtcInvApi_update_task ( node_ptr, MTC_TASK_INTEST_FAIL_TO_ ); + elog ("%s has GOENABLED Timeout", node_ptr->hostname.c_str()); node_ptr->mtcTimer.ring = false ; - - /* raise an alarm for the enable failure */ - alarm_enabled_failure ( node_ptr , true ); - - /* go back and issue reboot again */ - enableStageChange ( node_ptr, MTC_ENABLE__FAILURE ); - - /* no longer In-Test ; we are 'Failed' again" */ - availStatusChange ( node_ptr, MTC_AVAIL_STATUS__FAILED ); + goenable_failed = true ; + mtcInvApi_update_task ( node_ptr, MTC_TASK_MAIN_INTEST_TO ); } else { ; /* wait some more */ } + + if ( goenable_failed ) + { + alarm_enabled_failure ( node_ptr, true ); + + enableStageChange ( node_ptr, MTC_ENABLE__FAILURE ); + + /* handle auto recovery for this failure */ + if ( ar_manage ( node_ptr, + MTC_AR_DISABLE_CAUSE__GOENABLE, + MTC_TASK_AR_DISABLED_GOENABLE ) != PASS ) + break ; + } break ; } @@ -1224,14 +1234,20 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr ) plog ("%s Starting Host Services\n", node_ptr->hostname.c_str()); if ( this->launch_host_services_cmd ( node_ptr, start ) != PASS ) { - node_ptr->hostservices_failed = true ; - elog ("%s %s failed ; launch\n", node_ptr->hostname.c_str(), node_ptr->host_services_req.name.c_str()); - mtcInvApi_update_task ( node_ptr, MTC_TASK_START_SERVICE_FAIL ); + node_ptr->hostservices_failed = true ; + alarm_enabled_failure ( node_ptr, true ); enableStageChange ( node_ptr, MTC_ENABLE__FAILURE ); + mtcInvApi_update_task ( node_ptr, MTC_TASK_MAIN_SERVICE_FAIL ); + + /* handle auto recovery for this failure */ + if ( ar_manage ( node_ptr, + MTC_AR_DISABLE_CAUSE__HOST_SERVICES, + MTC_TASK_AR_DISABLED_SERVICES ) != PASS ) + break ; } else { @@ -1261,6 +1277,10 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr ) else if ( rc != PASS ) { node_ptr->hostservices_failed = true ; + alarm_enabled_failure ( node_ptr, true ); + enableStageChange ( node_ptr, MTC_ENABLE__FAILURE ); + + /* distinguish 'timeout' from other 'execution' failures */ if ( rc == FAIL_TIMEOUT ) { @@ -1269,7 +1289,7 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr ) node_ptr->host_services_req.name.c_str()); mtcInvApi_update_task ( node_ptr, - MTC_TASK_START_SERVICE_TO ); + MTC_TASK_MAIN_SERVICE_TO ); } else { @@ -1279,9 +1299,14 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr ) rc); mtcInvApi_update_task ( node_ptr, - MTC_TASK_START_SERVICE_FAIL ); + MTC_TASK_MAIN_SERVICE_FAIL ); } - enableStageChange ( node_ptr, MTC_ENABLE__FAILURE ); + + /* handle auto recovery for this failure */ + if ( ar_manage ( node_ptr, + MTC_AR_DISABLE_CAUSE__HOST_SERVICES, + MTC_TASK_AR_DISABLED_SERVICES ) != PASS ) + break ; } else /* success path */ { @@ -1321,8 +1346,6 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr ) mtcTimer_reset ( node_ptr->mtcTimer ); } - /* Start Monitoring Services - heartbeat, process and hardware */ - send_hbs_command ( node_ptr->hostname, MTC_CMD_START_HOST ); if ( this->hbs_failure_action == HBS_FAILURE_ACTION__NONE ) { @@ -1338,11 +1361,13 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr ) MTC_HEARTBEAT_SOAK_BEFORE_ENABLE, node_ptr->hbsClient_ready ? " ready event" : "out ready event" ); - /* allow heartbeat to run for MTC_HEARTBEAT_SOAK_BEFORE_ENABLE * seconds before we declare enable */ mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_HEARTBEAT_SOAK_BEFORE_ENABLE ); enableStageChange ( node_ptr, MTC_ENABLE__HEARTBEAT_SOAK ); + + /* Start Monitoring Services - heartbeat, process and hardware */ + send_hbs_command ( node_ptr->hostname, MTC_CMD_START_HOST ); } break ; } @@ -1351,6 +1376,11 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr ) if ( node_ptr->mtcTimer.ring == true ) { plog ("%s heartbeating\n", node_ptr->hostname.c_str() ); + + /* handle auto recovery ear for thsi potential cause */ + node_ptr->ar_cause = MTC_AR_DISABLE_CAUSE__NONE ; + node_ptr->ar_count[MTC_AR_DISABLE_CAUSE__HEARTBEAT] = 0 ; + /* if heartbeat is not working then we will * never get here and enable the host */ enableStageChange ( node_ptr, MTC_ENABLE__STATE_CHANGE ); @@ -1490,7 +1520,6 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr ) } else { - node_ptr->enabled_count++ ; /* Inform the VIM that this host is enabled */ @@ -1505,6 +1534,8 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr ) adminActionChange ( node_ptr, MTC_ADMIN_ACTION__NONE ); node_ptr->health_threshold_counter = 0 ; + + ar_enable ( node_ptr ); } break ; @@ -2103,7 +2134,7 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr ) { elog ("%s got GOENABLED Failed\n", node_ptr->hostname.c_str()); mtcTimer_reset ( node_ptr->mtcTimer ); - mtcInvApi_update_task ( node_ptr, MTC_TASK_INTEST_FAIL ); + mtcInvApi_update_task ( node_ptr, MTC_TASK_MAIN_INTEST_FAIL ); this->force_full_enable ( node_ptr ); } @@ -2121,6 +2152,7 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr ) else if ( node_ptr->mtcTimer.ring == true ) { elog ("%s has GOENABLED Timeout\n", node_ptr->hostname.c_str()); + mtcInvApi_update_task ( node_ptr, MTC_TASK_MAIN_INTEST_TO ); node_ptr->mtcTimer.ring = false ; @@ -2640,7 +2672,7 @@ int nodeLinkClass::disable_handler ( struct nodeLinkClass::node * node_ptr ) { int rc = PASS ; - switch ( (int)node_ptr->handlerStage.disable ) + switch ( (int)node_ptr->disableStage ) { case MTC_DISABLE__START: { @@ -2657,6 +2689,7 @@ int nodeLinkClass::disable_handler ( struct nodeLinkClass::node * node_ptr ) clear_subf_failed_bools ( node_ptr ); clear_hostservices_ctls ( node_ptr ); + enableStageChange ( node_ptr, MTC_ENABLE__START ) ; disableStageChange ( node_ptr, MTC_DISABLE__DIS_SERVICES_WAIT) ; stop_offline_handler ( node_ptr ); @@ -2758,7 +2791,7 @@ int nodeLinkClass::disable_handler ( struct nodeLinkClass::node * node_ptr ) /* If the stage is still MTC_DISABLE__DIS_SERVICES_WAIT then the * host should already be powered on so lets send the stop * services command */ - if ( node_ptr->handlerStage.disable == MTC_DISABLE__DIS_SERVICES_WAIT ) + if ( node_ptr->disableStage == MTC_DISABLE__DIS_SERVICES_WAIT ) { bool start = false ; if ( this->launch_host_services_cmd ( node_ptr, start ) != PASS ) @@ -3042,6 +3075,9 @@ int nodeLinkClass::disable_handler ( struct nodeLinkClass::node * node_ptr ) recovery_ctrl_init ( node_ptr->hwmon_reset ); recovery_ctrl_init ( node_ptr->hwmon_powercycle ); + /* re-enable auto recovery */ + ar_enable ( node_ptr ); + /* Load configured mtcAlive and goEnabled timers */ LOAD_NODETYPE_TIMERS ; @@ -3055,7 +3091,7 @@ int nodeLinkClass::disable_handler ( struct nodeLinkClass::node * node_ptr ) default: { elog ("%s Bad Case (%d)\n", node_ptr->hostname.c_str(), - node_ptr->handlerStage.disable ); + node_ptr->disableStage ); rc = FAIL_BAD_CASE ; } } @@ -3273,14 +3309,20 @@ int nodeLinkClass::online_handler ( struct nodeLinkClass::node * node_ptr ) int rc = PASS ; /* don't need to manage the offline or online state - * for the following availability states */ - if (( node_ptr->availStatus == MTC_AVAIL_STATUS__AVAILABLE ) || + * for the following states + * ... auto recovery state + * ... enable stages + * ... availability states */ + if (( node_ptr->ar_disabled == true ) || + ( node_ptr->enableStage == MTC_ENABLE__FAILURE ) || + ( node_ptr->enableStage == MTC_ENABLE__FAILURE_WAIT ) || + ( node_ptr->availStatus == MTC_AVAIL_STATUS__AVAILABLE ) || ( node_ptr->availStatus == MTC_AVAIL_STATUS__DEGRADED ) || ( node_ptr->availStatus == MTC_AVAIL_STATUS__OFFDUTY ) || ( node_ptr->availStatus == MTC_AVAIL_STATUS__INTEST ) || ( node_ptr->availStatus == MTC_AVAIL_STATUS__NOT_INSTALLED )) { - return (PASS); + return (rc); } switch ( (int)node_ptr->onlineStage ) @@ -4648,6 +4690,8 @@ int nodeLinkClass::power_handler ( struct nodeLinkClass::node * node_ptr ) recovery_ctrl_init ( node_ptr->hwmon_reset ); recovery_ctrl_init ( node_ptr->hwmon_powercycle ); + ar_enable ( node_ptr ); + mtcInvApi_force_task ( node_ptr, "" ); break ; } @@ -5537,25 +5581,69 @@ int nodeLinkClass::add_handler ( struct nodeLinkClass::node * node_ptr ) case MTC_ADD__CLEAR_TASK: { - if ( is_controller(node_ptr) ) + /* Check for hosts that were in the auto recovery disabled state */ + if ( !node_ptr->task.empty () ) { - if ( node_ptr->mtcTimer.ring == true ) + if (( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED ) && + (( !node_ptr->task.compare(MTC_TASK_AR_DISABLED_CONFIG)) || + ( !node_ptr->task.compare(MTC_TASK_AR_DISABLED_GOENABLE))|| + ( !node_ptr->task.compare(MTC_TASK_AR_DISABLED_SERVICES))|| + ( !node_ptr->task.compare(MTC_TASK_AR_DISABLED_HEARTBEAT)))) { - if ( !node_ptr->task.empty () ) + if ( !node_ptr->task.compare(MTC_TASK_AR_DISABLED_CONFIG )) { - mtcInvApi_force_task ( node_ptr, "" ); + node_ptr->ar_cause = MTC_AR_DISABLE_CAUSE__CONFIG ; + alarm_config_failure ( node_ptr ); } + else if ( !node_ptr->task.compare(MTC_TASK_AR_DISABLED_GOENABLE )) + { + node_ptr->ar_cause = MTC_AR_DISABLE_CAUSE__GOENABLE ; + alarm_enabled_failure ( node_ptr, true ); + } + else if ( !node_ptr->task.compare(MTC_TASK_AR_DISABLED_SERVICES )) + { + node_ptr->ar_cause = MTC_AR_DISABLE_CAUSE__HOST_SERVICES ; + alarm_enabled_failure ( node_ptr, true ); + } + else if ( !node_ptr->task.compare(MTC_TASK_AR_DISABLED_HEARTBEAT )) + { + node_ptr->ar_cause = MTC_AR_DISABLE_CAUSE__HEARTBEAT ; + } + node_ptr->ar_disabled = true ; + + if ( THIS_HOST ) + mtcInvApi_update_states ( node_ptr, "unlocked", "enabled", "degraded" ); + else + mtcInvApi_update_states ( node_ptr, "unlocked", "disabled", "failed" ); + + node_ptr->addStage = MTC_ADD__START; + node_ptr->add_completed = true ; + + adminActionChange ( node_ptr, MTC_ADMIN_ACTION__NONE ); + plog ("%s Host Add Completed ; auto recovery disabled state (uptime:%d)\n", + node_ptr->hostname.c_str(), node_ptr->uptime ); + break ; } else { - break ; + if ( is_controller(node_ptr) ) + { + if ( node_ptr->mtcTimer.ring == true ) + { + mtcInvApi_force_task ( node_ptr, "" ); + } + else + { + break ; + } + } + else + { + /* do it immediately for all otyher server types */ + mtcInvApi_force_task ( node_ptr, "" ); + } } } - else - { - /* do it immediately for all otyher server types */ - mtcInvApi_force_task ( node_ptr, "" ); - } /* default retries counter to zero before START_SERVICES */ node_ptr->retries = 0 ; node_ptr->addStage = MTC_ADD__START_SERVICES ; @@ -6017,55 +6105,6 @@ int nodeLinkClass::oos_test_handler ( struct nodeLinkClass::node * node_ptr ) node_ptr->start_services_running_main); } } -#endif - - - /* Avoid forcing the states to the database when on the first & second pass. - * This is because it is likely we just read all the states and - * if coming out of a DOR or a SWACT we don't need to un-necessarily - * produce that extra sysinv traffic. - * Also, no point forcing the states while there is an admin action - * or enable or graceful recovery going on as well because state changes - * are being done in the FSM already */ - if (( node_ptr->oos_test_count > 1 ) && - ( node_ptr->adminAction == MTC_ADMIN_ACTION__NONE ) && - ( !node_ptr->handlerStage.raw ) && - ( !node_ptr->recoveryStage )) - { - /* Change the oper and avail states in the database */ - allStateChange ( node_ptr, node_ptr->adminState, - node_ptr->operState, - node_ptr->availStatus ); - } - -#ifdef WANT_CLEAR_ALARM_AUDIT - - /* TODO: Obsolete with new Alarm Strategy */ - /* Self Correct Stuck Failure Alarms */ - if (( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED ) && - ( node_ptr->operState == MTC_OPER_STATE__ENABLED ) && - (( node_ptr->availStatus == MTC_AVAIL_STATUS__AVAILABLE ) || - ( node_ptr->availStatus == MTC_AVAIL_STATUS__DEGRADED ))) - { - if ( node_ptr->alarms[MTC_ALARM_ID__CONFIG] != FM_ALARM_SEVERITY_CLEAR ) - { - mtcAlarm_clear ( node_ptr->hostname, MTC_ALARM_ID__CONFIG ); - node_ptr->alarms[MTC_ALARM_ID__CONFIG] = FM_ALARM_SEVERITY_CLEAR ; - } - alarm_enabled_clear ( node_ptr , false); - } -#endif - /* Make sure the locked status on the host itself is set */ - if (( node_ptr->adminState == MTC_ADMIN_STATE__LOCKED ) && - ( node_ptr->operState == MTC_OPER_STATE__DISABLED ) && - ( node_ptr->availStatus == MTC_AVAIL_STATUS__ONLINE ) && - ( !(node_ptr->mtce_flags & MTC_FLAG__I_AM_LOCKED) )) - { - ilog ("%s setting 'locked' status\n", node_ptr->hostname.c_str()); - - /* Tell the host that it is locked */ - send_mtc_cmd ( node_ptr->hostname , MTC_MSG_LOCKED, MGMNT_INTERFACE); - } if (( daemon_is_file_present ( MTC_CMD_FIT__GOENABLE_AUDIT )) && ( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED ) && @@ -6078,6 +6117,43 @@ int nodeLinkClass::oos_test_handler ( struct nodeLinkClass::node * node_ptr ) send_mtc_cmd ( node_ptr->hostname, MTC_REQ_SUBF_GOENABLED, MGMNT_INTERFACE ); } } +#endif + + if ( node_ptr->ar_disabled == true ) + { + elog ( "%s auto recovery disabled cause:%d", + node_ptr->hostname.c_str(), + node_ptr->ar_cause ); + } + + /* Avoid forcing the states to the database when on the first & second pass. + * This is because it is likely we just read all the states and + * if coming out of a DOR or a SWACT we don't need to un-necessarily + * produce that extra sysinv traffic. + * Also, no point forcing the states while there is an admin action + * or enable or graceful recovery going on as well because state changes + * are being done in the FSM already */ + if (( node_ptr->oos_test_count > 1 ) && + ( node_ptr->adminAction == MTC_ADMIN_ACTION__NONE ) && + ( !node_ptr->enableStage ) && ( !node_ptr->recoveryStage )) + { + /* Change the oper and avail states in the database */ + allStateChange ( node_ptr, node_ptr->adminState, + node_ptr->operState, + node_ptr->availStatus ); + } + + /* Make sure the locked status on the host itself is set */ + if (( node_ptr->adminState == MTC_ADMIN_STATE__LOCKED ) && + ( node_ptr->operState == MTC_OPER_STATE__DISABLED ) && + ( node_ptr->availStatus == MTC_AVAIL_STATUS__ONLINE ) && + ( !(node_ptr->mtce_flags & MTC_FLAG__I_AM_LOCKED) )) + { + ilog ("%s setting 'locked' status\n", node_ptr->hostname.c_str()); + + /* Tell the host that it is locked */ + send_mtc_cmd ( node_ptr->hostname , MTC_MSG_LOCKED, MGMNT_INTERFACE); + } break ; } @@ -6140,7 +6216,8 @@ int nodeLinkClass::insv_test_handler ( struct nodeLinkClass::node * node_ptr ) } /* manage degrade state and alarms */ if (( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED ) && - ( node_ptr->operState == MTC_OPER_STATE__ENABLED )) + ( node_ptr->operState == MTC_OPER_STATE__ENABLED ) && + ( node_ptr->ar_disabled == false )) { /************************************************************ * Manage In-Service Alarms * @@ -6170,7 +6247,6 @@ int nodeLinkClass::insv_test_handler ( struct nodeLinkClass::node * node_ptr ) } case MTC_INSV_TEST__RUN: { - #ifdef WANT_FIT_TESTING daemon_load_fit (); @@ -6229,22 +6305,20 @@ int nodeLinkClass::insv_test_handler ( struct nodeLinkClass::node * node_ptr ) * controller autorecovery. Otherwise enable it but in this case * don't change the disable bool as that is used to gate auto * recovery once the threshoild is reached */ - if ( is_controller ( node_ptr ) && NOT_THIS_HOST ) - { - if (( this->autorecovery_enabled == true ) && - ( node_ptr->operState == MTC_OPER_STATE__ENABLED )) - { - autorecovery_clear ( CONTROLLER_0 ); - autorecovery_clear ( CONTROLLER_1 ); - this->autorecovery_enabled = false ; - this->autorecovery_disabled = false ; - } - else if (( this->autorecovery_enabled == false ) && - ( node_ptr->operState != MTC_OPER_STATE__ENABLED )) - { - this->autorecovery_enabled = true ; - } - } +// if ( is_controller ( node_ptr ) && NOT_THIS_HOST ) +// { +// if (( node_ptr->ar_disabled == false ) && +// ( node_ptr->operState == MTC_OPER_STATE__ENABLED )) +// { +// autorecovery_clear ( CONTROLLER_0 ); +// autorecovery_clear ( CONTROLLER_1 ); +// } + //else if (( node_ptr->ar_disabled == true ) && + // ( node_ptr->operState != MTC_OPER_STATE__ENABLED )) + //{ + // node_ptr->ar_disabled = false ; + //} + // } /* Monitor the health of the host - no pass file */ if (( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED ) && @@ -6440,7 +6514,7 @@ int nodeLinkClass::insv_test_handler ( struct nodeLinkClass::node * node_ptr ) * **/ if (( node_ptr->operState_subf == MTC_OPER_STATE__DISABLED ) && - ( this->autorecovery_disabled == false ) && + ( node_ptr->ar_disabled == false ) && ( node_ptr->start_services_needed == false )) { if (( node_ptr->adminAction != MTC_ADMIN_ACTION__ENABLE_SUBF ) && @@ -6527,7 +6601,7 @@ int nodeLinkClass::insv_test_handler ( struct nodeLinkClass::node * node_ptr ) if (( daemon_is_file_present ( CONFIG_COMPLETE_FILE )) && ( daemon_is_file_present ( CONFIG_FAIL_FILE ))) { - wlog_throttled ( node_ptr->health_threshold_counter, (MTC_UNHEALTHY_THRESHOLD*3), "%s is UNHEALTHY\n", node_ptr->hostname.c_str()); + wlog_throttled ( node_ptr->health_threshold_counter, (MTC_UNHEALTHY_THRESHOLD*10), "%s is UNHEALTHY\n", node_ptr->hostname.c_str()); if ( node_ptr->health_threshold_counter >= MTC_UNHEALTHY_THRESHOLD ) { node_ptr->degrade_mask |= DEGRADE_MASK_CONFIG ; diff --git a/mtce/src/maintenance/mtcSubfHdlrs.cpp b/mtce/src/maintenance/mtcSubfHdlrs.cpp index 20d42793..9a25c701 100644 --- a/mtce/src/maintenance/mtcSubfHdlrs.cpp +++ b/mtce/src/maintenance/mtcSubfHdlrs.cpp @@ -43,20 +43,17 @@ int nodeLinkClass::enable_subf_handler ( struct nodeLinkClass::node * node_ptr ) { int rc = PASS ; + if ( node_ptr->ar_disabled == true ) + { + enableStageChange ( node_ptr, MTC_ENABLE__START ); + return (rc); + } + /* Setup the log prefix */ string name = node_ptr->hostname ; name.append("-compute"); - bool simplex = false ; - if (( SIMPLEX ) || - (( THIS_HOST ) && - (( this->is_inactive_controller_main_insv() == false ) || - ( this->is_inactive_controller_subf_insv() == false )))) - { - simplex = true ; - } - - switch ( (int)node_ptr->handlerStage.enable ) + switch ( (int)node_ptr->enableStage ) { case MTC_ENABLE__FAILURE_WAIT: { @@ -77,29 +74,7 @@ int nodeLinkClass::enable_subf_handler ( struct nodeLinkClass::node * node_ptr ) workQueue_purge ( node_ptr ); doneQueue_purge ( node_ptr ); - enableStageChange ( node_ptr, MTC_ENABLE__START ); - - /* avoid failing this controller if there is no inactive to - * take over and avoid thrashing back and forth if the sub - * function on the inactive is disabled */ - if ( simplex ) - { - /* if autorecovery is enabled then handle it that way. */ - if ( this->autorecovery_enabled == true ) - { - adminActionChange ( node_ptr, MTC_ADMIN_ACTION__NONE ); - enableStageChange ( node_ptr, MTC_ENABLE__START ); - - manage_autorecovery ( node_ptr ); - } - - wlog ("%s is ENABLED-degraded (failed subfunction)\n", name.c_str()); - } - else - { - /* if there is another controller enabled then just force a full enable of this one */ - force_full_enable ( node_ptr ) ; - } + force_full_enable ( node_ptr ) ; break ; } @@ -139,22 +114,26 @@ int nodeLinkClass::enable_subf_handler ( struct nodeLinkClass::node * node_ptr ) enableStageChange ( node_ptr, MTC_ENABLE__GOENABLED_TIMER ); alarm_config_clear ( node_ptr ); } - else if ( node_ptr->mtce_flags & MTC_FLAG__I_AM_NOT_HEALTHY ) + + if ((( !node_ptr->mtce_flags & MTC_FLAG__I_AM_CONFIGURED )) || + (( node_ptr->mtce_flags & MTC_FLAG__I_AM_NOT_HEALTHY ))) { mtcTimer_reset (node_ptr->mtcTimer); - elog ("%s configuration failed (oob:%x:%x)\n", - name.c_str(), - node_ptr->mtce_flags, - MTC_FLAG__I_AM_NOT_HEALTHY); + + elog ("%s configuration failed or incomplete (oob:%x)\n", + name.c_str(), node_ptr->mtce_flags); alarm_config_failure ( node_ptr ); - if ( simplex ) - mtcInvApi_update_task ( node_ptr, MTC_TASK_SUBF_CONFIG_FAIL_ ); - else - mtcInvApi_update_task ( node_ptr, MTC_TASK_SUBF_CONFIG_FAIL ); + mtcInvApi_update_task ( node_ptr, MTC_TASK_SUBF_CONFIG_FAIL ); enableStageChange ( node_ptr, MTC_ENABLE__SUBF_FAILED ); + + /* handle auto recovery for this failure */ + if ( ar_manage ( node_ptr, + MTC_AR_DISABLE_CAUSE__CONFIG, + MTC_TASK_AR_DISABLED_CONFIG ) != PASS ) + break ; } /* timeout handling */ @@ -166,12 +145,15 @@ int nodeLinkClass::enable_subf_handler ( struct nodeLinkClass::node * node_ptr ) alarm_config_failure ( node_ptr ); - if ( simplex ) - mtcInvApi_update_task ( node_ptr, MTC_TASK_SUBF_CONFIG_TO_ ); - else - mtcInvApi_update_task ( node_ptr, MTC_TASK_SUBF_CONFIG_TO ); + mtcInvApi_update_task ( node_ptr, MTC_TASK_SUBF_CONFIG_TO ); enableStageChange ( node_ptr, MTC_ENABLE__SUBF_FAILED ); + + /* handle auto recovery for this failure */ + if ( ar_manage ( node_ptr, + MTC_AR_DISABLE_CAUSE__CONFIG, + MTC_TASK_AR_DISABLED_CONFIG ) != PASS ) + break ; } else { @@ -227,19 +209,19 @@ int nodeLinkClass::enable_subf_handler ( struct nodeLinkClass::node * node_ptr ) } case MTC_ENABLE__GOENABLED_WAIT: { + bool goenable_failed = false ; + /* search for the Go Enable message */ - if ( node_ptr->goEnabled_failed_subf == true ) + if (( node_ptr->health == NODE_UNHEALTHY ) || + ( node_ptr->mtce_flags & MTC_FLAG__I_AM_NOT_HEALTHY) || + ( node_ptr->goEnabled_failed_subf == true )) { mtcTimer_reset ( node_ptr->mtcTimer ); elog ("%s one or more out-of-service tests failed\n", name.c_str()); - mtcInvApi_update_task ( node_ptr, simplex ? MTC_TASK_INTEST_FAIL_ : MTC_TASK_INTEST_FAIL ); - - /* Need thresholded auto recovery for this failure mode */ - if ( this->system_type == SYSTEM_TYPE__CPE_MODE__SIMPLEX ) - this->autorecovery_enabled = true ; - + mtcInvApi_update_task ( node_ptr, MTC_TASK_SUBF_INTEST_FAIL ); enableStageChange ( node_ptr, MTC_ENABLE__SUBF_FAILED ); + goenable_failed = true ; } /* search for the Go Enable message */ @@ -248,6 +230,7 @@ int nodeLinkClass::enable_subf_handler ( struct nodeLinkClass::node * node_ptr ) mtcTimer_reset ( node_ptr->mtcTimer ); alarm_enabled_clear ( node_ptr, false ); + alarm_compute_clear ( node_ptr, true ); plog ("%s passed out-of-service tests\n", name.c_str()); @@ -275,18 +258,25 @@ int nodeLinkClass::enable_subf_handler ( struct nodeLinkClass::node * node_ptr ) { elog ("%s out-of-service test execution timeout\n", name.c_str()); - mtcInvApi_update_task ( node_ptr, simplex ? MTC_TASK_INTEST_FAIL_TO_ : MTC_TASK_INTEST_FAIL_TO ); - - /* Need thresholded auto recovery for this failure mode */ - if ( this->system_type == SYSTEM_TYPE__CPE_MODE__SIMPLEX ) - this->autorecovery_enabled = true ; - + mtcInvApi_update_task ( node_ptr, MTC_TASK_SUBF_INTEST_TO ); enableStageChange ( node_ptr, MTC_ENABLE__SUBF_FAILED ); + goenable_failed = true ; } else { ; /* wait some more */ } + + if ( goenable_failed == true ) + { + alarm_compute_failure ( node_ptr, FM_ALARM_SEVERITY_CRITICAL ); + + /* handle auto recovery for this failure */ + if ( ar_manage ( node_ptr, + MTC_AR_DISABLE_CAUSE__GOENABLE, + MTC_TASK_AR_DISABLED_GOENABLE ) != PASS ) + break ; + } break ; } case MTC_ENABLE__HOST_SERVICES_START: @@ -312,17 +302,20 @@ int nodeLinkClass::enable_subf_handler ( struct nodeLinkClass::node * node_ptr ) else if ( launch_host_services_cmd ( node_ptr, start, subf ) != PASS ) { - node_ptr->hostservices_failed_subf = true ; - wlog ("%s %s failed ; launch\n", name.c_str(), node_ptr->host_services_req.name.c_str()); - /* Need thresholded auto recovery for this failure mode */ - if ( this->system_type == SYSTEM_TYPE__CPE_MODE__SIMPLEX ) - this->autorecovery_enabled = true ; - + node_ptr->hostservices_failed_subf = true ; + alarm_compute_failure ( node_ptr, FM_ALARM_SEVERITY_CRITICAL ); enableStageChange ( node_ptr, MTC_ENABLE__SUBF_FAILED ); + mtcInvApi_update_task ( node_ptr, MTC_TASK_SUBF_SERVICE_FAIL ); + + /* handle auto recovery for this failure */ + if ( ar_manage ( node_ptr, + MTC_AR_DISABLE_CAUSE__HOST_SERVICES, + MTC_TASK_AR_DISABLED_SERVICES ) != PASS ) + break ; } else { @@ -343,11 +336,12 @@ int nodeLinkClass::enable_subf_handler ( struct nodeLinkClass::node * node_ptr ) } else if ( rc != PASS ) { - /* Need thresholded auto recovery for this failure mode */ - if ( this->system_type == SYSTEM_TYPE__CPE_MODE__SIMPLEX ) - this->autorecovery_enabled = true ; - node_ptr->hostservices_failed_subf = true ; + alarm_compute_failure ( node_ptr, FM_ALARM_SEVERITY_CRITICAL ); + + enableStageChange ( node_ptr, MTC_ENABLE__SUBF_FAILED ); + + if ( rc == FAIL_TIMEOUT ) { elog ("%s %s failed ; timeout\n", @@ -355,7 +349,7 @@ int nodeLinkClass::enable_subf_handler ( struct nodeLinkClass::node * node_ptr ) node_ptr->host_services_req.name.c_str()); /* Report "Enabling Compute Service Timeout" to sysinv/horizon */ - mtcInvApi_update_task ( node_ptr, MTC_TASK_ENABLING_SUBF_TO ); + mtcInvApi_update_task ( node_ptr, MTC_TASK_SUBF_SERVICE_TO ); } else { @@ -365,9 +359,14 @@ int nodeLinkClass::enable_subf_handler ( struct nodeLinkClass::node * node_ptr ) rc); /* Report "Enabling Compute Service Failed" to sysinv/horizon */ - mtcInvApi_update_task ( node_ptr, MTC_TASK_ENABLING_SUBF_FAIL ); + mtcInvApi_update_task ( node_ptr, MTC_TASK_SUBF_SERVICE_FAIL ); } - enableStageChange ( node_ptr, MTC_ENABLE__SUBF_FAILED ); + + /* handle auto recovery for this failure */ + if ( ar_manage ( node_ptr, + MTC_AR_DISABLE_CAUSE__HOST_SERVICES, + MTC_TASK_AR_DISABLED_SERVICES ) != PASS ) + break ; } else /* success path */ { @@ -409,8 +408,6 @@ int nodeLinkClass::enable_subf_handler ( struct nodeLinkClass::node * node_ptr ) mtcTimer_reset ( node_ptr->mtcTimer ); } - /* Start Monitoring heartbeat */ - send_hbs_command ( node_ptr->hostname, MTC_CMD_START_HOST ); if ( this->hbs_failure_action == HBS_FAILURE_ACTION__NONE ) { @@ -426,6 +423,9 @@ int nodeLinkClass::enable_subf_handler ( struct nodeLinkClass::node * node_ptr ) /* allow heartbeat to run for 10 seconds before we declare enable */ mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_HEARTBEAT_SOAK_BEFORE_ENABLE ); enableStageChange ( node_ptr, MTC_ENABLE__HEARTBEAT_SOAK ); + + /* Start Monitoring heartbeat */ + send_hbs_command ( node_ptr->hostname, MTC_CMD_START_HOST ); } break ; } @@ -434,6 +434,11 @@ int nodeLinkClass::enable_subf_handler ( struct nodeLinkClass::node * node_ptr ) if ( node_ptr->mtcTimer.ring == true ) { plog ("%s heartbeating\n", name.c_str() ); + + /* handle auto recovery ear for thsi potential cause */ + node_ptr->ar_cause = MTC_AR_DISABLE_CAUSE__NONE ; + node_ptr->ar_count[MTC_AR_DISABLE_CAUSE__HEARTBEAT] = 0 ; + /* if heartbeat is not working then we will * never get here and enable the host */ enableStageChange ( node_ptr, MTC_ENABLE__STATE_CHANGE ); @@ -472,7 +477,7 @@ int nodeLinkClass::enable_subf_handler ( struct nodeLinkClass::node * node_ptr ) { elog ("%s enable failed ; Enable workQueue timeout, purging ...\n", name.c_str()); - mtcInvApi_update_task ( node_ptr, simplex ? MTC_TASK_ENABLE_WORK_TO_ : MTC_TASK_ENABLE_WORK_TO ); + mtcInvApi_update_task ( node_ptr, MTC_TASK_ENABLE_WORK_TO ); fail = true ; } @@ -480,7 +485,7 @@ int nodeLinkClass::enable_subf_handler ( struct nodeLinkClass::node * node_ptr ) { elog ("%s enable failed ; Enable doneQueue has failed commands\n", name.c_str()); - mtcInvApi_update_task ( node_ptr, simplex ? MTC_TASK_ENABLE_WORK_FAIL_ : MTC_TASK_ENABLE_WORK_FAIL ); + mtcInvApi_update_task ( node_ptr, MTC_TASK_ENABLE_WORK_FAIL ); fail = true ; } @@ -656,6 +661,8 @@ int nodeLinkClass::enable_subf_handler ( struct nodeLinkClass::node * node_ptr ) node_ptr->dor_recovery_mode = false ; this->dor_mode_active = false ; + ar_enable ( node_ptr ); + mtcInvApi_force_task ( node_ptr, "" ); break ; } diff --git a/mtce/src/scripts/mtc.conf b/mtce/src/scripts/mtc.conf index 8c0730d9..898e8a3b 100644 --- a/mtce/src/scripts/mtc.conf +++ b/mtce/src/scripts/mtc.conf @@ -34,6 +34,35 @@ autorecovery_threshold = 3 ; The number of times maintenance will try to ; while there is no backup controllers to fail ; over to before giving up. + +; Service specific Auto Recovery failure thresholds. +; +; ar__threshold = +; +; If a host fails to enable due to a particular service failure, for example +; configuration, goenabled etc. , then the mtcAgent will stop retrying after +; the particular services' threshold is reached. While at threshold auto +; recovery for specified host is disabled. The host sits there in the +; unlocked-disabled-failed state with the WEBGUI host status showing that +; auto recovery is disabled and horizon showing then a lock/unlock is required +; to trigger another enable attempt and reset of the auto recovery counter. +ar_config_threshold = 2 +ar_goenable_threshold = 2 +ar_hostservices_threshold = 2 +ar_heartbeat_threshold = 2 + +; Service specific Auto Recovery retry interval. +; +; ar__interval = +; +; When a host fails to enable due to a particular service reason then +; the mtcAgent will use the service specific interval value specified +; to wait before it retries the enable sequence again. +ar_config_interval = 30 +ar_goenable_interval = 30 +ar_hostservices_interval = 30 +ar_heartbeat_interval = 600 + api_retries = 10 ; number of API retries b4 failure [client] ; Client Configuration