From 5f232f64867ee8468476b198cf5aaa759e36b6cd Mon Sep 17 00:00:00 2001 From: Eric MacDonald Date: Tue, 4 Sep 2018 20:54:19 -0400 Subject: [PATCH] Mtce: Make Heartbeat Failure Action Configurable The current maintenance heartbeat failure action handling is to Fail and Gracefully Recover the host. This means that maintenance will ensure that a heartbeat failed host is rebooted/reset before it is recovered but will avoid rebooting it a second time if its recovered uptime indicates that it has already rebooted. This update expands that single action handling behavior to support three new actions. In doing so it adds a new configuration service parameter called heartbeat_failure_action. The customer can configure this new parameter with any one of the following 4 actions in order of decreasing impact. fail - Host is failed and gracefuly recovered. - Current Network specific alarms continue to be raised/cleared. Note: Prior to this update this was standard system behavior. degrade - Host is only degraded while it is failing heartbeat. - Current Network specific alarms continue to be raised/cleared. - heartbeat degrade reason is cleared as are the alarms when heartbeat responses resume. alarm - The only indication of a heartbeat failure is by alarm. - Same set of alarms as in above action cases - Only in this case no degrade, no failure, no reboot/reset none - Heartbeat is disabled ; no multicase heartbeat message is sent. - All existing heartbeat alarms are cleared. - The heartbeat soak as part of the enable sequence is bypassed. The selected action is a system wide setting. The selected setting also applies to Multi-Node Failure Avoidance. The default action is the legacy action Fail. This update also 1. Removes redundant inservice failure alarm for MNFA case in support of degrade only action. Keeping it would make that alarm handling case unnecessarily complicated. 2. No longer used 'hbs calibration' code is removed (cleanup). 3. Small amount of heartbeat logging cleanup. Test Plan: PASS: fail: Verify MNFA and recovery PASS: fail: Verify Single Host heartbeat failure and recovery PASS: fail: Verify Single Host heartbeat failure and recovery (from none) PASS: degrade: Verify MNFA and recovery PASS: degrade: Verify Single Host heartbeat failure and recovery PASS: degrade: Verify Single Host heartbeat failure and recovery (from alarm) PASS: alarm: Verify MNFA and recovery PASS: alarm: Verify Single Host heartbeat failure and recovery PASS: alarm: Verify Single Host heartbeat failure and recovery (from degrade) PASS: none: Verify heartbeat disable, fail ignore and no recovery PASS: none: Verify Single Host heartbeat ignore and no recovery PASS: none: Verify Single Host heartbeat ignode and no recovery (from fail) PASS: Verify action change behavior from none to alarm with active MNFA PASS: Verify action change behavior from alarm to degrade with active MNFA PASS: Verify action change behavior from degrade to none with active MNFA PASS: Verify action change behavior from none to fail with active MNFA PASS: Verify action change behavior from fail to none with active MNFA PASS: Verify action change behavior from degrade to fail then MNFA timeout PASS: Verify all heartbeat action change customer logs PASS: verify heartbeat stats clear over action change PASS: Verify LO DOR (several large labs - compute and storage systems) PASS: Verify recovery from failure of active controller PASS: Verify 3 host failure behavior with MNFA threshold at 3 (action:fail) PASS: Verify 2 host failure behavior with MNFA threshold at 3 (action:fail) Change-Id: I198505fb7a923cc760b12082acff1e5bac929ef2 Signed-off-by: Eric MacDonald --- puppet-manifests/centos/build_srpm.data | 2 +- .../src/hieradata/controller.yaml | 1 + .../src/modules/platform/manifests/mtce.pp | 1 + .../puppet-mtce/centos/build_srpm.data | 2 +- .../src/mtce/templates/mtc_ini.erb | 13 +++++++++ sysinv/sysinv/centos/build_srpm.data | 2 +- .../sysinv/sysinv/sysinv/common/constants.py | 2 ++ .../sysinv/sysinv/common/service_parameter.py | 29 +++++++++++++++++++ .../sysinv/sysinv/sysinv/conductor/manager.py | 5 ++++ 9 files changed, 54 insertions(+), 3 deletions(-) diff --git a/puppet-manifests/centos/build_srpm.data b/puppet-manifests/centos/build_srpm.data index 65999a7567..44049d7bb8 100644 --- a/puppet-manifests/centos/build_srpm.data +++ b/puppet-manifests/centos/build_srpm.data @@ -1,2 +1,2 @@ SRC_DIR="src" -TIS_PATCH_VER=62 +TIS_PATCH_VER=63 diff --git a/puppet-manifests/src/hieradata/controller.yaml b/puppet-manifests/src/hieradata/controller.yaml index fc1c7d3c11..52c7a0adac 100644 --- a/puppet-manifests/src/hieradata/controller.yaml +++ b/puppet-manifests/src/hieradata/controller.yaml @@ -49,6 +49,7 @@ CONFIG_ADMIN_PROJECT_DOMAIN_NAME: Default platform::mtce::agent::params::compute_boot_timeout: 720 platform::mtce::agent::params::controller_boot_timeout: 1200 platform::mtce::agent::params::heartbeat_period: 100 +platform::mtce::agent::params::heartbeat_failure_action: 'fail' platform::mtce::agent::params::heartbeat_failure_threshold: 10 platform::mtce::agent::params::heartbeat_degrade_threshold: 6 platform::mtce::agent::params::mnfa_threshold: 2 diff --git a/puppet-manifests/src/modules/platform/manifests/mtce.pp b/puppet-manifests/src/modules/platform/manifests/mtce.pp index a9e0f1f276..b146abf56a 100644 --- a/puppet-manifests/src/modules/platform/manifests/mtce.pp +++ b/puppet-manifests/src/modules/platform/manifests/mtce.pp @@ -12,6 +12,7 @@ class platform::mtce::params ( $controller_boot_timeout = undef, $heartbeat_degrade_threshold = undef, $heartbeat_failure_threshold = undef, + $heartbeat_failure_action = undef, $heartbeat_period = undef, $mtce_multicast = undef, $mnfa_threshold = undef, diff --git a/puppet-modules-wrs/puppet-mtce/centos/build_srpm.data b/puppet-modules-wrs/puppet-mtce/centos/build_srpm.data index 38be379ea2..f0a35f7823 100644 --- a/puppet-modules-wrs/puppet-mtce/centos/build_srpm.data +++ b/puppet-modules-wrs/puppet-mtce/centos/build_srpm.data @@ -1,3 +1,3 @@ SRC_DIR="src" COPY_LIST="$SRC_DIR/LICENSE" -TIS_PATCH_VER=7 +TIS_PATCH_VER=8 diff --git a/puppet-modules-wrs/puppet-mtce/src/mtce/templates/mtc_ini.erb b/puppet-modules-wrs/puppet-mtce/src/mtce/templates/mtc_ini.erb index bac8185700..54fd506444 100644 --- a/puppet-modules-wrs/puppet-mtce/src/mtce/templates/mtc_ini.erb +++ b/puppet-modules-wrs/puppet-mtce/src/mtce/templates/mtc_ini.erb @@ -16,6 +16,19 @@ heartbeat_period = <%= @heartbeat_period %> ; Heartbeat period in milliseconds heartbeat_failure_threshold = <%= @heartbeat_failure_threshold %> ; Heartbeat failure threshold count. heartbeat_degrade_threshold = <%= @heartbeat_degrade_threshold %> ; Heartbeat degrade threshold count. +; Heartbeat Loss / Failure Action Selection. +; The action to take on host heartbeat failure. +; Supported actions are +; fail = fail host and raise network specific heartbeat alarms +; degrade = degrade host and raise network specific heartbeat alarms +; alarm = raise network specific heartbeat alarms only +; none = no action and no alarms +; Selected action applies to all hosts in the system +; Default is fail +; To modify execute: +; system service-parameter-modify platform maintenance heartbeat_failure_action= +heartbeat_failure_action = <%= @heartbeat_failure_action %> + ; Multi-Node Failure Avoidance (MNFA) Activation and Deactivation threshold. ; The minimum number of hosts that fail heartbeat within the ; heartbeat_failure_threshold upon which Maintenance activates MNFA Mode. diff --git a/sysinv/sysinv/centos/build_srpm.data b/sysinv/sysinv/centos/build_srpm.data index a7d006a129..e6a2d4a1d7 100644 --- a/sysinv/sysinv/centos/build_srpm.data +++ b/sysinv/sysinv/centos/build_srpm.data @@ -1,2 +1,2 @@ SRC_DIR="sysinv" -TIS_PATCH_VER=280 +TIS_PATCH_VER=281 diff --git a/sysinv/sysinv/sysinv/sysinv/common/constants.py b/sysinv/sysinv/sysinv/sysinv/common/constants.py index e3d49b7a4f..692b8b9bb5 100644 --- a/sysinv/sysinv/sysinv/sysinv/common/constants.py +++ b/sysinv/sysinv/sysinv/sysinv/common/constants.py @@ -942,6 +942,7 @@ SERVICE_PARAM_NAME_SYSINV_FIREWALL_RULES_ID = 'firewall_rules_id' SERVICE_PARAM_PLAT_MTCE_COMPUTE_BOOT_TIMEOUT = 'compute_boot_timeout' SERVICE_PARAM_PLAT_MTCE_CONTROLLER_BOOT_TIMEOUT = 'controller_boot_timeout' SERVICE_PARAM_PLAT_MTCE_HBS_PERIOD = 'heartbeat_period' +SERVICE_PARAM_PLAT_MTCE_HBS_FAILURE_ACTION = 'heartbeat_failure_action' SERVICE_PARAM_PLAT_MTCE_HBS_FAILURE_THRESHOLD = 'heartbeat_failure_threshold' SERVICE_PARAM_PLAT_MTCE_HBS_DEGRADE_THRESHOLD = 'heartbeat_degrade_threshold' SERVICE_PARAM_PLAT_MTCE_MNFA_THRESHOLD = 'mnfa_threshold' @@ -950,6 +951,7 @@ SERVICE_PARAM_PLAT_MTCE_MNFA_TIMEOUT = 'mnfa_timeout' SERVICE_PARAM_PLAT_MTCE_COMPUTE_BOOT_TIMEOUT_DEFAULT = 720 SERVICE_PARAM_PLAT_MTCE_CONTROLLER_BOOT_TIMEOUT_DEFAULT = 1200 SERVICE_PARAM_PLAT_MTCE_HBS_PERIOD_DEFAULT = 100 +SERVICE_PARAM_PLAT_MTCE_HBS_FAILURE_ACTION_DEFAULT = 'fail' SERVICE_PARAM_PLAT_MTCE_HBS_FAILURE_THRESHOLD_DEFAULT = 10 SERVICE_PARAM_PLAT_MTCE_HBS_DEGRADE_THRESHOLD_DEFAULT = 6 SERVICE_PARAM_PLAT_MTCE_MNFA_THRESHOLD_DEFAULT = 2 diff --git a/sysinv/sysinv/sysinv/sysinv/common/service_parameter.py b/sysinv/sysinv/sysinv/sysinv/common/service_parameter.py index 1c7968df35..153a5da3f4 100644 --- a/sysinv/sysinv/sysinv/sysinv/common/service_parameter.py +++ b/sysinv/sysinv/sysinv/sysinv/common/service_parameter.py @@ -545,6 +545,27 @@ def _validate_hbs_period(name, value): SERVICE_PARAM_PLAT_MTCE_HBS_PERIOD_MAX) +def _validate_hbs_failure_action(name, value): + error = False + try: + if str(value) != SERVICE_PARAM_PLAT_MTCE_HBS_FAILURE_ACTION_FAIL and \ + str(value) != SERVICE_PARAM_PLAT_MTCE_HBS_FAILURE_ACTION_DEGRADE and \ + str(value) != SERVICE_PARAM_PLAT_MTCE_HBS_FAILURE_ACTION_ALARM and \ + str(value) != SERVICE_PARAM_PLAT_MTCE_HBS_FAILURE_ACTION_NONE: + error = True + + except ValueError: + error = True + + if error is True: + raise wsme.exc.ClientSideError(_( + "Action must be one of '%s', '%s', '%s' or '%s'" % + (SERVICE_PARAM_PLAT_MTCE_HBS_FAILURE_ACTION_FAIL, + SERVICE_PARAM_PLAT_MTCE_HBS_FAILURE_ACTION_DEGRADE, + SERVICE_PARAM_PLAT_MTCE_HBS_FAILURE_ACTION_ALARM, + SERVICE_PARAM_PLAT_MTCE_HBS_FAILURE_ACTION_NONE))) + + def _validate_hbs_failure_threshold(name, value): _validate_range(name, value, SERVICE_PARAM_PLAT_MTCE_HBS_FAILURE_THRESHOLD_MIN, @@ -1331,6 +1352,7 @@ PLATFORM_MTCE_PARAMETER_MANDATORY = [ constants.SERVICE_PARAM_PLAT_MTCE_COMPUTE_BOOT_TIMEOUT, constants.SERVICE_PARAM_PLAT_MTCE_CONTROLLER_BOOT_TIMEOUT, constants.SERVICE_PARAM_PLAT_MTCE_HBS_PERIOD, + constants.SERVICE_PARAM_PLAT_MTCE_HBS_FAILURE_ACTION, constants.SERVICE_PARAM_PLAT_MTCE_HBS_FAILURE_THRESHOLD, constants.SERVICE_PARAM_PLAT_MTCE_HBS_DEGRADE_THRESHOLD, constants.SERVICE_PARAM_PLAT_MTCE_MNFA_THRESHOLD, @@ -1349,6 +1371,10 @@ SERVICE_PARAM_PLAT_MTCE_HBS_FAILURE_THRESHOLD_MIN = 10 SERVICE_PARAM_PLAT_MTCE_HBS_FAILURE_THRESHOLD_MAX = 100 SERVICE_PARAM_PLAT_MTCE_HBS_DEGRADE_THRESHOLD_MIN = 4 SERVICE_PARAM_PLAT_MTCE_HBS_DEGRADE_THRESHOLD_MAX = 100 +SERVICE_PARAM_PLAT_MTCE_HBS_FAILURE_ACTION_FAIL = 'fail' +SERVICE_PARAM_PLAT_MTCE_HBS_FAILURE_ACTION_DEGRADE = 'degrade' +SERVICE_PARAM_PLAT_MTCE_HBS_FAILURE_ACTION_ALARM = 'alarm' +SERVICE_PARAM_PLAT_MTCE_HBS_FAILURE_ACTION_NONE = 'none' SERVICE_PARAM_PLAT_MTCE_MNFA_THRESHOLD_MIN = 2 SERVICE_PARAM_PLAT_MTCE_MNFA_THRESHOLD_MAX = 100 SERVICE_PARAM_PLAT_MTCE_MNFA_TIMEOUT_MIN = 100 @@ -1361,6 +1387,8 @@ PLATFORM_MTCE_PARAMETER_VALIDATOR = { _validate_controller_boot_timeout, constants.SERVICE_PARAM_PLAT_MTCE_HBS_PERIOD: _validate_hbs_period, + constants.SERVICE_PARAM_PLAT_MTCE_HBS_FAILURE_ACTION: + _validate_hbs_failure_action, constants.SERVICE_PARAM_PLAT_MTCE_HBS_FAILURE_THRESHOLD: _validate_hbs_failure_threshold, constants.SERVICE_PARAM_PLAT_MTCE_HBS_DEGRADE_THRESHOLD: @@ -1375,6 +1403,7 @@ PLATFORM_MTCE_PARAMETER_RESOURCE = { constants.SERVICE_PARAM_PLAT_MTCE_COMPUTE_BOOT_TIMEOUT: 'platform::mtce::params::compute_boot_timeout', constants.SERVICE_PARAM_PLAT_MTCE_CONTROLLER_BOOT_TIMEOUT: 'platform::mtce::params::controller_boot_timeout', constants.SERVICE_PARAM_PLAT_MTCE_HBS_PERIOD: 'platform::mtce::params::heartbeat_period', + constants.SERVICE_PARAM_PLAT_MTCE_HBS_FAILURE_ACTION: 'platform::mtce::params::heartbeat_failure_action', constants.SERVICE_PARAM_PLAT_MTCE_HBS_FAILURE_THRESHOLD: 'platform::mtce::params::heartbeat_failure_threshold', constants.SERVICE_PARAM_PLAT_MTCE_HBS_DEGRADE_THRESHOLD: 'platform::mtce::params::heartbeat_degrade_threshold', constants.SERVICE_PARAM_PLAT_MTCE_MNFA_THRESHOLD: 'platform::mtce::params::mnfa_threshold', diff --git a/sysinv/sysinv/sysinv/sysinv/conductor/manager.py b/sysinv/sysinv/sysinv/sysinv/conductor/manager.py index adbfacd04e..61eab01206 100644 --- a/sysinv/sysinv/sysinv/sysinv/conductor/manager.py +++ b/sysinv/sysinv/sysinv/sysinv/conductor/manager.py @@ -476,6 +476,11 @@ class ConductorManager(service.PeriodicService): 'name': constants.SERVICE_PARAM_PLAT_MTCE_HBS_PERIOD, 'value': constants.SERVICE_PARAM_PLAT_MTCE_HBS_PERIOD_DEFAULT, }, + {'service': constants.SERVICE_TYPE_PLATFORM, + 'section': constants.SERVICE_PARAM_SECTION_PLATFORM_MAINTENANCE, + 'name': constants.SERVICE_PARAM_PLAT_MTCE_HBS_FAILURE_ACTION, + 'value': constants.SERVICE_PARAM_PLAT_MTCE_HBS_FAILURE_ACTION_DEFAULT, + }, {'service': constants.SERVICE_TYPE_PLATFORM, 'section': constants.SERVICE_PARAM_SECTION_PLATFORM_MAINTENANCE, 'name': constants.SERVICE_PARAM_PLAT_MTCE_HBS_FAILURE_THRESHOLD,