From f19dd0498fb7e5af740837b1e2d93e618e53525b Mon Sep 17 00:00:00 2001 From: Eric MacDonald Date: Wed, 29 Aug 2018 15:12:12 -0400 Subject: [PATCH] Mtce: Make Multi-Node Failure Avoidance Configurable The maintenance system implements a high availability (HA) feature designed to detect the simultaneous heartbeat failure of a group of hosts and avoid failing all those hosts until heartbeat resumes or after a set period of time. This feature is called Multi-Node Failure Avoidance, aka MNFA, and currently has the hosts threshold set to 3 and timeout set to 100 secs. This update implements enhancements to that existing feature by making the 'number-of-hosts threshold' and 'timeout period' customer configurable service parameters. The new service parameters are listed under platform:maintenance which display with the following command > system service-parameter-list mnfa_threshold: This new label and value is added to the puppet managed /etc/mtc.ini and represents the number of hosts that are required to fail heartbeat as a group; within the heartbeat failure window (heartbeat_failure_threshold) after which maintenance activates MNFA Mode. This update changes the default number of failing hosts from 3 to 2 while allowing a configurable range from 2 to 100. mnfa_timeout: This new label and value is added to the puppet managed /etc/mtc.ini. While MNFA mode is active, it will remain active until the number of failing hosts drop below the mnfa_threshold or this timer expires. The MNFA mode deactivates on the first occurance of either case. Upon deactivation the remaining failed hosts are no longer treated as a failure group but instead are all Gracefully Recovered individually. A value of zero imposes no timeout making the deactivation criteria solely host based. This update changes the default 100 second timer to 0; no-timeout while permitting valid a times range from 100 to 86400 secs or 1 day. DocImpact Story: 2003576 Task: 24903 Change-Id: I2fb737a4cd3c235845b064449949fcada303d6b2 Signed-off-by: Eric MacDonald --- .../src/hieradata/controller.yaml | 2 + .../src/modules/platform/manifests/mtce.pp | 2 + .../puppet-mtce/centos/build_srpm.data | 2 +- .../src/mtce/templates/mtc_ini.erb | 27 +++++++++++++- .../sysinv/sysinv/sysinv/common/constants.py | 4 ++ .../sysinv/sysinv/common/service_parameter.py | 37 +++++++++++++++++++ .../sysinv/sysinv/sysinv/conductor/manager.py | 10 +++++ 7 files changed, 81 insertions(+), 3 deletions(-) diff --git a/puppet-manifests/src/hieradata/controller.yaml b/puppet-manifests/src/hieradata/controller.yaml index e0460142a1..b13adab7ae 100644 --- a/puppet-manifests/src/hieradata/controller.yaml +++ b/puppet-manifests/src/hieradata/controller.yaml @@ -51,6 +51,8 @@ platform::mtce::agent::params::controller_boot_timeout: 1200 platform::mtce::agent::params::heartbeat_period: 100 platform::mtce::agent::params::heartbeat_failure_threshold: 10 platform::mtce::agent::params::heartbeat_degrade_threshold: 6 +platform::mtce::agent::params::mnfa_threshold: 2 +platform::mtce::agent::params::mnfa_timeout: 0 # influxdb configuration for collectd platform::influxdb::params::bind_address: ':25826' diff --git a/puppet-manifests/src/modules/platform/manifests/mtce.pp b/puppet-manifests/src/modules/platform/manifests/mtce.pp index 39add87b25..a9e0f1f276 100644 --- a/puppet-manifests/src/modules/platform/manifests/mtce.pp +++ b/puppet-manifests/src/modules/platform/manifests/mtce.pp @@ -14,6 +14,8 @@ class platform::mtce::params ( $heartbeat_failure_threshold = undef, $heartbeat_period = undef, $mtce_multicast = undef, + $mnfa_threshold = undef, + $mnfa_timeout = undef, ) { } diff --git a/puppet-modules-wrs/puppet-mtce/centos/build_srpm.data b/puppet-modules-wrs/puppet-mtce/centos/build_srpm.data index b781aa56d3..38be379ea2 100644 --- a/puppet-modules-wrs/puppet-mtce/centos/build_srpm.data +++ b/puppet-modules-wrs/puppet-mtce/centos/build_srpm.data @@ -1,3 +1,3 @@ SRC_DIR="src" COPY_LIST="$SRC_DIR/LICENSE" -TIS_PATCH_VER=6 +TIS_PATCH_VER=7 diff --git a/puppet-modules-wrs/puppet-mtce/src/mtce/templates/mtc_ini.erb b/puppet-modules-wrs/puppet-mtce/src/mtce/templates/mtc_ini.erb index 30a781692a..bac8185700 100644 --- a/puppet-modules-wrs/puppet-mtce/src/mtce/templates/mtc_ini.erb +++ b/puppet-modules-wrs/puppet-mtce/src/mtce/templates/mtc_ini.erb @@ -1,4 +1,4 @@ -; Packstack managed Maintenance Configuration +; Puppet Managed Maintenance Configuration [agent] ; Agent Configuration keystone_auth_username = <%= @auth_username %> ; mtce auth username keystone_auth_pw = <%= @auth_pw %> ; mtce auth password @@ -16,8 +16,31 @@ heartbeat_period = <%= @heartbeat_period %> ; Heartbeat period in milliseconds heartbeat_failure_threshold = <%= @heartbeat_failure_threshold %> ; Heartbeat failure threshold count. heartbeat_degrade_threshold = <%= @heartbeat_degrade_threshold %> ; Heartbeat degrade threshold count. +; Multi-Node Failure Avoidance (MNFA) Activation and Deactivation threshold. +; The minimum number of hosts that fail heartbeat within the +; heartbeat_failure_threshold upon which Maintenance activates MNFA Mode. +; Once the number of failing hosts drop below this threshold then mainteance +; deactivates MNFA mode while remaining failing hosts are Gracefully Recovered. +; Default value is 2 +; Minimum value is 2 +; To modify execute: +; system service-parameter-modify platform maintenance mnfa_threshold= +mnfa_threshold = <%= @mnfa_threshold %> + [timeouts] compute_boot_timeout = <%= @compute_boot_timeout %> ; The max time (seconds) that Mtce waits for the mtcAlive controller_boot_timeout = <%= @controller_boot_timeout %> ; message after which it will time out and fail the host. - +; Multi-Node Failure Avoidance (MNFA) Lifecycle Timer. +; MNFA Activation starts a timer with this timeout value. +; See mnfa_threshold above. +; Maintenance automatically Deactivates MNFA mode if the number of hosts that +; are failing heartbeat doesn't drop below mnfa_threshold before timer expires. +; Timer is in seconds. +; A zero value means infinite lifecycle or until the number of +; heartbeat failing hosts drops below the mnfa_threshold before expiry. +; Default value is 0 +; Minimum non-zero value is 100 ; maximum is 86400 +; To modify execute: +; system service-parameter-modify platform maintenance mnfa_timeout= +mnfa_timeout = <%= @mnfa_timeout %> diff --git a/sysinv/sysinv/sysinv/sysinv/common/constants.py b/sysinv/sysinv/sysinv/sysinv/common/constants.py index 2a5f238335..e3d49b7a4f 100644 --- a/sysinv/sysinv/sysinv/sysinv/common/constants.py +++ b/sysinv/sysinv/sysinv/sysinv/common/constants.py @@ -944,12 +944,16 @@ SERVICE_PARAM_PLAT_MTCE_CONTROLLER_BOOT_TIMEOUT = 'controller_boot_timeout' SERVICE_PARAM_PLAT_MTCE_HBS_PERIOD = 'heartbeat_period' SERVICE_PARAM_PLAT_MTCE_HBS_FAILURE_THRESHOLD = 'heartbeat_failure_threshold' SERVICE_PARAM_PLAT_MTCE_HBS_DEGRADE_THRESHOLD = 'heartbeat_degrade_threshold' +SERVICE_PARAM_PLAT_MTCE_MNFA_THRESHOLD = 'mnfa_threshold' +SERVICE_PARAM_PLAT_MTCE_MNFA_TIMEOUT = 'mnfa_timeout' SERVICE_PARAM_PLAT_MTCE_COMPUTE_BOOT_TIMEOUT_DEFAULT = 720 SERVICE_PARAM_PLAT_MTCE_CONTROLLER_BOOT_TIMEOUT_DEFAULT = 1200 SERVICE_PARAM_PLAT_MTCE_HBS_PERIOD_DEFAULT = 100 SERVICE_PARAM_PLAT_MTCE_HBS_FAILURE_THRESHOLD_DEFAULT = 10 SERVICE_PARAM_PLAT_MTCE_HBS_DEGRADE_THRESHOLD_DEFAULT = 6 +SERVICE_PARAM_PLAT_MTCE_MNFA_THRESHOLD_DEFAULT = 2 +SERVICE_PARAM_PLAT_MTCE_MNFA_TIMEOUT_DEFAULT = 0 # Nova Service Parameters SERVICE_PARAM_SECTION_NOVA_PCI_ALIAS = 'pci_alias' diff --git a/sysinv/sysinv/sysinv/sysinv/common/service_parameter.py b/sysinv/sysinv/sysinv/sysinv/common/service_parameter.py index 67c4765ae7..1c7968df35 100644 --- a/sysinv/sysinv/sysinv/sysinv/common/service_parameter.py +++ b/sysinv/sysinv/sysinv/sysinv/common/service_parameter.py @@ -93,6 +93,18 @@ def _validate_range(name, value, min, max): "Parameter '%s' must be an integer value." % name)) +def _validate_zero_or_range(name, value, min, max): + try: + if int(value) != 0: + if int(value) < min or int(value) > max: + raise wsme.exc.ClientSideError(_( + "Parameter '%s' must be zero or between %d and %d.") + % (name, min, max)) + except ValueError: + raise wsme.exc.ClientSideError(_( + "Parameter '%s' must be an integer value." % name)) + + def _validate_ldap_url(name, value): url = urlparse.urlparse(value) @@ -545,6 +557,19 @@ def _validate_hbs_degrade_threshold(name, value): SERVICE_PARAM_PLAT_MTCE_HBS_DEGRADE_THRESHOLD_MAX) +def _validate_mnfa_threshold(name, value): + _validate_range(name, value, + SERVICE_PARAM_PLAT_MTCE_MNFA_THRESHOLD_MIN, + SERVICE_PARAM_PLAT_MTCE_MNFA_THRESHOLD_MAX) + + +def _validate_mnfa_timeout(name, value): + # accept zero (no timeout) or a reasonable/tested specific range + _validate_zero_or_range(name, value, + SERVICE_PARAM_PLAT_MTCE_MNFA_TIMEOUT_MIN, + SERVICE_PARAM_PLAT_MTCE_MNFA_TIMEOUT_MAX) + + # Validate range of Performance Monitoring Event 'time to live" value def _validate_event_time_to_live_range(name, value): _validate_range(name, value, @@ -1308,6 +1333,8 @@ PLATFORM_MTCE_PARAMETER_MANDATORY = [ constants.SERVICE_PARAM_PLAT_MTCE_HBS_PERIOD, constants.SERVICE_PARAM_PLAT_MTCE_HBS_FAILURE_THRESHOLD, constants.SERVICE_PARAM_PLAT_MTCE_HBS_DEGRADE_THRESHOLD, + constants.SERVICE_PARAM_PLAT_MTCE_MNFA_THRESHOLD, + constants.SERVICE_PARAM_PLAT_MTCE_MNFA_TIMEOUT, ] PLATFORM_SYSINV_PARAMETER_PROTECTED = ['firewall_rules_id'] @@ -1322,6 +1349,10 @@ SERVICE_PARAM_PLAT_MTCE_HBS_FAILURE_THRESHOLD_MIN = 10 SERVICE_PARAM_PLAT_MTCE_HBS_FAILURE_THRESHOLD_MAX = 100 SERVICE_PARAM_PLAT_MTCE_HBS_DEGRADE_THRESHOLD_MIN = 4 SERVICE_PARAM_PLAT_MTCE_HBS_DEGRADE_THRESHOLD_MAX = 100 +SERVICE_PARAM_PLAT_MTCE_MNFA_THRESHOLD_MIN = 2 +SERVICE_PARAM_PLAT_MTCE_MNFA_THRESHOLD_MAX = 100 +SERVICE_PARAM_PLAT_MTCE_MNFA_TIMEOUT_MIN = 100 +SERVICE_PARAM_PLAT_MTCE_MNFA_TIMEOUT_MAX = 86400 PLATFORM_MTCE_PARAMETER_VALIDATOR = { constants.SERVICE_PARAM_PLAT_MTCE_COMPUTE_BOOT_TIMEOUT: @@ -1334,6 +1365,10 @@ PLATFORM_MTCE_PARAMETER_VALIDATOR = { _validate_hbs_failure_threshold, constants.SERVICE_PARAM_PLAT_MTCE_HBS_DEGRADE_THRESHOLD: _validate_hbs_degrade_threshold, + constants.SERVICE_PARAM_PLAT_MTCE_MNFA_THRESHOLD: + _validate_mnfa_threshold, + constants.SERVICE_PARAM_PLAT_MTCE_MNFA_TIMEOUT: + _validate_mnfa_timeout, } PLATFORM_MTCE_PARAMETER_RESOURCE = { @@ -1342,6 +1377,8 @@ PLATFORM_MTCE_PARAMETER_RESOURCE = { constants.SERVICE_PARAM_PLAT_MTCE_HBS_PERIOD: 'platform::mtce::params::heartbeat_period', constants.SERVICE_PARAM_PLAT_MTCE_HBS_FAILURE_THRESHOLD: 'platform::mtce::params::heartbeat_failure_threshold', constants.SERVICE_PARAM_PLAT_MTCE_HBS_DEGRADE_THRESHOLD: 'platform::mtce::params::heartbeat_degrade_threshold', + constants.SERVICE_PARAM_PLAT_MTCE_MNFA_THRESHOLD: 'platform::mtce::params::mnfa_threshold', + constants.SERVICE_PARAM_PLAT_MTCE_MNFA_TIMEOUT: 'platform::mtce::params::mnfa_timeout', } # Panko Event TTL range from 1 hour to 1 year diff --git a/sysinv/sysinv/sysinv/sysinv/conductor/manager.py b/sysinv/sysinv/sysinv/sysinv/conductor/manager.py index a82425ca5f..bfbae00992 100644 --- a/sysinv/sysinv/sysinv/sysinv/conductor/manager.py +++ b/sysinv/sysinv/sysinv/sysinv/conductor/manager.py @@ -486,6 +486,16 @@ class ConductorManager(service.PeriodicService): 'name': constants.SERVICE_PARAM_PLAT_MTCE_HBS_DEGRADE_THRESHOLD, 'value': constants.SERVICE_PARAM_PLAT_MTCE_HBS_DEGRADE_THRESHOLD_DEFAULT, }, + {'service': constants.SERVICE_TYPE_PLATFORM, + 'section': constants.SERVICE_PARAM_SECTION_PLATFORM_MAINTENANCE, + 'name': constants.SERVICE_PARAM_PLAT_MTCE_MNFA_THRESHOLD, + 'value': constants.SERVICE_PARAM_PLAT_MTCE_MNFA_THRESHOLD_DEFAULT, + }, + {'service': constants.SERVICE_TYPE_PLATFORM, + 'section': constants.SERVICE_PARAM_SECTION_PLATFORM_MAINTENANCE, + 'name': constants.SERVICE_PARAM_PLAT_MTCE_MNFA_TIMEOUT, + 'value': constants.SERVICE_PARAM_PLAT_MTCE_MNFA_TIMEOUT_DEFAULT, + }, {'service': constants.SERVICE_TYPE_PANKO, 'section': constants.SERVICE_PARAM_SECTION_PANKO_DATABASE, 'name': constants.SERVICE_PARAM_NAME_PANKO_DATABASE_EVENT_TIME_TO_LIVE,