Filter out skippable alarms in precheck

Subcloud online checks are now skipped if the subcloud is
already in the 'migrated' state. The precheck also skips
upgrade alarm as well as the host lock alarm if upgrade
has started.

In addition, the commit includes the code to handle bmc_password
with None value which the previous commit
(69a7449988) did not handle.

Closes-Bug: 1924774
Change-Id: Ifdfe1e44a34c2c2561c299a82d7178dce6063daf
Signed-off-by: Tee Ngo <Tee.Ngo@windriver.com>
(cherry picked from commit a9157c51d5)
This commit is contained in:
Tee Ngo 2021-04-16 11:00:26 -04:00
parent afde5fa53f
commit de0fef663a
7 changed files with 255 additions and 81 deletions

View File

@ -56,3 +56,14 @@ class FmClient(base.DriverBase):
LOG.error("get_alarm_summary exception={}".format(e))
raise e
return alarms
def get_alarms(self):
"""Get this region alarms"""
try:
LOG.debug("get_alarms region %s" % self.region_name)
alarms = self.fm.alarm.list(include_suppress=True)
except Exception as e:
LOG.error("get_alarms exception={}".format(e))
raise e
return alarms

View File

@ -9,6 +9,7 @@ import six
from oslo_log import log as logging
from dccommon.drivers.openstack.barbican import BarbicanClient
from dccommon.drivers.openstack.fm import FmClient
from dccommon.drivers.openstack.patching_v1 import PatchingClient
from dccommon.drivers.openstack.sdk_platform import OpenStackDriver
from dccommon.drivers.openstack.sysinv_v1 import SysinvClient
@ -101,6 +102,10 @@ class BaseState(object):
return SysinvClient(region_name, keystone_client.session)
def get_fm_client(self, region_name):
keystone_client = self.get_keystone_client(region_name)
return FmClient(region_name, keystone_client.session)
def get_patching_client(self, region_name=consts.DEFAULT_REGION_NAME):
keystone_client = self.get_keystone_client(region_name)
return PatchingClient(region_name, keystone_client.session)

View File

@ -3,6 +3,7 @@
#
# SPDX-License-Identifier: Apache-2.0
#
import copy
import re
from dccommon.drivers.openstack.sysinv_v1 import HOST_FS_NAME_SCRATCH
@ -26,6 +27,11 @@ VALID_ACTIVATION_STATES = [consts.DEPLOY_STATE_MIGRATED, ]
MIN_SCRATCH_SIZE_REQUIRED_GB = 16
UPGRADE_IN_PROGRESS_ALARM = '900.005'
HOST_ADMINISTRATIVELY_LOCKED_ALARM = '200.001'
ALARM_IGNORE_LIST = [UPGRADE_IN_PROGRESS_ALARM, ]
class PreCheckState(BaseState):
"""This State performs entry checks and skips to the appropriate state"""
@ -34,73 +40,85 @@ class PreCheckState(BaseState):
super(PreCheckState, self).__init__(
next_state=consts.STRATEGY_STATE_INSTALLING_LICENSE, region_name=region_name)
def _perform_subcloud_online_checks(self, strategy_step, subcloud_sysinv_client):
subcloud_type = self.get_sysinv_client(
strategy_step.subcloud.name).get_system().system_mode
upgrades = self.get_sysinv_client(strategy_step.subcloud.name).get_upgrades()
def _perform_subcloud_online_checks(self, strategy_step, subcloud_sysinv_client,
subcloud_fm_client, host, alarm_ignore_list):
# For duplex upgrade, we skip health checks if an upgrade is in progress.
# check system health
#
# Sample output #1
# ================
# Some non-management affecting alarms, all other checks passed
#
# System Health:
# All hosts are provisioned: [OK]
# All hosts are unlocked/enabled: [OK]
# All hosts have current configurations: [OK]
# All hosts are patch current: [OK]
# Ceph Storage Healthy: [OK]
# No alarms: [Fail]
# [1] alarms found, [0] of which are management affecting
# All kubernetes nodes are ready: [OK]
# All kubernetes control plane pods are ready: [OK]
#
# Sample output #2
# ================
# Multiple failed checks, management affecting alarms
#
# System Health:
# All hosts are provisioned: [OK]
# All hosts are unlocked/enabled: [OK]
# All hosts have current configurations: [OK]
# All hosts are patch current: [OK]
# Ceph Storage Healthy: [Fail]
# No alarms: [Fail]
# [7] alarms found, [2] of which are management affecting
# All kubernetes nodes are ready: [OK]
# All kubernetes control plane pods are ready: [OK]
if (len(upgrades) != 0 and subcloud_type == consts.SYSTEM_MODE_DUPLEX):
self.info_log(strategy_step, "Health check skipped for non-simplex subclouds.")
# TODO(teewrs): Update the sysinv API to allow a list of ignored alarms
# to be passed to the health check API. This would be much more efficient
# than having to retrieve the alarms in a separate step.
system_health = subcloud_sysinv_client.get_system_health()
fails = re.findall("\[Fail\]", system_health)
failed_alarm_check = re.findall("No alarms: \[Fail\]", system_health)
no_mgmt_alarms = re.findall("\[0\] of which are management affecting",
system_health)
# The health conditions acceptable for upgrade are:
# a) subcloud is completely healthy (i.e. no failed checks)
# b) subcloud only fails alarm check and it only has non-management
# affecting alarm(s)
# c) the management alarm(s) that subcloud has once upgrade has started
# are upgrade alarm itself and host locked alarm
if ((len(fails) == 0) or
(len(fails) == 1 and failed_alarm_check and no_mgmt_alarms)):
self.info_log(strategy_step, "Health check passed.")
elif ((len(fails) > 1) or (len(fails) == 1 and not failed_alarm_check)):
# Multiple failures or kubernetes related failure which has not been
# converted into an alarm condition.
details = "System health check failed due to multiple failures. " \
"Please run 'system health-query' command on the " \
"subcloud for more details."
self.error_log(strategy_step, "\n" + system_health)
raise PreCheckFailedException(
subcloud=strategy_step.subcloud.name,
details=details,
)
else:
# check system health
#
# Sample output #1
# ================
# Some non-management affecting alarms, all other checks passed
#
# System Health:
# All hosts are provisioned: [OK]
# All hosts are unlocked/enabled: [OK]
# All hosts have current configurations: [OK]
# All hosts are patch current: [OK]
# Ceph Storage Healthy: [OK]
# No alarms: [Fail]
# [1] alarms found, [0] of which are management affecting
# All kubernetes nodes are ready: [OK]
# All kubernetes control plane pods are ready: [OK]
#
# Sample output #2
# ================
# Multiple failed checks, management affecting alarms
#
# System Health:
# All hosts are provisioned: [OK]
# All hosts are unlocked/enabled: [OK]
# All hosts have current configurations: [OK]
# All hosts are patch current: [OK]
# Ceph Storage Healthy: [Fail]
# No alarms: [Fail]
# [7] alarms found, [2] of which are management affecting
# All kubernetes nodes are ready: [OK]
# All kubernetes control plane pods are ready: [OK]
system_health = subcloud_sysinv_client.get_system_health()
fails = re.findall("\[Fail\]", system_health)
failed_alarm_check = re.findall("No alarms: \[Fail\]", system_health)
no_mgmt_alarms = re.findall("\[0\] of which are management affecting",
system_health)
# The only 2 health conditions acceptable for simplex upgrade are:
# a) subcloud is completely healthy (i.e. no failed checks)
# b) subcloud only fails alarm check and it only has non-management
# affecting alarm(s)
if ((len(fails) == 0) or
(len(fails) == 1 and failed_alarm_check and no_mgmt_alarms)):
self.info_log(strategy_step, "Health check passed.")
else:
details = "System health check failed. " \
"Please run 'system health-query' " \
"command on the subcloud for more details."
self.error_log(strategy_step, "\n" + system_health)
raise PreCheckFailedException(
subcloud=strategy_step.subcloud.name,
details=details,
)
alarms = subcloud_fm_client.get_alarms()
for alarm in alarms:
if alarm.alarm_id not in alarm_ignore_list:
if alarm.mgmt_affecting == "True":
details = "System health check failed due to alarm %s. " \
"Please run 'system health-query' " \
"command on the subcloud for more details." % alarm.alarm_id
self.error_log(strategy_step, "\n" + system_health)
raise PreCheckFailedException(
subcloud=strategy_step.subcloud.name,
details=details,
)
# check scratch
host = subcloud_sysinv_client.get_host("controller-0")
scratch_fs = subcloud_sysinv_client.get_host_filesystem(
host.uuid, HOST_FS_NAME_SCRATCH)
if scratch_fs.size < MIN_SCRATCH_SIZE_REQUIRED_GB:
@ -125,12 +143,12 @@ class PreCheckState(BaseState):
subcloud_sysinv_client = None
try:
subcloud_sysinv_client = self.get_sysinv_client(strategy_step.subcloud.name)
subcloud_fm_client = self.get_fm_client(strategy_step.subcloud.name)
except Exception:
# if getting the token times out, the orchestrator may have
# restarted and subcloud may be offline; so will attempt
# to use the persisted values
message = ("_perform_subcloud_online_checks subcloud %s "
"failed to get subcloud client" %
message = ("Subcloud %s failed to get subcloud client" %
strategy_step.subcloud.name)
self.error_log(strategy_step, message)
raise ManualRecoveryRequiredException(
@ -160,7 +178,24 @@ class PreCheckState(BaseState):
self.override_next_state(consts.STRATEGY_STATE_UPGRADING_SIMPLEX)
return self.next_state
self._perform_subcloud_online_checks(strategy_step, subcloud_sysinv_client)
# We skip the subcloud online check if either the subcloud deploy status is
# "migrated" or the subcloud is a duplex subcloud and upgrade has started.
upgrades = subcloud_sysinv_client.get_upgrades()
if (subcloud.deploy_status == consts.DEPLOY_STATE_MIGRATED or
(len(upgrades) != 0 and subcloud_type == consts.SYSTEM_MODE_DUPLEX)):
self.info_log(strategy_step,
"Online subcloud checks skipped.")
else:
alarm_ignore_list = copy.copy(ALARM_IGNORE_LIST)
if (host.administrative == consts.ADMIN_LOCKED and len(upgrades) != 0):
alarm_ignore_list.append(HOST_ADMINISTRATIVELY_LOCKED_ALARM)
self._perform_subcloud_online_checks(strategy_step,
subcloud_sysinv_client,
subcloud_fm_client,
host,
alarm_ignore_list)
# If the subcloud has completed data migration and is online,
# advance directly to activating upgrade step. Otherwise, start
# from installing license step.

View File

@ -281,11 +281,16 @@ class UpgradingSimplexState(BaseState):
bmc_password = None
if subcloud_barbican_client:
bmc_password = subcloud_barbican_client.get_host_bmc_password(host.uuid)
if bmc_password:
# If the host is configured to store bmc in its barbican database,
# encode the password. Otherwise leave it as None and it will be
# replaced with the value retrieved from the dcmanager database.
bmc_password = b64encode(bmc_password)
volatile_data_install.update({
'bmc_address': host.bm_ip,
'bmc_username': host.bm_username,
'bmc_password': b64encode(bmc_password),
'bmc_password': bmc_password,
'install_type': install_type,
'boot_device': host.boot_device,
'rootfs_device': host.rootfs_device,

View File

@ -205,6 +205,11 @@ class FakePatchingClient(object):
pass
class FakeFmClient(object):
def __init__(self):
pass
class FakeSystem(object):
def __init__(self,
obj_id=1,
@ -226,3 +231,11 @@ class FakeUpgrade(object):
self.from_release = from_release
self.to_release = to_release
self.links = []
class FakeAlarm(object):
def __init__(self,
alarm_id='12.34',
mgmt_affecting='False'):
self.alarm_id = alarm_id
self.mgmt_affecting = mgmt_affecting

View File

@ -7,10 +7,12 @@ import mock
from dcmanager.common import consts
from dcmanager.tests.unit.orchestrator.states.fakes import FakeAlarm
from dcmanager.tests.unit.orchestrator.states.fakes import FakeController
from dcmanager.tests.unit.orchestrator.states.fakes import FakeHostFilesystem
from dcmanager.tests.unit.orchestrator.states.fakes import FakeSubcloud
from dcmanager.tests.unit.orchestrator.states.fakes import FakeSystem
from dcmanager.tests.unit.orchestrator.states.fakes import FakeUpgrade
from dcmanager.tests.unit.orchestrator.states.upgrade.test_base \
import TestSwUpgradeState
@ -66,6 +68,22 @@ SYSTEM_HEALTH_RESPONSE_MULTIPLE_FAILED_HEALTH_CHECKS = \
"All kubernetes control plane pods are ready: [Fail]\n" \
"Kubernetes control plane pods not ready: kube-apiserver-controller-0"
SYSTEM_HEALTH_RESPONSE_K8S_FAILED_HEALTH_CHECKS = \
"System Health:\n" \
"All hosts are provisioned: [OK]\n" \
"All hosts are unlocked/enabled: [OK]\n" \
"All hosts have current configurations: [OK]\n" \
"All hosts are patch current: [OK]\n" \
"Ceph Storage Healthy: [OK]\n" \
"No alarms: [OK]\n" \
"All kubernetes nodes are ready: [Fail]\n" \
"All kubernetes control plane pods are ready: [OK]"
UPGRADE_STARTED = FakeUpgrade(state='started')
UPGRADE_ALARM = FakeAlarm('900.005', 'True')
HOST_LOCKED_ALARM = FakeAlarm('200.001', 'True')
class TestSwUpgradePreCheckStage(TestSwUpgradeState):
@ -92,6 +110,7 @@ class TestSwUpgradePreCheckStage(TestSwUpgradeState):
system_values.system_mode = consts.SYSTEM_MODE_SIMPLEX
self.sysinv_client.get_system.return_value = system_values
self.sysinv_client.get_upgrades = mock.MagicMock()
self.fm_client.get_alarms = mock.MagicMock()
def test_upgrade_pre_check_subcloud_online_fresh(self):
"""Test pre check step where the subcloud is online and running N load
@ -172,34 +191,28 @@ class TestSwUpgradePreCheckStage(TestSwUpgradeState):
availability_status=consts.AVAILABILITY_ONLINE,
deploy_status=consts.DEPLOY_STATE_MIGRATED)
self.sysinv_client.get_host_filesystem.side_effect = \
[CONTROLLER_0_HOST_FS_SCRATCH_MIN_SIZED]
self.sysinv_client.get_system_health.return_value = \
SYSTEM_HEALTH_RESPONSE_SUCCESS
# invoke the strategy state operation on the orch thread
self.worker.perform_state_action(self.strategy_step)
# verify the DB query was invoked
self.mock_db_query.assert_called()
# verify the get system health API call was invoked
self.sysinv_client.get_system_health.assert_called()
# verify the get system health API call was not invoked
self.sysinv_client.get_system_health.assert_not_called()
# verify the get host filesystem API call was invoked
self.sysinv_client.get_host_filesystem.assert_called()
# verify the get host filesystem API call was not invoked
self.sysinv_client.get_host_filesystem.assert_not_called()
# Verify the expected next state happened (activating upgrade)
self.assert_step_updated(self.strategy_step.subcloud_id,
consts.STRATEGY_STATE_ACTIVATING_UPGRADE)
def test_upgrade_pre_check_subcloud_online_management_alarm(self):
"""Test pre check step where the subcloud is online with a mgmt alarm
def test_upgrade_pre_check_subcloud_online_host_locked_upgrade_started_mgmt_alarms(self):
"""Test pre check step where the subcloud is online, locked and upgrade has started.
The pre-check should raise an exception and transition to the failed
state when the subcloud is not ready for upgrade due to the management
affecting alarm.
The pre-check should move to the next step as the upgrade alarm can
be ignored and the host locked alarm can also be ignored if upgrade has
started.
"""
# online subcloud running N+1 load
@ -207,9 +220,20 @@ class TestSwUpgradePreCheckStage(TestSwUpgradeState):
availability_status=consts.AVAILABILITY_ONLINE,
deploy_status=consts.DEPLOY_STATE_DONE)
# subcloud is locked
self.sysinv_client.get_host.side_effect = [CONTROLLER_0_LOCKED]
# upgrade has started
self.sysinv_client.get_upgrades.return_value = [UPGRADE_STARTED, ]
self.sysinv_client.get_system_health.return_value = \
SYSTEM_HEALTH_RESPONSE_MGMT_AFFECTING_ALARM
self.fm_client.get_alarms.return_value = [UPGRADE_ALARM, HOST_LOCKED_ALARM, ]
self.sysinv_client.get_host_filesystem.side_effect = \
[CONTROLLER_0_HOST_FS_SCRATCH_MIN_SIZED]
# invoke the strategy state operation on the orch thread
self.worker.perform_state_action(self.strategy_step)
@ -219,6 +243,50 @@ class TestSwUpgradePreCheckStage(TestSwUpgradeState):
# verify the get system health API call was invoked
self.sysinv_client.get_system_health.assert_called()
# verify the get alarms API call was invoked
self.fm_client.get_alarms.assert_called()
# verify the get host filesystem API call was invoked
self.sysinv_client.get_host_filesystem.assert_called()
# Verify the expected next state happened (installing license)
self.assert_step_updated(self.strategy_step.subcloud_id,
consts.STRATEGY_STATE_INSTALLING_LICENSE)
def test_upgrade_pre_check_subcloud_online_host_locked_no_upgrade_mgmt_alarms(self):
"""Test pre check step where subcloud is online, locked and upgrade has not started.
The pre-check should raise an exception and transition to the failed
state as host locked alarm cannot be skipped if upgrade has
not been started.
"""
# online subcloud running N+1 load
self.mock_db_query.return_value = FakeSubcloud(
availability_status=consts.AVAILABILITY_ONLINE,
deploy_status=consts.DEPLOY_STATE_DONE)
# subcloud is locked
self.sysinv_client.get_host.side_effect = [CONTROLLER_0_LOCKED]
self.sysinv_client.get_system_health.return_value = \
SYSTEM_HEALTH_RESPONSE_MGMT_AFFECTING_ALARM
self.fm_client.get_alarms.return_value = [HOST_LOCKED_ALARM, ]
# self.fm_client.get_alarms.return_value = \
# invoke the strategy state operation on the orch thread
self.worker.perform_state_action(self.strategy_step)
# verify the DB query was invoked
self.mock_db_query.assert_called()
# verify the get system health API call was invoked
self.sysinv_client.get_system_health.assert_called()
# verify the get alarms API call was invoked
self.fm_client.get_alarms.assert_called()
# Verify the exception caused the state to go to failed
self.assert_step_updated(self.strategy_step.subcloud_id,
consts.STRATEGY_STATE_FAILED)
@ -252,6 +320,35 @@ class TestSwUpgradePreCheckStage(TestSwUpgradeState):
self.assert_step_updated(self.strategy_step.subcloud_id,
consts.STRATEGY_STATE_FAILED)
def test_upgrade_pre_check_subcloud_online_failed_health_checks_no_alarms(self):
"""Test pre check step where the subcloud is online but is unhealthy
The pre-check should raise an exception and transition to the failed
state when the subcloud is not ready for upgrade due to some failure
other than platform alarms.
"""
# online subcloud running N+1 load
self.mock_db_query.return_value = FakeSubcloud(
availability_status=consts.AVAILABILITY_ONLINE,
deploy_status=consts.DEPLOY_STATE_DONE)
self.sysinv_client.get_system_health.return_value = \
SYSTEM_HEALTH_RESPONSE_K8S_FAILED_HEALTH_CHECKS
# invoke the strategy state operation on the orch thread
self.worker.perform_state_action(self.strategy_step)
# verify the DB query was invoked
self.mock_db_query.assert_called()
# verify the get system health API call was invoked
self.sysinv_client.get_system_health.assert_called()
# Verify the exception caused the state to go to failed
self.assert_step_updated(self.strategy_step.subcloud_id,
consts.STRATEGY_STATE_FAILED)
def test_upgrade_pre_check_subcloud_online_scratch_undersized(self):
"""Test pre check step where the subcloud is online undersized scratch

View File

@ -30,6 +30,7 @@ from dcmanager.tests import base
from dcmanager.tests.unit.common import fake_strategy
from dcmanager.tests.unit.common import fake_subcloud
from dcmanager.tests.unit.fakes import FakeVimClient
from dcmanager.tests.unit.orchestrator.states.fakes import FakeFmClient
from dcmanager.tests.unit.orchestrator.states.fakes import FakeKeystoneClient
from dcmanager.tests.unit.orchestrator.states.fakes import FakePatchingClient
from dcmanager.tests.unit.orchestrator.states.fakes import FakeSysinvClient
@ -84,6 +85,13 @@ class TestSwUpdate(base.DCManagerTestCase):
self.mock_vim_client.return_value = self.vim_client
self.addCleanup(p.stop)
# Mock the fm client defined in the base state class
self.fm_client = FakeFmClient()
p = mock.patch.object(BaseState, 'get_fm_client')
self.mock_fm_client = p.start()
self.mock_fm_client.return_value = self.fm_client
self.addCleanup(p.stop)
def setup_orch_worker(self, strategy_type):
worker = None
mock_strategy_lock = mock.Mock()