Filter out skippable alarms in precheck

Subcloud online checks are now skipped if the subcloud is already in the 'migrated' state. The precheck also skips upgrade alarm as well as the host lock alarm if upgrade has started. In addition, the commit includes the code to handle bmc_password with None value which the previous commit (69a7449988) did not handle. Closes-Bug: 1924774 Change-Id: Ifdfe1e44a34c2c2561c299a82d7178dce6063daf Signed-off-by: Tee Ngo <Tee.Ngo@windriver.com> (cherry picked from commit a9157c51d5)
2021-04-16 11:00:26 -04:00 · 2021-04-16 11:00:26 -04:00 · de0fef663a
parent afde5fa53f
commit de0fef663a
7 changed files with 255 additions and 81 deletions
--- a/distributedcloud/dccommon/drivers/openstack/fm.py
+++ b/distributedcloud/dccommon/drivers/openstack/fm.py
@ -56,3 +56,14 @@ class FmClient(base.DriverBase):
            LOG.error("get_alarm_summary exception={}".format(e))
            raise e
        return alarms
+
+    def get_alarms(self):
+        """Get this region alarms"""
+
+        try:
+            LOG.debug("get_alarms region %s" % self.region_name)
+            alarms = self.fm.alarm.list(include_suppress=True)
+        except Exception as e:
+            LOG.error("get_alarms exception={}".format(e))
+            raise e
+        return alarms
--- a/distributedcloud/dcmanager/orchestrator/states/base.py
+++ b/distributedcloud/dcmanager/orchestrator/states/base.py
@ -9,6 +9,7 @@ import six
 from oslo_log import log as logging

 from dccommon.drivers.openstack.barbican import BarbicanClient
+from dccommon.drivers.openstack.fm import FmClient
 from dccommon.drivers.openstack.patching_v1 import PatchingClient
 from dccommon.drivers.openstack.sdk_platform import OpenStackDriver
 from dccommon.drivers.openstack.sysinv_v1 import SysinvClient
@ -101,6 +102,10 @@ class BaseState(object):

        return SysinvClient(region_name, keystone_client.session)

+    def get_fm_client(self, region_name):
+        keystone_client = self.get_keystone_client(region_name)
+        return FmClient(region_name, keystone_client.session)
+
    def get_patching_client(self, region_name=consts.DEFAULT_REGION_NAME):
        keystone_client = self.get_keystone_client(region_name)
        return PatchingClient(region_name, keystone_client.session)
--- a/distributedcloud/dcmanager/orchestrator/states/upgrade/pre_check.py
+++ b/distributedcloud/dcmanager/orchestrator/states/upgrade/pre_check.py
@ -3,6 +3,7 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 #
+import copy
 import re

 from dccommon.drivers.openstack.sysinv_v1 import HOST_FS_NAME_SCRATCH
@ -26,6 +27,11 @@ VALID_ACTIVATION_STATES = [consts.DEPLOY_STATE_MIGRATED, ]

 MIN_SCRATCH_SIZE_REQUIRED_GB = 16

+UPGRADE_IN_PROGRESS_ALARM = '900.005'
+HOST_ADMINISTRATIVELY_LOCKED_ALARM = '200.001'
+
+ALARM_IGNORE_LIST = [UPGRADE_IN_PROGRESS_ALARM, ]
+

 class PreCheckState(BaseState):
    """This State performs entry checks and skips to the appropriate state"""
@ -34,73 +40,85 @@ class PreCheckState(BaseState):
        super(PreCheckState, self).__init__(
            next_state=consts.STRATEGY_STATE_INSTALLING_LICENSE, region_name=region_name)

-    def _perform_subcloud_online_checks(self, strategy_step, subcloud_sysinv_client):
-        subcloud_type = self.get_sysinv_client(
-            strategy_step.subcloud.name).get_system().system_mode
-        upgrades = self.get_sysinv_client(strategy_step.subcloud.name).get_upgrades()
+    def _perform_subcloud_online_checks(self, strategy_step, subcloud_sysinv_client,
+                                        subcloud_fm_client, host, alarm_ignore_list):

-        # For duplex upgrade, we skip health checks if an upgrade is in progress.
+        # check system health
+        #
+        # Sample output #1
+        # ================
+        #     Some non-management affecting alarms, all other checks passed
+        #
+        # System Health:
+        # All hosts are provisioned: [OK]
+        # All hosts are unlocked/enabled: [OK]
+        # All hosts have current configurations: [OK]
+        # All hosts are patch current: [OK]
+        # Ceph Storage Healthy: [OK]
+        # No alarms: [Fail]
+        # [1] alarms found, [0] of which are management affecting
+        # All kubernetes nodes are ready: [OK]
+        # All kubernetes control plane pods are ready: [OK]
+        #
+        # Sample output #2
+        # ================
+        #     Multiple failed checks, management affecting alarms
+        #
+        # System Health:
+        # All hosts are provisioned: [OK]
+        # All hosts are unlocked/enabled: [OK]
+        # All hosts have current configurations: [OK]
+        # All hosts are patch current: [OK]
+        # Ceph Storage Healthy: [Fail]
+        # No alarms: [Fail]
+        # [7] alarms found, [2] of which are management affecting
+        # All kubernetes nodes are ready: [OK]
+        # All kubernetes control plane pods are ready: [OK]

-        if (len(upgrades) != 0 and subcloud_type == consts.SYSTEM_MODE_DUPLEX):
-            self.info_log(strategy_step, "Health check skipped for non-simplex subclouds.")
+        # TODO(teewrs): Update the sysinv API to allow a list of ignored alarms
+        # to be passed to the health check API. This would be much more efficient
+        # than having to retrieve the alarms in a separate step.
+        system_health = subcloud_sysinv_client.get_system_health()
+        fails = re.findall("\[Fail\]", system_health)
+        failed_alarm_check = re.findall("No alarms: \[Fail\]", system_health)
+        no_mgmt_alarms = re.findall("\[0\] of which are management affecting",
+                                    system_health)
+        # The health conditions acceptable for upgrade are:
+        # a) subcloud is completely healthy (i.e. no failed checks)
+        # b) subcloud only fails alarm check and it only has non-management
+        #    affecting alarm(s)
+        # c) the management alarm(s) that subcloud has once upgrade has started
+        #    are upgrade alarm itself and host locked alarm
+
+        if ((len(fails) == 0) or
+                (len(fails) == 1 and failed_alarm_check and no_mgmt_alarms)):
+            self.info_log(strategy_step, "Health check passed.")
+        elif ((len(fails) > 1) or (len(fails) == 1 and not failed_alarm_check)):
+            # Multiple failures or kubernetes related failure which has not been
+            # converted into an alarm condition.
+            details = "System health check failed due to multiple failures. " \
+                      "Please run 'system health-query' command on the " \
+                      "subcloud for more details."
+            self.error_log(strategy_step, "\n" + system_health)
+            raise PreCheckFailedException(
+                subcloud=strategy_step.subcloud.name,
+                details=details,
+                )
        else:
-            # check system health
-            #
-            # Sample output #1
-            # ================
-            #     Some non-management affecting alarms, all other checks passed
-            #
-            # System Health:
-            # All hosts are provisioned: [OK]
-            # All hosts are unlocked/enabled: [OK]
-            # All hosts have current configurations: [OK]
-            # All hosts are patch current: [OK]
-            # Ceph Storage Healthy: [OK]
-            # No alarms: [Fail]
-            # [1] alarms found, [0] of which are management affecting
-            # All kubernetes nodes are ready: [OK]
-            # All kubernetes control plane pods are ready: [OK]
-            #
-            # Sample output #2
-            # ================
-            #     Multiple failed checks, management affecting alarms
-            #
-            # System Health:
-            # All hosts are provisioned: [OK]
-            # All hosts are unlocked/enabled: [OK]
-            # All hosts have current configurations: [OK]
-            # All hosts are patch current: [OK]
-            # Ceph Storage Healthy: [Fail]
-            # No alarms: [Fail]
-            # [7] alarms found, [2] of which are management affecting
-            # All kubernetes nodes are ready: [OK]
-            # All kubernetes control plane pods are ready: [OK]
-
-            system_health = subcloud_sysinv_client.get_system_health()
-            fails = re.findall("\[Fail\]", system_health)
-            failed_alarm_check = re.findall("No alarms: \[Fail\]", system_health)
-            no_mgmt_alarms = re.findall("\[0\] of which are management affecting",
-                                        system_health)
-            # The only 2 health conditions acceptable for simplex upgrade are:
-            # a) subcloud is completely healthy (i.e. no failed checks)
-            # b) subcloud only fails alarm check and it only has non-management
-            #    affecting alarm(s)
-
-            if ((len(fails) == 0) or
-                    (len(fails) == 1 and failed_alarm_check and no_mgmt_alarms)):
-                self.info_log(strategy_step, "Health check passed.")
-            else:
-                details = "System health check failed. " \
-                          "Please run 'system health-query' " \
-                          "command on the subcloud for more details."
-                self.error_log(strategy_step, "\n" + system_health)
-                raise PreCheckFailedException(
-                    subcloud=strategy_step.subcloud.name,
-                    details=details,
-                    )
+            alarms = subcloud_fm_client.get_alarms()
+            for alarm in alarms:
+                if alarm.alarm_id not in alarm_ignore_list:
+                    if alarm.mgmt_affecting == "True":
+                        details = "System health check failed due to alarm %s. " \
+                                  "Please run 'system health-query' " \
+                                  "command on the subcloud for more details." % alarm.alarm_id
+                        self.error_log(strategy_step, "\n" + system_health)
+                        raise PreCheckFailedException(
+                            subcloud=strategy_step.subcloud.name,
+                            details=details,
+                            )

        # check scratch
-        host = subcloud_sysinv_client.get_host("controller-0")
        scratch_fs = subcloud_sysinv_client.get_host_filesystem(
            host.uuid, HOST_FS_NAME_SCRATCH)
        if scratch_fs.size < MIN_SCRATCH_SIZE_REQUIRED_GB:
@ -125,12 +143,12 @@ class PreCheckState(BaseState):
            subcloud_sysinv_client = None
            try:
                subcloud_sysinv_client = self.get_sysinv_client(strategy_step.subcloud.name)
+                subcloud_fm_client = self.get_fm_client(strategy_step.subcloud.name)
            except Exception:
                # if getting the token times out, the orchestrator may have
                # restarted and subcloud may be offline; so will attempt
                # to use the persisted values
-                message = ("_perform_subcloud_online_checks subcloud %s "
-                           "failed to get subcloud client" %
+                message = ("Subcloud %s failed to get subcloud client" %
                           strategy_step.subcloud.name)
                self.error_log(strategy_step, message)
                raise ManualRecoveryRequiredException(
@ -160,7 +178,24 @@ class PreCheckState(BaseState):
                self.override_next_state(consts.STRATEGY_STATE_UPGRADING_SIMPLEX)
                return self.next_state

-            self._perform_subcloud_online_checks(strategy_step, subcloud_sysinv_client)
+            # We skip the subcloud online check if either the subcloud deploy status is
+            # "migrated" or the subcloud is a duplex subcloud and upgrade has started.
+            upgrades = subcloud_sysinv_client.get_upgrades()
+            if (subcloud.deploy_status == consts.DEPLOY_STATE_MIGRATED or
+               (len(upgrades) != 0 and subcloud_type == consts.SYSTEM_MODE_DUPLEX)):
+                self.info_log(strategy_step,
+                              "Online subcloud checks skipped.")
+            else:
+                alarm_ignore_list = copy.copy(ALARM_IGNORE_LIST)
+                if (host.administrative == consts.ADMIN_LOCKED and len(upgrades) != 0):
+                    alarm_ignore_list.append(HOST_ADMINISTRATIVELY_LOCKED_ALARM)
+
+                self._perform_subcloud_online_checks(strategy_step,
+                                                     subcloud_sysinv_client,
+                                                     subcloud_fm_client,
+                                                     host,
+                                                     alarm_ignore_list)
+
            # If the subcloud has completed data migration and is online,
            # advance directly to activating upgrade step. Otherwise, start
            # from installing license step.
--- a/distributedcloud/dcmanager/orchestrator/states/upgrade/upgrading_simplex.py
+++ b/distributedcloud/dcmanager/orchestrator/states/upgrade/upgrading_simplex.py
@ -281,11 +281,16 @@ class UpgradingSimplexState(BaseState):
        bmc_password = None
        if subcloud_barbican_client:
            bmc_password = subcloud_barbican_client.get_host_bmc_password(host.uuid)
+            if bmc_password:
+                # If the host is configured to store bmc in its barbican database,
+                # encode the password. Otherwise leave it as None and it will be
+                # replaced with the value retrieved from the dcmanager database.
+                bmc_password = b64encode(bmc_password)

        volatile_data_install.update({
            'bmc_address': host.bm_ip,
            'bmc_username': host.bm_username,
-            'bmc_password': b64encode(bmc_password),
+            'bmc_password': bmc_password,
            'install_type': install_type,
            'boot_device': host.boot_device,
            'rootfs_device': host.rootfs_device,
--- a/distributedcloud/dcmanager/tests/unit/orchestrator/states/fakes.py
+++ b/distributedcloud/dcmanager/tests/unit/orchestrator/states/fakes.py
@ -205,6 +205,11 @@ class FakePatchingClient(object):
        pass


+class FakeFmClient(object):
+    def __init__(self):
+        pass
+
+
 class FakeSystem(object):
    def __init__(self,
                 obj_id=1,
@ -226,3 +231,11 @@ class FakeUpgrade(object):
        self.from_release = from_release
        self.to_release = to_release
        self.links = []
+
+
+class FakeAlarm(object):
+    def __init__(self,
+                 alarm_id='12.34',
+                 mgmt_affecting='False'):
+        self.alarm_id = alarm_id
+        self.mgmt_affecting = mgmt_affecting
--- a/distributedcloud/dcmanager/tests/unit/orchestrator/states/upgrade/test_pre_check.py
+++ b/distributedcloud/dcmanager/tests/unit/orchestrator/states/upgrade/test_pre_check.py
@ -7,10 +7,12 @@ import mock

 from dcmanager.common import consts

+from dcmanager.tests.unit.orchestrator.states.fakes import FakeAlarm
 from dcmanager.tests.unit.orchestrator.states.fakes import FakeController
 from dcmanager.tests.unit.orchestrator.states.fakes import FakeHostFilesystem
 from dcmanager.tests.unit.orchestrator.states.fakes import FakeSubcloud
 from dcmanager.tests.unit.orchestrator.states.fakes import FakeSystem
+from dcmanager.tests.unit.orchestrator.states.fakes import FakeUpgrade
 from dcmanager.tests.unit.orchestrator.states.upgrade.test_base \
    import TestSwUpgradeState

@ -66,6 +68,22 @@ SYSTEM_HEALTH_RESPONSE_MULTIPLE_FAILED_HEALTH_CHECKS =  \
    "All kubernetes control plane pods are ready: [Fail]\n" \
    "Kubernetes control plane pods not ready: kube-apiserver-controller-0"

+SYSTEM_HEALTH_RESPONSE_K8S_FAILED_HEALTH_CHECKS =  \
+    "System Health:\n" \
+    "All hosts are provisioned: [OK]\n" \
+    "All hosts are unlocked/enabled: [OK]\n" \
+    "All hosts have current configurations: [OK]\n" \
+    "All hosts are patch current: [OK]\n" \
+    "Ceph Storage Healthy: [OK]\n" \
+    "No alarms: [OK]\n" \
+    "All kubernetes nodes are ready: [Fail]\n" \
+    "All kubernetes control plane pods are ready: [OK]"
+
+UPGRADE_STARTED = FakeUpgrade(state='started')
+
+UPGRADE_ALARM = FakeAlarm('900.005', 'True')
+HOST_LOCKED_ALARM = FakeAlarm('200.001', 'True')
+

 class TestSwUpgradePreCheckStage(TestSwUpgradeState):

@ -92,6 +110,7 @@ class TestSwUpgradePreCheckStage(TestSwUpgradeState):
        system_values.system_mode = consts.SYSTEM_MODE_SIMPLEX
        self.sysinv_client.get_system.return_value = system_values
        self.sysinv_client.get_upgrades = mock.MagicMock()
+        self.fm_client.get_alarms = mock.MagicMock()

    def test_upgrade_pre_check_subcloud_online_fresh(self):
        """Test pre check step where the subcloud is online and running N load
@ -172,34 +191,28 @@ class TestSwUpgradePreCheckStage(TestSwUpgradeState):
            availability_status=consts.AVAILABILITY_ONLINE,
            deploy_status=consts.DEPLOY_STATE_MIGRATED)

-        self.sysinv_client.get_host_filesystem.side_effect = \
-            [CONTROLLER_0_HOST_FS_SCRATCH_MIN_SIZED]
-
-        self.sysinv_client.get_system_health.return_value = \
-            SYSTEM_HEALTH_RESPONSE_SUCCESS
-
        # invoke the strategy state operation on the orch thread
        self.worker.perform_state_action(self.strategy_step)

        # verify the DB query was invoked
        self.mock_db_query.assert_called()

-        # verify the get system health API call was invoked
-        self.sysinv_client.get_system_health.assert_called()
+        # verify the get system health API call was not invoked
+        self.sysinv_client.get_system_health.assert_not_called()

-        # verify the get host filesystem API call was invoked
-        self.sysinv_client.get_host_filesystem.assert_called()
+        # verify the get host filesystem API call was not invoked
+        self.sysinv_client.get_host_filesystem.assert_not_called()

        # Verify the expected next state happened (activating upgrade)
        self.assert_step_updated(self.strategy_step.subcloud_id,
                                 consts.STRATEGY_STATE_ACTIVATING_UPGRADE)

-    def test_upgrade_pre_check_subcloud_online_management_alarm(self):
-        """Test pre check step where the subcloud is online with a mgmt alarm
+    def test_upgrade_pre_check_subcloud_online_host_locked_upgrade_started_mgmt_alarms(self):
+        """Test pre check step where the subcloud is online, locked and upgrade has started.

-        The pre-check should raise an exception and transition to the failed
-        state when the subcloud is not ready for upgrade due to the management
-        affecting alarm.
+        The pre-check should move to the next step as the upgrade alarm can
+        be ignored and the host locked alarm can also be ignored if upgrade has
+        started.
        """

        # online subcloud running N+1 load
@ -207,9 +220,20 @@ class TestSwUpgradePreCheckStage(TestSwUpgradeState):
            availability_status=consts.AVAILABILITY_ONLINE,
            deploy_status=consts.DEPLOY_STATE_DONE)

+        # subcloud is locked
+        self.sysinv_client.get_host.side_effect = [CONTROLLER_0_LOCKED]
+
+        # upgrade has started
+        self.sysinv_client.get_upgrades.return_value = [UPGRADE_STARTED, ]
+
        self.sysinv_client.get_system_health.return_value = \
            SYSTEM_HEALTH_RESPONSE_MGMT_AFFECTING_ALARM

+        self.fm_client.get_alarms.return_value = [UPGRADE_ALARM, HOST_LOCKED_ALARM, ]
+
+        self.sysinv_client.get_host_filesystem.side_effect = \
+            [CONTROLLER_0_HOST_FS_SCRATCH_MIN_SIZED]
+
        # invoke the strategy state operation on the orch thread
        self.worker.perform_state_action(self.strategy_step)

@ -219,6 +243,50 @@ class TestSwUpgradePreCheckStage(TestSwUpgradeState):
        # verify the get system health API call was invoked
        self.sysinv_client.get_system_health.assert_called()

+        # verify the get alarms API call was invoked
+        self.fm_client.get_alarms.assert_called()
+
+        # verify the get host filesystem API call was invoked
+        self.sysinv_client.get_host_filesystem.assert_called()
+
+        # Verify the expected next state happened (installing license)
+        self.assert_step_updated(self.strategy_step.subcloud_id,
+                                 consts.STRATEGY_STATE_INSTALLING_LICENSE)
+
+    def test_upgrade_pre_check_subcloud_online_host_locked_no_upgrade_mgmt_alarms(self):
+        """Test pre check step where subcloud is online, locked and upgrade has not started.
+
+        The pre-check should raise an exception and transition to the failed
+        state as host locked alarm cannot be skipped if upgrade has
+        not been started.
+        """
+
+        # online subcloud running N+1 load
+        self.mock_db_query.return_value = FakeSubcloud(
+            availability_status=consts.AVAILABILITY_ONLINE,
+            deploy_status=consts.DEPLOY_STATE_DONE)
+
+        # subcloud is locked
+        self.sysinv_client.get_host.side_effect = [CONTROLLER_0_LOCKED]
+
+        self.sysinv_client.get_system_health.return_value = \
+            SYSTEM_HEALTH_RESPONSE_MGMT_AFFECTING_ALARM
+
+        self.fm_client.get_alarms.return_value = [HOST_LOCKED_ALARM, ]
+
+        # self.fm_client.get_alarms.return_value = \
+        # invoke the strategy state operation on the orch thread
+        self.worker.perform_state_action(self.strategy_step)
+
+        # verify the DB query was invoked
+        self.mock_db_query.assert_called()
+
+        # verify the get system health API call was invoked
+        self.sysinv_client.get_system_health.assert_called()
+
+        # verify the get alarms API call was invoked
+        self.fm_client.get_alarms.assert_called()
+
        # Verify the exception caused the state to go to failed
        self.assert_step_updated(self.strategy_step.subcloud_id,
                                 consts.STRATEGY_STATE_FAILED)
@ -252,6 +320,35 @@ class TestSwUpgradePreCheckStage(TestSwUpgradeState):
        self.assert_step_updated(self.strategy_step.subcloud_id,
                                 consts.STRATEGY_STATE_FAILED)

+    def test_upgrade_pre_check_subcloud_online_failed_health_checks_no_alarms(self):
+        """Test pre check step where the subcloud is online but is unhealthy
+
+        The pre-check should raise an exception and transition to the failed
+        state when the subcloud is not ready for upgrade due to some failure
+        other than platform alarms.
+        """
+
+        # online subcloud running N+1 load
+        self.mock_db_query.return_value = FakeSubcloud(
+            availability_status=consts.AVAILABILITY_ONLINE,
+            deploy_status=consts.DEPLOY_STATE_DONE)
+
+        self.sysinv_client.get_system_health.return_value = \
+            SYSTEM_HEALTH_RESPONSE_K8S_FAILED_HEALTH_CHECKS
+
+        # invoke the strategy state operation on the orch thread
+        self.worker.perform_state_action(self.strategy_step)
+
+        # verify the DB query was invoked
+        self.mock_db_query.assert_called()
+
+        # verify the get system health API call was invoked
+        self.sysinv_client.get_system_health.assert_called()
+
+        # Verify the exception caused the state to go to failed
+        self.assert_step_updated(self.strategy_step.subcloud_id,
+                                 consts.STRATEGY_STATE_FAILED)
+
    def test_upgrade_pre_check_subcloud_online_scratch_undersized(self):
        """Test pre check step where the subcloud is online undersized scratch

--- a/distributedcloud/dcmanager/tests/unit/orchestrator/test_base.py
+++ b/distributedcloud/dcmanager/tests/unit/orchestrator/test_base.py
@ -30,6 +30,7 @@ from dcmanager.tests import base
 from dcmanager.tests.unit.common import fake_strategy
 from dcmanager.tests.unit.common import fake_subcloud
 from dcmanager.tests.unit.fakes import FakeVimClient
+from dcmanager.tests.unit.orchestrator.states.fakes import FakeFmClient
 from dcmanager.tests.unit.orchestrator.states.fakes import FakeKeystoneClient
 from dcmanager.tests.unit.orchestrator.states.fakes import FakePatchingClient
 from dcmanager.tests.unit.orchestrator.states.fakes import FakeSysinvClient
@ -84,6 +85,13 @@ class TestSwUpdate(base.DCManagerTestCase):
        self.mock_vim_client.return_value = self.vim_client
        self.addCleanup(p.stop)

+        # Mock the fm client defined in the base state class
+        self.fm_client = FakeFmClient()
+        p = mock.patch.object(BaseState, 'get_fm_client')
+        self.mock_fm_client = p.start()
+        self.mock_fm_client.return_value = self.fm_client
+        self.addCleanup(p.stop)
+
    def setup_orch_worker(self, strategy_type):
        worker = None
        mock_strategy_lock = mock.Mock()