distcloud/distributedcloud/dcmanager/orchestrator/states/upgrade/pre_check.py

#
# Copyright (c) 2020-2023 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
#
import copy
import re

from dccommon import consts as dccommon_consts
from dccommon.drivers.openstack.sysinv_v1 import HOST_FS_NAME_SCRATCH
from dcmanager.common import consts
from dcmanager.common.exceptions import ManualRecoveryRequiredException
from dcmanager.common.exceptions import PreCheckFailedException
from dcmanager.db import api as db_api
from dcmanager.orchestrator.states.base import BaseState
from dcmanager.orchestrator.states.upgrade.cache.cache_specifications import \
    REGION_ONE_SYSTEM_INFO_CACHE_TYPE

# These deploy states should transition to the 'upgrading' state
VALID_UPGRADE_STATES = [consts.DEPLOY_STATE_PRE_INSTALL_FAILED,
                        consts.DEPLOY_STATE_INSTALL_FAILED,
                        consts.DEPLOY_STATE_DATA_MIGRATION_FAILED, ]

# These deploy states should transition to the 'migrating_data' state
VALID_MIGRATE_DATA_STATES = [consts.DEPLOY_STATE_INSTALLED, ]

# These deploy states should transition to the 'activating_upgrade' state
VALID_ACTIVATION_STATES = [consts.DEPLOY_STATE_MIGRATED, ]

MIN_SCRATCH_SIZE_REQUIRED_GB = 16

UPGRADE_IN_PROGRESS_ALARM = '900.005'
HOST_ADMINISTRATIVELY_LOCKED_ALARM = '200.001'

ALARM_IGNORE_LIST = [UPGRADE_IN_PROGRESS_ALARM, ]


class PreCheckState(BaseState):
    """This State performs entry checks and skips to the appropriate state"""

    def __init__(self, region_name):
        super(PreCheckState, self).__init__(
            next_state=consts.STRATEGY_STATE_INSTALLING_LICENSE, region_name=region_name)

    def _check_health(self, strategy_step, subcloud_sysinv_client, subcloud_fm_client,
                      host, upgrades):

        # Check system upgrade health
        #
        # Sample output #1
        # ================
        #     Some non-management affecting alarms, all other checks passed
        #
        # System Health:
        # All hosts are provisioned: [OK]
        # All hosts are unlocked/enabled: [OK]
        # All hosts have current configurations: [OK]
        # All hosts are patch current: [OK]
        # Ceph Storage Healthy: [OK]
        # No alarms: [Fail]
        # [1] alarms found, [0] of which are management affecting
        # All kubernetes nodes are ready: [OK]
        # All kubernetes control plane pods are ready: [OK]
        # Active kubernetes version is the latest supported version: [OK]
        # No imported load found. Unable to test further
        #
        # Sample output #2
        # ================
        #     Multiple failed checks, management affecting alarms
        #
        # System Health:
        # All hosts are provisioned: [OK]
        # All hosts are unlocked/enabled: [OK]
        # All hosts have current configurations: [OK]
        # All hosts are patch current: [OK]
        # Ceph Storage Healthy: [Fail]
        # No alarms: [Fail]
        # [7] alarms found, [2] of which are management affecting
        # All kubernetes nodes are ready: [OK]
        # All kubernetes control plane pods are ready: [OK]
        # Active kubernetes version is the latest supported version: [OK]
        # No imported load found. Unable to test further

        # TODO(teewrs): Update the sysinv API to allow a list of ignored alarms
        # to be passed to the health check API. This would be much more efficient
        # than having to retrieve the alarms in a separate step.
        system_health = subcloud_sysinv_client.get_system_health_upgrade()
        fails = re.findall("\[Fail\]", system_health)
        failed_alarm_check = re.findall("No alarms: \[Fail\]", system_health)
        no_mgmt_alarms = re.findall("\[0\] of which are management affecting",
                                    system_health)

        alarm_ignore_list = copy.copy(ALARM_IGNORE_LIST)
        if (host.administrative == consts.ADMIN_LOCKED and upgrades):
            alarm_ignore_list.append(HOST_ADMINISTRATIVELY_LOCKED_ALARM)

        # Clean old error messages
        db_api.subcloud_update(
            self.context, strategy_step.subcloud_id,
            error_description=consts.ERROR_DESC_EMPTY)
        # The health conditions acceptable for upgrade are:
        # a) subcloud is completely healthy (i.e. no failed checks)
        # b) subcloud only fails alarm check and it only has non-management
        #    affecting alarm(s)
        # c) the management alarm(s) that subcloud has once upgrade has started
        #    are upgrade alarm itself and host locked alarm
        if ((len(fails) == 0) or
                (len(fails) == 1 and failed_alarm_check and no_mgmt_alarms)):
            self.info_log(strategy_step, "Health check passed.")
            return

        if not failed_alarm_check:
            # Health check failure: no alarms involved
            #
            # These could be Kubernetes or other related failure(s) which has not been been
            # converted into an alarm condition.
            error_desc_msg = ("System upgrade health check failed. \n %s" %
                              fails)
            db_api.subcloud_update(
                self.context, strategy_step.subcloud_id,
                error_description=error_desc_msg[0:consts.ERROR_DESCRIPTION_LENGTH])
            details = ("System upgrade health check failed. Please run 'system health-query-upgrade' "
                       "command on the subcloud or %s on central for details"
                       % (consts.ERROR_DESC_CMD))
            self.error_log(strategy_step, "\n" + system_health)
            raise PreCheckFailedException(
                subcloud=strategy_step.subcloud.name,
                details=details,
                )
        else:
            # Health check failure: one or more alarms
            if (upgrades and (len(fails) == len(alarm_ignore_list))):
                # Upgrade has started, previous try failed either before or after
                # host lock.
                return
            elif len(fails) == 1:
                # Healthy check failure: exclusively alarms related
                alarms = subcloud_fm_client.get_alarms()
                for alarm in alarms:
                    if alarm.alarm_id not in alarm_ignore_list:
                        if alarm.mgmt_affecting == "True":
                            error_desc_msg = ("System upgrade health check failed due to alarm %s. "
                                              "System upgrade health: \n %s" %
                                              (alarm.alarm_id, system_health))
                            db_api.subcloud_update(
                                self.context, strategy_step.subcloud_id,
                                error_description=error_desc_msg[0:consts.ERROR_DESCRIPTION_LENGTH])
                            details = ("System upgrade health check failed due to alarm %s. "
                                       "Please run 'system health-query-upgrade' "
                                       "command on the subcloud or %s on central for details." %
                                       (alarm.alarm_id, consts.ERROR_DESC_CMD))
                            self.error_log(strategy_step, "\n" + system_health)
                            raise PreCheckFailedException(
                                subcloud=strategy_step.subcloud.name,
                                details=details,
                                )
            else:
                # Multiple failures
                error_desc_msg = ("System upgrade health check failed due to multiple failures. "
                                  "Health: \n %s" %
                                  (system_health))
                db_api.subcloud_update(
                    self.context, strategy_step.subcloud_id,
                    error_description=error_desc_msg[0:consts.ERROR_DESCRIPTION_LENGTH])
                details = ("System upgrade health check failed due to multiple failures. "
                           "Please run 'system health-query-upgrade' command on the "
                           "subcloud or %s on central for details." %
                           (consts.ERROR_DESC_CMD))
                self.error_log(strategy_step, "\n" + system_health)
                raise PreCheckFailedException(
                    subcloud=strategy_step.subcloud.name,
                    details=details,
                    )

    def _check_scratch(self, strategy_step, subcloud_sysinv_client, host):
        scratch_fs = subcloud_sysinv_client.get_host_filesystem(
            host.uuid, HOST_FS_NAME_SCRATCH)
        if scratch_fs.size < MIN_SCRATCH_SIZE_REQUIRED_GB:
            details = ("Scratch filesystem size of %s does not meet "
                       "minimum required %s" %
                       (scratch_fs.size, MIN_SCRATCH_SIZE_REQUIRED_GB))
            raise PreCheckFailedException(
                subcloud=strategy_step.subcloud.name,
                details=details,
                )

    def _perform_subcloud_online_checks(self, strategy_step, subcloud_sysinv_client,
                                        subcloud_fm_client, host, upgrades):

        self._check_health(strategy_step, subcloud_sysinv_client, subcloud_fm_client,
                           host, upgrades)

        self._check_scratch(strategy_step, subcloud_sysinv_client, host)

    def perform_state_action(self, strategy_step):
        """This state will check if the subcloud is offline:

        Check the deploy_status and transfer to the correct state.
        if an unsupported deploy_status is encountered, fail the upgrade
        """

        subcloud = db_api.subcloud_get(self.context, strategy_step.subcloud.id)

        if subcloud.availability_status == dccommon_consts.AVAILABILITY_ONLINE:
            subcloud_sysinv_client = None
            try:
                subcloud_sysinv_client = self.get_sysinv_client(strategy_step.subcloud.region_name)
                subcloud_fm_client = self.get_fm_client(strategy_step.subcloud.region_name)
            except Exception:
                # if getting the token times out, the orchestrator may have
                # restarted and subcloud may be offline; so will attempt
                # to use the persisted values
                message = ("Subcloud %s failed to get subcloud client" %
                           strategy_step.subcloud.name)
                self.error_log(strategy_step, message)
                error_message = "deploy state: %s" % subcloud.deploy_status
                raise ManualRecoveryRequiredException(
                    subcloud=strategy_step.subcloud.name,
                    error_message=error_message)

            host = subcloud_sysinv_client.get_host("controller-0")
            subcloud_type = self.get_sysinv_client(
                strategy_step.subcloud.region_name).get_system().system_mode

            upgrades = subcloud_sysinv_client.get_upgrades()
            if subcloud_type == consts.SYSTEM_MODE_SIMPLEX:
                # Check presence of data_install values.  These are managed
                # semantically on subcloud add or update
                if not subcloud.data_install:
                    details = ("Data install values are missing and must be updated "
                               "via dcmanager subcloud update")
                    raise PreCheckFailedException(
                        subcloud=strategy_step.subcloud.name,
                        details=details)

                if (host.administrative == consts.ADMIN_LOCKED and
                        (subcloud.deploy_status == consts.DEPLOY_STATE_INSTALL_FAILED or
                         subcloud.deploy_status == consts.DEPLOY_STATE_PRE_INSTALL_FAILED)):
                    # If the subcloud is online but its deploy state is pre-install-failed
                    # or install-failed and the subcloud host is locked, the upgrading
                    # simplex step must have failed early in the previous upgrade attempt.
                    # The pre-check should transition directly to upgrading simplex step in the
                    # retry.
                    self.override_next_state(consts.STRATEGY_STATE_UPGRADING_SIMPLEX)
                    return self.next_state

                # Skip subcloud online checks if the subcloud deploy status is
                # "migrated".
                if subcloud.deploy_status == consts.DEPLOY_STATE_MIGRATED:
                    self.info_log(strategy_step, "Online subcloud checks skipped.")
                else:
                    self._perform_subcloud_online_checks(strategy_step,
                                                         subcloud_sysinv_client,
                                                         subcloud_fm_client,
                                                         host, upgrades)

                if subcloud.deploy_status == consts.DEPLOY_STATE_MIGRATED:
                    # If the subcloud has completed data migration, advance directly
                    # to activating upgrade step.
                    self.override_next_state(consts.STRATEGY_STATE_ACTIVATING_UPGRADE)
                elif subcloud.deploy_status == consts.DEPLOY_STATE_DATA_MIGRATION_FAILED:
                    # If the subcloud deploy status is data-migration-failed but
                    # it is online and has passed subcloud online checks, it must have
                    # timed out while waiting for the subcloud to reboot previously and
                    # has succesfully been unlocked since. Update the subcloud deploy
                    # status and advance to activating upgrade step.
                    db_api.subcloud_update(
                        self.context, strategy_step.subcloud_id,
                        deploy_status=consts.DEPLOY_STATE_MIGRATED)
                    self.override_next_state(consts.STRATEGY_STATE_ACTIVATING_UPGRADE)
            else:
                # Duplex case
                if upgrades:
                    # If upgrade has started, skip subcloud online checks
                    self.info_log(strategy_step, "Online subcloud checks skipped.")
                    upgrade_state = upgrades[0].state
                    if(upgrade_state == consts.UPGRADE_STATE_DATA_MIGRATION_FAILED or
                       upgrade_state == consts.UPGRADE_STATE_DATA_MIGRATION):
                        error_message = "upgrade state: %s" % upgrade_state
                        raise ManualRecoveryRequiredException(
                            subcloud=strategy_step.subcloud.name,
                            error_message=error_message)
                    elif(upgrade_state == consts.UPGRADE_STATE_UPGRADING_CONTROLLERS or
                         upgrade_state == consts.UPGRADE_STATE_DATA_MIGRATION_COMPLETE):
                        # At this point the subcloud is duplex, deploy state is complete
                        # and "system upgrade-show" on the subcloud indicates that the
                        # upgrade state is "upgrading-controllers".
                        # If controller-1 is locked then we unlock it,
                        # if controller-0 is active we need to swact
                        # else we can proceed to create the VIM strategy.
                        controller_1_host = subcloud_sysinv_client.get_host("controller-1")
                        if controller_1_host.administrative == consts.ADMIN_LOCKED:
                            self.override_next_state(
                                consts.STRATEGY_STATE_UNLOCKING_CONTROLLER_1)
                        elif host.capabilities.get('Personality') == consts.PERSONALITY_CONTROLLER_ACTIVE:
                            self.override_next_state(
                                consts.STRATEGY_STATE_SWACTING_TO_CONTROLLER_1)
                        else:
                            self.override_next_state(
                                consts.STRATEGY_STATE_CREATING_VIM_UPGRADE_STRATEGY)
                    elif (upgrade_state == consts.UPGRADE_STATE_UPGRADING_HOSTS):
                        # At this point the subcloud is duplex, deploy state is complete
                        # and "system upgrade-show" on the subcloud indicates that the
                        # upgrade state is "upgrading-hosts".
                        # If both subcloud hosts are upgraded to the newer load,
                        # we resume the state machine from activate upgrade state.
                        # Otherwise, we resume from create the VIM strategy state.

                        # determine the version of the system controller in region one
                        target_version = \
                            self._read_from_cache(REGION_ONE_SYSTEM_INFO_CACHE_TYPE)\
                                .software_version

                        all_hosts_upgraded = True
                        subcloud_hosts = self.get_sysinv_client(
                            strategy_step.subcloud.region_name).get_hosts()
                        for subcloud_host in subcloud_hosts:
                            if(subcloud_host.software_load != target_version or
                               subcloud_host.administrative == consts.ADMIN_LOCKED or
                               subcloud_host.operational == consts.OPERATIONAL_DISABLED):
                                all_hosts_upgraded = False
                                self.override_next_state(
                                    consts.STRATEGY_STATE_CREATING_VIM_UPGRADE_STRATEGY)

                        if all_hosts_upgraded:
                            if host.capabilities.get('Personality') == consts.PERSONALITY_CONTROLLER_ACTIVE:
                                self.override_next_state(
                                    consts.STRATEGY_STATE_ACTIVATING_UPGRADE)
                            else:
                                self.override_next_state(
                                    consts.STRATEGY_STATE_SWACTING_TO_CONTROLLER_0)
                    elif (upgrade_state == consts.UPGRADE_STATE_ACTIVATION_FAILED):
                        if(host.capabilities.get('Personality') == consts.PERSONALITY_CONTROLLER_ACTIVE):
                            self.override_next_state(
                                consts.STRATEGY_STATE_ACTIVATING_UPGRADE)
                        else:
                            self.override_next_state(
                                consts.STRATEGY_STATE_SWACTING_TO_CONTROLLER_0)
                    elif (upgrade_state == consts.UPGRADE_STATE_ACTIVATION_COMPLETE):
                        self.override_next_state(consts.STRATEGY_STATE_COMPLETING_UPGRADE)

                else:
                    # Perform subcloud online check for duplex and proceed to the next step
                    # (i.e. installing license)
                    self._perform_subcloud_online_checks(strategy_step,
                                                         subcloud_sysinv_client,
                                                         subcloud_fm_client,
                                                         host, upgrades)
            return self.next_state

        # If it gets here, the subcloud must be offline and is a simplex
        if subcloud.deploy_status in VALID_UPGRADE_STATES:
            if not subcloud.data_install:
                details = ("Data install values are missing and must be updated "
                           "via dcmanager subcloud update")
                raise PreCheckFailedException(
                    subcloud=strategy_step.subcloud.name,
                    details=details)

            self.override_next_state(consts.STRATEGY_STATE_UPGRADING_SIMPLEX)
            return self.next_state

        elif subcloud.deploy_status in VALID_MIGRATE_DATA_STATES:
            self.override_next_state(consts.STRATEGY_STATE_MIGRATING_DATA)
            return self.next_state

        elif subcloud.deploy_status in VALID_ACTIVATION_STATES:
            self.override_next_state(consts.STRATEGY_STATE_ACTIVATING_UPGRADE)
            return self.next_state

        # FAIL: We are offline and encountered an un-recoverable deploy status
        self.info_log(strategy_step,
                      "Un-handled deploy_status: %s" % subcloud.deploy_status)
        error_message = "deploy state: %s" % subcloud.deploy_status
        raise ManualRecoveryRequiredException(
            subcloud=strategy_step.subcloud.name,
            error_message=error_message)