419 lines
22 KiB
Python
419 lines
22 KiB
Python
#
|
|
# Copyright (c) 2020-2024 Wind River Systems, Inc.
|
|
#
|
|
# SPDX-License-Identifier: Apache-2.0
|
|
#
|
|
|
|
import copy
|
|
import re
|
|
|
|
from dccommon import consts as dccommon_consts
|
|
from dccommon.drivers.openstack.sysinv_v1 import HOST_FS_NAME_SCRATCH
|
|
from dcmanager.common import consts
|
|
from dcmanager.common.exceptions import ManualRecoveryRequiredException
|
|
from dcmanager.common.exceptions import PreCheckFailedException
|
|
from dcmanager.db import api as db_api
|
|
from dcmanager.orchestrator.states.base import BaseState
|
|
from dcmanager.orchestrator.states.upgrade.cache.cache_specifications import \
|
|
REGION_ONE_SYSTEM_INFO_CACHE_TYPE
|
|
|
|
# These deploy states should transition to the 'upgrading' state
|
|
VALID_UPGRADE_STATES = [consts.DEPLOY_STATE_PRE_INSTALL_FAILED,
|
|
consts.DEPLOY_STATE_INSTALL_FAILED,
|
|
consts.DEPLOY_STATE_DATA_MIGRATION_FAILED, ]
|
|
|
|
# These deploy states should transition to the 'migrating_data' state
|
|
VALID_MIGRATE_DATA_STATES = [consts.DEPLOY_STATE_INSTALLED, ]
|
|
|
|
# These deploy states should transition to the 'activating_upgrade' state
|
|
VALID_ACTIVATION_STATES = [consts.DEPLOY_STATE_MIGRATED, ]
|
|
|
|
MIN_SCRATCH_SIZE_REQUIRED_GB = 16
|
|
|
|
UPGRADE_IN_PROGRESS_ALARM = '900.005'
|
|
HOST_ADMINISTRATIVELY_LOCKED_ALARM = '200.001'
|
|
|
|
ALARM_IGNORE_LIST = [UPGRADE_IN_PROGRESS_ALARM, ]
|
|
|
|
|
|
class PreCheckState(BaseState):
|
|
"""This State performs entry checks and skips to the appropriate state"""
|
|
|
|
def __init__(self, region_name):
|
|
super(PreCheckState, self).__init__(
|
|
next_state=consts.STRATEGY_STATE_INSTALLING_LICENSE,
|
|
region_name=region_name)
|
|
|
|
def _check_health(
|
|
self, strategy_step, subcloud_sysinv_client, subcloud_fm_client,
|
|
host, upgrades):
|
|
|
|
# Check system upgrade health
|
|
#
|
|
# Sample output #1
|
|
# ================
|
|
# Some non-management affecting alarms, all other checks passed
|
|
#
|
|
# System Health:
|
|
# All hosts are provisioned: [OK]
|
|
# All hosts are unlocked/enabled: [OK]
|
|
# All hosts have current configurations: [OK]
|
|
# All hosts are patch current: [OK]
|
|
# Ceph Storage Healthy: [OK]
|
|
# No alarms: [Fail]
|
|
# [1] alarms found, [0] of which are management affecting
|
|
# All kubernetes nodes are ready: [OK]
|
|
# All kubernetes control plane pods are ready: [OK]
|
|
# Active kubernetes version is the latest supported version: [OK]
|
|
# No imported load found. Unable to test further
|
|
#
|
|
# Sample output #2
|
|
# ================
|
|
# Multiple failed checks, management affecting alarms
|
|
#
|
|
# System Health:
|
|
# All hosts are provisioned: [OK]
|
|
# All hosts are unlocked/enabled: [OK]
|
|
# All hosts have current configurations: [OK]
|
|
# All hosts are patch current: [OK]
|
|
# Ceph Storage Healthy: [Fail]
|
|
# No alarms: [Fail]
|
|
# [7] alarms found, [2] of which are management affecting
|
|
# All kubernetes nodes are ready: [OK]
|
|
# All kubernetes control plane pods are ready: [OK]
|
|
# Active kubernetes version is the latest supported version: [OK]
|
|
# No imported load found. Unable to test further
|
|
|
|
# TODO(teewrs): Update the sysinv API to allow a list of ignored alarms
|
|
# to be passed to the health check API. This would be much more efficient
|
|
# than having to retrieve the alarms in a separate step.
|
|
system_health = subcloud_sysinv_client.get_system_health_upgrade()
|
|
fails = re.findall("\[Fail\]", system_health)
|
|
failed_alarm_check = re.findall("No alarms: \[Fail\]", system_health)
|
|
no_mgmt_alarms = re.findall("\[0\] of which are management affecting",
|
|
system_health)
|
|
|
|
alarm_ignore_list = copy.copy(ALARM_IGNORE_LIST)
|
|
if (host.administrative == consts.ADMIN_LOCKED and upgrades):
|
|
alarm_ignore_list.append(HOST_ADMINISTRATIVELY_LOCKED_ALARM)
|
|
|
|
# Clean old error messages
|
|
db_api.subcloud_update(
|
|
self.context, strategy_step.subcloud_id,
|
|
error_description=consts.ERROR_DESC_EMPTY)
|
|
# The health conditions acceptable for upgrade are:
|
|
# a) subcloud is completely healthy (i.e. no failed checks)
|
|
# b) subcloud only fails alarm check and it only has non-management
|
|
# affecting alarm(s)
|
|
# c) the management alarm(s) that subcloud has once upgrade has started
|
|
# are upgrade alarm itself and host locked alarm
|
|
if ((len(fails) == 0) or
|
|
(len(fails) == 1 and failed_alarm_check and no_mgmt_alarms)):
|
|
self.info_log(strategy_step, "Health check passed.")
|
|
return
|
|
|
|
if not failed_alarm_check:
|
|
# Health check failure: no alarms involved
|
|
#
|
|
# These could be Kubernetes or other related failure(s) which has not
|
|
# been been converted into an alarm condition.
|
|
error_desc_msg = ("System upgrade health check failed. \n %s" %
|
|
fails)
|
|
db_api.subcloud_update(
|
|
self.context, strategy_step.subcloud_id,
|
|
error_description=error_desc_msg[0:consts.ERROR_DESCRIPTION_LENGTH])
|
|
details = (
|
|
"System upgrade health check failed. Please run "
|
|
"'system health-query-upgrade' command on the subcloud or %s "
|
|
"on central for details" % (consts.ERROR_DESC_CMD))
|
|
self.error_log(strategy_step, "\n" + system_health)
|
|
raise PreCheckFailedException(
|
|
subcloud=strategy_step.subcloud.name,
|
|
details=details,
|
|
)
|
|
else:
|
|
# Health check failure: one or more alarms
|
|
if (upgrades and (len(fails) == len(alarm_ignore_list))):
|
|
# Upgrade has started, previous try failed either before or after
|
|
# host lock.
|
|
return
|
|
elif len(fails) == 1:
|
|
# Healthy check failure: exclusively alarms related
|
|
alarms = subcloud_fm_client.get_alarms()
|
|
for alarm in alarms:
|
|
if alarm.alarm_id not in alarm_ignore_list:
|
|
if alarm.mgmt_affecting == "True":
|
|
error_desc_msg = (
|
|
"System upgrade health check failed due to "
|
|
"alarm %s. System upgrade health: \n %s" %
|
|
(alarm.alarm_id, system_health))
|
|
db_api.subcloud_update(
|
|
self.context, strategy_step.subcloud_id,
|
|
error_description=error_desc_msg[
|
|
0:consts.ERROR_DESCRIPTION_LENGTH])
|
|
details = (
|
|
"System upgrade health check failed due to "
|
|
"alarm %s. Please run 'system health-query-upgrade' "
|
|
"command on the subcloud or %s on central for "
|
|
"details." % (alarm.alarm_id, consts.ERROR_DESC_CMD))
|
|
self.error_log(strategy_step, "\n" + system_health)
|
|
raise PreCheckFailedException(
|
|
subcloud=strategy_step.subcloud.name,
|
|
details=details,
|
|
)
|
|
else:
|
|
# Multiple failures
|
|
error_desc_msg = (
|
|
"System upgrade health check failed due to multiple failures. "
|
|
"Health: \n %s" % system_health)
|
|
db_api.subcloud_update(
|
|
self.context, strategy_step.subcloud_id,
|
|
error_description=error_desc_msg[
|
|
0:consts.ERROR_DESCRIPTION_LENGTH])
|
|
details = (
|
|
"System upgrade health check failed due to multiple failures. "
|
|
"Please run 'system health-query-upgrade' command on the "
|
|
"subcloud or %s on central for details." % consts.ERROR_DESC_CMD)
|
|
self.error_log(strategy_step, "\n" + system_health)
|
|
raise PreCheckFailedException(
|
|
subcloud=strategy_step.subcloud.name,
|
|
details=details,
|
|
)
|
|
|
|
def _check_scratch(self, strategy_step, subcloud_sysinv_client, host):
|
|
scratch_fs = subcloud_sysinv_client.get_host_filesystem(
|
|
host.uuid, HOST_FS_NAME_SCRATCH)
|
|
if scratch_fs.size < MIN_SCRATCH_SIZE_REQUIRED_GB:
|
|
details = ("Scratch filesystem size of %s does not meet "
|
|
"minimum required %s" %
|
|
(scratch_fs.size, MIN_SCRATCH_SIZE_REQUIRED_GB))
|
|
raise PreCheckFailedException(
|
|
subcloud=strategy_step.subcloud.name,
|
|
details=details,
|
|
)
|
|
|
|
def _perform_subcloud_online_checks(self, strategy_step, subcloud_sysinv_client,
|
|
subcloud_fm_client, host, upgrades):
|
|
|
|
self._check_health(strategy_step, subcloud_sysinv_client, subcloud_fm_client,
|
|
host, upgrades)
|
|
|
|
self._check_scratch(strategy_step, subcloud_sysinv_client, host)
|
|
|
|
def perform_state_action(self, strategy_step):
|
|
"""This state will check if the subcloud is offline:
|
|
|
|
Check the deploy_status and transfer to the correct state.
|
|
if an unsupported deploy_status is encountered, fail the upgrade
|
|
"""
|
|
|
|
subcloud = db_api.subcloud_get(self.context, strategy_step.subcloud.id)
|
|
|
|
if subcloud.availability_status == dccommon_consts.AVAILABILITY_ONLINE:
|
|
subcloud_sysinv_client = None
|
|
try:
|
|
subcloud_sysinv_client = \
|
|
self.get_sysinv_client(strategy_step.subcloud.region_name)
|
|
subcloud_fm_client = \
|
|
self.get_fm_client(strategy_step.subcloud.region_name)
|
|
except Exception:
|
|
# if getting the token times out, the orchestrator may have
|
|
# restarted and subcloud may be offline; so will attempt
|
|
# to use the persisted values
|
|
message = ("Subcloud %s failed to get subcloud client" %
|
|
strategy_step.subcloud.name)
|
|
self.error_log(strategy_step, message)
|
|
error_message = "deploy state: %s" % subcloud.deploy_status
|
|
raise ManualRecoveryRequiredException(
|
|
subcloud=strategy_step.subcloud.name,
|
|
error_message=error_message)
|
|
|
|
host = subcloud_sysinv_client.get_host("controller-0")
|
|
subcloud_type = self.get_sysinv_client(
|
|
strategy_step.subcloud.region_name).get_system().system_mode
|
|
|
|
upgrades = subcloud_sysinv_client.get_upgrades()
|
|
if subcloud_type == consts.SYSTEM_MODE_SIMPLEX:
|
|
# Check presence of data_install values. These are managed
|
|
# semantically on subcloud add or update
|
|
if not subcloud.data_install:
|
|
details = ("Data install values are missing and must be updated "
|
|
"via dcmanager subcloud update")
|
|
raise PreCheckFailedException(
|
|
subcloud=strategy_step.subcloud.name,
|
|
details=details)
|
|
|
|
sc_status = subcloud.deploy_status
|
|
if (host.administrative == consts.ADMIN_LOCKED and
|
|
(sc_status == consts.DEPLOY_STATE_INSTALL_FAILED or
|
|
sc_status == consts.DEPLOY_STATE_PRE_INSTALL_FAILED)):
|
|
# If the subcloud is online but its deploy state is
|
|
# pre-install-failed or install-failed and the subcloud host is
|
|
# locked, the upgrading simplex step must have failed early in
|
|
# the previous upgrade attempt. The pre-check should transition
|
|
# directly to upgrading simplex step in the retry.
|
|
self.override_next_state(consts.STRATEGY_STATE_UPGRADING_SIMPLEX)
|
|
return self.next_state
|
|
|
|
# Skip subcloud online checks if the subcloud deploy status is
|
|
# either "migrated" or "upgrade-activated".
|
|
if subcloud.deploy_status in [consts.DEPLOY_STATE_MIGRATED,
|
|
consts.DEPLOY_STATE_UPGRADE_ACTIVATED]:
|
|
self.info_log(strategy_step, "Online subcloud checks skipped.")
|
|
else:
|
|
self._perform_subcloud_online_checks(strategy_step,
|
|
subcloud_sysinv_client,
|
|
subcloud_fm_client,
|
|
host, upgrades)
|
|
|
|
if subcloud.deploy_status == consts.DEPLOY_STATE_UPGRADE_ACTIVATED:
|
|
# If the subcloud has completed upgrade activation,
|
|
# advance directly to completing step.
|
|
self.override_next_state(
|
|
consts.STRATEGY_STATE_COMPLETING_UPGRADE
|
|
)
|
|
elif subcloud.deploy_status == \
|
|
consts.DEPLOY_STATE_DATA_MIGRATION_FAILED:
|
|
# If the subcloud deploy status is data-migration-failed but
|
|
# it is online and has passed subcloud online checks, it must
|
|
# have timed out while waiting for the subcloud to unlock
|
|
# previously and has succesfully been unlocked since. Update
|
|
# the subcloud deploy status and advance to activating upgrade
|
|
# step.
|
|
db_api.subcloud_update(
|
|
self.context, strategy_step.subcloud_id,
|
|
deploy_status=consts.DEPLOY_STATE_MIGRATED)
|
|
self.override_next_state(
|
|
consts.STRATEGY_STATE_ACTIVATING_UPGRADE
|
|
)
|
|
elif subcloud.deploy_status == consts.DEPLOY_STATE_MIGRATED:
|
|
# If the subcloud deploy status is migrated but it is online, it
|
|
# must have undergone 2 upgrade attempts:
|
|
# - in 1st upgrade attempt: strategy timed out while waiting
|
|
# for the subcloud to unlock
|
|
# - in 2nd upgrade attempt: the subcloud was unlocked
|
|
# successfully (with or without manual interventions) but
|
|
# failed to activate.
|
|
# Advance to activating upgrade step so activation can be retried
|
|
# after the manual intervention.
|
|
self.override_next_state(
|
|
consts.STRATEGY_STATE_ACTIVATING_UPGRADE
|
|
)
|
|
else:
|
|
# Duplex case
|
|
if upgrades:
|
|
# If upgrade has started, skip subcloud online checks
|
|
self.info_log(strategy_step, "Online subcloud checks skipped.")
|
|
upgrade_state = upgrades[0].state
|
|
controllers_state = consts.UPGRADE_STATE_UPGRADING_CONTROLLERS
|
|
migration_complete = consts.UPGRADE_STATE_DATA_MIGRATION_COMPLETE
|
|
|
|
if (upgrade_state == consts.UPGRADE_STATE_DATA_MIGRATION_FAILED
|
|
or upgrade_state == consts.UPGRADE_STATE_DATA_MIGRATION):
|
|
error_message = "upgrade state: %s" % upgrade_state
|
|
raise ManualRecoveryRequiredException(
|
|
subcloud=strategy_step.subcloud.name,
|
|
error_message=error_message)
|
|
elif (upgrade_state == controllers_state or
|
|
upgrade_state == migration_complete):
|
|
# At this point the subcloud is duplex, deploy state is
|
|
# completeand "system upgrade-show" on the subcloud indicates
|
|
# that the upgrade state is "upgrading-controllers".
|
|
# If controller-1 is locked then we unlock it,
|
|
# if controller-0 is active we need to swact
|
|
# else we can proceed to create the VIM strategy.
|
|
controller_1_host = subcloud_sysinv_client.get_host(
|
|
"controller-1")
|
|
if controller_1_host.administrative == consts.ADMIN_LOCKED:
|
|
self.override_next_state(
|
|
consts.STRATEGY_STATE_UNLOCKING_CONTROLLER_1)
|
|
elif host.capabilities.get('Personality') == \
|
|
consts.PERSONALITY_CONTROLLER_ACTIVE:
|
|
self.override_next_state(
|
|
consts.STRATEGY_STATE_SWACTING_TO_CONTROLLER_1)
|
|
else:
|
|
self.override_next_state(
|
|
consts.STRATEGY_STATE_CREATING_VIM_UPGRADE_STRATEGY)
|
|
elif upgrade_state == consts.UPGRADE_STATE_UPGRADING_HOSTS:
|
|
# At this point the subcloud is duplex, deploy state is
|
|
# complete and "system upgrade-show" on the subcloud
|
|
# indicates that theupgrade state is "upgrading-hosts".
|
|
# If both subcloud hosts are upgraded to the newer load,
|
|
# we resume the state machine from activate upgrade state.
|
|
# Otherwise, we resume from create the VIM strategy state.
|
|
# determine the version of the system controller in regionone
|
|
target_version = self._read_from_cache(
|
|
REGION_ONE_SYSTEM_INFO_CACHE_TYPE).software_version
|
|
|
|
all_hosts_upgraded = True
|
|
subcloud_hosts = self.get_sysinv_client(
|
|
strategy_step.subcloud.region_name).get_hosts()
|
|
for subcloud_host in subcloud_hosts:
|
|
is_locked = (subcloud_host.administrative ==
|
|
consts.ADMIN_LOCKED)
|
|
is_disabled = (subcloud_host.operational ==
|
|
consts.OPERATIONAL_DISABLED)
|
|
create_vim_state = \
|
|
consts.STRATEGY_STATE_CREATING_VIM_UPGRADE_STRATEGY
|
|
if (subcloud_host.software_load != target_version or
|
|
is_locked or is_disabled):
|
|
all_hosts_upgraded = False
|
|
self.override_next_state(create_vim_state)
|
|
|
|
if all_hosts_upgraded:
|
|
if host.capabilities.get('Personality') == \
|
|
consts.PERSONALITY_CONTROLLER_ACTIVE:
|
|
self.override_next_state(
|
|
consts.STRATEGY_STATE_ACTIVATING_UPGRADE)
|
|
else:
|
|
self.override_next_state(
|
|
consts.STRATEGY_STATE_SWACTING_TO_CONTROLLER_0)
|
|
elif upgrade_state == consts.UPGRADE_STATE_ACTIVATION_FAILED:
|
|
if (host.capabilities.get('Personality') ==
|
|
consts.PERSONALITY_CONTROLLER_ACTIVE):
|
|
self.override_next_state(
|
|
consts.STRATEGY_STATE_ACTIVATING_UPGRADE)
|
|
else:
|
|
self.override_next_state(
|
|
consts.STRATEGY_STATE_SWACTING_TO_CONTROLLER_0)
|
|
elif upgrade_state == consts.UPGRADE_STATE_ACTIVATION_COMPLETE:
|
|
self.override_next_state(
|
|
consts.STRATEGY_STATE_COMPLETING_UPGRADE)
|
|
|
|
else:
|
|
# Perform subcloud online check for duplex and proceed to the
|
|
# next step (i.e. installing license)
|
|
self._perform_subcloud_online_checks(strategy_step,
|
|
subcloud_sysinv_client,
|
|
subcloud_fm_client,
|
|
host, upgrades)
|
|
return self.next_state
|
|
|
|
# If it gets here, the subcloud must be offline and is a simplex
|
|
if subcloud.deploy_status in VALID_UPGRADE_STATES:
|
|
if not subcloud.data_install:
|
|
details = ("Data install values are missing and must be updated "
|
|
"via dcmanager subcloud update")
|
|
raise PreCheckFailedException(
|
|
subcloud=strategy_step.subcloud.name,
|
|
details=details)
|
|
|
|
self.override_next_state(consts.STRATEGY_STATE_UPGRADING_SIMPLEX)
|
|
return self.next_state
|
|
|
|
elif subcloud.deploy_status in VALID_MIGRATE_DATA_STATES:
|
|
self.override_next_state(consts.STRATEGY_STATE_MIGRATING_DATA)
|
|
return self.next_state
|
|
|
|
elif subcloud.deploy_status in VALID_ACTIVATION_STATES:
|
|
self.override_next_state(consts.STRATEGY_STATE_ACTIVATING_UPGRADE)
|
|
return self.next_state
|
|
|
|
# FAIL: We are offline and encountered an un-recoverable deploy status
|
|
self.info_log(strategy_step,
|
|
"Un-handled deploy_status: %s" % subcloud.deploy_status)
|
|
error_message = "deploy state: %s" % subcloud.deploy_status
|
|
raise ManualRecoveryRequiredException(
|
|
subcloud=strategy_step.subcloud.name,
|
|
error_message=error_message)
|