Merge "Enhance error reporting on subcloud upgrade"

This commit is contained in:
Zuul 2022-12-07 14:17:15 +00:00 committed by Gerrit Code Review
commit b6badeaa4d
9 changed files with 120 additions and 40 deletions

View File

@ -31,6 +31,7 @@ from dccommon.drivers.openstack.sysinv_v1 import SysinvClient
from dccommon import exceptions
from dccommon import install_consts
from dccommon import utils as common_utils
from dcmanager.common import consts as common_consts
from dcmanager.common import utils
LOG = logging.getLogger(__name__)
@ -618,7 +619,7 @@ class SubcloudInstall(object):
# for cleanup on process restart/SWACT.
common_utils.run_playbook(log_file, install_command)
except exceptions.PlaybookExecutionFailed:
msg = ("Failed to install the subcloud %s, check individual "
"log at %s for detailed output."
% (self.name, log_file))
msg = ("Failed to install %s, check individual "
"log at %s or run %s for details"
% (self.name, log_file, common_consts.ERROR_DESC_CMD))
raise Exception(msg)

View File

@ -184,6 +184,7 @@ DEPLOY_STATE_DONE = 'complete'
# Subcloud errors
ERROR_DESC_EMPTY = 'No errors present'
ERROR_DESC_CMD = 'dcmanager subcloud errors <subcloud-name>'
# error_description max length
ERROR_DESCRIPTION_LENGTH = 2048

View File

@ -178,7 +178,7 @@ class CertificateUploadError(DCManagerException):
class LicenseInstallError(DCManagerException):
message = _("Error while installing license on subcloud: %(subcloud_id)s")
message = _("Error while installing license on subcloud: %(subcloud_id)s. %(error_message)s")
class LicenseMissingError(DCManagerException):

View File

@ -1,5 +1,5 @@
#
# Copyright (c) 2020-2021 Wind River Systems, Inc.
# Copyright (c) 2020-2022 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
#
@ -97,8 +97,15 @@ class CompletingUpgradeState(BaseState):
# invoke the API 'upgrade-complete'
# This is a partially blocking call that raises exception on failure.
# We will re-attempt even if that failure is encountered
self._upgrade_complete(strategy_step)
try:
message = self._upgrade_complete(strategy_step)
except Exception as e:
msg = ("Failed to complete upgrade. %s" %
str(e))
db_api.subcloud_update(
self.context, strategy_step.subcloud_id,
error_description=msg[0:consts.ERROR_DESCRIPTION_LENGTH])
raise
# 'completion' deletes the upgrade. Need to loop until it is deleted
counter = 0
while True:
@ -112,7 +119,12 @@ class CompletingUpgradeState(BaseState):
break
counter += 1
if counter >= self.max_queries:
raise Exception("Timeout waiting for completion to complete")
msg = ("Timeout waiting for completion to complete: %s:" %
message)
db_api.subcloud_update(
self.context, strategy_step.subcloud_id,
error_description=msg[0:consts.ERROR_DESCRIPTION_LENGTH])
raise Exception(msg)
time.sleep(self.sleep_duration)
# When we return from this method without throwing an exception, the

View File

@ -11,6 +11,7 @@ from dcmanager.common import utils
from dcmanager.common.exceptions import StrategyStoppedException
from dcmanager.common.exceptions import VaultLoadMissingError
from dcmanager.db import api as db_api
from dcmanager.orchestrator.states.base import BaseState
from dcmanager.orchestrator.states.upgrade.cache.cache_specifications import \
REGION_ONE_SYSTEM_INFO_CACHE_TYPE
@ -155,11 +156,20 @@ class ImportingLoadState(BaseState):
# Send only the required fields
creation_keys = ['software_version', 'compatible_version', 'required_patches']
target_load = {key: target_load[key] for key in creation_keys}
load = self.get_sysinv_client(
strategy_step.subcloud.name).import_load_metadata(target_load)
self.info_log(strategy_step,
"Load: %s is now: %s" % (
load.software_version, load.state))
try:
load = self.get_sysinv_client(
strategy_step.subcloud.name).import_load_metadata(target_load)
self.info_log(strategy_step,
"Load: %s is now: %s" % (
load.software_version, load.state))
except Exception as e:
msg = ("Failed to import load metadata. %s" %
str(e))
db_api.subcloud_update(
self.context, strategy_step.subcloud_id,
error_description=msg[0:consts.ERROR_DESCRIPTION_LENGTH])
self.error_log(strategy_step, msg)
raise
else:
while True:
# If event handler stop has been triggered, fail the state

View File

@ -6,6 +6,7 @@
from dccommon import consts as dccommon_consts
from dcmanager.common import consts
from dcmanager.common import exceptions
from dcmanager.db import api as db_api
from dcmanager.orchestrator.states.base import BaseState
from dcmanager.orchestrator.states.upgrade.cache.cache_specifications import \
REGION_ONE_LICENSE_CACHE_TYPE
@ -51,8 +52,15 @@ class InstallingLicenseState(BaseState):
return self.next_state
else:
# An unexpected error occurred querying the license
message = ('An unexpected error occurred querying the license %s. Detail: %s' %
(dccommon_consts.SYSTEM_CONTROLLER_NAME,
target_error))
db_api.subcloud_update(
self.context, strategy_step.subcloud_id,
error_description=message[0:consts.ERROR_DESCRIPTION_LENGTH])
raise exceptions.LicenseInstallError(
subcloud_id=dccommon_consts.SYSTEM_CONTROLLER_NAME)
subcloud_id=dccommon_consts.SYSTEM_CONTROLLER_NAME,
error_message=target_error)
# retrieve the keystone session for the subcloud and query its license
subcloud_sysinv_client = \
@ -76,8 +84,17 @@ class InstallingLicenseState(BaseState):
install_rc = subcloud_sysinv_client.install_license(target_license)
install_error = install_rc.get('error')
if len(install_error) != 0:
# Save error response from sysinv into subcloud error description.
# Provide exception with sysinv error response to strategy_step details
message = ('Error installing license on subcloud %s. Detail: %s' %
(strategy_step.subcloud.name,
install_error))
db_api.subcloud_update(
self.context, strategy_step.subcloud_id,
error_description=message[0:consts.ERROR_DESCRIPTION_LENGTH])
raise exceptions.LicenseInstallError(
subcloud_id=strategy_step.subcloud_id)
subcloud_id=strategy_step.subcloud_id,
error_message=install_error)
# The license has been successfully installed. Move to the next stage
self.info_log(strategy_step, "License installed.")

View File

@ -1,5 +1,5 @@
#
# Copyright (c) 2020-2021 Wind River Systems, Inc.
# Copyright (c) 2020-2022 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
#
@ -10,6 +10,7 @@ from dccommon.exceptions import PlaybookExecutionFailed
from dccommon.utils import run_playbook
from dcmanager.common import consts
from dcmanager.common.exceptions import StrategyStoppedException
from dcmanager.common import utils
from dcmanager.db import api as db_api
from dcmanager.orchestrator.states.base import BaseState
@ -32,16 +33,14 @@ DEFAULT_API_SLEEP = 60
DEFAULT_ANSIBLE_SLEEP = 180
def migrate_subcloud_data(subcloud_name, migrate_command):
log_file = os.path.join(consts.DC_ANSIBLE_LOG_DIR, subcloud_name) + \
'_playbook_output.log'
def migrate_subcloud_data(migrate_command, log_file):
try:
run_playbook(log_file, migrate_command)
except PlaybookExecutionFailed:
msg = ("Failed to migrate data for subcloud %s, check individual "
"log at %s for detailed output."
% (subcloud_name, log_file))
raise Exception(msg)
msg_orch = ("Failed to migrate data, check individual "
"log at %s or run %s for details"
% (log_file, consts.ERROR_DESC_CMD))
raise Exception(msg_orch)
class MigratingDataState(BaseState):
@ -142,7 +141,8 @@ class MigratingDataState(BaseState):
ansible_subcloud_inventory_file = os.path.join(
consts.ANSIBLE_OVERRIDES_PATH,
strategy_step.subcloud.name + consts.INVENTORY_FILE_POSTFIX)
log_file = os.path.join(consts.DC_ANSIBLE_LOG_DIR, subcloud.name) + \
'_playbook_output.log'
# Send skip_patching=true to prevent the playbook from applying any patches present in the
# upgrade_data. All the required patches will be included in the generated install iso.
data_migrating_cmd = [
@ -152,12 +152,17 @@ class MigratingDataState(BaseState):
% (consts.TEMP_SYSADMIN_PASSWORD, consts.TEMP_SYSADMIN_PASSWORD)]
try:
migrate_subcloud_data(strategy_step.subcloud.name,
data_migrating_cmd)
migrate_subcloud_data(data_migrating_cmd, log_file)
except Exception as e:
# Two error messages: one for subcloud error description and logs and
# one for orchestrator strategy_step detail (shorter than the previous).
msg_subcloud = utils.find_ansible_error_msg(
strategy_step.subcloud.name, log_file, consts.DEPLOY_STATE_MIGRATING_DATA)
db_api.subcloud_update(
self.context, strategy_step.subcloud_id,
deploy_status=consts.DEPLOY_STATE_DATA_MIGRATION_FAILED)
deploy_status=consts.DEPLOY_STATE_DATA_MIGRATION_FAILED,
error_description=msg_subcloud[0:consts.ERROR_DESCRIPTION_LENGTH])
self.error_log(strategy_step, msg_subcloud)
self.error_log(strategy_step, str(e))
raise

View File

@ -90,6 +90,10 @@ class PreCheckState(BaseState):
if (host.administrative == consts.ADMIN_LOCKED and upgrades):
alarm_ignore_list.append(HOST_ADMINISTRATIVELY_LOCKED_ALARM)
# Clean old error messages
db_api.subcloud_update(
self.context, strategy_step.subcloud_id,
error_description=consts.ERROR_DESC_EMPTY)
# The health conditions acceptable for upgrade are:
# a) subcloud is completely healthy (i.e. no failed checks)
# b) subcloud only fails alarm check and it only has non-management
@ -106,8 +110,14 @@ class PreCheckState(BaseState):
#
# These could be Kubernetes or other related failure(s) which has not been been
# converted into an alarm condition.
details = "System health check failed. Please run 'system health-query' " \
"command on the subcloud for more details."
error_desc_msg = ("System health check failed. \n %s" %
fails)
db_api.subcloud_update(
self.context, strategy_step.subcloud_id,
error_description=error_desc_msg[0:consts.ERROR_DESCRIPTION_LENGTH])
details = ("System health check failed. Please run 'system health-query' "
"command on the subcloud or %s on central for details"
% (consts.ERROR_DESC_CMD))
self.error_log(strategy_step, "\n" + system_health)
raise PreCheckFailedException(
subcloud=strategy_step.subcloud.name,
@ -125,9 +135,16 @@ class PreCheckState(BaseState):
for alarm in alarms:
if alarm.alarm_id not in alarm_ignore_list:
if alarm.mgmt_affecting == "True":
details = "System health check failed due to alarm %s. " \
"Please run 'system health-query' " \
"command on the subcloud for more details." % alarm.alarm_id
error_desc_msg = ("System health check failed due to alarm %s. "
"System health: \n %s" %
(alarm.alarm_id, system_health))
db_api.subcloud_update(
self.context, strategy_step.subcloud_id,
error_description=error_desc_msg[0:consts.ERROR_DESCRIPTION_LENGTH])
details = ("System health check failed due to alarm %s. "
"Please run 'system health-query' "
"command on the subcloud or %s on central for details." %
(alarm.alarm_id, consts.ERROR_DESC_CMD))
self.error_log(strategy_step, "\n" + system_health)
raise PreCheckFailedException(
subcloud=strategy_step.subcloud.name,
@ -135,9 +152,16 @@ class PreCheckState(BaseState):
)
else:
# Multiple failures
details = "System health check failed due to multiple failures. " \
"Please run 'system health-query' command on the " \
"subcloud for more details."
error_desc_msg = ("System health check failed due to multiple failures. "
"Health: \n %s" %
(system_health))
db_api.subcloud_update(
self.context, strategy_step.subcloud_id,
error_description=error_desc_msg[0:consts.ERROR_DESCRIPTION_LENGTH])
details = ("System health check failed due to multiple failures. "
"Please run 'system health-query' command on the "
"subcloud or %s on central for details." %
(consts.ERROR_DESC_CMD))
self.error_log(strategy_step, "\n" + system_health)
raise PreCheckFailedException(
subcloud=strategy_step.subcloud.name,

View File

@ -1,5 +1,5 @@
#
# Copyright (c) 2020-2021 Wind River Systems, Inc.
# Copyright (c) 2020-2022 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
#
@ -201,11 +201,12 @@ class UpgradingSimplexState(BaseState):
if not subcloud.data_install:
# Set the deploy status to pre-install-failed so it can be
# handled accordingly in pre check step.
message = ("Failed to get upgrade data from install")
db_api.subcloud_update(
self.context, strategy_step.subcloud_id,
deploy_status=consts.DEPLOY_STATE_PRE_INSTALL_FAILED)
deploy_status=consts.DEPLOY_STATE_PRE_INSTALL_FAILED,
error_description=message)
message = ("Failed to get upgrade data from install")
self.warn_log(strategy_step, message)
raise Exception(message)
@ -337,6 +338,8 @@ class UpgradingSimplexState(BaseState):
def perform_subcloud_install(self, strategy_step, session, install_values):
log_file = os.path.join(consts.DC_ANSIBLE_LOG_DIR, strategy_step.subcloud.name) + \
'_playbook_output.log'
db_api.subcloud_update(
self.context, strategy_step.subcloud_id,
deploy_status=consts.DEPLOY_STATE_PRE_INSTALL)
@ -350,7 +353,8 @@ class UpgradingSimplexState(BaseState):
except Exception as e:
db_api.subcloud_update(
self.context, strategy_step.subcloud_id,
deploy_status=consts.DEPLOY_STATE_PRE_INSTALL_FAILED)
deploy_status=consts.DEPLOY_STATE_PRE_INSTALL_FAILED,
error_description=str(e)[0:consts.ERROR_DESCRIPTION_LENGTH])
self.error_log(strategy_step, str(e))
# TODO(jkung): cleanup to be implemented within SubcloudInstall
install.cleanup()
@ -379,9 +383,15 @@ class UpgradingSimplexState(BaseState):
try:
install.install(consts.DC_ANSIBLE_LOG_DIR, install_command)
except Exception as e:
# Detailed error message for subcloud error description field.
# Exception message for strategy_step detail.
msg = utils.find_ansible_error_msg(
strategy_step.subcloud.name, log_file, consts.DEPLOY_STATE_INSTALLING)
db_api.subcloud_update(
self.context, strategy_step.subcloud_id,
deploy_status=consts.DEPLOY_STATE_INSTALL_FAILED)
deploy_status=consts.DEPLOY_STATE_INSTALL_FAILED,
error_description=msg[0:consts.ERROR_DESCRIPTION_LENGTH])
self.error_log(strategy_step, msg)
self.error_log(strategy_step, str(e))
install.cleanup()
raise